Compare commits

...

2371 Commits

Author SHA1 Message Date
Linus Torvalds
559e608c46 Merge tag 'ntfs3_for_6.19' of https://github.com/Paragon-Software-Group/linux-ntfs3
Pull ntfs3 updates from Konstantin Komarov:
 "New code:
   - support timestamps prior to epoch
   - do not overwrite uptodate pages
   - disable readahead for compressed files
   - setting of dummy blocksize to read boot_block when mounting
   - the run_lock initialization when loading $Extend
   - initialization of allocated memory before use
   - support for the NTFS3_IOC_SHUTDOWN ioctl
   - check for minimum alignment when performing direct I/O reads
   - check for shutdown in fsync

  Fixes:
   - mount failure for sparse runs in run_unpack()
   - use-after-free of sbi->options in cmp_fnames
   - KMSAN uninit bug after failed mi_read in mi_format_new
   - uninit error after buffer allocation by __getname()
   - KMSAN uninit-value in ni_create_attr_list
   - double free of sbi->options->nls and ownership of fc->fs_private
   - incorrect vcn adjustments in attr_collapse_range()
   - mode update when ACL can be reduced to mode
   - memory leaks in add sub record

  Changes:
   - refactor code, updated terminology, spelling
   - do not kmap pages in (de)compression code
   - after ntfs_look_free_mft(), code that fails must put mft_inode
   - default mount options for "acl" and "prealloc"

  Replaced:
   - use unsafe_memcpy() to avoid memcpy size warning
   - ntfs_bio_pages with page cache for compressed files"

* tag 'ntfs3_for_6.19' of https://github.com/Paragon-Software-Group/linux-ntfs3: (26 commits)
  fs/ntfs3: check for shutdown in fsync
  fs/ntfs3: change the default mount options for "acl" and "prealloc"
  fs/ntfs3: Prevent memory leaks in add sub record
  fs/ntfs3: out1 also needs to put mi
  fs/ntfs3: Fix spelling mistake "recommened" -> "recommended"
  fs/ntfs3: update mode in xattr when ACL can be reduced to mode
  fs/ntfs3: check minimum alignment for direct I/O
  fs/ntfs3: implement NTFS3_IOC_SHUTDOWN ioctl
  fs/ntfs3: correct attr_collapse_range when file is too fragmented
  ntfs3: fix double free of sbi->options->nls and clarify ownership of fc->fs_private
  fs/ntfs3: Initialize allocated memory before use
  fs/ntfs3: remove ntfs_bio_pages and use page cache for compressed I/O
  ntfs3: avoid memcpy size warning
  fs/ntfs3: fix KMSAN uninit-value in ni_create_attr_list
  ntfs3: init run lock for extend inode
  ntfs: set dummy blocksize to read boot_block when mounting
  fs/ntfs3: disable readahead for compressed files
  ntfs3: Fix uninit buffer allocated by __getname()
  ntfs3: fix uninit memory after failed mi_read in mi_format_new
  ntfs3: fix use-after-free of sbi->options in cmp_fnames
  ...
2025-12-03 20:45:43 -08:00
Linus Torvalds
fbeea4db51 Merge tag 'ext4_for_linus-6.19-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
Pull ext4 updates from Ted Ts'o:
 "New features and improvements for the ext4 file system:
   - Optimize online defragmentation by using folios instead of
     individual buffer heads
   - Improve error codes stored in the superblock when the journal
     aborts
   - Minor cleanups and clarifications in ext4_map_blocks()
   - Add documentation of the casefold and encrypt flags
   - Add support for file systems with a blocksize greater than the
     pagesize
   - Improve performance by enabling the caching the fact that an inode
     does not have a Posix ACL

  Various Bug Fixes:
   - Fix false positive complaints from smatch
   - Fix error code which is returned by ext4fs_dirhash() when Siphash
     is used without the encryption key
   - Fix races when writing to inline data files which could trigger a
     BUG
   - Fix potential NULL dereference when there is an corrupt file system
     with an extended attribute value stored in a inode
   - Fix false positive lockdep report when syzbot uses ext4 and ocfs2
     together
   - Fix false positive reported by DEPT by adjusting lock annotation
   - Avoid a potential BUG_ON in jbd2 when a file system is massively
     corrupted
   - Fix a WARN_ON when superblock is corrupted with a non-NULL
     terminated mount options field
   - Add check if the userspace passes in a non-NULL terminated mount
     options field to EXT4_IOC_SET_TUNE_SB_PARAM
   - Fix a potential journal checksum failure whena file system is
     copied while it is mounted read-only
   - Fix a potential potential orphan file tracking error which only
     showed on 32-bit systems
   - Fix assertion checks in mballoc (which have to be explicitly enbled
     by manually enabling AGGRESSIVE_CHECKS and recompiling)
   - Avoid complaining about overly large orphan files created by mke2fs
     with with file systems with a 64k block size"

* tag 'ext4_for_linus-6.19-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (58 commits)
  ext4: mark inodes without acls in __ext4_iget()
  ext4: enable block size larger than page size
  ext4: add checks for large folio incompatibilities when BS > PS
  ext4: support verifying data from large folios with fs-verity
  ext4: make data=journal support large block size
  ext4: support large block size in __ext4_block_zero_page_range()
  ext4: support large block size in mpage_prepare_extent_to_map()
  ext4: support large block size in mpage_map_and_submit_buffers()
  ext4: support large block size in ext4_block_write_begin()
  ext4: support large block size in ext4_mpage_readpages()
  ext4: rename 'page' references to 'folio' in multi-block allocator
  ext4: prepare buddy cache inode for BS > PS with large folios
  ext4: support large block size in ext4_mb_init_cache()
  ext4: support large block size in ext4_mb_get_buddy_page_lock()
  ext4: support large block size in ext4_mb_load_buddy_gfp()
  ext4: add EXT4_LBLK_TO_PG and EXT4_PG_TO_LBLK for block/page conversion
  ext4: add EXT4_LBLK_TO_B macro for logical block to bytes conversion
  ext4: support large block size in ext4_readdir()
  ext4: support large block size in ext4_calculate_overhead()
  ext4: introduce s_min_folio_order for future BS > PS support
  ...
2025-12-03 20:37:15 -08:00
Linus Torvalds
afcbce74f3 Merge tag 'gfs2-for-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/gfs2/linux-gfs2
Pull gfs2 updates from Andreas Gruenbacher:

 - Major withdraw / error handling overhaul based on dlm's new
   DLM_RELEASE_RECOVER feature: this allows gfs to treat withdraws like
   node failures. Make withdraws asynchronous

 - Fix a bug in commit e4a8b5481c that caused 'df' to remain out of
   sync. ('df' is still allowed to go slightly out of sync for short
   periods of time)

 - Prevent recusive memory reclaim in gfs2_unstuff_dinode()

 - Clean up SDF_JOURNAL_LIVE flag handling

 - Fix remote evict for read-only filesystems

 - Fix a misuse of bio_chain()

 - Various other minor cleanups

* tag 'gfs2-for-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/gfs2/linux-gfs2: (35 commits)
  gfs2: Fix use of bio_chain
  gfs2: Clean up SDF_JOURNAL_LIVE flag handling
  gfs2: No longer thaw filesystems during a withdraw
  gfs2: Withdraw immediately in gfs2_trans_add_meta
  gfs2: New gfs2_withdraw_helper
  gfs2: Clean up properly during a withdraw
  gfs2: Rename gfs2_{gl_dq_holders => withdraw_glocks}
  Revert "gfs2: fix infinite loop when checking ail item count before go_inval"
  Revert "gfs2: Allow some glocks to be used during withdraw"
  Revert "gfs2: Check for log write errors before telling dlm to unlock"
  Revert "gfs2: fix a deadlock on withdraw-during-mount"
  Revert "gfs2: Force withdraw to replay journals and wait for it to finish" (6/6)
  Revert "gfs2: Force withdraw to replay journals and wait for it to finish" (5/6)
  Revert "gfs2: Force withdraw to replay journals and wait for it to finish" (4/6)
  Revert "gfs2: Force withdraw to replay journals and wait for it to finish" (3/6)
  Revert "gfs2: Force withdraw to replay journals and wait for it to finish" (2/6)
  Revert "gfs2: Force withdraw to replay journals and wait for it to finish" (1/6)
  Revert "gfs2: don't stop reads while withdraw in progress"
  gfs2: Rename LM_FLAG_{NOEXP -> RECOVER}
  gfs2: Kill gfs2_io_error_bh_wd
  ...
2025-12-03 20:28:50 -08:00
Linus Torvalds
869737543b Merge tag 'v6.19-rc-smb-fixes' of git://git.samba.org/ksmbd
Pull smb client and server updates from Steve French:

 - server fixes:
     - IPC use after free locking fix
     - fix locking bug in delete paths
     - fix use after free in disconnect
     - fix underflow in locking check
     - error mapping improvement
     - socket listening improvement
     - return code mapping fixes
     - crypto improvements (use default libraries)

 - cleanup patches:
     - netfs
     - client checkpatch cleanup
     - server cleanup
     - move server/client duplicate code to common code
     - fix some defines to better match protocol specification

  - smbdirect (RDMA) fixes

  - client debugging improvements for leases

* tag 'v6.19-rc-smb-fixes' of git://git.samba.org/ksmbd: (44 commits)
  cifs: Use netfs_alloc/free_folioq_buffer()
  smb: client: show smb lease key in open_dirs output
  smb: client: show smb lease key in open_files output
  ksmbd: ipc: fix use-after-free in ipc_msg_send_request
  smb: client: relax WARN_ON_ONCE(SMBDIRECT_SOCKET_*) checks in recv_done() and smbd_conn_upcall()
  smb: server: relax WARN_ON_ONCE(SMBDIRECT_SOCKET_*) checks in recv_done() and smb_direct_cm_handler()
  smb: smbdirect: introduce SMBDIRECT_CHECK_STATUS_{WARN,DISCONNECT}()
  smb: smbdirect: introduce SMBDIRECT_DEBUG_ERR_PTR() helper
  ksmbd: vfs: fix race on m_flags in vfs_cache
  ksmbd: Replace strcpy + strcat to improve convert_to_nt_pathname
  smb: move FILE_SYSTEM_ATTRIBUTE_INFO to common/fscc.h
  ksmbd: implement error handling for STATUS_INFO_LENGTH_MISMATCH in smb server
  ksmbd: fix use-after-free in ksmbd_tree_connect_put under concurrency
  ksmbd: server: avoid busy polling in accept loop
  smb: move create_durable_reconn to common/smb2pdu.h
  smb: fix some warnings reported by scripts/checkpatch.pl
  smb: do some cleanups
  smb: move FILE_SYSTEM_SIZE_INFO to common/fscc.h
  smb: move some duplicate struct definitions to common/fscc.h
  smb: move list of FileSystemAttributes to common/fscc.h
  ...
2025-12-03 20:23:41 -08:00
Linus Torvalds
3ed1c68307 Merge tag 'xfs-merge-6.19' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux
Pull xfs updates from Carlos Maiolino:
 "There are no major changes in xfs. This contains mostly some code
  cleanups, a few bug fixes and documentation update. Highlights are:

   - Quota locking cleanup

   - Getting rid of old xlog_in_core_2_t type"

* tag 'xfs-merge-6.19' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux: (33 commits)
  docs: remove obsolete links in the xfs online repair documentation
  xfs: move some code out of xfs_iget_recycle
  xfs: use zi more in xfs_zone_gc_mount
  xfs: remove the unused bv field in struct xfs_gc_bio
  xfs: remove xarray mark for reclaimable zones
  xfs: remove the xlog_in_core_t typedef
  xfs: remove l_iclog_heads
  xfs: remove the xlog_rec_header_t typedef
  xfs: remove xlog_in_core_2_t
  xfs: remove a very outdated comment from xlog_alloc_log
  xfs: cleanup xlog_alloc_log a bit
  xfs: don't use xlog_in_core_2_t in struct xlog_in_core
  xfs: add a on-disk log header cycle array accessor
  xfs: add a XLOG_CYCLE_DATA_SIZE constant
  xfs: reduce ilock roundtrips in xfs_qm_vop_dqalloc
  xfs: move xfs_dquot_tree calls into xfs_qm_dqget_cache_{lookup,insert}
  xfs: move quota locking into xrep_quota_item
  xfs: move quota locking into xqcheck_commit_dquot
  xfs: move q_qlock locking into xqcheck_compare_dquot
  xfs: move q_qlock locking into xchk_quota_item
  ...
2025-12-03 20:19:38 -08:00
Linus Torvalds
477e31fd1e Merge tag 'erofs-for-6.19-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs
Pull erofs updates from Gao Xiang:

 - Fix a WARNING caused by a recent FSDAX misdetection regression

 - Fix the filesystem stacking limit for file-backed mounts

 - Print more informative diagnostics on decompression errors

 - Switch the on-disk definition `erofs_fs.h` to the MIT license

 - Minor cleanups

* tag 'erofs-for-6.19-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs:
  erofs: switch on-disk header `erofs_fs.h` to MIT license
  erofs: get rid of raw bi_end_io() usage
  erofs: enable error reporting for z_erofs_fixup_insize()
  erofs: enable error reporting for z_erofs_stream_switch_bufs()
  erofs: improve Zstd, LZMA and DEFLATE error strings
  erofs: improve decompression error reporting
  erofs: tidy up z_erofs_lz4_handle_overlap()
  erofs: limit the level of fs stacking for file-backed mounts
  erofs: correct FSDAX detection
2025-12-03 20:14:44 -08:00
Linus Torvalds
ca010e2ef6 Merge tag 'hfs-v6.19-tag1' of git://git.kernel.org/pub/scm/linux/kernel/git/vdubeyko/hfs
Pull hfs/hfsplus updates from Viacheslav Dubeyko:
 "Several fixes for syzbot reported issues, HFS/HFS+ fixes of xfstests
  failures, Kunit-based unit-tests introduction, and code cleanup:

   - Dan Carpenter fixed a potential use-after-free issue in
     hfs_correct_next_unused_CNID() method. Tetsuo Handa has made nice
     fix of syzbot reported issue related to incorrect inode->i_mode
     management if volume has been corrupted somehow. Yang Chenzhi has
     made really good fix of potential race condition in
     __hfs_bnode_create() method for HFS+ file system.

   - Several fixes to xfstests failures. Particularly, generic/070,
     generic/073, and generic/101 test-cases finish successfully for the
     case of HFS+ file system right now.

   - HFS and HFS+ drivers share multiple structures of on-disk layout
     declarations. Some structures are used without any change. However,
     we had two independent declarations of the same structures in HFS
     and HFS+ drivers.

     The on-disk layout declarations have been moved into
     include/linux/hfs_common.h with the goal to exclude the
     declarations duplication and to keep the HFS/HFS+ on-disk layout
     declarations in one place.

     Also, this patch prepares the basis for creating a hfslib that can
     aggregate common functionality without necessity to duplicate the
     same code in HFS and HFS+ drivers.

   - HFS/HFS+ really need unit-tests because of multiple xfstests
     failures. The first two patches introduce Kunit-based unit-tests
     for the case string operations in HFS/HFS+ file system drivers"

* tag 'hfs-v6.19-tag1' of git://git.kernel.org/pub/scm/linux/kernel/git/vdubeyko/hfs:
  hfs/hfsplus: move on-disk layout declarations into hfs_common.h
  hfsplus: fix volume corruption issue for generic/101
  hfsplus: introduce KUnit tests for HFS+ string operations
  hfs: introduce KUnit tests for HFS string operations
  hfsplus: fix volume corruption issue for generic/073
  hfsplus: Verify inode mode when loading from disk
  hfsplus: fix volume corruption issue for generic/070
  hfs/hfsplus: prevent getting negative values of offset/length
  hfsplus: fix missing hfs_bnode_get() in __hfs_bnode_create
  hfs: fix potential use after free in hfs_correct_next_unused_CNID()
2025-12-03 20:08:32 -08:00
Linus Torvalds
7696286034 Merge tag 'for-6.19-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs updates from David Sterba:
 "Features:

   - shutdown ioctl support (needs CONFIG_BTRFS_EXPERIMENTAL for now):
      - set filesystem state as being shut down (also named going down
        in other filesystems), where all active operations return EIO
        and this cannot be changed until unmount
      - pending operations are attempted to be finished but error
        messages may still show up depending on where exactly the
        shutdown happened

   - scrub (and device replace) vs suspend/hibernate:
      - a running scrub will prevent suspend, which can be annoying as
        suspend is an immediate request and scrub is not critical
      - filesystem freezing before suspend was not sufficient as the
        problem was in process freezing
      - behaviour change: on suspend scrub and device replace are
        cancelled, where scrub can record the last state and continue
        from there; the device replace has to be restarted from the
        beginning

   - zone stats exported in sysfs, from the perspective of the
     filesystem this includes active, reclaimable, relocation etc zones

  Performance:

   - improvements when processing space reservation tickets by
     optimizing locking and shrinking critical sections, cumulative
     improvements in lockstat numbers show +15%

  Notable fixes:

   - use vmalloc fallback when allocating bios as high order allocations
     can happen with wide checksums (like sha256)

   - scrub will always track the last position of progress so it's not
     starting from zero after an error

  Core:

   - under experimental config, checksum calculations are offloaded to
     process context, simplifies locking and allows to remove
     compression write worker kthread(s):
      - speed improvement in direct IO throughput with buffered IO
        fallback is +15% when not offloaded but this is more related to
        internal crypto subsystem improvements
      - this will be probably default in the future removing the sysfs
        tunable

   - (experimental) block size > page size updates:
      - support more operations when not using large folios (encoded
        read/write and send)
      - raid56

   - more preparations for fscrypt support

  Other:

   - more conversions to auto-cleaned variables

   - parameter cleanups and removals

   - extended warning fixes

   - improved printing of structured values like keys

   - lots of other cleanups and refactoring"

* tag 'for-6.19-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (147 commits)
  btrfs: remove unnecessary inode key in btrfs_log_all_parents()
  btrfs: remove redundant zero/NULL initializations in btrfs_alloc_root()
  btrfs: remaining BTRFS_PATH_AUTO_FREE conversions
  btrfs: send: do not allocate memory for xattr data when checking it exists
  btrfs: send: add unlikely to all unexpected overflow checks
  btrfs: reduce arguments to btrfs_del_inode_ref_in_log()
  btrfs: remove root argument from btrfs_del_dir_entries_in_log()
  btrfs: use test_and_set_bit() in btrfs_delayed_delete_inode_ref()
  btrfs: don't search back for dir inode item in INO_LOOKUP_USER
  btrfs: don't rewrite ret from inode_permission
  btrfs: add orig_logical to btrfs_bio for encryption
  btrfs: disable verity on encrypted inodes
  btrfs: disable various operations on encrypted inodes
  btrfs: remove redundant level reset in btrfs_del_items()
  btrfs: simplify leaf traversal after path release in btrfs_next_old_leaf()
  btrfs: optimize balance_level() path reference handling
  btrfs: factor out root promotion logic into promote_child_to_root()
  btrfs: raid56: remove the "_step" infix
  btrfs: raid56: enable bs > ps support
  btrfs: raid56: prepare finish_parity_scrub() to support bs > ps cases
  ...
2025-12-03 20:03:46 -08:00
Linus Torvalds
cc25df3e2e Merge tag 'for-6.19/block-20251201' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux
Pull block updates from Jens Axboe:

 - Fix head insertion for mq-deadline, a regression from when priority
   support was added

 - Series simplifying and improving the ublk user copy code

 - Various ublk related cleanups

 - Fixup REQ_NOWAIT handling in loop/zloop, clearing NOWAIT when the
   request is punted to a thread for handling

 - Merge and then later revert loop dio nowait support, as it ended up
   causing excessive stack usage for when the inline issue code needs to
   dip back into the full file system code

 - Improve auto integrity code, making it less deadlock prone

 - Speedup polled IO handling, but manually managing the hctx lookups

 - Fixes for blk-throttle for SSD devices

 - Small series with fixes for the S390 dasd driver

 - Add support for caching zones, avoiding unnecessary report zone
   queries

 - MD pull requests via Yu:
      - fix null-ptr-dereference regression for dm-raid0
      - fix IO hang for raid5 when array is broken with IO inflight
      - remove legacy 1s delay to speed up system shutdown
      - change maintainer's email address
      - data can be lost if array is created with different lbs devices,
        fix this problem and record lbs of the array in metadata
      - fix rcu protection for md_thread
      - fix mddev kobject lifetime regression
      - enable atomic writes for md-linear
      - some cleanups

 - bcache updates via Coly
      - remove useless discard and cache device code
      - improve usage of per-cpu workqueues

 - Reorganize the IO scheduler switching code, fixing some lockdep
   reports as well

 - Improve the block layer P2P DMA support

 - Add support to the block tracing code for zoned devices

 - Segment calculation improves, and memory alignment flexibility
   improvements

 - Set of prep and cleanups patches for ublk batching support. The
   actual batching hasn't been added yet, but helps shrink down the
   workload of getting that patchset ready for 6.20

 - Fix for how the ps3 block driver handles segments offsets

 - Improve how block plugging handles batch tag allocations

 - nbd fixes for use-after-free of the configuration on device clear/put

 - Set of improvements and fixes for zloop

 - Add Damien as maintainer of the block zoned device code handling

 - Various other fixes and cleanups

* tag 'for-6.19/block-20251201' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: (162 commits)
  block/rnbd: correct all kernel-doc complaints
  blk-mq: use queue_hctx in blk_mq_map_queue_type
  md: remove legacy 1s delay in md_notify_reboot
  md/raid5: fix IO hang when array is broken with IO inflight
  md: warn about updating super block failure
  md/raid0: fix NULL pointer dereference in create_strip_zones() for dm-raid
  sbitmap: fix all kernel-doc warnings
  ublk: add helper of __ublk_fetch()
  ublk: pass const pointer to ublk_queue_is_zoned()
  ublk: refactor auto buffer register in ublk_dispatch_req()
  ublk: add `union ublk_io_buf` with improved naming
  ublk: add parameter `struct io_uring_cmd *` to ublk_prep_auto_buf_reg()
  kfifo: add kfifo_alloc_node() helper for NUMA awareness
  blk-mq: fix potential uaf for 'queue_hw_ctx'
  blk-mq: use array manage hctx map instead of xarray
  ublk: prevent invalid access with DEBUG
  s390/dasd: Use scnprintf() instead of sprintf()
  s390/dasd: Move device name formatting into separate function
  s390/dasd: Remove unnecessary debugfs_create() return checks
  s390/dasd: Fix gendisk parent after copy pair swap
  ...
2025-12-03 19:26:18 -08:00
Linus Torvalds
0abcfd8983 Merge tag 'for-6.19/io_uring-20251201' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux
Pull io_uring updates from Jens Axboe:

 - Unify how task_work cancelations are detected, placing it in the
   task_work running state rather than needing to check the task state

 - Series cleaning up and moving the cancelation code to where it
   belongs, in cancel.c

 - Cleanup of waitid and futex argument handling

 - Add support for mixed sized SQEs. 6.18 added support for mixed sized
   CQEs, improving flexibility and efficiency of workloads that need big
   CQEs. This adds similar support for SQEs, where the occasional need
   for a 128b SQE doesn't necessitate having all SQEs be 128b in size

 - Introduce zcrx and SQ/CQ layout queries. The former returns what zcrx
   features are available. And both return the ring size information to
   help with allocation size calculation for user provided rings like
   IORING_SETUP_NO_MMAP and IORING_MEM_REGION_TYPE_USER

 - Zcrx updates for 6.19. It includes a bunch of small patches,
   IORING_REGISTER_ZCRX_CTRL and RQ flushing and David's work on sharing
   zcrx b/w multiple io_uring instances

 - Series cleaning up ring initializations, notable deduplicating ring
   size and offset calculations. It also moves most of the checking
   before doing any allocations, making the code simpler

 - Add support for getsockname and getpeername, which is mostly a
   trivial hookup after a bit of refactoring on the networking side

 - Various fixes and cleanups

* tag 'for-6.19/io_uring-20251201' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: (68 commits)
  io_uring: Introduce getsockname io_uring cmd
  socket: Split out a getsockname helper for io_uring
  socket: Unify getsockname and getpeername implementation
  io_uring/query: drop unused io_handle_query_entry() ctx arg
  io_uring/kbuf: remove obsolete buf_nr_pages and update comments
  io_uring/register: use correct location for io_rings_layout
  io_uring/zcrx: share an ifq between rings
  io_uring/zcrx: add io_fill_zcrx_offsets()
  io_uring/zcrx: export zcrx via a file
  io_uring/zcrx: move io_zcrx_scrub() and dependencies up
  io_uring/zcrx: count zcrx users
  io_uring/zcrx: add sync refill queue flushing
  io_uring/zcrx: introduce IORING_REGISTER_ZCRX_CTRL
  io_uring/zcrx: elide passing msg flags
  io_uring/zcrx: use folio_nr_pages() instead of shift operation
  io_uring/zcrx: convert to use netmem_desc
  io_uring/query: introduce rings info query
  io_uring/query: introduce zcrx query
  io_uring: move cq/sq user offset init around
  io_uring: pre-calculate scq layout
  ...
2025-12-03 18:58:57 -08:00
Linus Torvalds
8f7aa3d3c7 Merge tag 'net-next-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Jakub Kicinski:
 "Core & protocols:

   - Replace busylock at the Tx queuing layer with a lockless list.

     Resulting in a 300% (4x) improvement on heavy TX workloads, sending
     twice the number of packets per second, for half the cpu cycles.

   - Allow constantly busy flows to migrate to a more suitable CPU/NIC
     queue.

     Normally we perform queue re-selection when flow comes out of idle,
     but under extreme circumstances the flows may be constantly busy.

     Add sysctl to allow periodic rehashing even if it'd risk packet
     reordering.

   - Optimize the NAPI skb cache, make it larger, use it in more paths.

   - Attempt returning Tx skbs to the originating CPU (like we already
     did for Rx skbs).

   - Various data structure layout and prefetch optimizations from Eric.

   - Remove ktime_get() from the recvmsg() fast path, ktime_get() is
     sadly quite expensive on recent AMD machines.

   - Extend threaded NAPI polling to allow the kthread busy poll for
     packets.

   - Make MPTCP use Rx backlog processing. This lowers the lock
     pressure, improving the Rx performance.

   - Support memcg accounting of MPTCP socket memory.

   - Allow admin to opt sockets out of global protocol memory accounting
     (using a sysctl or BPF-based policy). The global limits are a poor
     fit for modern container workloads, where limits are imposed using
     cgroups.

   - Improve heuristics for when to kick off AF_UNIX garbage collection.

   - Allow users to control TCP SACK compression, and default to 33% of
     RTT.

   - Add tcp_rcvbuf_low_rtt sysctl to let datacenter users avoid
     unnecessarily aggressive rcvbuf growth and overshot when the
     connection RTT is low.

   - Preserve skb metadata space across skb_push / skb_pull operations.

   - Support for IPIP encapsulation in the nftables flowtable offload.

   - Support appending IP interface information to ICMP messages (RFC
     5837).

   - Support setting max record size in TLS (RFC 8449).

   - Remove taking rtnl_lock from RTM_GETNEIGHTBL and RTM_SETNEIGHTBL.

   - Use a dedicated lock (and RCU) in MPLS, instead of rtnl_lock.

   - Let users configure the number of write buffers in SMC.

   - Add new struct sockaddr_unsized for sockaddr of unknown length,
     from Kees.

   - Some conversions away from the crypto_ahash API, from Eric Biggers.

   - Some preparations for slimming down struct page.

   - YAML Netlink protocol spec for WireGuard.

   - Add a tool on top of YAML Netlink specs/lib for reporting commonly
     computed derived statistics and summarized system state.

  Driver API:

   - Add CAN XL support to the CAN Netlink interface.

   - Add uAPI for reporting PHY Mean Square Error (MSE) diagnostics, as
     defined by the OPEN Alliance's "Advanced diagnostic features for
     100BASE-T1 automotive Ethernet PHYs" specification.

   - Add DPLL phase-adjust-gran pin attribute (and implement it in
     zl3073x).

   - Refactor xfrm_input lock to reduce contention when NIC offloads
     IPsec and performs RSS.

   - Add info to devlink params whether the current setting is the
     default or a user override. Allow resetting back to default.

   - Add standard device stats for PSP crypto offload.

   - Leverage DSA frame broadcast to implement simple HSR frame
     duplication for a lot of switches without dedicated HSR offload.

   - Add uAPI defines for 1.6Tbps link modes.

  Device drivers:

   - Add Motorcomm YT921x gigabit Ethernet switch support.

   - Add MUCSE driver for N500/N210 1GbE NIC series.

   - Convert drivers to support dedicated ops for timestamping control,
     and away from the direct IOCTL handling. While at it support GET
     operations for PHY timestamping.

   - Add (and convert most drivers to) a dedicated ethtool callback for
     reading the Rx ring count.

   - Significant refactoring efforts in the STMMAC driver, which
     supports Synopsys turn-key MAC IP integrated into a ton of SoCs.

   - Ethernet high-speed NICs:
      - Broadcom (bnxt):
         - support PPS in/out on all pins
      - Intel (100G, ice, idpf):
         - ice: implement standard ethtool and timestamping stats
         - i40e: support setting the max number of MAC addresses per VF
         - iavf: support RSS of GTP tunnels for 5G and LTE deployments
      - nVidia/Mellanox (mlx5):
         - reduce downtime on interface reconfiguration
         - disable being an XDP redirect target by default (same as
           other drivers) to avoid wasting resources if feature is
           unused
      - Meta (fbnic):
         - add support for Linux-managed PCS on 25G, 50G, and 100G links
      - Wangxun:
         - support Rx descriptor merge, and Tx head writeback
         - support Rx coalescing offload
         - support 25G SPF and 40G QSFP modules

   - Ethernet virtual:
      - Google (gve):
         - allow ethtool to configure rx_buf_len
         - implement XDP HW RX Timestamping support for DQ descriptor
           format
      - Microsoft vNIC (mana):
         - support HW link state events
         - handle hardware recovery events when probing the device

   - Ethernet NICs consumer, and embedded:
      - usbnet: add support for Byte Queue Limits (BQL)
      - AMD (amd-xgbe):
         - add device selftests
      - NXP (enetc):
         - add i.MX94 support
      - Broadcom integrated MACs (bcmgenet, bcmasp):
         - bcmasp: add support for PHY-based Wake-on-LAN
      - Broadcom switches (b53):
         - support port isolation
         - support BCM5389/97/98 and BCM63XX ARL formats
      - Lantiq/MaxLinear switches:
         - support bridge FDB entries on the CPU port
         - use regmap for register access
         - allow user to enable/disable learning
         - support Energy Efficient Ethernet
         - support configuring RMII clock delays
         - add tagging driver for MaxLinear GSW1xx switches
      - Synopsys (stmmac):
         - support using the HW clock in free running mode
         - add Eswin EIC7700 support
         - add Rockchip RK3506 support
         - add Altera Agilex5 support
      - Cadence (macb):
         - cleanup and consolidate descriptor and DMA address handling
         - add EyeQ5 support
      - TI:
         - icssg-prueth: support AF_XDP
      - Airoha access points:
         - add missing Ethernet stats and link state callback
         - add AN7583 support
         - support out-of-order Tx completion processing
      - Power over Ethernet:
         - pd692x0: preserve PSE configuration across reboots
         - add support for TPS23881B devices

   - Ethernet PHYs:
      - Open Alliance OATC14 10BASE-T1S PHY cable diagnostic support
      - Support 50G SerDes and 100G interfaces in Linux-managed PHYs
      - micrel:
         - support for non PTP SKUs of lan8814
         - enable in-band auto-negotiation on lan8814
      - realtek:
         - cable testing support on RTL8224
         - interrupt support on RTL8221B
      - motorcomm: support for PHY LEDs on YT853
      - microchip: support for LAN867X Rev.D0 PHYs w/ SQI and cable diag
      - mscc: support for PHY LED control

   - CAN drivers:
      - m_can: add support for optional reset and system wake up
      - remove can_change_mtu() obsoleted by core handling
      - mcp251xfd: support GPIO controller functionality

   - Bluetooth:
      - add initial support for PASTa

   - WiFi:
      - split ieee80211.h file, it's way too big
      - improvements in VHT radiotap reporting, S1G, Channel Switch
        Announcement handling, rate tracking in mesh networks
      - improve multi-radio monitor mode support, and add a cfg80211
        debugfs interface for it
      - HT action frame handling on 6 GHz
      - initial chanctx work towards NAN
      - MU-MIMO sniffer improvements

   - WiFi drivers:
      - RealTek (rtw89):
         - support USB devices RTL8852AU and RTL8852CU
         - initial work for RTL8922DE
         - improved injection support
      - Intel:
         - iwlwifi: new sniffer API support
      - MediaTek (mt76):
         - WED support for >32-bit DMA
         - airoha NPU support
         - regdomain improvements
         - continued WiFi7/MLO work
      - Qualcomm/Atheros:
         - ath10k: factory test support
         - ath11k: TX power insertion support
         - ath12k: BSS color change support
         - ath12k: statistics improvements
      - brcmfmac: Acer A1 840 tablet quirk
      - rtl8xxxu: 40 MHz connection fixes/support"

* tag 'net-next-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1381 commits)
  net: page_pool: sanitise allocation order
  net: page pool: xa init with destroy on pp init
  net/mlx5e: Support XDP target xmit with dummy program
  net/mlx5e: Update XDP features in switch channels
  selftests/tc-testing: Test CAKE scheduler when enqueue drops packets
  net/sched: sch_cake: Fix incorrect qlen reduction in cake_drop
  wireguard: netlink: generate netlink code
  wireguard: uapi: generate header with ynl-gen
  wireguard: uapi: move flag enums
  wireguard: uapi: move enum wg_cmd
  wireguard: netlink: add YNL specification
  selftests: drv-net: Fix tolerance calculation in devlink_rate_tc_bw.py
  selftests: drv-net: Fix and clarify TC bandwidth split in devlink_rate_tc_bw.py
  selftests: drv-net: Set shell=True for sysfs writes in devlink_rate_tc_bw.py
  selftests: drv-net: Use Iperf3Runner in devlink_rate_tc_bw.py
  selftests: drv-net: introduce Iperf3Runner for measurement use cases
  selftests: drv-net: Add devlink_rate_tc_bw.py to TEST_PROGS
  net: ps3_gelic_net: Use napi_alloc_skb() and napi_gro_receive()
  Documentation: net: dsa: mention simple HSR offload helpers
  Documentation: net: dsa: mention availability of RedBox
  ...
2025-12-03 17:24:33 -08:00
Linus Torvalds
015e7b0b0e Merge tag 'bpf-next-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Pull bpf updates from Alexei Starovoitov:

 - Convert selftests/bpf/test_tc_edt and test_tc_tunnel from .sh to
   test_progs runner (Alexis Lothoré)

 - Convert selftests/bpf/test_xsk to test_progs runner (Bastien
   Curutchet)

 - Replace bpf memory allocator with kmalloc_nolock() in
   bpf_local_storage (Amery Hung), and in bpf streams and range tree
   (Puranjay Mohan)

 - Introduce support for indirect jumps in BPF verifier and x86 JIT
   (Anton Protopopov) and arm64 JIT (Puranjay Mohan)

 - Remove runqslower bpf tool (Hoyeon Lee)

 - Fix corner cases in the verifier to close several syzbot reports
   (Eduard Zingerman, KaFai Wan)

 - Several improvements in deadlock detection in rqspinlock (Kumar
   Kartikeya Dwivedi)

 - Implement "jmp" mode for BPF trampoline and corresponding
   DYNAMIC_FTRACE_WITH_JMP. It improves "fexit" program type performance
   from 80 M/s to 136 M/s. With Steven's Ack. (Menglong Dong)

 - Add ability to test non-linear skbs in BPF_PROG_TEST_RUN (Paul
   Chaignon)

 - Do not let BPF_PROG_TEST_RUN emit invalid GSO types to stack (Daniel
   Borkmann)

 - Generalize buildid reader into bpf_dynptr (Mykyta Yatsenko)

 - Optimize bpf_map_update_elem() for map-in-map types (Ritesh
   Oedayrajsingh Varma)

 - Introduce overwrite mode for BPF ring buffer (Xu Kuohai)

* tag 'bpf-next-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next: (169 commits)
  bpf: optimize bpf_map_update_elem() for map-in-map types
  bpf: make kprobe_multi_link_prog_run always_inline
  selftests/bpf: do not hardcode target rate in test_tc_edt BPF program
  selftests/bpf: remove test_tc_edt.sh
  selftests/bpf: integrate test_tc_edt into test_progs
  selftests/bpf: rename test_tc_edt.bpf.c section to expose program type
  selftests/bpf: Add success stats to rqspinlock stress test
  rqspinlock: Precede non-head waiter queueing with AA check
  rqspinlock: Disable spinning for trylock fallback
  rqspinlock: Use trylock fallback when per-CPU rqnode is busy
  rqspinlock: Perform AA checks immediately
  rqspinlock: Enclose lock/unlock within lock entry acquisitions
  bpf: Remove runqslower tool
  selftests/bpf: Remove usage of lsm/file_alloc_security in selftest
  bpf: Disable file_alloc_security hook
  bpf: check for insn arrays in check_ptr_alignment
  bpf: force BPF_F_RDONLY_PROG on insn array creation
  bpf: Fix exclusive map memory leak
  selftests/bpf: Make CS length configurable for rqspinlock stress test
  selftests/bpf: Add lock wait time stats to rqspinlock stress test
  ...
2025-12-03 16:54:54 -08:00
Jakub Kicinski
4de4454299 Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
Merge in late fixes in preparation for the net-next PR.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-02 15:37:53 -08:00
Pavel Begunkov
9954464d73 net: page_pool: sanitise allocation order
We're going to give more control over rx buffer sizes to user space, and
since we can't always rely on driver validation, let's sanitise it in
page_pool_init() as well. Note that we only need to reject over
MAX_PAGE_ORDER allocations for normal page pools, as current memory
providers don't need to use the buddy allocator and must check the order
on init.i

Suggested-by: Stanislav Fomichev <stfomichev@gmail.com>
Reviewed-by: Mina Almasry <almasrymina@google.com>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://patch.msgid.link/77ad83c1aec66cbd00e7b3952f74bc3b7a988150.1764542851.git.asml.silence@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-02 11:08:39 -08:00
Pavel Begunkov
854858848b net: page pool: xa init with destroy on pp init
The free_ptr_ring label path initialises ->dma_mapped xarray but doesn't
destroy it in case of an error. That's not a real problem since init
itself doesn't do anything requiring destruction, but still match it
with xa_destroy() to silence warnings.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://patch.msgid.link/02904c6d83dbe5cc1c671106a5c97bd93ab31006.1764542851.git.asml.silence@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-02 11:08:39 -08:00
Andreas Gruenbacher
8a157e0a0a gfs2: Fix use of bio_chain
In gfs2_chain_bio(), the call to bio_chain() has its arguments swapped.
The result is leaked bios and incorrect synchronization (only the last
bio will actually be waited for).  This code is only used during mount
and filesystem thaw, so the bug normally won't be noticeable.

Reported-by: Stephen Zhang <starzhangzsd@gmail.com>
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2025-12-02 16:44:54 +00:00
Paolo Abeni
3c4159b301 Merge branch 'net-mlx5e-disable-egress-xdp-redirect-in-default'
Tariq Toukan says:

====================
net/mlx5e: Disable egress xdp-redirect in default

This small series disables the egress xdp-redirect feature in default.
It can still be enabled by loading a dummy XDP program.

Patches were previously submitted as part of [1].

This reduces the default number of SQs in each channel from 4 to 3, and
saves resources in device and host memory.

This also improves the latency of channel configuration operations, like
interface up (create channels), interface down (destroy channels), and
channels reconfiguration (create new set, destroy old one).

Perf numbers:
NIC: Connect-X7.
Setup: 248 channels, default mtu and rx/tx ring sizes.

Interface up + down:
Before: 2.246 secs
After:  1.798 secs (-0.448 sec)

Saves ~1.8 msec per channel.

[1]
https://lore.kernel.org/all/1762939749-1165658-1-git-send-email-tariqt@nvidia.com/
====================

Link: https://patch.msgid.link/1764497617-1326331-1-git-send-email-tariqt@nvidia.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-12-02 15:20:42 +01:00
Tariq Toukan
d4aa0cc9bd net/mlx5e: Support XDP target xmit with dummy program
Save per-channel resources in default, in device and host memory.

As no better API exist, make the XDP-redirect-target SQ available by
loading a dummy XDP program.

This improves the latency of interface up/down operations when feature
is disabled.

Perf numbers:
NIC: Connect-X7.
Setup: 248 channels, default mtu and rx/tx ring sizes.

Interface up + down:
Before: 2.246 secs
After:  1.798 secs (-0.448 sec)

Saves ~1.8 msec per channel.

Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Dragos Tatulea <dtatulea@nvidia.com>
Reviewed-by: William Tu <witu@nvidia.com>
Link: https://patch.msgid.link/1764497617-1326331-3-git-send-email-tariqt@nvidia.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-12-02 15:20:41 +01:00
Tariq Toukan
96a8395061 net/mlx5e: Update XDP features in switch channels
The XDP features state might depend of the state of other features, like
HW-LRO / HW-GRO.

In general, move the re-evaluation announcement of the XDP features
(xdp_set_features_flag_locked) into the flow where configuration gets
changed. There's no point in updating them elsewhere.

This is a more appropriate place, as this modifies the announced
features while channels are inactive, which avoids the small interval
between channel activation and the proper setting of the XDP features.

Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Dragos Tatulea <dtatulea@nvidia.com>
Reviewed-by: William Tu <witu@nvidia.com>
Link: https://patch.msgid.link/1764497617-1326331-2-git-send-email-tariqt@nvidia.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-12-02 15:20:41 +01:00
Xiang Mei
108f9405ce selftests/tc-testing: Test CAKE scheduler when enqueue drops packets
Add tests that trigger packet drops in cake_enqueue(): "CAKE with QFQ
Parent - CAKE enqueue with packets dropping". It forces CAKE_enqueue to
return NET_XMIT_CN after dropping the packets when it has a QFQ parent.

Signed-off-by: Xiang Mei <xmei5@asu.edu>
Reviewed-by: Toke Høiland-Jørgensen <toke@toke.dk>
Link: https://patch.msgid.link/20251128001415.377823-3-xmei5@asu.edu
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-12-02 13:28:00 +01:00
Xiang Mei
9fefc78f7f net/sched: sch_cake: Fix incorrect qlen reduction in cake_drop
In cake_drop(), qdisc_tree_reduce_backlog() is used to update the qlen
and backlog of the qdisc hierarchy. Its caller, cake_enqueue(), assumes
that the parent qdisc will enqueue the current packet. However, this
assumption breaks when cake_enqueue() returns NET_XMIT_CN: the parent
qdisc stops enqueuing current packet, leaving the tree qlen/backlog
accounting inconsistent. This mismatch can lead to a NULL dereference
(e.g., when the parent Qdisc is qfq_qdisc).

This patch computes the qlen/backlog delta in a more robust way by
observing the difference before and after the series of cake_drop()
calls, and then compensates the qdisc tree accounting if cake_enqueue()
returns NET_XMIT_CN.

To ensure correct compensation when ACK thinning is enabled, a new
variable is introduced to keep qlen unchanged.

Fixes: 15de71d06a ("net/sched: Make cake_enqueue return NET_XMIT_CN when past buffer_limit")
Signed-off-by: Xiang Mei <xmei5@asu.edu>
Reviewed-by: Toke Høiland-Jørgensen <toke@toke.dk>
Link: https://patch.msgid.link/20251128001415.377823-1-xmei5@asu.edu
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-12-02 13:28:00 +01:00
Jakub Kicinski
31a3ed492d Merge tag 'wireguard-6.19-rc1-for-jakub' of https://git.kernel.org/pub/scm/linux/kernel/git/zx2c4/wireguard-linux
Jason A. Donenfeld says:

====================
WireGuard updates for Linux 6.19-rc1.

Please find here Asbjørn's ynl series. This has been sitting in my
testing for the last week or so, since he sent out the latest series.
I've dropped the yml sample code, as he found an issue in that last
minute, but otherwise, we've sat on this code for long enough, so
let's see how it goes.

* tag 'wireguard-6.19-rc1-for-jakub' of https://git.kernel.org/pub/scm/linux/kernel/git/zx2c4/wireguard-linux:
  wireguard: netlink: generate netlink code
  wireguard: uapi: generate header with ynl-gen
  wireguard: uapi: move flag enums
  wireguard: uapi: move enum wg_cmd
  wireguard: netlink: add YNL specification
  wireguard: netlink: lower .maxattr for WG_CMD_GET_DEVICE
  wireguard: netlink: convert to split ops
  wireguard: netlink: use WG_KEY_LEN in policies
  wireguard: netlink: validate nested arrays in policy
  wireguard: netlink: enable strict genetlink validation
====================

Link: https://patch.msgid.link/
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 20:33:07 -08:00
Asbjørn Sloth Tønnesen
3fd2f3d2f4 wireguard: netlink: generate netlink code
This patch adopts netlink policies and command definitions
generated by ynl-gen, thus completing the conversion to YNL.

Given that the old and new policies are functionally identical
and have just been moved to a new file, it serves to verify
that the policies generated from the spec are identical to the
previous policy code.

The following functions are renamed:
  wg_get_device_dump() -> wg_get_device_dumpit()
  wg_set_device()      -> wg_set_device_doit()

The new files are covered by the existing drivers/net/wireguard/
pattern in MAINTAINERS.

No behavioural changes intended.

Signed-off-by: Asbjørn Sloth Tønnesen <ast@fiberby.net>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
2025-12-02 04:12:49 +01:00
Asbjørn Sloth Tønnesen
88cedad45b wireguard: uapi: generate header with ynl-gen
Use ynl-gen to generate the UAPI header for WireGuard.

The cosmetic changes in this patch confirms that the spec is aligned
with the implementation. By using the generated version, it ensures
that they stay in sync.

Changes in the generated header:
* Trivial header guard rename.
* Trivial white space changes.
* Trivial comment changes.
* Precompute bitflags in ynl-gen (see [1]).
* Drop __*_F_ALL constants (see [1]).

[1] https://lore.kernel.org/r/20251014123201.6ecfd146@kernel.org/

No behavioural changes intended.

Signed-off-by: Asbjørn Sloth Tønnesen <ast@fiberby.net>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
2025-12-02 04:12:49 +01:00
Asbjørn Sloth Tønnesen
8d974872ab wireguard: uapi: move flag enums
Move the wg*_flag enums, so they are defined above the attribute set
enums, where ynl-gen would place them.

This is an incremental step towards adopting an UAPI header generated
by ynl-gen. This is split out to keep the patches readable.

This is a trivial patch with no behavioural changes intended.

Signed-off-by: Asbjørn Sloth Tønnesen <ast@fiberby.net>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
2025-12-02 04:12:49 +01:00
Asbjørn Sloth Tønnesen
b5c5a82bf5 wireguard: uapi: move enum wg_cmd
This patch moves enum wg_cmd to the end of the file, where ynl-gen
would generate it.

This is an incremental step towards adopting an UAPI header generated
by ynl-gen. This is split out to keep the patches readable.

This is a trivial patch with no behavioural changes intended.

Signed-off-by: Asbjørn Sloth Tønnesen <ast@fiberby.net>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
2025-12-02 04:12:49 +01:00
Asbjørn Sloth Tønnesen
6b0f4ca079 wireguard: netlink: add YNL specification
This patch adds a near[1] complete YNL specification for WireGuard,
documenting the protocol in a machine-readable format, rather than
comments in wireguard.h, and eases usage from C and non-C programming
languages alike.

The generated C library will be featured in a later patch, so in
this patch I will use the in-kernel python client for examples.

This makes the documentation in the UAPI header redundant, it is
therefore removed. The in-line documentation in the spec is based
on the existing comment in wireguard.h, and once released it will
be available in the kernel documentation at:
  https://docs.kernel.org/netlink/specs/wireguard.html
  (until then run: make htmldocs)

Generate wireguard.rst from this spec:
$ make -C tools/net/ynl/generated/ wireguard.rst

Query wireguard interface through pyynl:
$ sudo ./tools/net/ynl/pyynl/cli.py --family wireguard \
                                    --dump get-device \
                                    --json '{"ifindex":3}'
[{'fwmark': 0,
  'ifindex': 3,
  'ifname': 'wg-test',
  'listen-port': 54318,
  'peers': [{0: {'allowedips': [{0: {'cidr-mask': 0,
                                     'family': 2,
                                     'ipaddr': '0.0.0.0'}},
                                {0: {'cidr-mask': 0,
                                     'family': 10,
                                     'ipaddr': '::'}}],
                 'endpoint': b'[...]',
                 'last-handshake-time': {'nsec': 42, 'sec': 42},
                 'persistent-keepalive-interval': 42,
                 'preshared-key': '[...]',
                 'protocol-version': 1,
                 'public-key': '[...]',
                 'rx-bytes': 42,
                 'tx-bytes': 42}}],
  'private-key': '[...]',
  'public-key': '[...]'}]

Add another allowed IP prefix:
$ sudo ./tools/net/ynl/pyynl/cli.py --family wireguard \
  --do set-device --json '{"ifindex":3,"peers":[
    {"public-key":"6a df b1 83 a4 ..","allowedips":[
      {"cidr-mask":0,"family":10,"ipaddr":"::"}]}]}'

[1] As can be seen above, the "endpoint" is only dumped as binary data,
    as it can't be fully described in YNL. It's either a struct
    sockaddr_in or struct sockaddr_in6 depending on the attribute length.

Signed-off-by: Asbjørn Sloth Tønnesen <ast@fiberby.net>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
2025-12-02 04:12:19 +01:00
Jakub Kicinski
cbc19b3229 Merge branch 'selftests-drv-net-fix-issues-in-devlink_rate_tc_bw-py'
Carolina Jubran says:

====================
selftests: drv-net: Fix issues in devlink_rate_tc_bw.py

This series fixes issues in the devlink_rate_tc_bw.py selftest and
introduces a new Iperf3Runner that helps with measurement handling.
====================

Link: https://patch.msgid.link/20251130091938.4109055-1-cjubran@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 17:18:43 -08:00
Carolina Jubran
5cc1bddcfe selftests: drv-net: Fix tolerance calculation in devlink_rate_tc_bw.py
Currently, tolerance is computed against the TC’s expected percentage,
making TC3 (20%) validation overly strict and TC4 (80%) overly loose.

Update BandwidthValidator to take a dict of shares and compute bounds
relative to the overall total, so that all shares are validated
consistently.

Signed-off-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
Reviewed-by: Nimrod Oren <noren@nvidia.com>
Link: https://patch.msgid.link/20251130091938.4109055-7-cjubran@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 17:18:41 -08:00
Carolina Jubran
9ecd05a2c8 selftests: drv-net: Fix and clarify TC bandwidth split in devlink_rate_tc_bw.py
Correct the documented bandwidth distribution between TC3 and TC4
from 80/20 to 20/80. Update test descriptions and printed messages
to consistently reflect the intended split.

Signed-off-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
Reviewed-by: Nimrod Oren <noren@nvidia.com>
Link: https://patch.msgid.link/20251130091938.4109055-6-cjubran@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 17:18:41 -08:00
Carolina Jubran
3796e549e3 selftests: drv-net: Set shell=True for sysfs writes in devlink_rate_tc_bw.py
Commit 7c32f7a2d3db ("selftests: net: py: don't default to shell=True")
changed the cmd() helper to avoid spawning a shell unless explicitly
requested.

The devlink_rate_tc_bw test enables SR-IOV by writing to the
sriov_numvfs sysfs attribute using redirection. Without shell=True the
redirection is not interpreted and the VF device never appears,
causing the test to fail.

Fix by explicitly passing shell=True in the two places that update
sriov_numvfs.

Signed-off-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
Reviewed-by: Nimrod Oren <noren@nvidia.com>
Link: https://patch.msgid.link/20251130091938.4109055-5-cjubran@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 17:18:41 -08:00
Carolina Jubran
cb1acbd30a selftests: drv-net: Use Iperf3Runner in devlink_rate_tc_bw.py
Replace the inline iperf3 subprocess and JSON parsing with
Iperf3Runner.

Signed-off-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
Reviewed-by: Nimrod Oren <noren@nvidia.com>
Link: https://patch.msgid.link/20251130091938.4109055-4-cjubran@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 17:18:41 -08:00
Carolina Jubran
2a60ce94c6 selftests: drv-net: introduce Iperf3Runner for measurement use cases
GenerateTraffic was added to spin up long-running iperf3 load, mainly
to drive high PPS background traffic. It was never meant to provide
stable throughput numbers, and trying to repurpose it for measurement
does not make sense.

Introduce Iperf3Runner to allow tests to split out server/client
configuration, control start/stop, and collect JSON output for
analysis. This makes it possible to measure bandwidth directly when
validating egress shaping.

GenerateTraffic stays as the background load generator, reusing the
common iperf3 helpers under the hood.

Signed-off-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
Reviewed-by: Nimrod Oren <noren@nvidia.com>
Link: https://patch.msgid.link/20251130091938.4109055-3-cjubran@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 17:18:41 -08:00
Carolina Jubran
a8658f7bb6 selftests: drv-net: Add devlink_rate_tc_bw.py to TEST_PROGS
This makes devlink_rate_tc_bw.py present in the Makefile under the same
directory.

Signed-off-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
Reviewed-by: Nimrod Oren <noren@nvidia.com>
Link: https://patch.msgid.link/20251130091938.4109055-2-cjubran@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 17:18:40 -08:00
Jakub Kicinski
4a18b6cd7c Merge tag 'for-net-next-2025-12-01' of git://git.kernel.org/pub/scm/linux/kernel/git/bluetooth/bluetooth-next
Luiz Augusto von Dentz says:

====================
bluetooth-next pull request for net-next:

core:

 - HCI: Add initial support for PAST
 - hci_core: Introduce HCI_CONN_FLAG_PAST
 - ISO: Add support to bind to trigger PAST
 - HCI: Always use the identity address when initializing a connection
 - ISO: Attempt to resolve broadcast address
 - MGMT: Allow use of Set Device Flags without Add Device
 - ISO: Fix not updating BIS sender source address
 - HCI: Add support for LL Extended Feature Set

 driver:

 - btusb: Add new VID/PID 2b89/6275 for RTL8761BUV
 - btusb: MT7920: Add VID/PID 0489/e135
 - btusb: MT7922: Add VID/PID 0489/e170
 - btusb: Add new VID/PID 13d3/3533 for RTL8821CE
 - btusb: Add new VID/PID 0x0489/0xE12F for RTL8852BE-VT
 - btusb: Add new VID/PID 0x13d3/0x3618 for RTL8852BE-VT
 - btusb: Add new VID/PID 0x13d3/0x3619 for RTL8852BE-VT
 - btusb: Reclassify Qualcomm WCN6855 debug packets
 - btintel_pcie: Introduce HCI Driver protocol
 - btintel_pcie: Support for S4 (Hibernate)
 - btintel_pcie: Suspend/Resume: Controller doorbell interrupt handling
 - dt-bindings: net: Convert Marvell 8897/8997 bindings to DT schema
 - btbcm: Use kmalloc_array() to prevent overflow
 - btrtl: Add the support for RTL8761CUV
 - hci_h5: avoid sending two SYNC messages
 - hci_h5: implement CRC data integrity

MAINTAINERS:

 - Add Bartosz Golaszewski as Qualcomm hci_qca maintainer

* tag 'for-net-next-2025-12-01' of git://git.kernel.org/pub/scm/linux/kernel/git/bluetooth/bluetooth-next: (29 commits)
  Bluetooth: btusb: Add new VID/PID 13d3/3533 for RTL8821CE
  Bluetooth: HCI: Add support for LL Extended Feature Set
  drivers/bluetooth: btbcm: Use kmalloc_array() to prevent overflow
  Bluetooth: btintel_pcie: Introduce HCI Driver protocol
  Bluetooth: btusb: add new custom firmwares
  Bluetooth: btusb: Add new VID/PID 0x13d3/0x3619 for RTL8852BE-VT
  Bluetooth: btusb: Add new VID/PID 0x13d3/0x3618 for RTL8852BE-VT
  Bluetooth: btusb: Add new VID/PID 0x0489/0xE12F for RTL8852BE-VT
  Bluetooth: iso: fix socket matching ambiguity between BIS and CIS
  Bluetooth: MAINTAINERS: Add Bartosz Golaszewski as Qualcomm hci_qca maintainer
  Bluetooth: btrtl: Add the support for RTL8761CUV
  Bluetooth: Remove redundant pm_runtime_mark_last_busy() calls
  dt-bindings: net: Convert Marvell 8897/8997 bindings to DT schema
  Bluetooth: btusb: Reclassify Qualcomm WCN6855 debug packets
  Bluetooth: btusb: Add new VID/PID 2b89/6275 for RTL8761BUV
  Bluetooth: btintel_pcie: Suspend/Resume: Controller doorbell interrupt handling
  Bluetooth: btintel_pcie: Support for S4 (Hibernate)
  Bluetooth: btusb: MT7922: Add VID/PID 0489/e170
  Bluetooth: btusb: MT7920: Add VID/PID 0489/e135
  Bluetooth: ISO: Fix not updating BIS sender source address
  ...
====================

Link: https://patch.msgid.link/20251201213818.97249-1-luiz.dentz@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 17:10:52 -08:00
Florian Fuchs
d8e08149a5 net: ps3_gelic_net: Use napi_alloc_skb() and napi_gro_receive()
Use the napi functions napi_alloc_skb() and napi_gro_receive() instead
of netdev_alloc_skb() and netif_receive_skb() for more efficient packet
receiving. The switch to napi aware functions increases the RX
throughput, reduces the occurrence of retransmissions and improves the
resilience against SKB allocation failures.

Signed-off-by: Florian Fuchs <fuchsfl@gmail.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251130194155.1950980-1-fuchsfl@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 16:54:53 -08:00
Jakub Kicinski
3101f3e116 Merge branch 'dsa-simple-hsr-offload'
Vladimir Oltean says:

====================
DSA simple HSR offload

Provide a "simple" form of HSR offload for 8 DSA drivers (just the
NETIF_F_HW_HSR_DUP feature) based on the fact that their taggers use the
dsa_xmit_port_mask() function. This is in patches 6-13/15.

The helpers per se are introduced in patch 5/15, and documented in patch
15/15. Patch 14/15 is another small (and related) documentation update.

For HSR interlink ports the offloading rules are not quite so clear, and
for now we completely reject the offload. We can revise that once we see
a full offload implementation and understand what is needed.

To reject the offload, we need to know the port type, and patch 2/15
helps with that.

xrs700x is another driver which should have rejected offload based on
port type (patch 4/15). This is a bug fix submitted through net-next due
to the extra API required to fix it. If necessary, it could also be
picked up separately for backporting.

There is also patch 3/15, which makes the HSR offload like the others
supported by DSA: if we fall back to the software implementation, don't
call port_hsr_leave(), because by definition there won't be anything to
do.

A slightly unrelated change is patch 1/15, but I noticed this along the
way, and if I were to submit it separately, it would conflict with this
work (it would appear in patch 12/15's context).

Most of the driver additions are trivial. By far the most complex was
ocelot (which I could test). Microchip ksz (which I cannot test, and did
not patch) would also have some complexity. Essentially, ksz_hsr_join()
could fall back to a partial offload through the simple helpers, if the
full offload is not possible. But keeping track of which offload kind
was used is necessary later in ksz_hsr_leave(). This is left as homework
for interested developers.

With this patch set, one can observe a 50% reduction in transmitted
traffic over HSR interfaces.
====================

Link: https://patch.msgid.link/20251130131657.65080-1-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 16:51:56 -08:00
Vladimir Oltean
4e4c00f34d Documentation: net: dsa: mention simple HSR offload helpers
Keep the documentation up to date.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://patch.msgid.link/20251130131657.65080-16-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 16:51:55 -08:00
Vladimir Oltean
977839161f Documentation: net: dsa: mention availability of RedBox
Since commit 5055cccfc2 ("net: hsr: Provide RedBox support
(HSR-SAN)"), RedBox is available (including for offload in DSA).

Update the DSA documentation that states it isn't.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://patch.msgid.link/20251130131657.65080-15-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 16:51:49 -08:00
Vladimir Oltean
7271d4a08c net: dsa: a5psw: use simple HSR offload helpers
The "a5psw" tagging protocol uses dsa_xmit_port_mask(), which means
we can offload HSR packet duplication on transmit. Enable that feature.

Cc: "Clément Léger" <clement.leger@bootlin.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://patch.msgid.link/20251130131657.65080-14-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 16:45:08 -08:00
Vladimir Oltean
585943b7ad net: dsa: mt7530: use simple HSR offload helpers
The "mtk" tagging protocol uses dsa_xmit_port_mask(), which means we can
offload HSR packet duplication on transmit. Enable that feature.

Cc: Daniel Golle <daniel@makrotopia.org>
Cc: DENG Qingfang <dqfext@gmail.com>
Cc: Sean Wang <sean.wang@mediatek.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Acked-by: Chester A. Unal <chester.a.unal@arinc9.com>
Link: https://patch.msgid.link/20251130131657.65080-13-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 16:45:08 -08:00
Vladimir Oltean
017bcff732 net: dsa: hellcreek: use simple HSR offload helpers
The "hellcreek" tagging protocol uses dsa_xmit_port_mask(), which means
we can offload HSR packet duplication on transmit. Enable that feature.

Cc: Kurt Kanzenbach <kurt@linutronix.de>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://patch.msgid.link/20251130131657.65080-12-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 16:45:08 -08:00
Vladimir Oltean
4af9fa2ba6 net: dsa: mv88e6060: use simple HSR offload helpers
The "trailer" tagging protocol uses dsa_xmit_port_mask(), which means we
can offload HSR packet duplication on transmit. Enable that feature.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://patch.msgid.link/20251130131657.65080-11-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 16:45:08 -08:00
Vladimir Oltean
b6ad21ef28 net: dsa: lantiq_gswip: use simple HSR offload helpers
Both the "gswip" and "gsw1xx" protocols use dsa_xmit_port_mask(), so
they are compatible with accelerating TX packet duplication for HSR
rings.

Enable that feature by setting the port_hsr_join() and port_hsr_leave()
operations to the simple helpers provided by DSA.

Cc: Hauke Mehrtens <hauke@hauke-m.de>
Cc: Daniel Golle <daniel@makrotopia.org>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://patch.msgid.link/20251130131657.65080-10-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 16:45:07 -08:00
Vladimir Oltean
6db31942e3 net: dsa: realtek: use simple HSR offload helpers
All known Realtek protocols: "rtl4a", "rtl8_4" and "rtl8_4t" use
dsa_xmit_port_mask(), so they are compatible with accelerating TX packet
duplication for HSR rings.

Enable that feature by setting the port_hsr_join() and port_hsr_leave()
operations to the simple helpers provided by DSA.

Cc: "Alvin Šipraga" <alsi@bang-olufsen.dk>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Linus Walleij <linusw@kernel.org>
Link: https://patch.msgid.link/20251130131657.65080-9-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 16:45:07 -08:00
Vladimir Oltean
4b65d44555 net: dsa: ocelot: use simple HSR offload helpers
Accelerate TX packet duplication with HSR rings.

This is only possible with the NPI-based "ocelot" tagging protocol, not
with "ocelot-8021q", because the latter does not use dsa_xmit_port_mask().

This has 2 implications:
- Depending on tagging protocol, we should set (or not set) the offload
  feature flags. Switching tagging protocols is done with ports down, by
  design. Additional calls to dsa_port_simple_hsr_join() can be put in
  the ds->ops->change_tag_protocol() path, as I had originally tried,
  but this would not work: dsa_user_setup_tagger() would later clear
  the feature flag that we just set. So the additional call to
  dsa_port_simple_hsr_join() should sit in the ds->ops->port_enable()
  call.

- When joining a HSR ring and we are currently using "ocelot-8021q",
  there are cases when we should return -EOPNOTSUPP (pessimistic) and
  cases when we shouldn't (optimistic). In the pessimistic case, it is a
  configuration that the port won't support even with the right tagging
  protocol. Distinguishing between these 2 cases matters because if we
  just return -EOPNOTSUPP regardless, we lose the dp->hsr_dev pointer
  and can no longer replay the offload later for the optimistic case,
  from felix_port_enable().

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://patch.msgid.link/20251130131657.65080-8-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 16:45:07 -08:00
Vladimir Oltean
42e63b1373 net: dsa: yt921x: use simple HSR offloading helpers
Accelerate TX packet duplication with HSR rings.

Cc: David Yang <mmyangfl@gmail.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://patch.msgid.link/20251130131657.65080-7-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 16:45:07 -08:00
Vladimir Oltean
0e75bfe340 net: dsa: add simple HSR offload helpers
It turns out that HSR offloads are so fine-grained that many DSA
switches can do a small part even though they weren't specifically
designed for the protocols supported by that driver (HSR and PRP).

Specifically NETIF_F_HW_HSR_DUP - it is simple packet duplication on
transmit, towards all (aka 2) ports members of the HSR device.

For many DSA switches, we know how to duplicate a packet, even though we
never typically use that feature. The transmit port mask from the
tagging protocol can have multiple bits set, and the switch should send
the packet once to every port with a bit set from that mask.

Nonetheless, not all tagging protocols are like this, and sometimes the
port is a single numeric value rather than a bit mask. For that reason,
and also because switches can sometimes change tagging protocols for
different ones, we need to make HSR offload helpers opt-in.

For devices that can do nothing else HSR-specific, we introduce
dsa_port_simple_hsr_join() and dsa_port_simple_hsr_leave(). These
functions monitor when two user ports of the same switch are part of the
same HSR device, and when that condition is true, they toggle the
NETIF_F_HW_HSR_DUP feature flag of both net devices.

Normally only dsa_port_simple_hsr_join() and dsa_port_simple_hsr_leave()
are needed. The dsa_port_simple_hsr_validate() helper is just to see
what kind of configuration could be offloadable using the generic
helpers. This is used by switch drivers which are not currently using
the right tagging protocol to offload this HSR ring, but could in
principle offload it after changing the tagger.

Suggested-by: David Yang <mmyangfl@gmail.com>
Cc: "Alvin Šipraga" <alsi@bang-olufsen.dk>
Cc: Chester A. Unal" <chester.a.unal@arinc9.com>
Cc: "Clément Léger" <clement.leger@bootlin.com>
Cc: Daniel Golle <daniel@makrotopia.org>
Cc: DENG Qingfang <dqfext@gmail.com>
Cc: Florian Fainelli <florian.fainelli@broadcom.com>
Cc: George McCollister <george.mccollister@gmail.com>
Cc: Hauke Mehrtens <hauke@hauke-m.de>
Cc: Jonas Gorski <jonas.gorski@gmail.com>
Cc: Kurt Kanzenbach <kurt@linutronix.de>
Cc: Linus Walleij <linus.walleij@linaro.org>
Cc: Sean Wang <sean.wang@mediatek.com>
Cc: UNGLinuxDriver@microchip.com
Cc: Woojung Huh <woojung.huh@microchip.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://patch.msgid.link/20251130131657.65080-6-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 16:45:07 -08:00
Vladimir Oltean
30296ac764 net: dsa: xrs700x: reject unsupported HSR configurations
As discussed here:
https://lore.kernel.org/netdev/20240620090210.drop6jwh7e5qw556@skbuf/

the fact is that the xrs700x.c driver only supports offloading
HSR_PT_SLAVE_A and HSR_PT_SLAVE_B (which were the only port types at the
time the offload was written, _for this driver_).

Up until now, the API did not explicitly tell offloading drivers what
port has what role. So xrs700x can get confused and think that it can
support a configuration which it actually can't. There was a table in
the attached link which gave an example:

$ ip link add name hsr0 type hsr slave1 swp0 slave2 swp1 \
	interlink swp2 supervision 45 version 1

        HSR_PT_SLAVE_A    HSR_PT_SLAVE_B      HSR_PT_INTERLINK
 ----------------------------------------------------------------
 user
 space        0                 1                   2
 requests
 ----------------------------------------------------------------
 XRS700X
 driver       1                 2                   -
 understands

The switch would act as if the ring ports were swp1 and swp2.

Now that we have explicit hsr_get_port_type() API, let's use that to
work around the unintended semantical changes of the offloading API
brought by the introduction of interlink ports in HSR.

Fixes: 5055cccfc2 ("net: hsr: Provide RedBox support (HSR-SAN)")
Cc: Lukasz Majewski <lukma@denx.de>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: George McCollister <george.mccollister@gmail.com>
Link: https://patch.msgid.link/20251130131657.65080-5-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 16:45:07 -08:00
Vladimir Oltean
bed59a86e9 net: dsa: avoid calling ds->ops->port_hsr_leave() when unoffloaded
This mirrors what we do in dsa_port_lag_leave() and
dsa_port_bridge_leave(): when ds->ops->port_hsr_join() returns
-EOPNOTSUPP, we fall back to a software implementation where dp->hsr_dev
is NULL, and the unoffloaded port is no longer bothered with calls from
the HSR layer.

This helps, for example, with interlink ports which current DSA drivers
don't know how to offload. We have to check only in port_hsr_join() for
the port type, then in port_hsr_leave() we are sure we're dealing only
with known port types.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://patch.msgid.link/20251130131657.65080-4-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 16:45:06 -08:00
Xiaoliang Yang
a0244e7621 net: hsr: create an API to get hsr port type
Since the introduction of HSR_PT_INTERLINK in commit 5055cccfc2 ("net:
hsr: Provide RedBox support (HSR-SAN)"), we see that different port
types require different settings for hardware offload, which was not the
case before when we only had HSR_PT_SLAVE_A and HSR_PT_SLAVE_B. But
there is currently no way to know which port is which type, so create
the hsr_get_port_type() API function and export it.

When hsr_get_port_type() is called from the device driver, the port can
must be found in the HSR port list. An important use case is for this
function to work from offloading drivers' NETDEV_CHANGEUPPER handler,
which is triggered by hsr_portdev_setup() -> netdev_master_upper_dev_link().
Therefore, we need to move the addition of the hsr_port to the HSR port
list prior to calling hsr_portdev_setup(). This makes the error
restoration path also more similar to hsr_del_port(), where
kfree_rcu(port) is already used.

Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Lukasz Majewski <lukma@denx.de>
Signed-off-by: Xiaoliang Yang <xiaoliang.yang_1@nxp.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Łukasz Majewski <lukma@nabladev.com>
Link: https://patch.msgid.link/20251130131657.65080-3-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 16:45:06 -08:00
Vladimir Oltean
3b87e60d21 net: dsa: mt7530: unexport mt7530_switch_ops
Commit cb675afcdd ("net: dsa: mt7530: introduce separate MDIO driver")
exported mt7530_switch_ops for use from mt7530-mmio.c. Later in the
patch set, mt7530-mmio.c used mt7530_probe_common() to access the
mt7530_switch_ops still from mt7530.c - see commit 110c18bfed ("net:
dsa: mt7530: introduce driver for MT7988 built-in switch").

This proves that exporting mt7530_switch_ops was unnecessary, so
unexport it back.

Cc: DENG Qingfang <dqfext@gmail.com>
Cc: Sean Wang <sean.wang@mediatek.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Daniel Golle <daniel@makrotopia.org>
Acked-by: Daniel Golle <daniel@makrotopia.org>
Acked-by: Chester A. Unal <chester.a.unal@arinc9.com>
Link: https://patch.msgid.link/20251130131657.65080-2-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 16:45:06 -08:00
Jakub Kicinski
b4dcaeea5e Merge branch 'net-dsa-yt921x-add-stp-mst-support'
David Yang says:

====================
net: dsa: yt921x: Add STP/MST support

Support for these features was deferred from the initial submission of the
driver.

v3: https://lore.kernel.org/20251126093240.2853294-1-mmyangfl@gmail.com
v2: https://lore.kernel.org/20251025170606.1937327-1-mmyangfl@gmail.com
v1: https://lore.kernel.org/20251024033237.1336249-1-mmyangfl@gmail.com
====================

Link: https://patch.msgid.link/20251201094232.3155105-1-mmyangfl@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 15:10:15 -08:00
David Yang
633b1d010c net: dsa: yt921x: Add STP/MST support
Support for STP/MST was deferred from the initial submission of the
driver.

Signed-off-by: David Yang <mmyangfl@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20251201094232.3155105-3-mmyangfl@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 15:10:13 -08:00
David Yang
d973ac83ad net: dsa: yt921x: Use *_ULL bitfield macros for VLAN_CTRL
VLAN_CTRL should be treated as a 64-bit register. GENMASK and BIT
macros use unsigned long as the underlying type, which will result in a
build error on architectures where sizeof(long) == 4.

Replace them with unsigned long long variants.

Signed-off-by: David Yang <mmyangfl@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20251201094232.3155105-2-mmyangfl@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 15:10:13 -08:00
Jakub Kicinski
a2027019e9 Merge branch 'add-sqi-and-sqi-support-for-oatc14-10base-t1s-phys-and-microchip-t1s-driver'
Parthiban Veerasooran says:

====================
Add SQI and SQI+ support for OATC14 10Base-T1S PHYs and Microchip T1S driver

This patch series adds Signal Quality Indicator (SQI) and enhanced SQI+
support for OATC14 10Base-T1S PHYs, along with integration into the
Microchip T1S PHY driver. This enables ethtool to report the SQI value for
OATC14 10Base-T1S PHYs.
====================

Link: https://patch.msgid.link/20251201032346.6699-1-parthiban.veerasooran@microchip.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 15:08:26 -08:00
Parthiban Veerasooran
16416c8352 net: phy: microchip_t1s: add SQI support for LAN867x Rev.D0 PHYs
Add support for Signal Quality Index (SQI) reporting in the
Microchip T1S PHY driver for LAN867x Rev.D0 (OATC14-compliant) PHYs.

This patch registers the following callbacks in the microchip_t1s driver
structure:

- .get_sqi      - returns the current SQI value
- .get_sqi_max  - returns the maximum SQI value

This enables ethtool to report the SQI value for LAN867x Rev.D0 PHYs.

Signed-off-by: Parthiban Veerasooran <parthiban.veerasooran@microchip.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20251201032346.6699-3-parthiban.veerasooran@microchip.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 15:08:24 -08:00
Parthiban Veerasooran
5e1bf5ae5e net: phy: phy-c45: add SQI and SQI+ support for OATC14 10Base-T1S PHYs
Add support for reading Signal Quality Indicator (SQI) and enhanced SQI+
from OATC14 10Base-T1S PHYs.

- Introduce MDIO register definitions for DCQ_SQI and DCQ_SQIPLUS.
- Add `genphy_c45_oatc14_get_sqi_max()` to return the maximum supported
  SQI/SQI+ level.
- Add `genphy_c45_oatc14_get_sqi()` to return the current SQI or SQI+
  value.
- Update `include/linux/phy.h` to expose the new APIs.

SQI+ capability is read from the Advanced Diagnostic Features Capability
register (ADFCAP). If SQI+ is supported, the driver calculates the value
from the MSBs of the DCQ_SQIPLUS register; otherwise, it falls back to
basic SQI (0-7 levels). This enables ethtool to report the SQI value for
OATC14 10Base-T1S PHYs.

Open Alliance TC14 10BASE-T1S Advanced Diagnostic PHY Features
Specification ref:
https://opensig.org/wp-content/uploads/2025/06/OPEN_Alliance_10BASE-T1S_Advanced_PHY_features_for-automotive_Ethernet_V2.1b.pdf

Signed-off-by: Parthiban Veerasooran <parthiban.veerasooran@microchip.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20251201032346.6699-2-parthiban.veerasooran@microchip.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 15:08:24 -08:00
Jakub Kicinski
8d537e333e Merge branch 'net-mlx5e-enhance-dcbnl-get-set-maxrate-code'
Tariq Toukan says:

====================
net/mlx5e: Enhance DCBNL get/set maxrate code

This series by Gal introduces multiple small code quality improvements
for the DCBNL operations mlx5e_dcbnl_ieee_[gs]etmaxrate().

No functional change.
====================

Link: https://patch.msgid.link/1764498334-1327918-1-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 14:56:31 -08:00
Gal Pressman
87a5112bfc net/mlx5e: Use standard unit definitions for bandwidth conversion
MLX5E_100MB and MLX5E_1GB defines are confusing, MLX5E_100MB is not
equal to 100 * MEGA, and MLX5E_1GB is not equal to one GIGA, as they
hide the Kbps rate conversion required for ieee_maxrate.

Replace hardcoded bandwidth conversion values with standard unit
definitions from linux/units.h. Rename MLX5E_100MB/MLX5E_1GB to
MLX5E_100MB_TO_KB/MLX5E_1GB_TO_KB to clarify these are conversion
factors to Kbps, not absolute bandwidth values.

Signed-off-by: Gal Pressman <gal@nvidia.com>
Reviewed-by: Nimrod Oren <noren@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1764498334-1327918-5-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 14:56:28 -08:00
Gal Pressman
53f7a77128 net/mlx5e: Use U8_MAX instead of hard coded magic number
Replace hard coded 255 magic number with U8_MAX (the register field is 8
bits).

Signed-off-by: Gal Pressman <gal@nvidia.com>
Reviewed-by: Nimrod Oren <noren@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1764498334-1327918-4-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 14:56:27 -08:00
Gal Pressman
e1098bb02f net/mlx5e: Rename upper_limit_mbps to upper_limit_100mbps
Clarify that the limit represents the threshold for using 100 Mbps
units rather than a general Mbps limit.

Signed-off-by: Gal Pressman <gal@nvidia.com>
Reviewed-by: Nimrod Oren <noren@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1764498334-1327918-3-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 14:56:27 -08:00
Gal Pressman
e1de33c377 net/mlx5e: Use u64 instead of __u64 in ieee_setmaxrate
Change upper_limit_mbps/gbps from __u64 to u64 to follow kernel coding
conventions.

Signed-off-by: Gal Pressman <gal@nvidia.com>
Reviewed-by: Nimrod Oren <noren@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1764498334-1327918-2-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 14:56:27 -08:00
Jakub Kicinski
596c696301 Revert "r8169: add DASH support for RTL8127AP"
This reverts commit 17e9f841dd.

Nathan reports error messages appearing in dmesg since commit
under Fixes:

  [    3.844125] r8169 0000:01:00.0 (unnamed net_device) (uninitialized): rtl_eriar_cond == 0 (loop: 100, delay: 100).
  [    3.864844] r8169 0000:01:00.0 eth0: rtl_eriar_cond == 1 (loop: 100, delay: 100).
  [    3.878825] r8169 0000:01:00.0 eth0: rtl_eriar_cond == 1 (loop: 100, delay: 100).
  [    3.892632] r8169 0000:01:00.0 eth0: rtl_eriar_cond == 1 (loop: 100, delay: 100).
  [    5.002551] r8169 0000:01:00.0 eth0: rtl_eriar_cond == 1 (loop: 100, delay: 100).
  [    5.016286] r8169 0000:01:00.0 eth0: rtl_eriar_cond == 1 (loop: 100, delay: 100).
  [    5.030027] r8169 0000:01:00.0 eth0: rtl_eriar_cond == 1 (loop: 100, delay: 100).

Let's drop the bad change and revisit in the next release cycle.

Repoted-by: Nathan Chancellor <nathan@kernel.org>
Link: https://lore.kernel.org/20251201224238.GA604467@ax162
Fixes: 17e9f841dd ("r8169: add DASH support for RTL8127AP")
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 14:53:55 -08:00
Jakub Kicinski
8d92057c4a Merge branch 'net-dsa-b53-fix-arl-accesses-for-bcm5325-65-and-allow-vid-0'
Jonas Gorski says:

====================
net: dsa: b53: fix ARL accesses for BCM5325/65 and allow VID 0

ARL entries on BCM5325 and BCM5365 were broken significantly in two
ways:

- Entries for the CPU port were using the wrong port id, pointing to a
  non existing port.
- Setting the VLAN ID for entries was not done, adding them all to VLAN
  0 instead.

While the former technically broke any communication to the CPU port,
with the latter they were added to the currently unused VID 0, so they
never became effective. Presumably the default PVID was set to 1 because
of these issues 0 was broken (and the root cause not found).

So fix writing and reading entries on BCM5325/65 by first fixing the CPU
port entries, then fixing setting the VLAN ID for entries.

Finally, re-allow VID 0 for BCM5325/65 to allow the whole 1-15 VLAN ID
range to be available to users, and align VLAN handling with all other
switch chips.
====================

Link: https://patch.msgid.link/20251128080625.27181-1-jonas.gorski@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 14:46:41 -08:00
Jonas Gorski
0b2b270586 net: dsa: b53: allow VID 0 for BCM5325/65
Now that writing ARL entries works properly, we can actually use VID 0
as the default untagged VLAN for BCM5325 and BCM5365 as well.

So use 0 as default PVID for all chips and do not reject VLAN 0 anymore,
which we ignored since commit 45e9d59d39 ("net: dsa: b53: do not allow
to configure VLAN 0") anyway.

Signed-off-by: Jonas Gorski <jonas.gorski@gmail.com>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Link: https://patch.msgid.link/20251128080625.27181-8-jonas.gorski@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 14:46:39 -08:00
Jonas Gorski
d39514e6a2 net: dsa: b53: fix BCM5325/65 ARL entry VIDs
BCM5325/65's ARL entry registers do not contain the VID, only the search
result register does. ARL entries have a separate VID entry register for
the index into the VLAN table.

So make ARL entry accessors use the VID entry registers instead, and
move the VLAN ID field definition to the search register definition.

Fixes: c45655386e ("net: dsa: b53: add support for FDB operations on 5325/5365")
Signed-off-by: Jonas Gorski <jonas.gorski@gmail.com>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Link: https://patch.msgid.link/20251128080625.27181-7-jonas.gorski@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 14:46:39 -08:00
Jonas Gorski
3b08863469 net: dsa: b53: fix BCM5325/65 ARL entry multicast port masks
We currently use the mask 0xf for writing and reading b53_entry::port,
but this is only correct for unicast ARL entries. Multicast ARL entries
use a bitmask, and 0xf is not enough space for ports > 3, which includes
the CPU port.

So extend the mask accordingly to also fit port 4 (bit 4) and MII (bit
5). According to the datasheet the multicast port mask is [60:48],
making it 12 bit wide, but bits 60-55 are reserved anyway, and collide
with the priority field at [60:59], so I am not sure if this is valid.
Therefore leave it at the actual used range, [53:48].

The ARL search result register differs a bit, and there the mask is only
[52:48], so only spanning the user ports. The MII port bit is
contained in the Search Result Extension register. So create a separate
search result parse function that properly handles this.

Fixes: c45655386e ("net: dsa: b53: add support for FDB operations on 5325/5365")
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Signed-off-by: Jonas Gorski <jonas.gorski@gmail.com>
Link: https://patch.msgid.link/20251128080625.27181-6-jonas.gorski@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 14:46:39 -08:00
Jonas Gorski
85132103f7 net: dsa: b53: fix CPU port unicast ARL entries for BCM5325/65
On BCM5325 and BCM5365, unicast ARL entries use 8 as the value for the
CPU port, so we need to translate it to/from 5 as used for the CPU port
at most other places.

Fixes: c45655386e ("net: dsa: b53: add support for FDB operations on 5325/5365")
Signed-off-by: Jonas Gorski <jonas.gorski@gmail.com>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Link: https://patch.msgid.link/20251128080625.27181-5-jonas.gorski@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 14:46:39 -08:00
Jonas Gorski
8e46aacea4 net: dsa: b53: use same ARL search result offset for BCM5325/65
BCM5365's search result is at the same offset as BCM5325's search
result, and they (mostly) share the same format, so switch BCM5365 to
BCM5325's arl ops.

Fixes: c45655386e ("net: dsa: b53: add support for FDB operations on 5325/5365")
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Tested-by: Álvaro Fernández Rojas <noltari@gmail.com>
Signed-off-by: Jonas Gorski <jonas.gorski@gmail.com>
Link: https://patch.msgid.link/20251128080625.27181-4-jonas.gorski@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 14:46:39 -08:00
Jonas Gorski
9316012dd0 net: dsa: b53: fix extracting VID from entry for BCM5325/65
BCM5325/65's Entry register uses the highest three bits for
VALID/STATIC/AGE, so shifting by 53 only will add these to
b53_arl_entry::vid.

So make sure to mask the vid value as well, to not get invalid VIDs.

Fixes: c45655386e ("net: dsa: b53: add support for FDB operations on 5325/5365")
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Tested-by: Álvaro Fernández Rojas <noltari@gmail.com>
Signed-off-by: Jonas Gorski <jonas.gorski@gmail.com>
Link: https://patch.msgid.link/20251128080625.27181-3-jonas.gorski@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 14:46:39 -08:00
Jonas Gorski
6f268e275c net: dsa: b53: fix VLAN_ID_IDX write size for BCM5325/65
Since BCM5325 and BCM5365 only support up to 256 VLANs, the VLAN_ID_IDX
register is only 8 bit wide, not 16 bit, so use an appropriate accessor.

Fixes: c45655386e ("net: dsa: b53: add support for FDB operations on 5325/5365")
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Tested-by: Álvaro Fernández Rojas <noltari@gmail.com>
Signed-off-by: Jonas Gorski <jonas.gorski@gmail.com>
Link: https://patch.msgid.link/20251128080625.27181-2-jonas.gorski@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 14:46:38 -08:00
Jakub Kicinski
4e3583cb61 Merge branch 'amd-xgbe-schedule-napi-on-rbu-event'
Raju Rangoju says:

====================
amd-xgbe: schedule NAPI on RBU event

During the RX overload the Rx buffers may not be refilled, trying to
schedule the NAPI when an Rx Buffer Unavailable is signaled may help in
improving the such situation, in case we missed an IRQ.
====================

Link: https://patch.msgid.link/20251129175016.3034185-1-Raju.Rangoju@amd.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 14:30:09 -08:00
Raju Rangoju
ab96af7004 amd-xgbe: schedule NAPI on Rx Buffer Unavailable (RBU)
Under heavy load, Rx Buffer Unavailable (RBU) can occur if Rx processing
is slower than network. When an RBU is signaled, try to schedule NAPI to
help recover from such situation (including cases where an IRQ may be
missed or such)

Signed-off-by: Raju Rangoju <Raju.Rangoju@amd.com>
Link: https://patch.msgid.link/20251129175016.3034185-3-Raju.Rangoju@amd.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 14:30:06 -08:00
Raju Rangoju
c3b744fd20 amd-xgbe: refactor the dma IRQ handling code path
Refactor the DMA interrupt bottom-half handling to improve the
readability, maintainability, without changing the intended behavior.

Signed-off-by: Raju Rangoju <Raju.Rangoju@amd.com>
Link: https://patch.msgid.link/20251129175016.3034185-2-Raju.Rangoju@amd.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 14:30:06 -08:00
Nikola Z. Ivanov
aee0f01b4f team: Add matching error label for failed action
Follow the "action" - "err_action" pairing of labels
found across the source code of team net device.

Currently in team_port_add the err_set_slave_promisc
label is reused for exiting on error when setting
allmulti level of the new slave.

Signed-off-by: Nikola Z. Ivanov <zlatistiv@gmail.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Link: https://patch.msgid.link/20251128072544.223645-1-zlatistiv@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 14:20:17 -08:00
Long Li
9bf66036d6 net: mana: Handle hardware recovery events when probing the device
When MANA is being probed, it's possible that hardware is in recovery
mode and the device may get GDMA_EQE_HWC_RESET_REQUEST over HWC in the
middle of the probe. Detect such condition and go through the recovery
service procedure.

Signed-off-by: Long Li <longli@microsoft.com>
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
Link: https://patch.msgid.link/1764193552-9712-1-git-send-email-longli@linux.microsoft.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 13:53:53 -08:00
Jeremy Kerr
6ab578739a net: mctp: test: move TX packetqueue from dst to dev
To capture TX packets during a test, we are currently intercepting the
dst->output with an implementation that adds the transmitted packet to
a skb queue attached to the test-specific mock dst. The netdev itself is
not involved in the test TX path.

Instead, we can just use our test device to stash TXed packets for later
inspection by the test. This means we can include the actual
mctp_dst_output() implementation in the test (by setting dst.output in
the test case), and don't need to be creating fake dst objects, or their
corresponding skb queues.

We need to ensure that the netdev is up to allow delivery to
ndo_start_xmit, but the tests assume active devices at present anyway.

Signed-off-by: Jeremy Kerr <jk@codeconstruct.com.au>
Link: https://patch.msgid.link/20251126-dev-mctp-test-tx-queue-v2-1-4e5bbd1d6c57@codeconstruct.com.au
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 13:52:13 -08:00
Vladimir Oltean
56435627d9 net: pcs: lynx: accept in-band autoneg for 2500base-x
Testing in two circumstances:

1. back to back optical SFP+ connection between two LS1028A-QDS ports
   with the SCH-26908 riser card
2. T1042 with on-board AQR115 PHY using "OCSGMII", as per
   https://lore.kernel.org/lkml/aIuEvaSCIQdJWcZx@FUE-ALEWI-WINX/

strongly suggests that enabling in-band auto-negotiation is actually
possible when the lane baud rate is 3.125 Gbps.

It was previously thought that this would not be the case, because it
was only tested on 2500base-x links with on-board Aquantia PHYs, where
it was noticed that MII_LPA is always reported as zero, and it was
thought that this is because of the PCS.

Test case #1 above shows it is not, and the configured MII_ADVERTISE on
system A ends up in the MII_LPA on system B, when in 2500base-x mode
(IF_MODE=0).

Test case #2, which uses "SGMII" auto-negotiation (IF_MODE=3) for the
3.125 Gbps lane, is actually a misconfiguration, but it is what led to
the discovery.

There is actually an old bug in the Lynx PCS driver - it expects all
register values to contain their default out-of-reset values, as if the
PCS were initialized by the Reset Configuration Word (RCW) settings.
There are 2 cases in which this is problematic:
- if the bootloader (or previous kexec-enabled Linux) wrote a different
  IF_MODE value
- if dynamically changing the SerDes protocol from 1000base-x to
  2500base-x, e.g. by replacing the optical SFP module.

Specifically in test case #2, an accidental alignment between the
bootloader configuring the PCS to expect SGMII in-band code words, and
the AQR115 PHY actually transmitting SGMII in-band code words when
operating in the "OCSGMII" system interface protocol, led to the PCS
transmitting replicated symbols at 3.125 Gbps baud rate. This could only
have happened if the PCS saw and reacted to the SGMII code words in the
first place.

Since test #2 is invalid from a protocol perspective (there seems to be
no standard way of negotiating the data rate of 2500 Mbps with SGMII,
and the lower data rates should remain 10/100/1000), in-band auto-negotiation
for 2500base-x effectively means Clause 37 (i.e. IF_MODE=0).

Make 2500base-x be treated like 1000base-x in this regard, by removing
all prior limitations and calling lynx_pcs_config_giga().

This adds a new feature: LINK_INBAND_ENABLE and at the same time fixes
the Lynx PCS's long standing problem that the registers (specifically
IF_MODE, but others could be misconfigured as well) are not written by
the driver to the known valid values for 2500base-x.

Co-developed-by: Alexander Wilhelm <alexander.wilhelm@westermo.com>
Signed-off-by: Alexander Wilhelm <alexander.wilhelm@westermo.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://patch.msgid.link/20251125103507.749654-1-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 13:50:15 -08:00
Gongwei Li
525459da4b Bluetooth: btusb: Add new VID/PID 13d3/3533 for RTL8821CE
Add VID 13d3 & PID 3533 for Realtek RTL8821CE USB Bluetooth chip.

The information in /sys/kernel/debug/usb/devices about the Bluetooth
device is listed as the below.

T:  Bus=01 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#=  2 Spd=12   MxCh= 0
D:  Ver= 1.10 Cls=e0(wlcon) Sub=01 Prot=01 MxPS=64 #Cfgs=  1
P:  Vendor=13d3 ProdID=3533 Rev= 1.10
S:  Manufacturer=Realtek
S:  Product=Bluetooth Radio
S:  SerialNumber=00e04c000001
C:* #Ifs= 2 Cfg#= 1 Atr=e0 MxPwr=500mA
I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=81(I) Atr=03(Int.) MxPS=  16 Ivl=1ms
E:  Ad=02(O) Atr=02(Bulk) MxPS=  64 Ivl=0ms
E:  Ad=82(I) Atr=02(Bulk) MxPS=  64 Ivl=0ms
I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=   0 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=   0 Ivl=1ms
I:  If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=   9 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=   9 Ivl=1ms
I:  If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=  17 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=  17 Ivl=1ms
I:  If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=  25 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=  25 Ivl=1ms
I:  If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=  33 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=  33 Ivl=1ms
I:  If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=  49 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=  49 Ivl=1ms

Signed-off-by: Gongwei Li <ligongwei@kylinos.cn>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
2025-12-01 16:21:16 -05:00
Luiz Augusto von Dentz
a106e50be7 Bluetooth: HCI: Add support for LL Extended Feature Set
This adds support for emulating LL Extended Feature Set introduced in 6.0
that adds the following:

Commands:

 - HCI_LE_Read_All_Local_Supported_­Features(0x2087)(Feature:47,1)
 - HCI_LE_Read_All_Remote_Features(0x2088)(Feature:47,2)

Events:

 - HCI_LE_Read_All_Remote_Features_Complete(0x2b)(Mask bit:42)

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
2025-12-01 16:21:16 -05:00
Ayaan Mirza Baig
6f7cf13ef6 drivers/bluetooth: btbcm: Use kmalloc_array() to prevent overflow
Replace the open-coded multiplication in kmalloc() with a call
to kmalloc_array() to prevent potential integer overflows.

This is a mechanical change, replacing BCM_FW_NAME_LEN with
the type-safe sizeof(*fw_name) as the element size

Signed-off-by: Ayaan Mirza Baig <ayaanmirzabaig85@gmail.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
2025-12-01 16:21:16 -05:00
Chethan T N
bc6f557b33 Bluetooth: btintel_pcie: Introduce HCI Driver protocol
This patch adds the infrastructure that allow the user space program to
talk to intel pcie driver directly for fetching basic driver details.

The changes introduced are referred form
commit 04425292a6 ("Bluetooth: Introduce HCI Driver protocol")

Signed-off-by: Chethan T N <chethan.tumkur.narayan@intel.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
2025-12-01 16:04:51 -05:00
Shuai Zhang
a8b38d1985 Bluetooth: btusb: add new custom firmwares
The new platform uses the QCA2066 chip along with a new board ID, which
requires a dedicated firmware file to ensure proper initialization.
Without this entry, the driver cannot locate and load the correct
firmware, resulting in Bluetooth bring-up failure.

This patch adds a new entry to the firmware table for QCA2066 so that
the driver can correctly identify the board ID and load the appropriate
firmware from 'qca/QCA2066/' in the linux-firmware repository.

Signed-off-by: Shuai Zhang <quic_shuaz@quicinc.com>
Acked-by: Dmitry Baryshkov <dmitry.baryshkov@oss.qualcomm.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
2025-12-01 16:00:08 -05:00
Max Chou
1216462f4e Bluetooth: btusb: Add new VID/PID 0x13d3/0x3619 for RTL8852BE-VT
Add the support ID(0x13d3, 0x3619) to usb_device_id table for
Realtek RTL8852BE-VT.

The device info from /sys/kernel/debug/usb/devices as below.

    T:  Bus=04 Lev=02 Prnt=02 Port=05 Cnt=01 Dev#= 86 Spd=12   MxCh= 0
    D:  Ver= 1.00 Cls=e0(wlcon) Sub=01 Prot=01 MxPS=64 #Cfgs=  1
    P:  Vendor=13d3 ProdID=3619 Rev= 0.00
    S:  Manufacturer=Realtek
    S:  Product=Bluetooth Radio
    S:  SerialNumber=00e04c000001
    C:* #Ifs= 2 Cfg#= 1 Atr=e0 MxPwr=500mA
    I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
    E:  Ad=81(I) Atr=03(Int.) MxPS=  16 Ivl=1ms
    E:  Ad=02(O) Atr=02(Bulk) MxPS=  64 Ivl=0ms
    E:  Ad=82(I) Atr=02(Bulk) MxPS=  64 Ivl=0ms
    I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
    E:  Ad=03(O) Atr=01(Isoc) MxPS=   0 Ivl=1ms
    E:  Ad=83(I) Atr=01(Isoc) MxPS=   0 Ivl=1ms
    I:  If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
    E:  Ad=03(O) Atr=01(Isoc) MxPS=   9 Ivl=1ms
    E:  Ad=83(I) Atr=01(Isoc) MxPS=   9 Ivl=1ms
    I:  If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
    E:  Ad=03(O) Atr=01(Isoc) MxPS=  17 Ivl=1ms
    E:  Ad=83(I) Atr=01(Isoc) MxPS=  17 Ivl=1ms
    I:  If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
    E:  Ad=03(O) Atr=01(Isoc) MxPS=  25 Ivl=1ms
    E:  Ad=83(I) Atr=01(Isoc) MxPS=  25 Ivl=1ms
    I:  If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
    E:  Ad=03(O) Atr=01(Isoc) MxPS=  33 Ivl=1ms
    E:  Ad=83(I) Atr=01(Isoc) MxPS=  33 Ivl=1ms
    I:  If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
    E:  Ad=03(O) Atr=01(Isoc) MxPS=  49 Ivl=1ms
    E:  Ad=83(I) Atr=01(Isoc) MxPS=  49 Ivl=1ms

Signed-off-by: Max Chou <max.chou@realtek.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
2025-12-01 16:00:08 -05:00
Max Chou
f460768bba Bluetooth: btusb: Add new VID/PID 0x13d3/0x3618 for RTL8852BE-VT
Add the support ID(0x13d3, 0x3618) to usb_device_id table for
Realtek RTL8852BE-VT.

The device info from /sys/kernel/debug/usb/devices as below.

    T:  Bus=04 Lev=02 Prnt=02 Port=05 Cnt=01 Dev#= 86 Spd=12   MxCh= 0
    D:  Ver= 1.00 Cls=e0(wlcon) Sub=01 Prot=01 MxPS=64 #Cfgs=  1
    P:  Vendor=13d3 ProdID=3618 Rev= 0.00
    S:  Manufacturer=Realtek
    S:  Product=Bluetooth Radio
    S:  SerialNumber=00e04c000001
    C:* #Ifs= 2 Cfg#= 1 Atr=e0 MxPwr=500mA
    I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
    E:  Ad=81(I) Atr=03(Int.) MxPS=  16 Ivl=1ms
    E:  Ad=02(O) Atr=02(Bulk) MxPS=  64 Ivl=0ms
    E:  Ad=82(I) Atr=02(Bulk) MxPS=  64 Ivl=0ms
    I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
    E:  Ad=03(O) Atr=01(Isoc) MxPS=   0 Ivl=1ms
    E:  Ad=83(I) Atr=01(Isoc) MxPS=   0 Ivl=1ms
    I:  If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
    E:  Ad=03(O) Atr=01(Isoc) MxPS=   9 Ivl=1ms
    E:  Ad=83(I) Atr=01(Isoc) MxPS=   9 Ivl=1ms
    I:  If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
    E:  Ad=03(O) Atr=01(Isoc) MxPS=  17 Ivl=1ms
    E:  Ad=83(I) Atr=01(Isoc) MxPS=  17 Ivl=1ms
    I:  If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
    E:  Ad=03(O) Atr=01(Isoc) MxPS=  25 Ivl=1ms
    E:  Ad=83(I) Atr=01(Isoc) MxPS=  25 Ivl=1ms
    I:  If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
    E:  Ad=03(O) Atr=01(Isoc) MxPS=  33 Ivl=1ms
    E:  Ad=83(I) Atr=01(Isoc) MxPS=  33 Ivl=1ms
    I:  If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
    E:  Ad=03(O) Atr=01(Isoc) MxPS=  49 Ivl=1ms
    E:  Ad=83(I) Atr=01(Isoc) MxPS=  49 Ivl=1ms

Signed-off-by: Max Chou <max.chou@realtek.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
2025-12-01 16:00:08 -05:00
Max Chou
32caa197b9 Bluetooth: btusb: Add new VID/PID 0x0489/0xE12F for RTL8852BE-VT
Add the support ID(0x0489, 0xE12F) to usb_device_id table for
Realtek RTL8852BE-VT.

The device info from /sys/kernel/debug/usb/devices as below.

T:  Bus=04 Lev=02 Prnt=02 Port=05 Cnt=01 Dev#= 86 Spd=12   MxCh= 0
D:  Ver= 1.00 Cls=e0(wlcon) Sub=01 Prot=01 MxPS=64 #Cfgs=  1
P:  Vendor=0489 ProdID=e12f Rev= 0.00
S:  Manufacturer=Realtek
S:  Product=Bluetooth Radio
S:  SerialNumber=00e04c000001
C:* #Ifs= 2 Cfg#= 1 Atr=e0 MxPwr=500mA
I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=81(I) Atr=03(Int.) MxPS=  16 Ivl=1ms
E:  Ad=02(O) Atr=02(Bulk) MxPS=  64 Ivl=0ms
E:  Ad=82(I) Atr=02(Bulk) MxPS=  64 Ivl=0ms
I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=   0 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=   0 Ivl=1ms
I:  If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=   9 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=   9 Ivl=1ms
I:  If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=  17 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=  17 Ivl=1ms
I:  If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=  25 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=  25 Ivl=1ms
I:  If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=  33 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=  33 Ivl=1ms
I:  If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=  49 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=  49 Ivl=1ms

Signed-off-by: Max Chou <max.chou@realtek.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
2025-12-01 16:00:08 -05:00
Yang Li
56f765ce73 Bluetooth: iso: fix socket matching ambiguity between BIS and CIS
When both BIS and CIS links exist, their sockets are in
the BT_LISTEN state.
dump sock:
  sk 000000001977ef51 state 6
  src 10:a5:62:31:05:cf dst 00:00:00:00:00:00
  sk 0000000031d28700 state 7
  src 10:a5:62:31:05:cf dst00:00:00:00:00:00
  sk 00000000613af00e state 4   # listen sock of bis
  src 10:a5:62:31:05:cf dst 54:00:00:d4:99:30
  sk 000000001710468c state 9
  src 10:a5:62:31:05:cf dst 54:00:00:d4:99:30
  sk 000000005d97dfde state 4   #listen sock of cis
  src 10:a5:62:31:05:cf dst 00:00:00:00:00:00

To locate the CIS socket correctly, check both the BT_LISTEN
state and whether dst addr is BDADDR_ANY.

Link: https://github.com/bluez/bluez/issues/1224
Signed-off-by: Yang Li <yang.li@amlogic.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
2025-12-01 16:00:07 -05:00
Krzysztof Kozlowski
027473ef6a Bluetooth: MAINTAINERS: Add Bartosz Golaszewski as Qualcomm hci_qca maintainer
There are no dedicated maintainers of Qualcomm hci_qca Bluetooth
drivers, but there should be, because these are actively used on many
old and new platforms.  Bartosz Golaszewski agreed to take care of this
code.

Link: https://lore.kernel.org/r/CAMRc=MdqAATOcDPhd=u0vOb8nLxSRd7N8rLGLO8F5Ywq3+=JCw@mail.gmail.com/
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Acked-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
2025-12-01 16:00:07 -05:00
Max Chou
4a23ce935f Bluetooth: btrtl: Add the support for RTL8761CUV
Add support for RTL8761CUV BT controller on the USB interface.
Do not apply IC_MATCH_FL_HCIVER when hci_ver is 0 in the ic_id_table.

The device info from /sys/kernel/debug/usb/devices as below.

T:  Bus=02 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 13 Spd=12   MxCh= 0
D:  Ver= 1.10 Cls=e0(wlcon) Sub=01 Prot=01 MxPS=64 #Cfgs=  1
P:  Vendor=0bda ProdID=c761 Rev= 2.00
S:  Manufacturer=Realtek
S:  Product=Bluetooth Controller
S:  SerialNumber=10000
C:* #Ifs= 2 Cfg#= 1 Atr=a0 MxPwr=100mA
I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=81(I) Atr=03(Int.) MxPS=  64 Ivl=1ms
E:  Ad=02(O) Atr=02(Bulk) MxPS=  64 Ivl=0ms
E:  Ad=82(I) Atr=02(Bulk) MxPS=  64 Ivl=0ms
I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=   0 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=   0 Ivl=1ms
I:  If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=   9 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=   9 Ivl=1ms
I:  If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  17 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  17 Ivl=1ms
I:  If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  25 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  25 Ivl=1ms
I:  If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  33 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  33 Ivl=1ms
I:  If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  49 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  49 Ivl=1ms
I:  If#= 1 Alt= 6 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  63 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  63 Ivl=1ms

Signed-off-by: Max Chou <max.chou@realtek.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
2025-12-01 16:00:07 -05:00
Sakari Ailus
b8414ba5a0 Bluetooth: Remove redundant pm_runtime_mark_last_busy() calls
pm_runtime_put_autosuspend(), pm_runtime_put_sync_autosuspend(),
pm_runtime_autosuspend() and pm_request_autosuspend() now include a call
to pm_runtime_mark_last_busy(). Remove the now-reduntant explicit call to
pm_runtime_mark_last_busy().

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
2025-12-01 16:00:07 -05:00
Ariel D'Alessandro
73d2d709cc dt-bindings: net: Convert Marvell 8897/8997 bindings to DT schema
Convert the existing text-based DT bindings for Marvell 8897/8997
(sd8897/sd8997) bluetooth devices controller to a DT schema.

While here, bindings for "usb1286,204e" (USB interface) are dropped from
the DT   schema definition as these are currently documented in file [0].

[0] Documentation/devicetree/bindings/net/btusb.txt

Signed-off-by: Ariel D'Alessandro <ariel.dalessandro@collabora.com>
Reviewed-by: Rob Herring (Arm) <robh@kernel.org>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
2025-12-01 16:00:07 -05:00
Pascal Giard
0b00bee940 Bluetooth: btusb: Reclassify Qualcomm WCN6855 debug packets
Some Qualcomm Bluetooth controllers, e.g., QCNFA765 with WCN6855
chip, send debug packets as ACL frames with header 0x2EDC.
The kernel misinterprets these as malformed ACL packets, causing
repeated errors:

  Bluetooth: hci0: ACL packet for unknown connection handle 3804

This can occur hundreds of times per minute, greatly cluttering logs.
On my computer, I am observing approximately 7 messages per second
when streaming audio to a speaker.

For Qualcomm controllers exchanging over UART, hci_qca.c already
filters out these debug packets. This patch is for controllers
not going through UART, but USB.

This patch uses the classify_pkt_type callback to reclassify the
packets with handle 0x2EDC as HCI_DIAG_PKT before they reach the
HCI layer. This change is only applied to Qualcomm devices marked
as BTUSB_QCA_WCN6855.

Tested on: Thinkpad T14 gen2 (AMD) with QCNFA765 (0489:E0D0)
Signed-off-by: Pascal Giard <pascal.giard@etsmtl.ca>
Reviewed-by: Paul Menzel <pmenzel@molgen.mpg.de>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
2025-12-01 16:00:07 -05:00
Chingbin Li
8dbbb5423c Bluetooth: btusb: Add new VID/PID 2b89/6275 for RTL8761BUV
Add VID 2b89 & PID 6275 for Realtek RTL8761BUV USB Bluetooth chip.

The information in /sys/kernel/debug/usb/devices about the Bluetooth
device is listed as the below.

T:  Bus=01 Lev=01 Prnt=01 Port=02 Cnt=01 Dev#=  6 Spd=12   MxCh= 0
D:  Ver= 1.10 Cls=e0(wlcon) Sub=01 Prot=01 MxPS=64 #Cfgs=  1
P:  Vendor=2b89 ProdID=6275 Rev= 2.00
S:  Manufacturer=Realtek
S:  Product=Bluetooth Radio
S:  SerialNumber=00E04C239987
C:* #Ifs= 2 Cfg#= 1 Atr=e0 MxPwr=500mA
I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=81(I) Atr=03(Int.) MxPS=  16 Ivl=1ms
E:  Ad=02(O) Atr=02(Bulk) MxPS=  64 Ivl=0ms
E:  Ad=82(I) Atr=02(Bulk) MxPS=  64 Ivl=0ms
I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=   0 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=   0 Ivl=1ms
I:  If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=   9 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=   9 Ivl=1ms
I:  If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=  17 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=  17 Ivl=1ms
I:  If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=  25 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=  25 Ivl=1ms
I:  If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=  33 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=  33 Ivl=1ms
I:  If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=  49 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=  49 Ivl=1ms

Signed-off-by: Chingbin Li <liqb365@163.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
2025-12-01 16:00:07 -05:00
Ravindra
88c6216a52 Bluetooth: btintel_pcie: Suspend/Resume: Controller doorbell interrupt handling
Due to a hardware bug during suspend/resume, the controller may miss a
doorbell interrupt. To address this, a retry mechanism has been added to
inform the controller before reporting a failure.

Test case:
- run suspend and resume cycles.

Signed-off-by: Ravindra <ravindra@intel.com>
Signed-off-by: Kiran K <kiran.k@intel.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
2025-12-01 16:00:07 -05:00
Ravindra
1fb0d830da Bluetooth: btintel_pcie: Support for S4 (Hibernate)
During S4 (hibernate), the Bluetooth device loses power. Upon resume,
the driver performs the following actions:

1. Unregisters hdev
2. Calls function level reset
3. Registers hdev

Test case:
- run command sudo rtcwake -m disk -s 60

Signed-off-by: Ravindra <ravindra@intel.com>
Signed-off-by: Kiran K <kiran.k@intel.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
2025-12-01 16:00:07 -05:00
Chris Lu
5a6700a31c Bluetooth: btusb: MT7922: Add VID/PID 0489/e170
Add VID 0489 & PID e170 for MediaTek MT7922 USB Bluetooth chip.

The information in /sys/kernel/debug/usb/devices about the Bluetooth
device is listed as the below.

T:  Bus=06 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#=  2 Spd=480  MxCh= 0
D:  Ver= 2.10 Cls=ef(misc ) Sub=02 Prot=01 MxPS=64 #Cfgs=  1
P:  Vendor=0489 ProdID=e170 Rev= 1.00
S:  Manufacturer=MediaTek Inc.
S:  Product=Wireless_Device
S:  SerialNumber=000000000
C:* #Ifs= 3 Cfg#= 1 Atr=e0 MxPwr=100mA
A:  FirstIf#= 0 IfCount= 3 Cls=e0(wlcon) Sub=01 Prot=01
I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=81(I) Atr=03(Int.) MxPS=  16 Ivl=125us
E:  Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=   0 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=   0 Ivl=1ms
I:  If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=   9 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=   9 Ivl=1ms
I:  If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  17 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  17 Ivl=1ms
I:  If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  25 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  25 Ivl=1ms
I:  If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  33 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  33 Ivl=1ms
I:  If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  49 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  49 Ivl=1ms
I:  If#= 1 Alt= 6 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  63 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  63 Ivl=1ms
I:* If#= 2 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=(none)
E:  Ad=8a(I) Atr=03(Int.) MxPS=  64 Ivl=125us
E:  Ad=0a(O) Atr=03(Int.) MxPS=  64 Ivl=125us
I:  If#= 2 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=(none)
E:  Ad=8a(I) Atr=03(Int.) MxPS= 512 Ivl=125us
E:  Ad=0a(O) Atr=03(Int.) MxPS= 512 Ivl=125us

Signed-off-by: Chris Lu <chris.lu@mediatek.com>
Reviewed-by: Paul Menzel <pmenzel@molgen.mpg.de>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
2025-12-01 16:00:07 -05:00
Chris Lu
c126f98c01 Bluetooth: btusb: MT7920: Add VID/PID 0489/e135
Add VID 0489 & PID e135 for MediaTek MT7920 USB Bluetooth chip.

The information in /sys/kernel/debug/usb/devices about the Bluetooth
device is listed as the below.

T:  Bus=06 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#=  2 Spd=480  MxCh= 0
D:  Ver= 2.10 Cls=ef(misc ) Sub=02 Prot=01 MxPS=64 #Cfgs=  1
P:  Vendor=0489 ProdID=e135 Rev= 1.00
S:  Manufacturer=MediaTek Inc.
S:  Product=Wireless_Device
S:  SerialNumber=000000000
C:* #Ifs= 3 Cfg#= 1 Atr=e0 MxPwr=100mA
A:  FirstIf#= 0 IfCount= 3 Cls=e0(wlcon) Sub=01 Prot=01
I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=81(I) Atr=03(Int.) MxPS=  16 Ivl=125us
E:  Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=   0 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=   0 Ivl=1ms
I:  If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=   9 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=   9 Ivl=1ms
I:  If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  17 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  17 Ivl=1ms
I:  If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  25 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  25 Ivl=1ms
I:  If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  33 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  33 Ivl=1ms
I:  If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  49 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  49 Ivl=1ms
I:  If#= 1 Alt= 6 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  63 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  63 Ivl=1ms
I:* If#= 2 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=(none)
E:  Ad=8a(I) Atr=03(Int.) MxPS=  64 Ivl=125us
E:  Ad=0a(O) Atr=03(Int.) MxPS=  64 Ivl=125us
I:  If#= 2 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=(none)
E:  Ad=8a(I) Atr=03(Int.) MxPS=  64 Ivl=125us
E:  Ad=0a(O) Atr=03(Int.) MxPS=  64 Ivl=125us

Signed-off-by: Chris Lu <chris.lu@mediatek.com>
Reviewed-by: Paul Menzel <pmenzel@molgen.mpg.de>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
2025-12-01 16:00:06 -05:00
Luiz Augusto von Dentz
577cf4c0a1 Bluetooth: ISO: Fix not updating BIS sender source address
The source address for a BIS sender/Broadcast Source shall be updated
with the advertisement address since in case privacy is enabled it may
use an RPA rather than an identity address.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
2025-12-01 16:00:06 -05:00
Luiz Augusto von Dentz
a3b76bf4c4 Bluetooth: MGMT: Allow use of Set Device Flags without Add Device
In certain cases setting devices flags like HCI_CONN_FLAG_PAST it
shouldn't require to do Add Device first since it may not need to add
an auto-connect policy, so this instead just automatically creates
a hci_conn_params if one cannot be found using HCI_AUTO_CONN_DISABLED.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
2025-12-01 16:00:06 -05:00
Luiz Augusto von Dentz
f817db10dc Bluetooth: ISO: Attempt to resolve broadcast address
Broadcasters maybe using RPAs which can change over time and not
matching the address used as destination in the socket, so this
attempts to resolve the addresses then match with the socket
address, in case that uses an indentity address, or then match the
IRKs if both broadcaster and socket are using RPAs.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
2025-12-01 16:00:06 -05:00
Luiz Augusto von Dentz
14b06c3a88 Bluetooth: HCI: Always use the identity address when initializing a connection
This makes sure hci_conn is initialized with the identity address if
a matching IRK exists which avoids the trouble of having to do it at
multiple places which seems to be missing (e.g. CIS, BIS and PA).

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
2025-12-01 16:00:06 -05:00
Luiz Augusto von Dentz
d3413703d5 Bluetooth: ISO: Add support to bind to trigger PAST
This makes it possible to bind to a different destination address
after being connected (BT_CONNECTED, BT_CONNECT2) which then triggers
PAST Sender proceedure to transfer the PA Sync to the destination
address.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
2025-12-01 16:00:04 -05:00
Luiz Augusto von Dentz
c530569adc Bluetooth: hci_core: Introduce HCI_CONN_FLAG_PAST
This introduces a new device flag so userspace can indicate if it
wants to enable PAST Receiver for a specific device.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
2025-12-01 15:58:54 -05:00
Luiz Augusto von Dentz
33b2835f0b Bluetooth: HCI: Add initial support for PAST
This adds PAST related commands (HCI_OP_LE_PAST,
HCI_OP_LE_PAST_SET_INFO and HCI_OP_LE_PAST_PARAMS) and events
(HCI_EV_LE_PAST_RECEIVED) along with handling of PAST sender and
receiver features bits including new MGMG settings (
HCI_EV_LE_PAST_RECEIVED and MGMT_SETTING_PAST_RECEIVER) which
userspace can use to determine if PAST is supported by the
controller.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
2025-12-01 15:58:54 -05:00
Javier Nieto
97fdb2ea06 Bluetooth: hci_h5: implement CRC data integrity
The UART-based H5 protocol supports CRC data integrity checks for
reliable packets. The host sets bit 5 in the configuration field of the
CONFIG link control message to indicate that CRC is supported. The
controller sets the same bit in the CONFIG RESPONSE message to indicate
that CRC may be used from then on.

Tested on a MangoPi MQ-Pro with a Realtek RTL8723DS Bluetooth controller
using the tip of the bluetooth-next tree.

Signed-off-by: Javier Nieto <jgnieto@cs.stanford.edu>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
2025-12-01 15:58:54 -05:00
Javier Nieto
01622e9a53 Bluetooth: hci_h5: avoid sending two SYNC messages
Previously, h5_open() called h5_link_control() to send a SYNC message.
But h5_link_control() only enqueues the packet and requires the caller
to call hci_uart_tx_wakeup(). Thus, after H5_SYNC_TIMEOUT ran out
(100ms), h5_timed_event() would be called and, realizing that the state
was still H5_UNINITIALIZED, it would re-enqueue the SYNC and call
hci_uart_tx_wakeup(). Consequently, two SYNC packets would be sent and
initialization would unnecessarily wait for 100ms.

The naive solution of calling hci_uart_tx_wakeup() in h5_open() does not
work because it will only schedule tx work if the HCI_PROTO_READY bit is
set and hci_serdev only sets it after h5_open() returns. This patch
removes the extraneous SYNC being enqueued and makes h5_timed_event()
wake up on the next jiffy.

Signed-off-by: Javier Nieto <jgnieto@cs.stanford.edu>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
2025-12-01 15:58:54 -05:00
Zhangchao Zhang
484f117689 Bluetooth: mediatek: add gpio pin to reset bt
Support the platform Bluetooth to be reset by hardware pin,
when a Bluetooth exception occurs, attempt to reset the
Bluetooth module using the hardware reset pin, as this
method is generally more stable and reliable than a
software reset. If the hardware reset pin is not specified
in the device tree, fall back to the existing software
reset mechanism to ensure backward compatibility.

Co-developed: Sean Wang <Sean.Wang@mediatek.com>
Co-developed: Hao Qin <hao.qin@mediatek.com>
Co-developed: Chris Lu <chris.lu@mediatek.com>
Signed-off-by: Zhangchao Zhang <ot_zhangchao.zhang@mediatek.com>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
2025-12-01 15:58:54 -05:00
Andre Carvalho
e3b8cbf40c selftests: netconsole: remove log noise due to socat exit
This removes some noise that can be distracting while looking at
selftests by redirecting socat stderr to /dev/null.

Before this commit, netcons_basic would output:

Running with target mode: basic (ipv6)
2025/11/29 12:08:03 socat[259] W exiting on signal 15
2025/11/29 12:08:03 socat[271] W exiting on signal 15
basic : ipv6 : Test passed
Running with target mode: basic (ipv4)
2025/11/29 12:08:05 socat[329] W exiting on signal 15
2025/11/29 12:08:05 socat[322] W exiting on signal 15
basic : ipv4 : Test passed
Running with target mode: extended (ipv6)
2025/11/29 12:08:08 socat[386] W exiting on signal 15
2025/11/29 12:08:08 socat[386] W exiting on signal 15
2025/11/29 12:08:08 socat[380] W exiting on signal 15
extended : ipv6 : Test passed
Running with target mode: extended (ipv4)
2025/11/29 12:08:10 socat[440] W exiting on signal 15
2025/11/29 12:08:10 socat[435] W exiting on signal 15
2025/11/29 12:08:10 socat[435] W exiting on signal 15
extended : ipv4 : Test passed

After these changes, output looks like:

Running with target mode: basic (ipv6)
basic : ipv6 : Test passed
Running with target mode: basic (ipv4)
basic : ipv4 : Test passed
Running with target mode: extended (ipv6)
extended : ipv6 : Test passed
Running with target mode: extended (ipv4)
extended : ipv4 : Test passed

Signed-off-by: Andre Carvalho <asantostc@gmail.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20251129-netcons-socat-noise-v1-1-605a0cea8fca@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 12:06:55 -08:00
David Yang
ea2d3befcf net: dsa: yt921x: Set ageing_time_min/ageing_time_max
The ageing time is in 5s step, ranging from 1 step to 0xffff steps, so
add appropriate attributes.

Signed-off-by: David Yang <mmyangfl@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20251129042137.3034032-1-mmyangfl@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 12:06:15 -08:00
Alok Tiwari
09339d0d83 l2tp: correct debugfs label for tunnel tx stats
l2tp_dfs_seq_tunnel_show prints two groups of tunnel statistics. The
first group reports transmit counters, but the code labels it as rx.
Set the label to "tx" so the debugfs output reflects the actual meaning.

Signed-off-by: Alok Tiwari <alok.a.tiwari@oracle.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20251128085300.3377210-1-alok.a.tiwari@oracle.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 12:03:09 -08:00
Jakub Kicinski
aadff9f766 selftests: net: add a hint about MACAddressPolicy=persistent
New NIPA installation had been reporting a few flaky tests.
arp_ndisc_evict_nocarrier is most flaky of them all.
I suspect that the flakiness is due to udev swapping the MAC
addresses on the interfaces. Extend the message in
arp_ndisc_evict_nocarrier to hint at this potential issue.
Having the neigh get fail right after ping is rather unusual,
unless udev changes the MAC addr causing a flush in the meantime.

Link: https://patch.msgid.link/20251127194556.2409574-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 12:02:13 -08:00
Jakub Kicinski
4b1639cac0 selftests: net: py: handle interrupt during cleanup
Following up on the old discussion [1]. Let the BaseExceptions out of
defer()'ed cleanup. And handle it in the main loop. This allows us to
exit the tests if user hit Ctrl-C during defer().

Link: https://lore.kernel.org/20251119063228.3adfd743@kernel.org # [1]
Reviewed-by: Simon Horman <horms@kernel.org>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Link: https://patch.msgid.link/20251128004846.2602687-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 12:01:29 -08:00
Yeounsu Moon
40d5ce4af2 net: dlink: fix several spelling mistakes in comments
This patch fixes multiple spelling mistakes in dl2k driver comments:

- "deivices" -> "devices"
- "Ttransmit" -> "Transmit"
- "catastronphic" -> "catastrophic"
- "Extened" -> "Extended"

Also fix incorrect unit description: `rx_timeout` uses 640ns increments,
not 64ns.
- "64ns" -> "640ns"

These are comment-only changes and do not affect runtime behavior.

Signed-off-by: Yeounsu Moon <yyyynoom@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20251130220652.5425-2-yyyynoom@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 11:58:55 -08:00
Jakub Kicinski
cbca440dc3 Merge branch 'net-freescale-migrate-to-get_rx_ring_count-ethtool-callback'
Breno Leitao says:

====================
net: freescale: migrate to .get_rx_ring_count() ethtool callback

This series migrates Freescale network drivers to use the new .get_rx_ring_count()
ethtool callback introduced in commit 84eaf4359c ("net: ethtool: add
get_rx_ring_count callback to optimize RX ring queries").

The new callback simplifies the .get_rxnfc() implementation by removing
ETHTOOL_GRXRINGS handling and moving it to a dedicated callback. This provides
a cleaner separation of concerns and aligns these drivers with the modern
ethtool API.

The series updates the following Freescale drivers:
  - enetc
  - dppa2
  - gianfar
====================

Link: https://patch.msgid.link/20251128-gxring_freescale-v1-0-22a978abf29e@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 11:54:50 -08:00
Breno Leitao
ca8df5b877 net: enetc: convert to use .get_rx_ring_count
Convert the enetc driver to use the new .get_rx_ring_count
ethtool operation instead of implementing .get_rxnfc for handling
ETHTOOL_GRXRINGS command. This simplifies the code in two ways:

1. For enetc_get_rxnfc(): Remove the ETHTOOL_GRXRINGS case from the
   switch statement while keeping other cases for classifier rules.

2. For enetc4_get_rxnfc(): Remove it completely and use
   enetc_get_rxnfc() instead.

Now on, enetc_get_rx_ring_count() is the callback that returns the
number of RX rings for enetc driver.

Also, remove the documentation around enetc4_get_rxnfc(), which was not
matching what the function did(?!).

Signed-off-by: Breno Leitao <leitao@debian.org>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20251128-gxring_freescale-v1-3-22a978abf29e@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 11:54:45 -08:00
Breno Leitao
b2d6339269 net: dpaa2: convert to use .get_rx_ring_count
Convert the dpaa2 driver to use the new .get_rx_ring_count
ethtool operation instead of implementing .get_rxnfc for handling
ETHTOOL_GRXRINGS command. This simplifies the code by removing the
ETHTOOL_GRXRINGS case from the switch statement and replacing it with
a direct return of the queue count.

The driver still maintains .get_rxnfc for other commands including
ETHTOOL_GRXCLSRLCNT, ETHTOOL_GRXCLSRULE, and ETHTOOL_GRXCLSRLALL.

Signed-off-by: Breno Leitao <leitao@debian.org>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20251128-gxring_freescale-v1-2-22a978abf29e@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 11:54:45 -08:00
Breno Leitao
d3fbfb8b2c net: gianfar: convert to use .get_rx_ring_count
Convert the gianfar driver to use the new .get_rx_ring_count
ethtool operation instead of implementing .get_rxnfc for handling
ETHTOOL_GRXRINGS command. This simplifies the code by removing the
ETHTOOL_GRXRINGS case from the switch statement and replacing it with
a direct return of the queue count.

The driver still maintains .get_rxnfc for other commands including
ETHTOOL_GRXCLSRLCNT, ETHTOOL_GRXCLSRULE, and ETHTOOL_GRXCLSRLALL.

Signed-off-by: Breno Leitao <leitao@debian.org>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20251128-gxring_freescale-v1-1-22a978abf29e@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 11:54:45 -08:00
Colin Ian King
7adf0efb41 ynl: samples: Fix spelling mistake "failedq" -> "failed"
There is a spelling mistake in an error message. Fix it.

Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Link: https://patch.msgid.link/20251128173802.318520-1-colin.i.king@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-12-01 11:54:09 -08:00
Randy Dunlap
d211a28035 block/rnbd: correct all kernel-doc complaints
Fix all kernel-doc warnings in rnbd-proto.h:
- use correct enum name in kdoc comment
- mark several struct members as "/* private: */" so that no kdoc is
  required for them
- don't use "/**" for a non-kernel-doc comment
- use the correct struct member name for "dev_name"
- use " *" for a blank kernel-doc line

Fixes these warnings:
Warning: drivers/block/rnbd/rnbd-proto.h:41 expecting prototype for
 enum rnbd_msg_types. Prototype was for enum rnbd_msg_type instead
Warning: drivers/block/rnbd/rnbd-proto.h:50 struct member '__padding'
 not described in 'rnbd_msg_hdr'
Warning: drivers/block/rnbd/rnbd-proto.h:53 This comment starts with
 '/**', but isn't a kernel-doc comment.
 * We allow to map RO many times and RW only once. We allow to map yet another
Warning: drivers/block/rnbd/rnbd-proto.h:81 struct member 'reserved'
 not described in 'rnbd_msg_sess_info'
Warning: drivers/block/rnbd/rnbd-proto.h:92 struct member 'reserved'
 not described in 'rnbd_msg_sess_info_rsp'
Warning: drivers/block/rnbd/rnbd-proto.h:107 struct member 'resv1'
 not described in 'rnbd_msg_open'
Warning: drivers/block/rnbd/rnbd-proto.h:107 struct member 'dev_name'
 not described in 'rnbd_msg_open'
Warning: drivers/block/rnbd/rnbd-proto.h:107 struct member 'reserved'
 not described in 'rnbd_msg_open'
Warning: drivers/block/rnbd/rnbd-proto.h:158 struct member 'reserved'
 not described in 'rnbd_msg_open_rsp'
Warning: drivers/block/rnbd/rnbd-proto.h:189 bad line:

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Acked-by: Jack Wang <jinpu.wang@ionos.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-12-01 07:19:50 -07:00
Fengnan Chang
4d0e1f2139 blk-mq: use queue_hctx in blk_mq_map_queue_type
Some caller of blk_mq_map_queue_type now didn't grab
'q_usage_counter', such as blk_mq_cpu_mapped_to_hctx, so we need
protect 'queue_hw_ctx' through rcu.

Also checked all other functions, no more missed cases.

Fixes: 89e1fb7cef ("blk-mq: fix potential uaf for 'queue_hw_ctx'")
Reported-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Fengnan Chang <changfengnan@bytedance.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-12-01 07:18:31 -07:00
Gao Xiang
0bdbf89a8b erofs: switch on-disk header erofs_fs.h to MIT license
Switch to the permissive MIT license to make the EROFS on-disk format
more interoperable across various use cases.

It was previously recommended by the Composefs folks, for example:
https://github.com/composefs/composefs/pull/216#discussion_r1356409501

Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Acked-by: Jingbo Xu <jefflexu@linux.alibaba.com>
Acked-by: Yue Hu <zbestahu@gmail.com>
Acked-by: Chao Yu <chao@kernel.org>
Acked-by: Jianan Huang <jnhuang95@gmail.com>
Acked-by: Chunhai Guo <guochunhai@vivo.com>
Acked-by: Hongbo Li <lihongbo22@huawei.com>
2025-12-01 15:25:43 +08:00
David Howells
e1469f5608 cifs: Use netfs_alloc/free_folioq_buffer()
Use netfs_alloc/free_folioq_buffer() rather than doing its own version.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Steve French <sfrench@samba.org>
cc: Paulo Alcantara <pc@manguebit.org>
cc: Shyam Prasad N <sprasad@microsoft.com>
cc: Tom Talpey <tom@talpey.com> (RDMA, smbdirect)
cc: linux-cifs@vger.kernel.org
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:46 -06:00
Bharath SM
f78b83dce2 smb: client: show smb lease key in open_dirs output
Show cached directory smb lease key in /proc/fs/cifs/open_dirs
for debugging purposes.

Signed-off-by: Bharath SM <bharathsm@microsoft.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:46 -06:00
Bharath SM
9cfcd8601e smb: client: show smb lease key in open_files output
Add the SMB lease key in /proc/fs/cifs/open_files for
debugging purposes.

Signed-off-by: Bharath SM <bharathsm@microsoft.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:45 -06:00
Qianchang Zhao
1fab1fa091 ksmbd: ipc: fix use-after-free in ipc_msg_send_request
ipc_msg_send_request() waits for a generic netlink reply using an
ipc_msg_table_entry on the stack. The generic netlink handler
(handle_generic_event()/handle_response()) fills entry->response under
ipc_msg_table_lock, but ipc_msg_send_request() used to validate and free
entry->response without holding the same lock.

Under high concurrency this allows a race where handle_response() is
copying data into entry->response while ipc_msg_send_request() has just
freed it, leading to a slab-use-after-free reported by KASAN in
handle_generic_event():

  BUG: KASAN: slab-use-after-free in handle_generic_event+0x3c4/0x5f0 [ksmbd]
  Write of size 12 at addr ffff888198ee6e20 by task pool/109349
  ...
  Freed by task:
    kvfree
    ipc_msg_send_request [ksmbd]
    ksmbd_rpc_open -> ksmbd_session_rpc_open [ksmbd]

Fix by:
- Taking ipc_msg_table_lock in ipc_msg_send_request() while validating
  entry->response, freeing it when invalid, and removing the entry from
  ipc_msg_table.
- Returning the final entry->response pointer to the caller only after
  the hash entry is removed under the lock.
- Returning NULL in the error path, preserving the original API
  semantics.

This makes all accesses to entry->response consistent with
handle_response(), which already updates and fills the response buffer
under ipc_msg_table_lock, and closes the race that allowed the UAF.

Cc: stable@vger.kernel.org
Reported-by: Qianchang Zhao <pioooooooooip@gmail.com>
Reported-by: Zhitong Liu <liuzhitong1993@gmail.com>
Signed-off-by: Qianchang Zhao <pioooooooooip@gmail.com>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:45 -06:00
Stefan Metzmacher
dc10cf1368 smb: client: relax WARN_ON_ONCE(SMBDIRECT_SOCKET_*) checks in recv_done() and smbd_conn_upcall()
sc->first_error might already be set and sc->status
is thus unexpected, so this should avoid the WARN[_ON]_ONCE()
if sc->first_error is already set and have a usable error path.

While there set sc->first_error as soon as possible.

This is done based on a problem seen in similar places on
the server. And there it was already very useful in order
to find the problem when we have a meaningful WARN_ONCE()
that prints details about the connection.

This is much more useful:

[  309.560973] expected[NEGOTIATE_NEEDED] != RDMA_CONNECT_RUNNING
first_error=0 local=192.168.0.200:445 remote=192.168.0.100:60445
[  309.561034] WARNING: CPU: 2 PID: 78 at transport_rdma.c:643
recv_done+0x2fa/0x3d0 [ksmbd]

than what we had before (only):

[  894.140316] WARNING: CPU: 1 PID: 116 at
fs/smb/server/transport_rdma.c:642 recv_done+0x308/0x360 [ksmbd]

Fixes: 58dfba8a2d ("smb: client/smbdirect: replace SMBDIRECT_SOCKET_CONNECTING with more detailed states")
Cc: Steve French <smfrench@gmail.com>
Cc: Tom Talpey <tom@talpey.com>
Cc: Long Li <longli@microsoft.com>
Cc: Namjae Jeon <linkinjeon@kernel.org>
Cc: Paulo Alcantara <pc@manguebit.org>
Cc: linux-cifs@vger.kernel.org
Cc: samba-technical@lists.samba.org
Signed-off-by: Stefan Metzmacher <metze@samba.org>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:45 -06:00
Stefan Metzmacher
425c32750b smb: server: relax WARN_ON_ONCE(SMBDIRECT_SOCKET_*) checks in recv_done() and smb_direct_cm_handler()
Namjae reported the following:

I have a simple file copy test with windows 11 client, and get the
following error message.

[  894.140312] ------------[ cut here ]------------
[  894.140316] WARNING: CPU: 1 PID: 116 at
fs/smb/server/transport_rdma.c:642 recv_done+0x308/0x360 [ksmbd]
[  894.140335] Modules linked in: ksmbd cmac nls_utf8 nls_ucs2_utils
libarc4 nls_iso8859_1 snd_hda_codec_intelhdmi snd_hda_codec_hdmi
snd_hda_codec_alc882 snd_hda_codec_realtek_lib snd_hda_codec_generic
rpcrdma intel_rapl_msr rdma_ucm intel_rapl_common snd_hda_intel
ib_iser snd_hda_codec intel_uncore_frequency
intel_uncore_frequency_common snd_hda_core intel_tcc_cooling
x86_pkg_temp_thermal intel_powerclamp snd_intel_dspcfg libiscsi
snd_intel_sdw_acpi coretemp scsi_transport_iscsi snd_hwdep kvm_intel
i915 snd_pcm ib_umad rdma_cm snd_seq_midi ib_ipoib kvm
snd_seq_midi_event iw_cm snd_rawmidi ghash_clmulni_intel ib_cm
aesni_intel snd_seq mei_hdcp drm_buddy rapl snd_seq_device eeepc_wmi
asus_wmi snd_timer intel_cstate ttm snd drm_client_lib
drm_display_helper sparse_keymap soundcore platform_profile mxm_wmi
wmi_bmof joydev mei_me cec acpi_pad mei rc_core drm_kms_helper
input_leds i2c_algo_bit mac_hid sch_fq_codel msr parport_pc ppdev lp
nfsd parport auth_rpcgss binfmt_misc nfs_acl lockd grace drm sunrpc
ramoops efi_pstore
[  894.140414]  reed_solomon pstore_blk pstore_zone autofs4 btrfs
blake2b_generic xor raid6_pq mlx5_ib ib_uverbs ib_core hid_generic uas
usbhid hid r8169 i2c_i801 usb_storage i2c_mux i2c_smbus mlx5_core
realtek ahci mlxfw psample libahci video wmi [last unloaded: ksmbd]
[  894.140442] CPU: 1 UID: 0 PID: 116 Comm: kworker/1:1H Tainted: G
    W           6.18.0-rc5+ #1 PREEMPT(voluntary)
[  894.140447] Tainted: [W]=WARN
[  894.140448] Hardware name: System manufacturer System Product
Name/H110M-K, BIOS 3601 12/12/2017
[  894.140450] Workqueue: ib-comp-wq ib_cq_poll_work [ib_core]
[  894.140476] RIP: 0010:recv_done+0x308/0x360 [ksmbd]
[  894.140487] Code: 2e f2 ff ff 5b 41 5c 41 5d 41 5e 41 5f 5d c3 cc
cc cc cc 41 8b 55 10 49 8b 75 08 b9 02 00 00 00 e8 ed f4 f2 c3 e9 59
fd ff ff <0f> 0b e9 02 ff ff ff 49 8b 74 24 28 49 8d 94 24 c8 00 00 00
bf 00
[  894.140490] RSP: 0018:ffffa47ec03f3d78 EFLAGS: 00010293
[  894.140492] RAX: 0000000000000001 RBX: ffff8eb84c818000 RCX: 000000010002ba00
[  894.140494] RDX: 0000000037600001 RSI: 0000000000000083 RDI: ffff8eb92ec9ee40
[  894.140496] RBP: ffffa47ec03f3da0 R08: 0000000000000000 R09: 0000000000000010
[  894.140498] R10: ffff8eb801705680 R11: fefefefefefefeff R12: ffff8eb7454b8810
[  894.140499] R13: ffff8eb746deb988 R14: ffff8eb746deb980 R15: ffff8eb84c818000
[  894.140501] FS:  0000000000000000(0000) GS:ffff8eb9a7355000(0000)
knlGS:0000000000000000
[  894.140503] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  894.140505] CR2: 00002d9401d60018 CR3: 0000000010a40006 CR4: 00000000003726f0
[  894.140507] Call Trace:
[  894.140509]  <TASK>
[  894.140512]  __ib_process_cq+0x8e/0x190 [ib_core]
[  894.140530]  ib_cq_poll_work+0x2f/0x90 [ib_core]
[  894.140545]  process_scheduled_works+0xd4/0x430
[  894.140554]  worker_thread+0x12a/0x270
[  894.140558]  kthread+0x10d/0x250
[  894.140564]  ? __pfx_worker_thread+0x10/0x10
[  894.140567]  ? __pfx_kthread+0x10/0x10
[  894.140571]  ret_from_fork+0x11a/0x160
[  894.140574]  ? __pfx_kthread+0x10/0x10
[  894.140577]  ret_from_fork_asm+0x1a/0x30
[  894.140584]  </TASK>
[  894.140585] ---[ end trace 0000000000000000 ]---
[  894.154363] ------------[ cut here ]------------
[  894.154367] WARNING: CPU: 3 PID: 5543 at
fs/smb/server/transport_rdma.c:1728 smb_direct_cm_handler+0x121/0x130
[ksmbd]
[  894.154384] Modules linked in: ksmbd cmac nls_utf8 nls_ucs2_utils
libarc4 nls_iso8859_1 snd_hda_codec_intelhdmi snd_hda_codec_hdmi
snd_hda_codec_alc882 snd_hda_codec_realtek_lib snd_hda_codec_generic
rpcrdma intel_rapl_msr rdma_ucm intel_rapl_common snd_hda_intel
ib_iser snd_hda_codec intel_uncore_frequency
intel_uncore_frequency_common snd_hda_core intel_tcc_cooling
x86_pkg_temp_thermal intel_powerclamp snd_intel_dspcfg libiscsi
snd_intel_sdw_acpi coretemp scsi_transport_iscsi snd_hwdep kvm_intel
i915 snd_pcm ib_umad rdma_cm snd_seq_midi ib_ipoib kvm
snd_seq_midi_event iw_cm snd_rawmidi ghash_clmulni_intel ib_cm
aesni_intel snd_seq mei_hdcp drm_buddy rapl snd_seq_device eeepc_wmi
asus_wmi snd_timer intel_cstate ttm snd drm_client_lib
drm_display_helper sparse_keymap soundcore platform_profile mxm_wmi
wmi_bmof joydev mei_me cec acpi_pad mei rc_core drm_kms_helper
input_leds i2c_algo_bit mac_hid sch_fq_codel msr parport_pc ppdev lp
nfsd parport auth_rpcgss binfmt_misc nfs_acl lockd grace drm sunrpc
ramoops efi_pstore
[  894.154456]  reed_solomon pstore_blk pstore_zone autofs4 btrfs
blake2b_generic xor raid6_pq mlx5_ib ib_uverbs ib_core hid_generic uas
usbhid hid r8169 i2c_i801 usb_storage i2c_mux i2c_smbus mlx5_core
realtek ahci mlxfw psample libahci video wmi [last unloaded: ksmbd]
[  894.154483] CPU: 3 UID: 0 PID: 5543 Comm: kworker/3:6 Tainted: G
    W           6.18.0-rc5+ #1 PREEMPT(voluntary)
[  894.154487] Tainted: [W]=WARN
[  894.154488] Hardware name: System manufacturer System Product
Name/H110M-K, BIOS 3601 12/12/2017
[  894.154490] Workqueue: ib_cm cm_work_handler [ib_cm]
[  894.154499] RIP: 0010:smb_direct_cm_handler+0x121/0x130 [ksmbd]
[  894.154507] Code: e7 e8 13 b1 ef ff 44 89 e1 4c 89 ee 48 c7 c7 80
d7 59 c1 48 89 c2 e8 2e 4d ef c3 31 c0 5b 41 5c 41 5d 41 5e 5d c3 cc
cc cc cc <0f> 0b eb a5 66 66 2e 0f 1f 84 00 00 00 00 00 90 90 90 90 90
90 90
[  894.154510] RSP: 0018:ffffa47ec1b27c00 EFLAGS: 00010206
[  894.154512] RAX: ffffffffc1304e00 RBX: ffff8eb89ae50880 RCX: 0000000000000000
[  894.154514] RDX: ffff8eb730960000 RSI: ffffa47ec1b27c60 RDI: ffff8eb7454b9400
[  894.154515] RBP: ffffa47ec1b27c20 R08: 0000000000000002 R09: ffff8eb730b8c18b
[  894.154517] R10: 0000000000000001 R11: 0000000000000001 R12: 0000000000000009
[  894.154518] R13: ffff8eb7454b9400 R14: ffff8eb7454b8810 R15: ffff8eb815c43000
[  894.154520] FS:  0000000000000000(0000) GS:ffff8eb9a7455000(0000)
knlGS:0000000000000000
[  894.154522] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  894.154523] CR2: 00007fe1310e99d0 CR3: 0000000010a40005 CR4: 00000000003726f0
[  894.154525] Call Trace:
[  894.154527]  <TASK>
[  894.154530]  cma_cm_event_handler+0x27/0xd0 [rdma_cm]
[  894.154541]  cma_ib_handler+0x99/0x2e0 [rdma_cm]
[  894.154551]  cm_process_work+0x28/0xf0 [ib_cm]
[  894.154557]  cm_queue_work_unlock+0x41/0xf0 [ib_cm]
[  894.154563]  cm_work_handler+0x2eb/0x25b0 [ib_cm]
[  894.154568]  ? pwq_activate_first_inactive+0x52/0x70
[  894.154572]  ? pwq_dec_nr_in_flight+0x244/0x330
[  894.154575]  process_scheduled_works+0xd4/0x430
[  894.154579]  worker_thread+0x12a/0x270
[  894.154581]  kthread+0x10d/0x250
[  894.154585]  ? __pfx_worker_thread+0x10/0x10
[  894.154587]  ? __pfx_kthread+0x10/0x10
[  894.154590]  ret_from_fork+0x11a/0x160
[  894.154593]  ? __pfx_kthread+0x10/0x10
[  894.154596]  ret_from_fork_asm+0x1a/0x30
[  894.154602]  </TASK>
[  894.154603] ---[ end trace 0000000000000000 ]---
[  894.154931] ksmbd: smb_direct: disconnected
[  894.157278] ksmbd: smb_direct: disconnected

I guess sc->first_error is already set and sc->status
is thus unexpected, so this should avoid the WARN[_ON]_ONCE()
if sc->first_error is already set and have a usable error path.

While there set sc->first_error as soon as possible.

v1 of this patch revealed the real problem with this message:

[  309.560973] expected[NEGOTIATE_NEEDED] != RDMA_CONNECT_RUNNING
first_error=0 local=192.168.0.200:445 remote=192.168.0.100:60445
[  309.561034] WARNING: CPU: 2 PID: 78 at transport_rdma.c:643
recv_done+0x2fa/0x3d0 [ksmbd]

Some drivers (at least mlx5_ib) might post a recv completion before
RDMA_CM_EVENT_ESTABLISHED, so we need to adjust our expectation in that
case.

Fixes: e2d5e516c6 ("smb: server: only turn into SMBDIRECT_SOCKET_CONNECTED when negotiation is done")
Cc: Steve French <smfrench@gmail.com>
Cc: Tom Talpey <tom@talpey.com>
Cc: Long Li <longli@microsoft.com>
Cc: Namjae Jeon <linkinjeon@kernel.org>
Cc: Paulo Alcantara <pc@manguebit.org>
Cc: linux-cifs@vger.kernel.org
Cc: samba-technical@lists.samba.org
Signed-off-by: Stefan Metzmacher <metze@samba.org>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:45 -06:00
Stefan Metzmacher
1adb2dab97 smb: smbdirect: introduce SMBDIRECT_CHECK_STATUS_{WARN,DISCONNECT}()
These will be used in various places in order to assert
the current status mostly during the connect and negotiation
phase. It will replace the WARN_ON_ONCE(sc->status != ...)
calls, which are very useless in order to identify the
problem that happened.

As a start client and server will need to define their own
__SMBDIRECT_SOCKET_DISCONNECT(__sc) macro in order to use
SMBDIRECT_CHECK_STATUS_DISCONNECT().

Cc: Steve French <smfrench@gmail.com>
Cc: Tom Talpey <tom@talpey.com>
Cc: Long Li <longli@microsoft.com>
Cc: Namjae Jeon <linkinjeon@kernel.org>
Cc: Paulo Alcantara <pc@manguebit.org>
Cc: linux-cifs@vger.kernel.org
Cc: samba-technical@lists.samba.org
Signed-off-by: Stefan Metzmacher <metze@samba.org>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:45 -06:00
Stefan Metzmacher
1f3fd108c5 smb: smbdirect: introduce SMBDIRECT_DEBUG_ERR_PTR() helper
This can be used like this:

  int err = somefunc();
  pr_warn("err=%1pe\n", SMBDIRECT_DEBUG_ERR_PTR(err));

This will be used in the following fixes in order
to be prepared to identify real world problems
more easily.

Cc: Steve French <smfrench@gmail.com>
Cc: Tom Talpey <tom@talpey.com>
Cc: Long Li <longli@microsoft.com>
Cc: Namjae Jeon <linkinjeon@kernel.org>
Cc: Paulo Alcantara <pc@manguebit.org>
Cc: linux-cifs@vger.kernel.org
Cc: samba-technical@lists.samba.org
Signed-off-by: Stefan Metzmacher <metze@samba.org>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:45 -06:00
Qianchang Zhao
991f8a79db ksmbd: vfs: fix race on m_flags in vfs_cache
ksmbd maintains delete-on-close and pending-delete state in
ksmbd_inode->m_flags. In vfs_cache.c this field is accessed under
inconsistent locking: some paths read and modify m_flags under
ci->m_lock while others do so without taking the lock at all.

Examples:

 - ksmbd_query_inode_status() and __ksmbd_inode_close() use
   ci->m_lock when checking or updating m_flags.
 - ksmbd_inode_pending_delete(), ksmbd_set_inode_pending_delete(),
   ksmbd_clear_inode_pending_delete() and ksmbd_fd_set_delete_on_close()
   used to read and modify m_flags without ci->m_lock.

This creates a potential data race on m_flags when multiple threads
open, close and delete the same file concurrently. In the worst case
delete-on-close and pending-delete bits can be lost or observed in an
inconsistent state, leading to confusing delete semantics (files that
stay on disk after delete-on-close, or files that disappear while still
in use).

Fix it by:

 - Making ksmbd_query_inode_status() look at m_flags under ci->m_lock
   after dropping inode_hash_lock.
 - Adding ci->m_lock protection to all helpers that read or modify
   m_flags (ksmbd_inode_pending_delete(), ksmbd_set_inode_pending_delete(),
   ksmbd_clear_inode_pending_delete(), ksmbd_fd_set_delete_on_close()).
 - Keeping the existing ci->m_lock protection in __ksmbd_inode_close(),
   and moving the actual unlink/xattr removal outside the lock.

This unifies the locking around m_flags and removes the data race while
preserving the existing delete-on-close behaviour.

Reported-by: Qianchang Zhao <pioooooooooip@gmail.com>
Reported-by: Zhitong Liu <liuzhitong1993@gmail.com>
Signed-off-by: Qianchang Zhao <pioooooooooip@gmail.com>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:45 -06:00
Thorsten Blum
dc81b8f4f0 ksmbd: Replace strcpy + strcat to improve convert_to_nt_pathname
strcpy() is deprecated [1] and using strcat() is discouraged. Replace
them by assigning the prefix directly and by using memcpy() to copy the
pathname. Using memcpy() is safe because we already know the length of
the source string and that it is guaranteed to be NUL-terminated.

Allocate only as many bytes as needed and replace kzalloc() with
kmalloc() since memcpy() overwrites the entire buffer anyway.

Link: https://www.kernel.org/doc/html/latest/process/deprecated.html#strcpy [1]
Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:45 -06:00
ChenXiaoSong
c4a2a49f7d smb: move FILE_SYSTEM_ATTRIBUTE_INFO to common/fscc.h
Modify the following places:

  - struct filesystem_attribute_info -> FILE_SYSTEM_ATTRIBUTE_INFO
  - Remove MIN_FS_ATTR_INFO_SIZE definition
  - Introduce MAX_FS_NAME_LEN
  - max_len of FileFsAttributeInformation -> sizeof(FILE_SYSTEM_ATTRIBUTE_INFO) + MAX_FS_NAME_LEN
  - min_len of FileFsAttributeInformation -> sizeof(FILE_SYSTEM_ATTRIBUTE_INFO)
  - SMB2_QFS_attr(): memcpy(..., min_len)

Then move FILE_SYSTEM_ATTRIBUTE_INFO to common header file.

I have tested the relevant code related to FILE_SYSTEM_ATTRIBUTE_INFO (Link[1]).

Link[1]: https://chenxiaosong.com/en/FILE_SYSTEM_ATTRIBUTE_INFO.html
Suggested-by: Namjae Jeon <linkinjeon@kernel.org>
Tested-by: ChenXiaoSong <chenxiaosong@kylinos.cn>
Signed-off-by: ChenXiaoSong <chenxiaosong@kylinos.cn>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:45 -06:00
Aaditya Kansal
e4442b2e95 ksmbd: implement error handling for STATUS_INFO_LENGTH_MISMATCH in smb server
Add STATUS_INFO_LENGTH_MISMATCH mapping to EMSGSIZE.
Currently, STATUS_INFO_LENGTH_MISMATCH has no mapping to any error code,
making it difficult to distinguish between invalid parameters and length
mismatch.

Map STATUS_INFO_LENGTH_MISMATCH to EMSGSIZE while keeping the EINVAL for
invalid parameters. Although the buf_len check only checks for buf_size
being less than required, there was no error code for lower buf_size.
Hence, EMSGSIZE is used.

Signed-off-by: Aaditya Kansal <aadityakansal390@gmail.com>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:45 -06:00
Namjae Jeon
b39a1833cc ksmbd: fix use-after-free in ksmbd_tree_connect_put under concurrency
Under high concurrency, A tree-connection object (tcon) is freed on
a disconnect path while another path still holds a reference and later
executes *_put()/write on it.

Reported-by: Qianchang Zhao <pioooooooooip@gmail.com>
Reported-by: Zhitong Liu <liuzhitong1993@gmail.com>
Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:45 -06:00
Qingfang Deng
3316a8fc84 ksmbd: server: avoid busy polling in accept loop
The ksmbd listener thread was using busy waiting on a listening socket by
calling kernel_accept() with SOCK_NONBLOCK and retrying every 100ms on
-EAGAIN. Since this thread is dedicated to accepting new connections,
there is no need for non-blocking mode.

Switch to a blocking accept() call instead, allowing the thread to sleep
until a new connection arrives. This avoids unnecessary wakeups and CPU
usage. During teardown, call shutdown() on the listening socket so that
accept() returns -EINVAL and the thread exits cleanly.

The socket release mutex is redundant because kthread_stop() blocks until
the listener thread returns, guaranteeing safe teardown ordering.

Also remove sk_rcvtimeo and sk_sndtimeo assignments, which only caused
accept() to return -EAGAIN prematurely.

Signed-off-by: Qingfang Deng <dqfext@gmail.com>
Reviewed-by: Stefan Metzmacher <metze@samba.org>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:45 -06:00
ChenXiaoSong
5003ad718a smb: move create_durable_reconn to common/smb2pdu.h
The fields in struct create_durable_reconn_req and struct create_durable
are exactly the same, so remove create_durable_reconn_req from server,
and use typedef to define both create_durable_req_t and
create_durable_reconn_t for a single struct.

Rename the following places:

  - struct create_durable -> create_durable_req_t
  - struct create_durable_reconn_req -> create_durable_reconn_t

The documentation references are:

  - SMB2_CREATE_DURABLE_HANDLE_REQUEST   in MS-SMB2 2.2.13.2.3
  - SMB2_CREATE_DURABLE_HANDLE_RECONNECT in MS-SMB2 2.2.13.2.4
  - SMB2_FILEID in MS-SMB2 2.2.14.1

Descriptions of the struct fields:

  - __u8  Reserved[16]: DurableRequest field of SMB2_CREATE_DURABLE_HANDLE_REQUEST.
                        A 16-byte field that MUST be reserved.
  - __u64 PersistentFileId: Persistent field of 2.2.14.1 SMB2_FILEID
  - __u64 VolatileFileId: Volatile field of 2.2.14.1 SMB2_FILEID
  - struct Fid: Data field of SMB2_CREATE_DURABLE_HANDLE_RECONNECT.
                An SMB2_FILEID structure, as specified in section 2.2.14.1.

Suggested-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: ChenXiaoSong <chenxiaosong@kylinos.cn>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:45 -06:00
ChenXiaoSong
e7e60e8bfc smb: fix some warnings reported by scripts/checkpatch.pl
Fix the following warnings:

  WARNING: __always_unused or __maybe_unused is preferred over \
                                    __attribute__((__unused__))
  WARNING: Prefer __packed over __attribute__((packed))

Signed-off-by: ChenXiaoSong <chenxiaosong@kylinos.cn>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:45 -06:00
ChenXiaoSong
95e8c1bfa5 smb: do some cleanups
Modify the following places:

  - Add documentation references
  - ATTR_REPARSE -> ATTR_REPARSE_POINT: consistent with MS-SMB 2.2.1.2.1
  - Remove unused File Attribute flags from server, if the server uses
    them in the future, we can move the client-side definitions to common
  - Remove unused SMB1_CLIENT_GUID_SIZE from server

Signed-off-by: ChenXiaoSong <chenxiaosong@kylinos.cn>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:45 -06:00
ChenXiaoSong
464b913993 smb: move FILE_SYSTEM_SIZE_INFO to common/fscc.h
Rename the following places:

  - struct filesystem_info -> FILE_SYSTEM_SIZE_INFO
  - FILE_SYSTEM_INFO -> FILE_SYSTEM_SIZE_INFO
  - FreeAllocationUnits -> AvailableAllocationUnits: consistent with MS-FSCC 2.5.8

Then move duplicate definitions to common header file.

Signed-off-by: ChenXiaoSong <chenxiaosong@kylinos.cn>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:45 -06:00
ChenXiaoSong
d7edd3892d smb: move some duplicate struct definitions to common/fscc.h
Modify the following places:

  - smb2_file_ntwrk_info -> smb2_file_network_open_info
  - struct filesystem_device_info -> FILE_SYSTEM_DEVICE_INFO
  - struct file_directory_info -> FILE_DIRECTORY_INFO
  - struct file_full_directory_info -> FILE_FULL_DIRECTORY_INFO
  - struct file_both_directory_info -> FILE_BOTH_DIRECTORY_INFO
  - struct file_id_full_dir_info -> FILE_ID_FULL_DIR_INFO
  - struct filesystem_posix_info -> FILE_SYSTEM_POSIX_INFO

The fields of these structures are exactly the same on both client and
server, so move duplicate definitions to common header file.

Signed-off-by: ChenXiaoSong <chenxiaosong@kylinos.cn>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:45 -06:00
ChenXiaoSong
84d8d4cf88 smb: move list of FileSystemAttributes to common/fscc.h
These definitions are exactly the same on both client and server,
so move them to new common header file.

Signed-off-by: ChenXiaoSong <chenxiaosong@kylinos.cn>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:45 -06:00
ChenXiaoSong
d8ac987918 smb: move SMB_NEGOTIATE_REQ to common/smb2pdu.h
Modify the following places:

  - negotiate_req -> smb_negotiate_req
  - NEGOTIATE_REQ -> SMB_NEGOTIATE_REQ
  - negotiate_rsp -> smb_negotiate_rsp
  - NEGOTIATE_RSP -> SMB_NEGOTIATE_RSP

Then move SMB_NEGOTIATE_REQ to common header file.

Signed-off-by: ChenXiaoSong <chenxiaosong@kylinos.cn>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:45 -06:00
ZhangGuoDong
1172d85984 smb: move some duplicate definitions to common/smb2pdu.h
In order to maintain the code more easily, move duplicate definitions to
common header file.

There are only 4 different definitions, the client uses cpu_to_le16()
but the server does not:

  - RSS_CAPABLE
  - RDMA_CAPABLE
  - INTERNETWORK
  - INTERNETWORKV6

The other definitions are exactly the same on both client and server.

The struct smb_hdr is defined in MS-CIFS and MS-SMB, but it's also used by
SMB2 and SMB3, so move it to the common smb2pdu.h.

Co-developed-by: ChenXiaoSong <chenxiaosong@kylinos.cn>
Signed-off-by: ChenXiaoSong <chenxiaosong@kylinos.cn>
Signed-off-by: ZhangGuoDong <zhangguodong@kylinos.cn>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:45 -06:00
ChenXiaoSong
96721fd292 smb: move create_durable_rsp_v2 to common/smb2pdu.h
Modify the following places:

  - some fields in "struct create_durable_v2_rsp" ->
                       struct durable_context_v2_rsp
  - durable_reconnect_context_v2_rsp -> durable_context_v2_rsp
  - create_durable_v2_rsp -> create_durable_rsp_v2

Then move them to common header file.

Signed-off-by: ChenXiaoSong <chenxiaosong@kylinos.cn>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:44 -06:00
ChenXiaoSong
81a45de432 smb: move create_durable_handle_reconnect_v2 to common/smb2pdu.h
Modify the following places:

  - some fields in "struct create_durable_reconn_v2_req" ->
                        struct durable_reconnect_context_v2
  - create_durable_reconn_v2_req -> create_durable_handle_reconnect_v2

Then move duplicate definitions to common header file.

Signed-off-by: ChenXiaoSong <chenxiaosong@kylinos.cn>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:44 -06:00
ChenXiaoSong
833a75fc9e smb: move create_durable_req_v2 to common/smb2pdu.h
Modify the following places:

  - some fields in "struct create_durable_req_v2" ->
                           struct durable_context_v2
  - durable_context_v2 -> durable_context_v2_req
  - create_durable_v2 -> create_durable_req_v2

Then move duplicate definitions to common header file.

Signed-off-by: ChenXiaoSong <chenxiaosong@kylinos.cn>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:44 -06:00
ChenXiaoSong
884a1d4e9c smb: move MAX_CIFS_SMALL_BUFFER_SIZE to common/smbglob.h
In order to maintain the code more easily, move duplicate definitions to
common header file.

By the way, add the copyright and author information for Namjae to
common/smbglob.h.

Signed-off-by: ChenXiaoSong <chenxiaosong@kylinos.cn>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:44 -06:00
ChenXiaoSong
4a7f960780 smb/client: fix CAP_BULK_TRANSFER value
See MS-CIFS 2.2.4.52.2.

Keep it consistent with the value in the documentation.

Signed-off-by: ChenXiaoSong <chenxiaosong@kylinos.cn>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:44 -06:00
Qianchang Zhao
5d510ac316 ksmbd: skip lock-range check on equal size to avoid size==0 underflow
When size equals the current i_size (including 0), the code used to call
check_lock_range(filp, i_size, size - 1, WRITE), which computes `size - 1`
and can underflow for size==0. Skip the equal case.

Cc: stable@vger.kernel.org
Reported-by: Qianchang Zhao <pioooooooooip@gmail.com>
Reported-by: Zhitong Liu <liuzhitong1993@gmail.com>
Signed-off-by: Qianchang Zhao <pioooooooooip@gmail.com>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:44 -06:00
ZhangGuoDong
9c98f5eec8 smb: move resume_key_ioctl_rsp to common/smb2pdu.h
Rename 2 places:

  - resume_key_req -> resume_key_ioctl_rsp
  - server: ResumeKey -> ResumeKeyU64

Merge the struct members of the server and the client, then move duplicate
definitions to common header file.

Co-developed-by: ChenXiaoSong <chenxiaosong@kylinos.cn>
Signed-off-by: ChenXiaoSong <chenxiaosong@kylinos.cn>
Signed-off-by: ZhangGuoDong <zhangguodong@kylinos.cn>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:44 -06:00
ZhangGuoDong
cc26f593dc smb: move copychunk definitions to common/smb2pdu.h
Rename 3 places:

  - copychunk_ioctl -> copychunk_ioctl_req
  - copychunk -> srv_copychunk
  - server: ResumeKey -> SourceKeyU64

Merge the struct members of the server and the client, then move duplicate
definitions to common header file.

Co-developed-by: ChenXiaoSong <chenxiaosong@kylinos.cn>
Signed-off-by: ChenXiaoSong <chenxiaosong@kylinos.cn>
Signed-off-by: ZhangGuoDong <zhangguodong@kylinos.cn>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:44 -06:00
ZhangGuoDong
7844d50ca2 smb: move smb_sockaddr_in and smb_sockaddr_in6 to common/smb2pdu.h
Rename 4 places:

  - iface_info_ipv4 -> smb_sockaddr_in
  - iface_info_ipv6 -> smb_sockaddr_in6
  - IPv4address -> IPv4Address
  - IPv6address -> IPv6Address

Then move duplicate definitions to common header file.

Co-developed-by: ChenXiaoSong <chenxiaosong@kylinos.cn>
Signed-off-by: ChenXiaoSong <chenxiaosong@kylinos.cn>
Signed-off-by: ZhangGuoDong <zhangguodong@kylinos.cn>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:44 -06:00
ZhangGuoDong
cd311445d9 smb: move SMB1_PROTO_NUMBER to common/smbglob.h
Replace the constant of client with SMB1_PROTO_NUMBER, then move the
macro definition from server/smb_common.h to common/smbglob.h.

Co-developed-by: ChenXiaoSong <chenxiaosong@kylinos.cn>
Signed-off-by: ChenXiaoSong <chenxiaosong@kylinos.cn>
Signed-off-by: ZhangGuoDong <zhangguodong@kylinos.cn>
Suggested-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:44 -06:00
ZhangGuoDong
36c31540cf smb: move get_rfc1002_len() to common/smbglob.h
Rename get_rfc1002_length() to get_rfc1002_len(), then move duplicate
definitions to common header file.

Co-developed-by: ChenXiaoSong <chenxiaosong@kylinos.cn>
Signed-off-by: ChenXiaoSong <chenxiaosong@kylinos.cn>
Signed-off-by: ZhangGuoDong <zhangguodong@kylinos.cn>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:44 -06:00
ZhangGuoDong
34cf191bb6 smb: move smb_version_values to common/smbglob.h
Merge the struct members of the server and the client:

  - req_capabilities:		from client
  - header_preamble_size:	from client
  - cap_unicode:		from client
  - capabilities:		from server, rename to req_capabilities
  - max_read_size:		from server
  - max_write_size:		from server
  - max_trans_size:		from server
  - max_credits:		from server
  - create_durable_size:	from server
  - create_durable_v2_size:	from server
  - create_mxac_size:		from server
  - create_disk_id_size:	from server
  - create_posix_size:		from server

Then move duplicate definitions to common header file.

Co-developed-by: ChenXiaoSong <chenxiaosong@kylinos.cn>
Signed-off-by: ChenXiaoSong <chenxiaosong@kylinos.cn>
Signed-off-by: ZhangGuoDong <zhangguodong@kylinos.cn>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:44 -06:00
ZhangGuoDong
94b955167e smb: rename common/cifsglob.h to common/smbglob.h
"cifs" is a legacy name, rename it to "smbglob.h" according to
Namjae's suggestions.

Co-developed-by: ChenXiaoSong <chenxiaosong@kylinos.cn>
Signed-off-by: ChenXiaoSong <chenxiaosong@kylinos.cn>
Signed-off-by: ZhangGuoDong <zhangguodong@kylinos.cn>
Suggested-by: Namjae Jeon <linkinjeon@kernel.org>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:43 -06:00
ChenXiaoSong
7d9f51d36b smb/server: update some misguided comment of smb2_0_server_cmds proc
These functions return error code rather than always returning 0.

Signed-off-by: ChenXiaoSong <chenxiaosong@kylinos.cn>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:43 -06:00
ChenXiaoSong
a3c4445fdb smb/server: fix return value of smb2_oplock_break()
smb2_oplock_break() should return error code when an error occurs,
__process_request() will print the error messages.

Signed-off-by: ChenXiaoSong <chenxiaosong@kylinos.cn>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:43 -06:00
ChenXiaoSong
269df046c1 smb/server: fix return value of smb2_ioctl()
__process_request() will not print error messages if smb2_ioctl()
always returns 0.

Fix this by returning the correct value at the end of function.

Signed-off-by: ChenXiaoSong <chenxiaosong@kylinos.cn>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:43 -06:00
ChenXiaoSong
dafe22bc67 smb/server: fix return value of smb2_query_dir()
__process_request() will not print error messages if smb2_query_dir()
always returns 0.

Fix this by returning the correct value at the end of function.

Signed-off-by: ChenXiaoSong <chenxiaosong@kylinos.cn>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:43 -06:00
ChenXiaoSong
d1a30b9ddc smb/server: fix return value of smb2_notify()
smb2_notify() should return error code when an error occurs,
__process_request() will print the error messages.

I may implement the SMB2 CHANGE_NOTIFY response (see MS-SMB2 2.2.36)
in the future.

Signed-off-by: ChenXiaoSong <chenxiaosong@kylinos.cn>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:43 -06:00
ChenXiaoSong
c5b462e353 smb/server: fix return value of smb2_read()
STATUS_END_OF_FILE maps to the linux error -ENODATA. Perhaps in the future
we can move client/smb2maperror.c into common/ and then call
map_smb2_to_linux_error() to get the linux error.

Signed-off-by: ChenXiaoSong <chenxiaosong@kylinos.cn>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:43 -06:00
Eric Biggers
3a597e6e97 ksmbd: Use HMAC-MD5 library for NTLMv2
For the HMAC-MD5 computations in NTLMv2, use the HMAC-MD5 library
instead of a "hmac(md5)" crypto_shash.  This is simpler and faster.
With the library there's no need to allocate memory, no need to handle
errors, and the HMAC-MD5 code is accessed directly without inefficient
indirect calls and other unnecessary API overhead.

To preserve the existing behavior of NTLMv2 support being disabled when
the kernel is booted with "fips=1", make ksmbd_auth_ntlmv2() check
fips_enabled itself.  Previously it relied on the error from
crypto_alloc_shash("hmac(md5)") being bubbled up.  I don't know for sure
that this is actually needed, but this preserves the existing behavior.

Signed-off-by: Eric Biggers <ebiggers@kernel.org>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:43 -06:00
Eric Biggers
924067ef18 ksmbd: Use HMAC-SHA256 library for message signing and key generation
Convert ksmbd_sign_smb2_pdu() and generate_key() to use the HMAC-SHA256
library instead of a "hmac(sha256)" crypto_shash.  This is simpler and
faster.  With the library there's no need to allocate memory, no need to
handle errors, and the HMAC-SHA256 code is accessed directly without
inefficient indirect calls and other unnecessary API overhead.

Signed-off-by: Eric Biggers <ebiggers@kernel.org>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:43 -06:00
Eric Biggers
e009cb1e30 ksmbd: Use SHA-512 library for SMB3.1.1 preauth hash
Convert ksmbd_gen_preauth_integrity_hash() to use the SHA-512 library
instead of a "sha512" crypto_shash.  This is simpler and faster.  With
the library there's no need to allocate memory, no need to handle
errors, and the SHA-512 code is accessed directly without inefficient
indirect calls and other unnecessary API overhead.

Signed-off-by: Eric Biggers <ebiggers@kernel.org>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
2025-11-30 21:11:42 -06:00
Asbjørn Sloth Tønnesen
b8bcc17f58 wireguard: netlink: lower .maxattr for WG_CMD_GET_DEVICE
Previously .maxattr was shared for both WG_CMD_GET_DEVICE and
WG_CMD_SET_DEVICE. Now that it is split, then we can lower it
for WG_CMD_GET_DEVICE to follow the documentation which defines
.maxattr as WGDEVICE_A_IFNAME for WG_CMD_GET_DEVICE.

$ grep -hC5 'one but not both of:' include/uapi/linux/wireguard.h
 * WG_CMD_GET_DEVICE
 * -----------------
 *
 * May only be called via NLM_F_REQUEST | NLM_F_DUMP. The command
 * should contain one but not both of:
 *
 *    WGDEVICE_A_IFINDEX: NLA_U32
 *    WGDEVICE_A_IFNAME: NLA_NUL_STRING, maxlen IFNAMSIZ - 1
 *
 * The kernel will then return several messages [...]

While other attributes weren't rejected previously, the consensus
is that nobody sends those attributes, so nothing should break.

Link: https://lore.kernel.org/r/aRyLoy2iqbkUipZW@zx2c4.com/
Suggested-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Asbjørn Sloth Tønnesen <ast@fiberby.net>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
2025-12-01 03:25:09 +01:00
Asbjørn Sloth Tønnesen
73af07d7f2 wireguard: netlink: convert to split ops
This patch converts WireGuard from using the legacy struct genl_ops
to struct genl_split_ops, by applying the same transformation as
genl_cmd_full_to_split() would otherwise do at runtime.

WGDEVICE_A_MAX is swapped for WGDEVICE_A_PEERS, while they are
currently equivalent, then .maxattr should be the maximum attribute
that a given command supports, and not change along with WGDEVICE_A_MAX.

This is an incremental step towards adopting netlink policy code
generated by ynl-gen, ensuring that the code and spec is aligned.

This is a trivial patch with no behavioural changes intended.

Signed-off-by: Asbjørn Sloth Tønnesen <ast@fiberby.net>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
2025-12-01 03:25:09 +01:00
Asbjørn Sloth Tønnesen
9755f9de8f wireguard: netlink: use WG_KEY_LEN in policies
When converting the netlink policies to YNL, the constants used
in the policy have to be visible to userspace.

As NOISE_*_KEY_LEN isn't visible to userspace, change the policy
to use WG_KEY_LEN, as also documented in the UAPI header:

$ grep WG_KEY_LEN include/uapi/linux/wireguard.h
 *    WGDEVICE_A_PRIVATE_KEY: NLA_EXACT_LEN, len WG_KEY_LEN
 *    WGDEVICE_A_PUBLIC_KEY: NLA_EXACT_LEN, len WG_KEY_LEN
 *            WGPEER_A_PUBLIC_KEY: NLA_EXACT_LEN, len WG_KEY_LEN
 *            WGPEER_A_PRESHARED_KEY: NLA_EXACT_LEN, len WG_KEY_LEN
 [...]

Add a couple of BUILD_BUG_ON() to ensure that they stay in sync.

No behavioural changes intended.

Signed-off-by: Asbjørn Sloth Tønnesen <ast@fiberby.net>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
2025-12-01 03:25:09 +01:00
Asbjørn Sloth Tønnesen
aea199fa15 wireguard: netlink: validate nested arrays in policy
Use NLA_POLICY_NESTED_ARRAY() to perform nested array validation
in the policy validation step.

The nested policy was already enforced through nla_parse_nested(),
however extack wasn't passed previously, so no fancy error messages.

With the nested attributes being validated directly in the policy, the
policy argument can be set to NULL in the calls to nla_parse_nested().

Signed-off-by: Asbjørn Sloth Tønnesen <ast@fiberby.net>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
2025-12-01 03:25:09 +01:00
Asbjørn Sloth Tønnesen
e0e1b6db2e wireguard: netlink: enable strict genetlink validation
WireGuard is a modern enough genetlink family, that it doesn't need
resv_start_op. It already had policies in place when it was first
merged, it has also never used the reserved field, or other things
toggled by resv_start_op.

wireguard-tools have always used zero initialized memory, and have never
touched the reserved field, neither have any other clients I have
checked. Closed-source clients are much more likely to use the
embeddedable library from wireguard-tools, than a DIY implementation
using uninitialized memory.

Signed-off-by: Asbjørn Sloth Tønnesen <ast@fiberby.net>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
2025-12-01 03:25:08 +01:00
Gao Xiang
80d0c27a0a erofs: get rid of raw bi_end_io() usage
These BIOs are actually harmless in practice, as they are all pseudo
BIOs and do not use advanced features like chaining.  Using the BIO
interface is a more friendly and unified approach for both bdev and
and file-backed I/Os (compared to awkward bvec interfaces).

Let's use bio_endio() instead.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
2025-11-30 23:55:13 +08:00
Gao Xiang
30e13e41a0 erofs: enable error reporting for z_erofs_fixup_insize()
Enable propagation of detailed errors to callers.

Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
2025-11-30 23:49:32 +08:00
Jens Axboe
c1536df942 Merge tag 'md-6.19-20251130' of gitolite.kernel.org:pub/scm/linux/kernel/git/mdraid/linux into for-6.19/block
Pull MD changes from Yu:

"- fix null-ptr-dereference regression for dm-raid0 (Yu Kuai)
 - fix IO hang for raid5 when array is broken with IO inflight (Yu Kuai)
 - remove legacy 1s delay to speed up system shutdown (Tarun Sahu)"

* tag 'md-6.19-20251130' of gitolite.kernel.org:pub/scm/linux/kernel/git/mdraid/linux:
  md: remove legacy 1s delay in md_notify_reboot
  md/raid5: fix IO hang when array is broken with IO inflight
  md: warn about updating super block failure
  md/raid0: fix NULL pointer dereference in create_strip_zones() for dm-raid
2025-11-30 06:31:01 -07:00
Jakub Kicinski
0177f0f078 Merge tag 'linux-can-next-for-6.19-20251129' of git://git.kernel.org/pub/scm/linux/kernel/git/mkl/linux-can-next
Marc Kleine-Budde says:

====================
pull-request: can-next 2025-11-29

The patch is by Oliver Hartkopp and fixes the compilation of the
CAN_RAW protocol if the CAN driver infrastructure is not enabled.

This problem was introduced in the current development cycle of
net-next.

* tag 'linux-can-next-for-6.19-20251129' of git://git.kernel.org/pub/scm/linux/kernel/git/mkl/linux-can-next:
  can: Kconfig: select CAN driver infrastructure by default
====================

Link: https://patch.msgid.link/20251129125036.467177-1-mkl@pengutronix.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-29 17:45:26 -08:00
Tarun Sahu
fdd0c6a649 md: remove legacy 1s delay in md_notify_reboot
During system shutdown, the md driver registered notifier function
(md_notify_reboot) currently imposes a hardcoded one-second delay.

This delay was introduced approximately 23 years ago and was likely
necessary for the hardware generation of that time. Proposing this
patch to make sure there are no known devices that need this delay.

Link: https://lore.kernel.org/linux-raid/20251121191422.2758555-1-tarunsahu@google.com
Signed-off-by: Tarun Sahu <tarunsahu@google.com>
Reviewed-by: Yu Kuai<yukuai@fnnas.com>
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
2025-11-30 09:42:28 +08:00
Yu Kuai
a913d1f6a7 md/raid5: fix IO hang when array is broken with IO inflight
Following test can cause IO hang:

mdadm -CvR /dev/md0 -l10 -n4 /dev/sd[abcd] --assume-clean --chunk=64K --bitmap=none
sleep 5
echo 1 > /sys/block/sda/device/delete
echo 1 > /sys/block/sdb/device/delete
echo 1 > /sys/block/sdc/device/delete
echo 1 > /sys/block/sdd/device/delete

dd if=/dev/md0 of=/dev/null bs=8k count=1 iflag=direct

Root cause:

1) all disks removed, however all rdevs in the array is still in sync,
IO will be issued normally.

2) IO failure from sda, and set badblocks failed, sda will be faulty
and MD_SB_CHANGING_PENDING will be set.

3) error recovery try to recover this IO from other disks, IO will be
issued to sdb, sdc, and sdd.

4) IO failure from sdb, and set badblocks failed again, now array is
broken and will become read-only.

5) IO failure from sdc and sdd, however, stripe can't be handled anymore
because MD_SB_CHANGING_PENDING is set:

handle_stripe
 handle_stripe
 if (test_bit MD_SB_CHANGING_PENDING)
  set_bit STRIPE_HANDLE
  goto finish
  // skip handling failed stripe

release_stripe
 if (test_bit STRIPE_HANDLE)
  list_add_tail conf->hand_list

6) later raid5d can't handle failed stripe as well:

raid5d
 md_check_recovery
  md_update_sb
   if (!md_is_rdwr())
    // can't clear pending bit
    return
 if (test_bit MD_SB_CHANGING_PENDING)
  break;
  // can't handle failed stripe

Since MD_SB_CHANGING_PENDING can never be cleared for read-only array,
fix this problem by skip this checking for read-only array.

Link: https://lore.kernel.org/linux-raid/20251117085557.770572-3-yukuai@fnnas.com
Fixes: d87f064f58 ("md: never update metadata when array is read-only.")
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
Reviewed-by: Li Nan <linan122@huawei.com>
2025-11-30 09:38:45 +08:00
Yu Kuai
8c9e376b9d md: warn about updating super block failure
Many personalities will handle IO error from daemon thread(like raid1d,
raid10d, raid5d), and sb will require to be clean before hanlding these
failed IO. However update sb can fail, for example array is broken by
IO failure, or user config sysfs api array_state.

This patch adds warning if updating sb failed first, in case this will
be related to IO hang.

Link: https://lore.kernel.org/linux-raid/20251117085557.770572-2-yukuai@fnnas.com
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
Reviewed-by: Li Nan <linan122@huawei.com>
2025-11-30 09:38:22 +08:00
Yu Kuai
46f21952c4 md/raid0: fix NULL pointer dereference in create_strip_zones() for dm-raid
Commit 2107457e31 ("md/raid0: Move queue limit setup before r0conf
initialization") dereference mddev->gendisk unconditionally, which is
NULL for dm-raid.

Fix this problem by reverting to old codes for dm-raid.

Link: https://lore.kernel.org/linux-raid/20251116021816.107648-1-yukuai@fnnas.com
Fixes: 2107457e31 ("md/raid0: Move queue limit setup before r0conf initialization")
Reported-and-tested-by: Changhui Zhong <czhong@redhat.com>
Closes: https://lore.kernel.org/all/CAGVVp+VqVnvGeneUoTbYvBv2cw6GwQRrR3B-iQ-_9rVfyumoKA@mail.gmail.com/
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
Reviewed-by: Xiao Ni <xni@redhat.com>
Reviewed-by: Li Nan <linan122@huawei.com>
Reviewed-by: Paul Menzel <pmenzel@molgen.mpg.de>
2025-11-30 09:36:50 +08:00
Ritesh Oedayrajsingh Varma
ff34657aa7 bpf: optimize bpf_map_update_elem() for map-in-map types
Updating a BPF_MAP_TYPE_HASH_OF_MAPS or BPF_MAP_TYPE_ARRAY_OF_MAPS via
bpf_map_update_elem() is very expensive.

In one of our workloads, we're inserting ~1400 maps of type
BPF_MAP_TYPE_ARRAY into a BPF_MAP_TYPE_ARRAY_OF_MAPS. This takes ~21
seconds on a single thread, with an average of ~15ms per call:

Function Name:    map_update_elem
Number of calls:  1369
Total time:       21s 182ms 966µs
Maximum:          47ms 937µs
Average:          15ms 473µs
Minimum:          7µs

Profiling shows that nearly all of this time is going to synchronize_rcu(),
via maybe_wait_bpf_programs() in map_update_elem().

The call to synchronize_rcu() is done to ensure that after
bpf_map_update_elem() returns, no BPF programs are still looking at the old
value of the map, per commit 1ae80cf319 ("bpf: wait for running BPF
programs when updating map-in-map").

As discussed on the bpf mailing list, replace synchronize_rcu() with
synchronize_rcu_expedited(). This is 175x faster: it now takes an average
of 88 microseconds per call, for a total of 127 milliseconds in the same
benchmark:

Function Name:    map_update_elem
Number of calls:  1439
Total time:       127ms 626µs
Maximum:          445µs
Average:          88µs
Minimum:          10µs

Link: https://lore.kernel.org/bpf/CAH6OuBR=w2kybK6u7aH_35B=Bo1PCukeMZefR=7V4Z2tJNK--Q@mail.gmail.com/

Signed-off-by: Ritesh Oedayrajsingh Varma <ritesh@superluminal.eu>
Link: https://lore.kernel.org/r/20251128000422.20462-1-ritesh@superluminal.eu
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-29 09:48:41 -08:00
Menglong Dong
c1af4465b9 bpf: make kprobe_multi_link_prog_run always_inline
Make kprobe_multi_link_prog_run() always inline to obtain better
performance. Before this patch, the bench performance is:

./bench trig-kprobe-multi
Setting up benchmark 'trig-kprobe-multi'...
Benchmark 'trig-kprobe-multi' started.
Iter   0 ( 95.485us): hits   62.462M/s ( 62.462M/prod), [...]
Iter   1 (-80.054us): hits   62.486M/s ( 62.486M/prod), [...]
Iter   2 ( 13.572us): hits   62.287M/s ( 62.287M/prod), [...]
Iter   3 ( 76.961us): hits   62.293M/s ( 62.293M/prod), [...]
Iter   4 (-77.698us): hits   62.394M/s ( 62.394M/prod), [...]
Iter   5 (-13.399us): hits   62.319M/s ( 62.319M/prod), [...]
Iter   6 ( 77.573us): hits   62.250M/s ( 62.250M/prod), [...]
Summary: hits   62.338 ± 0.083M/s ( 62.338M/prod)

And after this patch, the performance is:

Iter   0 (454.148us): hits   66.900M/s ( 66.900M/prod), [...]
Iter   1 (-435.540us): hits   68.925M/s ( 68.925M/prod), [...]
Iter   2 (  8.223us): hits   68.795M/s ( 68.795M/prod), [...]
Iter   3 (-12.347us): hits   68.880M/s ( 68.880M/prod), [...]
Iter   4 (  2.291us): hits   68.767M/s ( 68.767M/prod), [...]
Iter   5 ( -1.446us): hits   68.756M/s ( 68.756M/prod), [...]
Iter   6 ( 13.882us): hits   68.657M/s ( 68.657M/prod), [...]
Summary: hits   68.792 ± 0.087M/s ( 68.792M/prod)

As we can see, the performance of kprobe-multi increase from 62M/s to
68M/s.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
Link: https://lore.kernel.org/r/20251126085246.309942-1-dongml2@chinatelecom.cn
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-29 09:47:10 -08:00
Alexei Starovoitov
85bdeeb48f Merge branch 'selftests-bpf-convert-test_tc_edt-sh-into-test_progs'
Alexis Lothoré says:

====================
selftests/bpf: convert test_tc_edt.sh into test_progs

Hello,
this is a (late) v2 to my first attempt to convert the test_tc_edt
script to test_progs. This new version is way simpler, thanks to
Martin's suggestion about properly using the existing network_helpers
rather than reinventing the wheel. It also fixes a small bug in the
measured effective rate.

The converted test roughly follows the original script logic, with two
veths in two namespaces, a TCP connection between a client and a server,
and the client pushing a specific amount of data. Time is recorded
before and after the transmission to compute the effective rate.

There are two knobs driving the robustness of the test in CI:
- the amount of pushed data (the higher, the more precise is the
  effective rate)
- the tolerated error margin

The original test was configured with a 20s duration and a 1% error
margin. The new test is configured with 1MB of data being pushed and a
2% error margin, to:
- make the duration tolerable in CI
- while keeping enough margin for rate measure fluctuations depending on
  the CI machines load

This has been run multiple times locally to ensure that those values are
sane, and once in CI before sending the series, but I suggest to let it
live a few days in CI to see how it really behaves.

Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>

Changes in v2:
- drop custom client/server management
- update bpf program now that server pushes data
- fix effective rate computation
- Link to v1: https://lore.kernel.org/r/20251031-tc_edt-v1-0-5d34a5823144@bootlin.com

---
Alexis Lothoré (eBPF Foundation) (4):
      selftests/bpf: rename test_tc_edt.bpf.c section to expose program type
      selftests/bpf: integrate test_tc_edt into test_progs
      selftests/bpf: remove test_tc_edt.sh
      selftests/bpf: do not hardcode target rate in test_tc_edt BPF program

 tools/testing/selftests/bpf/Makefile               |   2 -
 .../testing/selftests/bpf/prog_tests/test_tc_edt.c | 145 +++++++++++++++++++++
 tools/testing/selftests/bpf/progs/test_tc_edt.c    |  11 +-
 tools/testing/selftests/bpf/test_tc_edt.sh         | 100 --------------
 4 files changed, 151 insertions(+), 107 deletions(-)
---
base-commit: 233a075a1b27070af76d64541cf001340ecff917
change-id: 20251030-tc_edt-3ea8e8d3d14e

Best regards,
====================

Link: https://patch.msgid.link/20251128-tc_edt-v2-0-26db48373e73@bootlin.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-29 09:43:47 -08:00
Alexis Lothoré (eBPF Foundation)
1d17bcce6a selftests/bpf: do not hardcode target rate in test_tc_edt BPF program
test_tc_edt currently defines the target rate in both the userspace and
BPF parts. This value could be defined once in the userspace part if we
make it able to configure the BPF program before starting the test.

Add a target_rate variable in the BPF part, and make the userspace part
set it to the desired rate before attaching the shaping program.

Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Link: https://lore.kernel.org/r/20251128-tc_edt-v2-4-26db48373e73@bootlin.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-29 09:37:41 -08:00
Alexis Lothoré (eBPF Foundation)
50ce5ea5f7 selftests/bpf: remove test_tc_edt.sh
Now that test_tc_edt has been integrated in test_progs, remove the
legacy shell script.

Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Link: https://lore.kernel.org/r/20251128-tc_edt-v2-3-26db48373e73@bootlin.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-29 09:37:41 -08:00
Alexis Lothoré (eBPF Foundation)
b0f82e7ab6 selftests/bpf: integrate test_tc_edt into test_progs
test_tc_edt.sh uses a pair of veth and a BPF program attached to the TX
veth to shape the traffic to 5MBps. It then checks that the amount of
received bytes (at interface level), compared to the TX duration, indeed
matches 5Mbps.

Convert this test script to the test_progs framework:
- keep the double veth setup, isolated in two veths
- run a small tcp server, and connect client to server
- push a pre-configured amount of bytes, and measure how much time has
  been needed to push those
- ensure that this rate is in a 2% error margin around the target rate

This two percent value, while being tight, is hopefully large enough to
not make the test too flaky in CI, while also turning it into a small
example of BPF-based shaping.

Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Link: https://lore.kernel.org/r/20251128-tc_edt-v2-2-26db48373e73@bootlin.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-29 09:37:41 -08:00
Alexis Lothoré (eBPF Foundation)
4b4833acc6 selftests/bpf: rename test_tc_edt.bpf.c section to expose program type
The test_tc_edt BPF program uses a custom section name, which works fine
when manually loading it with tc, but prevents it from being loaded with
libbpf.

Update the program section name to "tc" to be able to manipulate it with
a libbpf-based C test.

Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Link: https://lore.kernel.org/r/20251128-tc_edt-v2-1-26db48373e73@bootlin.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-29 09:37:41 -08:00
Alexei Starovoitov
34235a3544 Merge branch 'limited-queueing-in-nmi-for-rqspinlock'
Kumar Kartikeya Dwivedi says:

====================
Limited queueing in NMI for rqspinlock

Ritesh reported that he was frequently seeing timeouts in cases which
should have been covered by the AA heuristics. This led to the discovery
of multiple gaps in the current code that could lead to timeouts when
AA heuristics could work to prevent them. More details and investigation
is available in the original threads. [0][1]

This set restores the ability for NMI waiters to queue in the slow path,
and reduces the cases where they would attempt to trylock. However, such
queueing must not happen when interrupting waiters which the NMI itself
depends upon for forward progress; in those cases the trylock fallback
remains, but with a single attempt to avoid aimless attempts to acquire
the lock.

It also closes a possible window in the lock fast path and the unlock
path where NMIs landing between cmpxchg and entry creation, or entry
deletion and unlock would miss the detection of an AA scenario and end
up timing out.

This virtually eliminates all the cases where existing heuristics can
prevent timeouts and quickly recover from a deadlock. More details are
available in the commit logs for each patch.

  [0]: https://lore.kernel.org/bpf/CAH6OuBTjG+N=+GGwcpOUbeDN563oz4iVcU3rbse68egp9wj9_A@mail.gmail.com
  [1]: https://lore.kernel.org/bpf/20251125203253.3287019-1-memxor@gmail.com
====================

Link: https://patch.msgid.link/20251128232802.1031906-1-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-29 09:35:36 -08:00
Kumar Kartikeya Dwivedi
3448375e71 selftests/bpf: Add success stats to rqspinlock stress test
Add stats to observe the success and failure rate of lock acquisition
attempts in various contexts.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20251128232802.1031906-7-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-29 09:35:36 -08:00
Kumar Kartikeya Dwivedi
087849cca3 rqspinlock: Precede non-head waiter queueing with AA check
While previous commits sufficiently address the deadlocks, there are
still scenarios where queueing of waiters in NMIs can exacerbate the
possibility of timeouts.

Consider the case below:

CPU 0
<NMI>
res_spin_lock(A) -> becomes non-head waiter
</NMI>
lock owner in CS or pending waiter spinning

CPU 1
res_spin_lock(A) -> head waiter spinning on owner/pending bits

In such a scenario, the non-head waiter in NMI on CPU 0 will not poll
for deadlocks or timeout since it will simply queue behind previous
waiter (head on CPU 1), and also not enter the trylock fallback since
no rqspinlock queue waiter is active on CPU 0. In such a scenario, the
transaction initiated by the head waiter on CPU 1 will timeout,
signalling the NMI and ending the cyclic dependency, but it will cost
250 ms of time.

Instead, the NMI on CPU 0 could simply check for the presence of an AA
deadlock and only proceed with queueing on success. Add such a check
right before any form of queueing is initiated.

The reason the AA deadlock check is not used in conjunction with
in_nmi() is that a similar case could occur due to a reentrant path
in the owner's critical section, and unconditionally checking for AA
before entering the queueing path avoids expensive timeouts. Non-NMI
reentrancy only happens at controlled points in the slow path (with
specific tracepoints which do not impede the forward progress of a
waiter loop), or in the owner CS, while NMIs can land anywhere.

While this check is only needed for non-head waiter queueing, checking
whether we are head or not is racy without xchg_tail, and after that
point, we are already queued, hence for simplicity we must invoke the
check unconditionally.

Note that a more contrived case could still be constructed by using two
locks, and interrupting the progress of the respective owners by
non-head waiters of the other lock, in an ABBA fashion, which would
still not be covered by the current set of checks and conditions. It
would still lead to a timeout though, and not a deadlock. An ABBA check
cannot happen optimistically before the queueing, since it can be racy,
and needs to be happen continuously during the waiting period, which
would then require an unlinking step for queued NMI/reentrant waiters.
This is beyond the scope of this patch.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20251128232802.1031906-6-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-29 09:35:36 -08:00
Kumar Kartikeya Dwivedi
30dc2f7025 rqspinlock: Disable spinning for trylock fallback
The original trylock fallback was inherited from qspinlock, and then
reused for the reentrant NMIs while the slow path is active. However,
under contention, it is very unlikely for the trylock to succeed in
taking the lock. In addition, a trylock also has no fairness guarantees,
and thus is prone to starvation issues under extreme scenarios.

The original qspinlock had no choice in terms of returning an error the
caller; if the node count was breached, it had to fall back to trylock
to attempt to take the lock. In case of rqspinlock, we do have the
option of returning to the user. Thus, simply attempt the trylock once,
and instead of spinning, return an error in case the lock cannot be
taken.

This ends up significantly reducing the time spent in the trylock
fallback, since we no longer wait for the timeout duration trying to
aimlessly acquire the lock when there's a high-probability that under
contention, it won't be available to us anyway.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20251128232802.1031906-5-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-29 09:35:36 -08:00
Kumar Kartikeya Dwivedi
81d5a6a438 rqspinlock: Use trylock fallback when per-CPU rqnode is busy
In addition to deferring to the trylock fallback in NMIs, only do so
when an rqspinlock waiter is queued on the current CPU. This is detected
by noticing a non-zero node index. This allows NMI waiters to join the
waiter queue if it isn't interrupting an existing rqspinlock waiter, and
increase the chances of fairly obtaining the lock, performing deadlock
detection as the head, and not being starved while attempting the
trylock.

The trylock path in particular is unlikely to succeed under contention,
as it relies on the lock word becoming 0, which indicates no contention.
This means that the most likely result for NMIs attempting a trylock is
a timeout under contention if they don't hit an AA or ABBA case.

The core problem being addressed through the fixed commit was removing
the dependency edge between an NMI queue waiter and the queue waiter it
is interrupting. Whenever a circular dependency forms, and with no way
to break it (as non-head waiters don't poll for deadlocks or timeouts),
we would enter into a deadlock. A trylock either breaks such an edge by
probing for deadlocks, and finally terminating the waiting loop using a
timeout.

By excluding queueing on CPUs where the node index is non-zero for NMIs,
this sort of dependency is broken. The CPU enters the trylock path for
those cases, and falls back to deadlock checks and timeouts. However, in
other case where it doesn't interrupt the CPU in the slow path while its
queued on the lock, it can join the queue as a normal waiter, and avoid
trylock associated starvation and subsequent timeouts.

There are a few remaining cases here that matter: the NMI can still
preempt the owner in its critical section, and if it queues as a
non-head waiter, it can end up impeding the progress of the owner. While
this won't deadlock, since the head waiter will eventually signal the
NMI waiter to either stop (due to a timeout), it can still lead to long
timeouts. These gaps will be addressed in subsequent commits.

Note that while the node count detection approach is less conservative
than simply deferring NMIs to trylock, it is going to return errors
where attempts to lock B in NMI happen while waiters for lock A are in a
lower context on the same CPU. However, this only occurs when the lower
context is queued in the slow path, and the NMI attempt can proceed
without failure in all other cases. To continue to prevent AA deadlocks
(or ABBA in a similar NMI interrupting lower context pattern), we'd need
a more fleshed out algorithm to unlink NMI waiters after they queue and
detect such cases. However, all that complexity isn't appealing yet to
reduce the failure rate in the small window inside the slow path.

It is important to note that reentrancy in the slow path can also happen
through trace_contention_{begin,end}, but in those cases, unlike an NMI,
the forward progress of the head waiter (or the predecessor in general)
is not being blocked.

Fixes: 0d80e7f951 ("rqspinlock: Choose trylock fallback for NMI waiters")
Reported-by: Ritesh Oedayrajsingh Varma <ritesh@superluminal.eu>
Suggested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20251128232802.1031906-4-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-29 09:35:35 -08:00
Kumar Kartikeya Dwivedi
5860f5ce47 rqspinlock: Perform AA checks immediately
Currently, while we enter the check_timeout call immediately due to the
way the ts.spin is initialized, we still invoke the AA and ABBA checks
in the second invocation, and only initialize the timestamp in the first
one. Since each iteration is at least done with a 1ms delay, this can
add delays in detection of AA deadlocks, up to a ms.

Rework check_timeout() to avoid this. First, call check_deadlock_AA()
while initializing the timestamps for the wait period. This also means
that we only do it once per waiting period, instead of every invocation.
Finally, drop check_deadlock() and call check_deadlock_ABBA() directly.

To save on unnecessary ktime_get_mono_fast_ns() in case of AA deadlock,
sample the time only if it returns 0.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20251128232802.1031906-3-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-29 09:35:35 -08:00
Kumar Kartikeya Dwivedi
beb7021a60 rqspinlock: Enclose lock/unlock within lock entry acquisitions
Ritesh reported that timeouts occurred frequently for rqspinlock despite
reentrancy on the same lock on the same CPU in [0]. This patch closes
one of the races leading to this behavior, and reduces the frequency of
timeouts.

We currently have a tiny window between the fast-path cmpxchg and the
grabbing of the lock entry where an NMI could land, attempt the same
lock that was just acquired, and end up timing out. This is not ideal.
Instead, move the lock entry acquisition from the fast path to before
the cmpxchg, and remove the grabbing of the lock entry in the slow path,
assuming it was already taken by the fast path. The TAS fallback is
invoked directly without being preceded by the typical fast path,
therefore we must continue to grab the deadlock detection entry in that
case.

Case on lock leading to missed AA:

cmpxchg lock A
<NMI>
... rqspinlock acquisition of A
... timeout
</NMI>
grab_held_lock_entry(A)

There is a similar case when unlocking the lock. If the NMI lands
between the WRITE_ONCE and smp_store_release, it is possible that we end
up in a situation where the NMI fails to diagnose the AA condition,
leading to a timeout.

Case on unlock leading to missed AA:

WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL)
<NMI>
... rqspinlock acquisition of A
... timeout
</NMI>
smp_store_release(A->locked, 0)

The patch changes the order on unlock to smp_store_release() succeeded
by WRITE_ONCE() of NULL. This avoids the missed AA detection described
above, but may lead to a false positive if the NMI lands between these
two statements, which is acceptable (and preferred over a timeout).

The original intention of the reverse order on unlock was to prevent the
following possible misdiagnosis of an ABBA scenario:

grab entry A
lock A
grab entry B
lock B
unlock B
   smp_store_release(B->locked, 0)
							grab entry B
							lock B
							grab entry A
							lock A
							! <detect ABBA>
   WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL)

If the store release were is after the WRITE_ONCE, the other CPU would
not observe B in the table of the CPU unlocking the lock B.  However,
since the threads are obviously participating in an ABBA deadlock, it
is no longer appealing to use the order above since it may lead to a
250 ms timeout due to missed AA detection.

  [0]: https://lore.kernel.org/bpf/CAH6OuBTjG+N=+GGwcpOUbeDN563oz4iVcU3rbse68egp9wj9_A@mail.gmail.com

Fixes: 0d80e7f951 ("rqspinlock: Choose trylock fallback for NMI waiters")
Reported-by: Ritesh Oedayrajsingh Varma <ritesh@superluminal.eu>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20251128232802.1031906-2-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-29 09:35:35 -08:00
Oliver Hartkopp
cb2dc6d286 can: Kconfig: select CAN driver infrastructure by default
The CAN bus support enabled with CONFIG_CAN provides a socket-based
access to CAN interfaces. With the introduction of the latest CAN protocol
CAN XL additional configuration status information needs to be exposed to
the network layer than formerly provided by standard Linux network drivers.

This requires the CAN driver infrastructure to be selected by default.
As the CAN network layer can only operate on CAN interfaces anyway all
distributions and common default configs enable at least one CAN driver.

So selecting CONFIG_CAN_DEV when CONFIG_CAN is selected by the user has
no effect on established configurations but solves potential build issues
when CONFIG_CAN[_XXX]=y is set together with CANFIG_CAN_DEV=m

Fixes: 1a620a7238 ("can: raw: instantly reject unsupported CAN frames")
Reported-by: Vincent Mailhol <mailhol@kernel.org>
Closes: https://lore.kernel.org/all/CAMZ6RqL_nGszwoLPXn1Li8op-ox4k3Hs6p=Hw6+w0W=DTtobPw@mail.gmail.com/
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202511280531.YnWW2Rxc-lkp@intel.com/
Closes: https://lore.kernel.org/oe-kbuild-all/202511280842.djCQ0N0O-lkp@intel.com/
Closes: https://lore.kernel.org/oe-kbuild-all/202511282325.uVQFRTkA-lkp@intel.com/
Closes: https://lore.kernel.org/oe-kbuild-all/202511291520.guIE1QHj-lkp@intel.com/
Suggested-by: Marc Kleine-Budde <mkl@pengutronix.de>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://patch.msgid.link/20251129090500.17484-1-socketcan@hartkopp.net
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-11-29 13:37:12 +01:00
Thorsten Blum
ff736a2861 net: ipconfig: Replace strncpy with strscpy in ic_proto_name
strncpy() is deprecated [1] for NUL-terminated destination buffers
because it does not guarantee NUL termination. Replace it with strscpy()
to ensure the destination buffer is always NUL-terminated and to avoid
any additional NUL padding.

Although the identifier buffer has 252 usable bytes, strncpy() copied
only up to 251 bytes to the zero-initialized buffer, relying on the last
byte to act as an implicit NUL terminator. Switching to strscpy() avoids
this implicit behavior and does not use magic numbers.

The source string is also NUL-terminated and satisfies the
__must_be_cstr() requirement of strscpy().

Link: https://www.kernel.org/doc/html/latest/process/deprecated.html#strncpy-on-nul-terminated-strings [1]
Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
Link: https://patch.msgid.link/20251126220804.102160-2-thorsten.blum@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-28 20:19:16 -08:00
Breno Leitao
e5235eb6cf net: netpoll: initialize work queue before error checks
Prevent a kernel warning when netconsole setup fails on devices with
IFF_DISABLE_NETPOLL flag. The warning (at kernel/workqueue.c:4242 in
__flush_work) occurs because the cleanup path tries to cancel an
uninitialized work queue.

When __netpoll_setup() encounters a device with IFF_DISABLE_NETPOLL,
it fails early and calls skb_pool_flush() for cleanup. This function
calls cancel_work_sync(&np->refill_wq), but refill_wq hasn't been
initialized yet, triggering the warning.

Move INIT_WORK() to the beginning of __netpoll_setup(), ensuring the
work queue is properly initialized before any potential failure points.
This allows the cleanup path to safely cancel the work queue regardless
of where the setup fails.

Fixes: 248f6571fd ("netpoll: Optimize skb refilling on critical path")
Signed-off-by: Breno Leitao <leitao@debian.org>
Link: https://patch.msgid.link/20251127-netpoll_fix_init_work-v1-1-65c07806d736@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-28 20:16:57 -08:00
Hangbin Liu
2c28ee720a selftests: bonding: add delay before each xvlan_over_bond connectivity check
Jakub reported increased flakiness in bond_macvlan_ipvlan.sh on regular
kernel, while the tests consistently pass on a debug kernel. This suggests
a timing-sensitive issue.

To mitigate this, introduce a short sleep before each xvlan_over_bond
connectivity check. The delay helps ensure neighbor and route cache
have fully converged before verifying connectivity.

The sleep interval is kept minimal since check_connection() is invoked
nearly 100 times during the test.

Fixes: 246af950b9 ("selftests: bonding: add macvlan over bond testing")
Reported-by: Jakub Kicinski <kuba@kernel.org>
Closes: https://lore.kernel.org/netdev/20251114082014.750edfad@kernel.org
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Link: https://patch.msgid.link/20251127143310.47740-1-liuhangbin@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-28 20:16:22 -08:00
Jakub Kicinski
840a64710e Merge tag 'nf-next-25-11-28' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next
Pablo Neira Ayuso says:

====================
Netfilter updates for net-next

The following batch contains Netfilter updates for net-next:

0) Add sanity check for maximum encapsulations in bridge vlan,
   reported by the new AI robot.

1) Move the flowtable path discovery code to its own file, the
   nft_flow_offload.c mixes the nf_tables evaluation with the path
   discovery logic, just split this in two for clarity.

2) Consolidate flowtable xmit path by using dev_queue_xmit() and the
   real device behind the layer 2 vlan/pppoe device. This allows to
   inline encapsulation. After this update, hw_ifidx can be removed
   since both ifidx and hw_ifidx now point to the same device.

3) Support for IPIP encapsulation in the flowtable, extend selftest
   to cover for this new layer 3 offload, from Lorenzo Bianconi.

4) Push down the skb into the conncount API to fix duplicates in the
   conncount list for packets with non-confirmed conntrack entries,
   this is due to an optimization introduced in d265929930
   ("netfilter: nf_conncount: reduce unnecessary GC").
   From Fernando Fernandez Mancera.

5) In conncount, disable BH when performing garbage collection
   to consolidate existing behaviour in the conncount API, also
   from Fernando.

6) A matching packet with a confirmed conntrack invokes GC if
   conncount reaches the limit in an attempt to release slots.
   This allows the existing extensions to be used for real conntrack
   counting, not just limiting new connections, from Fernando.

7) Support for updating ct count objects in nf_tables, from Fernando.

8) Extend nft_flowtables.sh selftest to send IPv6 TCP traffic,
   from Lorenzo Bianconi.

9) Fixes for UAPI kernel-doc documentation, from Randy Dunlap.

* tag 'nf-next-25-11-28' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next:
  netfilter: nf_tables: improve UAPI kernel-doc comments
  netfilter: ip6t_srh: fix UAPI kernel-doc comments format
  selftests: netfilter: nft_flowtable.sh: Add the capability to send IPv6 TCP traffic
  netfilter: nft_connlimit: add support to object update operation
  netfilter: nft_connlimit: update the count if add was skipped
  netfilter: nf_conncount: make nf_conncount_gc_list() to disable BH
  netfilter: nf_conncount: rework API to use sk_buff directly
  selftests: netfilter: nft_flowtable.sh: Add IPIP flowtable selftest
  netfilter: flowtable: Add IPIP tx sw acceleration
  netfilter: flowtable: Add IPIP rx sw acceleration
  netfilter: flowtable: use tuple address to calculate next hop
  netfilter: flowtable: remove hw_ifidx
  netfilter: flowtable: inline pppoe encapsulation in xmit path
  netfilter: flowtable: inline vlan encapsulation in xmit path
  netfilter: flowtable: consolidate xmit path
  netfilter: flowtable: move path discovery infrastructure to its own file
  netfilter: flowtable: check for maximum number of encapsulations in bridge vlan
====================

Link: https://patch.msgid.link/20251128002345.29378-1-pablo@netfilter.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-28 20:08:39 -08:00
Jakub Kicinski
8aa1053f9d Merge branch 'introduce-the-dsa_xmit_port_mask-tagging-protocol-helper'
Vladimir Oltean says:

====================
Introduce the dsa_xmit_port_mask() tagging protocol helper

What
----

Some DSA tags have just the port number in the TX header format, others
have a bit field where in theory, multiple bits can be set, even though
DSA only sets one.

The latter kind is now making use of a dsa_xmit_port_mask() helper,
which will decide when to set more than 1 bit in that mask.

Why
---

David Yang has pointed out in a recently posted patch that HSR packet
duplication on transmission can be offloaded even on HSR-unaware
switches. This should be made generally available to all DSA switches.

How to test
-----------

These patches just lay the groundwork, and there should be no functional
change - so for this set, regression testing is all that's necessary.

For testing the HSR packet duplication idea, I've put together a branch:
https://github.com/vladimiroltean/linux/commits/dsa-simple-hsr-offload/
where most drivers are patched to call dsa_port_simple_hsr_join() and
dsa_port_simple_hsr_leave().

Assuming there are volunteers to also test the latter, one can enable
CONFIG_HSR and create a HSR device using:
$ ip link add name hsr0 type hsr slave1 swp0 slave2 swp1 supervision 45 version 1

This needs to be connected using 2 cables to another system where the
same command was run. Then, one should be able to ping the other board
through the hsr0 interface.

Without the Github branch, a ping over HSR should increase the DSA
conduit interface's TX counters by 2 packets. With the Github branch,
the TX counters should increase by only 1 packet.

Why so many patches
-------------------

To avoid the situation where a patch has to be backported, conflicts
with the work done here, pulls this in as a dependency, and that pulls
in 13 other unrelated drivers. These don't have any dependencies between
each other and can be cherry-picked at will (except they all depend on
patch 1/15).
====================

Link: https://patch.msgid.link/20251127120902.292555-1-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-28 20:03:44 -08:00
Vladimir Oltean
64b0d2edb6 net: dsa: tag_yt921x: use the dsa_xmit_port_mask() helper
The "yt921x" tagging protocol populates a bit mask for the TX ports,
so we can use dsa_xmit_port_mask() to centralize the decision of how to
set that field.

Cc: David Yang <mmyangfl@gmail.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://patch.msgid.link/20251127120902.292555-16-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-28 20:03:41 -08:00
Vladimir Oltean
24099389a6 net: dsa: tag_xrs700x: use the dsa_xmit_port_mask() helper
The "xrs700x" is the original DSA tagging protocol with HSR TX
replication support, we now essentially move that logic to the
dsa_xmit_port_mask() helper. The end result is something akin to
hellcreek_xmit() (but reminds me I should also take care of
skb_checksum_help() for tail taggers in the core).

The implementation differences to dsa_xmit_port_mask() are immaterial.

Cc: George McCollister <george.mccollister@gmail.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://patch.msgid.link/20251127120902.292555-15-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-28 20:03:41 -08:00
Vladimir Oltean
3c1975bbdf net: dsa: tag_trailer: use the dsa_xmit_port_mask() helper
The "trailer" tagging protocol populates a bit mask for the TX ports, so
we can use dsa_xmit_port_mask() to centralize the decision of how to set
that field.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://patch.msgid.link/20251127120902.292555-14-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-28 20:03:41 -08:00
Vladimir Oltean
b33aa90e68 net: dsa: tag_rzn1_a5psw: use the dsa_xmit_port_mask() helper
The "a5psw" tagging protocol populates a bit mask for the TX ports,
so we can use dsa_xmit_port_mask() to centralize the decision of how to
set that field.

Cc: "Clément Léger" <clement.leger@bootlin.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://patch.msgid.link/20251127120902.292555-13-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-28 20:03:41 -08:00
Vladimir Oltean
5afe4ccc33 net: dsa: tag_rtl8_4: use the dsa_xmit_port_mask() helper
The "rtl8_4" and "rtl8_4t" tagging protocols populate a bit mask for the
TX ports, so we can use dsa_xmit_port_mask() to centralize the decision
of how to set that field.

Cc: Linus Walleij <linus.walleij@linaro.org>
Cc: "Alvin Šipraga" <alsi@bang-olufsen.dk>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://patch.msgid.link/20251127120902.292555-12-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-28 20:03:40 -08:00
Vladimir Oltean
4abf39c8ae net: dsa: tag_rtl4_a: use the dsa_xmit_port_mask() helper
The "rtl4a" tagging protocol populates a bit mask for the TX ports,
so we can use dsa_xmit_port_mask() to centralize the decision of how to
set that field.

Cc: Linus Walleij <linus.walleij@linaro.org>
Cc: "Alvin Šipraga" <alsi@bang-olufsen.dk>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://patch.msgid.link/20251127120902.292555-11-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-28 20:03:40 -08:00
Vladimir Oltean
48afabaf4a net: dsa: tag_qca: use the dsa_xmit_port_mask() helper
The "qca" tagging protocol populates a bit mask for the TX ports, so we
can use dsa_xmit_port_mask() to centralize the decision of how to set
that field.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://patch.msgid.link/20251127120902.292555-10-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-28 20:03:40 -08:00
Vladimir Oltean
5733fe2a7a net: dsa: tag_ocelot: use the dsa_xmit_port_mask() helper
The "ocelot" and "seville" tagging protocols populate a bit mask for the
TX ports, so we can use dsa_xmit_port_mask() to centralize the decision
of how to set that field.

This protocol used BIT_ULL() rather than simple BIT() to silence Smatch,
as explained in commit 1f778d500d ("net: mscc: ocelot: avoid type
promotion when calling ocelot_ifh_set_dest"). I would expect that this
tool no longer complains now, when the BIT(dp->index) is hidden inside
the dsa_xmit_port_mask() function, the return value of which is promoted
to u64.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://patch.msgid.link/20251127120902.292555-9-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-28 20:03:40 -08:00
Vladimir Oltean
a4a00d9e36 net: dsa: tag_mxl_gsw1xx: use the dsa_xmit_port_mask() helper
The "gsw1xx" tagging protocol populates a bit mask for the TX ports, so
we can use dsa_xmit_port_mask() to centralize the decision of how to set
that field.

Cc: Hauke Mehrtens <hauke@hauke-m.de>
Cc: Daniel Golle <daniel@makrotopia.org>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://patch.msgid.link/20251127120902.292555-8-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-28 20:03:40 -08:00
Vladimir Oltean
84a60bbec5 net: dsa: tag_mtk: use the dsa_xmit_port_mask() helper
The "mtk" tagging protocol populates a bit mask for the TX ports, so we
can use dsa_xmit_port_mask() to centralize the decision of how to set
that field.

Cc: Chester A. Unal" <chester.a.unal@arinc9.com>
Cc: Daniel Golle <daniel@makrotopia.org>
Cc: DENG Qingfang <dqfext@gmail.com>
Cc: Sean Wang <sean.wang@mediatek.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://patch.msgid.link/20251127120902.292555-7-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-28 20:03:40 -08:00
Vladimir Oltean
ea659a9292 net: dsa: tag_ksz: use the dsa_xmit_port_mask() helper
The "ksz8795", "ksz9893", "ksz9477" and "lan937x" tagging protocols
populate a bit mask for the TX ports.

Unlike the others, "ksz9477" also accelerates HSR packet duplication.

Make the HSR duplication logic available generically to all 4 taggers by
using the dsa_xmit_port_mask() function to set the TX port mask.

Cc: Woojung Huh <woojung.huh@microchip.com>
Cc: UNGLinuxDriver@microchip.com
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://patch.msgid.link/20251127120902.292555-6-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-28 20:03:39 -08:00
Vladimir Oltean
f59e44cc0d net: dsa: tag_hellcreek: use the dsa_xmit_port_mask() helper
The "hellcreek" tagging protocol populates a bit mask for the TX ports,
so we can use dsa_xmit_port_mask() to centralize the decision of how to
set that field.

Cc: Kurt Kanzenbach <kurt@linutronix.de>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://patch.msgid.link/20251127120902.292555-5-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-28 20:03:39 -08:00
Vladimir Oltean
e094428fb4 net: dsa: tag_gswip: use the dsa_xmit_port_mask() helper
The "gswip" tagging protocol populates a bit mask for the TX ports, so
we can use dsa_xmit_port_mask() to centralize the decision of how to set
that field.

Cc: Hauke Mehrtens <hauke@hauke-m.de>
Cc: Daniel Golle <daniel@makrotopia.org>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://patch.msgid.link/20251127120902.292555-4-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-28 20:03:39 -08:00
Vladimir Oltean
621d06a40e net: dsa: tag_brcm: use the dsa_xmit_port_mask() helper
The "brcm" and "brcm-prepend" tagging protocols populate a bit mask for
the TX ports, so we can use dsa_xmit_port_mask() to centralize the
decision of how to set that field. The port mask is written u8 by u8,
first the high octet and then the low octet.

Cc: Florian Fainelli <florian.fainelli@broadcom.com>
Cc: Jonas Gorski <jonas.gorski@gmail.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://patch.msgid.link/20251127120902.292555-3-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-28 20:03:39 -08:00
Vladimir Oltean
6f2e1c75bc net: dsa: introduce the dsa_xmit_port_mask() tagging protocol helper
Many tagging protocols deal with the transmit port mask being a bit
mask, and set it to BIT(dp->index). Not a big deal.

Also, some tagging protocols are written for switches which support HSR
offload (including packet duplication offload), there we see a walk
using dsa_hsr_foreach_port() to find the other port in the same switch
that's member of the HSR, and set that bit in the port mask too.

That isn't sufficiently interesting either, until you come to realize
that there isn't anything special in the second case that switches just
in the first one can't do too.

It just becomes a matter of "is it wise to do it? are sufficient people
using HSR/PRP with generic off-the-shelf switches to justify add an
extra test in the data path?" - the answer to which is probably "it
depends". It isn't _much_ worse to not have HSR offload at all, so as to
make it impractical, esp. with a rich OS like Linux. But the HSR users
are rather specialized in industrial networking.

Anyway, the change acts on the premise that we're going to have support
for this, it should be uniformly implemented for everyone, and that if
we find some sort of balance, we can keep everyone relatively happy.

So I've disabled that logic if CONFIG_HSR isn't enabled, and I've tilted
the branch predictor to say it's unlikely we're transmitting through a
port with this capability currently active. On branch miss, we're still
going to save the transmission of one packet, so there's some remaining
benefit there too. I don't _think_ we need to jump to static keys yet.

The helper returns a 32-bit zero-based unsigned number, that callers
have to transpose using FIELD_PREP(). It is not the first time we assume
DSA switches won't be larger than 32 ports - dsa_user_ports() has that
assumption baked into it too.

One last development note about why pass the "skb" argument when this
isn't used. Looking at the compiled code on arm64, which is identical
both with and without it, the answer is "why not?" - who knows what
other features dependent on the skb may be handled in the future.

Link: https://lore.kernel.org/netdev/20251126093240.2853294-4-mmyangfl@gmail.com/
Cc: "Alvin Šipraga" <alsi@bang-olufsen.dk>
Cc: Chester A. Unal" <chester.a.unal@arinc9.com>
Cc: "Clément Léger" <clement.leger@bootlin.com>
Cc: Daniel Golle <daniel@makrotopia.org>
Cc: David Yang <mmyangfl@gmail.com>
Cc: DENG Qingfang <dqfext@gmail.com>
Cc: Florian Fainelli <florian.fainelli@broadcom.com>
Cc: George McCollister <george.mccollister@gmail.com>
Cc: Hauke Mehrtens <hauke@hauke-m.de>
Cc: Jonas Gorski <jonas.gorski@gmail.com>
Cc: Kurt Kanzenbach <kurt@linutronix.de>
Cc: Linus Walleij <linus.walleij@linaro.org>
Cc: Sean Wang <sean.wang@mediatek.com>
Cc: UNGLinuxDriver@microchip.com
Cc: Woojung Huh <woojung.huh@microchip.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://patch.msgid.link/20251127120902.292555-2-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-28 20:03:39 -08:00
Jakub Kicinski
e48766e66b Merge branch 'net-broadcom-migrate-to-get_rx_ring_count-ethtool-callback'
Breno Leitao says:

====================
net: broadcom: migrate to .get_rx_ring_count() ethtool callback

This series migrates Broadcom ethernet drivers to use the new
.get_rx_ring_count() ethtool callback introduced in commit 84eaf4359c
("net: ethtool: add get_rx_ring_count callback to optimize RX ring
queries").

This change simplifies the .get_rxnfc() implementation by
extracting the ETHTOOL_GRXRINGS case handling into a dedicated callback,
making the code cleaner and aligning these drivers with the updated
ethtool API.

The series covers two Broadcom drivers: bnxt and bcmgenet. Each patch
removes the ETHTOOL_GRXRINGS case from the driver's .get_rxnfc() switch
statement and implements the new .get_rx_ring_count() callback that
returns the number of RX rings.
====================

Link: https://patch.msgid.link/20251127-grxrings_broadcom-v1-0-b0b182864950@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-28 19:56:27 -08:00
Breno Leitao
335d78c616 net: bcmgenet: extract GRXRINGS from .get_rxnfc
Commit 84eaf4359c ("net: ethtool: add get_rx_ring_count callback to
optimize RX ring queries") added specific support for GRXRINGS callback,
simplifying .get_rxnfc.

Remove the handling of GRXRINGS in .get_rxnfc() by moving it to the new
.get_rx_ring_count().

This simplifies the RX ring count retrieval and aligns bcmgenet with the
new ethtool API for querying RX ring parameters.

Signed-off-by: Breno Leitao <leitao@debian.org>
Link: https://patch.msgid.link/20251127-grxrings_broadcom-v1-2-b0b182864950@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-28 19:56:22 -08:00
Breno Leitao
bba18f3ba7 net: bnxt: extract GRXRINGS from .get_rxnfc
Commit 84eaf4359c ("net: ethtool: add get_rx_ring_count callback to
optimize RX ring queries") added specific support for GRXRINGS callback,
simplifying .get_rxnfc.

Remove the handling of GRXRINGS in .get_rxnfc() by moving it to the new
.get_rx_ring_count().

This simplifies the RX ring count retrieval and aligns bnxt with the new
ethtool API for querying RX ring parameters.

Signed-off-by: Breno Leitao <leitao@debian.org>
Link: https://patch.msgid.link/20251127-grxrings_broadcom-v1-1-b0b182864950@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-28 19:56:22 -08:00
Jakub Kicinski
2ce992a1a4 Merge branch 'tools-ynl-add-schema-checking'
Donald Hunter says:

====================
tools: ynl: add schema checking

Add schema checking and yaml linting for the YNL specs.

Patch 1 adds a schema_check make target using a pyynl --validate option
Patch 2 adds a lint make target using yamllint
Patches 3,4 fix issues reported by make -C tools/net/ynl lint schema_check
====================

Link: https://patch.msgid.link/20251127123502.89142-1-donald.hunter@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-28 19:53:22 -08:00
Donald Hunter
1adc241f39 ynl: fix schema check errors
Fix two schema check errors that have lurked since the attribute name
validation was made more strict:

not ok 2 conntrack.yaml schema validation
'labels mask' does not match '^[0-9a-z-]+$'

not ok 13 nftables.yaml schema validation
'set id' does not match '^[0-9a-z-]+$'

Signed-off-by: Donald Hunter <donald.hunter@gmail.com>
Link: https://patch.msgid.link/20251127123502.89142-5-donald.hunter@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-28 19:53:20 -08:00
Donald Hunter
acce9d7200 ynl: fix a yamllint warning in ethtool spec
Fix warning reported by yamllint:

../../../Documentation/netlink/specs/ethtool.yaml
  1272:21   warning  truthy value should be one of [false, true]  (truthy)

Signed-off-by: Donald Hunter <donald.hunter@gmail.com>
Link: https://patch.msgid.link/20251127123502.89142-4-donald.hunter@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-28 19:53:19 -08:00
Donald Hunter
129dc6075a tools: ynl: add a lint makefile target
Add a lint target to run yamllint on the YNL specs.

make -C tools/net/ynl lint
make: Entering directory '/home/donaldh/net-next/tools/net/ynl'
yamllint ../../../Documentation/netlink/specs/*.yaml
../../../Documentation/netlink/specs/ethtool.yaml
  1272:21   warning  truthy value should be one of [false, true]  (truthy)

make: Leaving directory '/home/donaldh/net-next/tools/net/ynl'

Signed-off-by: Donald Hunter <donald.hunter@gmail.com>
Link: https://patch.msgid.link/20251127123502.89142-3-donald.hunter@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-28 19:53:19 -08:00
Donald Hunter
362d051c90 tools: ynl: add schema checking
Add a --validate flag to pyynl for explicit schema check with error
reporting and add a schema_check make target to check all YNL specs.

make -C tools/net/ynl schema_check
make: Entering directory '/home/donaldh/net-next/tools/net/ynl'
ok 1 binder.yaml schema validation
not ok 2 conntrack.yaml schema validation
'labels mask' does not match '^[0-9a-z-]+$'

Failed validating 'pattern' in schema['properties']['attribute-sets']['items']['properties']['attributes']['items']['properties']['name']:
    {'type': 'string', 'pattern': '^[0-9a-z-]+$'}

On instance['attribute-sets'][14]['attributes'][22]['name']:
    'labels mask'

ok 3 devlink.yaml schema validation
[...]

Signed-off-by: Donald Hunter <donald.hunter@gmail.com>
Link: https://patch.msgid.link/20251127123502.89142-2-donald.hunter@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-28 19:53:19 -08:00
Robert Marko
a6c121a243 net: phy: aquantia: check for NVMEM deferral
Currently, if NVMEM provider is probed later than Aquantia, loading the
firmware will fail with -EINVAL.

To fix this, simply check for -EPROBE_DEFER when NVMEM is attempted and
return it.

Fixes: e93984ebc1 ("net: phy: aquantia: add firmware load support")
Signed-off-by: Robert Marko <robimarko@gmail.com>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/20251127114514.460924-1-robimarko@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-28 19:40:42 -08:00
Jan Kara
91ef18b567 ext4: mark inodes without acls in __ext4_iget()
Mark inodes without acls with cache_no_acl() in __ext4_iget() so that
path lookup can run in RCU mode from the start. This is interesting in
particular for the case where the file owner does the lookup because in
that case end up constantly hitting the slow path otherwise. We drop out
from the fast path (because ACL state is unknown) but never end up calling
check_acl() to cache ACL state.

The problem was originally analyzed by Linus and fix tested by Matheusz,
I'm just putting it into mergeable form :).

Link: https://lore.kernel.org/all/CAHk-=whSzc75TLLPWskV0xuaHR4tpWBr=LduqhcCFr4kCmme_w@mail.gmail.com
Reported-by: Mateusz Guzik <mjguzik@gmail.com>
Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Baokun Li <libaokun1@huawei.com>
Message-ID: <20251125101340.24276-2-jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-28 22:35:28 -05:00
Baokun Li
cab8cbcb92 ext4: enable block size larger than page size
Since block device (See commit 3c20917120 ("block/bdev: enable large
folio support for large logical block sizes")) and page cache (See commit
ab95d23bab ("filemap: allocate mapping_min_order folios in the page
cache")) has the ability to have a minimum order when allocating folio,
and ext4 has supported large folio in commit 7ac67301e8 ("ext4: enable
large folio for regular file"), now add support for block_size > PAGE_SIZE
in ext4.

set_blocksize() -> bdev_validate_blocksize() already validates the block
size, so ext4_load_super() does not need to perform additional checks.
Here we only need to add the FS_LBS bit to fs_flags.

In addition, block sizes larger than the page size are currently supported
only when CONFIG_TRANSPARENT_HUGEPAGE is enabled. To make this explicit,
a blocksize_gt_pagesize entry has been added under /sys/fs/ext4/feature/,
indicating whether bs > ps is supported. This allows mke2fs to check the
interface and determine whether a warning should be issued when formatting
a filesystem with block size larger than the page size.

Suggested-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Pankaj Raghav <p.raghav@samsung.com>
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Message-ID: <20251121090654.631996-25-libaokun@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-28 22:35:28 -05:00
Baokun Li
709f0f1f1b ext4: add checks for large folio incompatibilities when BS > PS
Supporting a block size greater than the page size (BS > PS) requires
support for large folios. However, several features (e.g., encrypt)
do not yet support large folios.

To prevent conflicts, this patch adds checks at mount time to prohibit
these features from being used when BS > PS. Since these features cannot
be changed on remount, there is no need to check on remount.

This patch adds s_max_folio_order, initialized during mount according to
filesystem features and mount options. If s_max_folio_order is 0, large
folios are disabled.

With this in place, ext4_set_inode_mapping_order() can be simplified by
checking s_max_folio_order, avoiding redundant checks.

Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Message-ID: <20251121090654.631996-24-libaokun@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-28 22:35:28 -05:00
Baokun Li
1a3e9e8aa4 ext4: support verifying data from large folios with fs-verity
Eric Biggers already added support for verifying data from large folios
several years ago in commit 5d0f0e57ed ("fsverity: support verifying
data from large folios").

With ext4 now supporting large block sizes, the fs-verity tests
`kvm-xfstests -c ext4/64k -g verity -x encrypt` pass without issues.

Therefore, remove the restriction and allow large folios to be enabled
together with fs-verity.

Cc: Eric Biggers <ebiggers@kernel.org>
Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Message-ID: <20251121090654.631996-23-libaokun@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-28 22:35:28 -05:00
Baokun Li
58fd191f99 ext4: make data=journal support large block size
Currently, ext4_set_inode_mapping_order() does not set max folio order
for files with the data journalling flag. For files that already have
large folios enabled, ext4_inode_journal_mode() ignores the data
journalling flag once max folio order is set.

This is not because data journalling cannot work with large folios, but
because credit estimates will go through the roof if there are too many
blocks per folio.

Since the real constraint is blocks-per-folio, to support data=journal
under LBS, we now set max folio order to be equal to min folio order for
files with the journalling flag. When LBS is disabled, the max folio order
remains unset as before.

Therefore, before ext4_change_inode_journal_flag() switches the journalling
mode, we call truncate_pagecache() to drop all page cache for that inode,
and filemap_write_and_wait() is called unconditionally.

After that, once the journalling mode has been switched, we can safely
reset the inode mapping order, and the mapping_large_folio_support() check
in ext4_inode_journal_mode() can be removed.

Suggested-by: Jan Kara <jack@suse.cz>
Suggested-by: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Message-ID: <20251121090654.631996-22-libaokun@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-28 22:35:28 -05:00
Zhihao Cheng
c00a6292d0 ext4: support large block size in __ext4_block_zero_page_range()
Use the EXT4_PG_TO_LBLK() macro to convert folio indexes to blocks to avoid
negative left shifts after supporting blocksize greater than PAGE_SIZE.

Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Message-ID: <20251121090654.631996-21-libaokun@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-28 22:35:27 -05:00
Baokun Li
8e50e23b76 ext4: support large block size in mpage_prepare_extent_to_map()
Use the EXT4_PG_TO_LBLK/EXT4_LBLK_TO_PG macros to complete the conversion
between folio indexes and blocks to avoid negative left/right shifts after
supporting blocksize greater than PAGE_SIZE.

Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Message-ID: <20251121090654.631996-20-libaokun@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-28 22:35:27 -05:00
Baokun Li
b967ab7487 ext4: support large block size in mpage_map_and_submit_buffers()
Use the EXT4_PG_TO_LBLK/EXT4_LBLK_TO_PG macros to complete the conversion
between folio indexes and blocks to avoid negative left/right shifts after
supporting blocksize greater than PAGE_SIZE.

Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Message-ID: <20251121090654.631996-19-libaokun@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-28 22:35:27 -05:00
Baokun Li
bff6235d62 ext4: support large block size in ext4_block_write_begin()
Use the EXT4_PG_TO_LBLK() macro to convert folio indexes to blocks to avoid
negative left shifts after supporting blocksize greater than PAGE_SIZE.

Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Message-ID: <20251121090654.631996-18-libaokun@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-28 22:35:27 -05:00
Baokun Li
a6d73242b8 ext4: support large block size in ext4_mpage_readpages()
Use the EXT4_PG_TO_LBLK() macro to convert folio indexes to blocks to avoid
negative left shifts after supporting blocksize greater than PAGE_SIZE.

Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Message-ID: <20251121090654.631996-17-libaokun@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-28 22:35:27 -05:00
Zhihao Cheng
65c39954bb ext4: rename 'page' references to 'folio' in multi-block allocator
The ext4 multi-block allocator now fully supports folio objects. Update
all variable names, function names, and comments to replace legacy 'page'
terminology with 'folio', improving clarity and consistency.

No functional changes.

Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Message-ID: <20251121090654.631996-16-libaokun@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-28 22:35:27 -05:00
Baokun Li
31daa8261c ext4: prepare buddy cache inode for BS > PS with large folios
We use EXT4_BAD_INO for the buddy cache inode number. This inode is not
accessed via __ext4_new_inode() or __ext4_iget(), meaning
ext4_set_inode_mapping_order() is not called to set its folio order range.

However, future block size greater than page size support requires this
inode to support large folios, and the buddy cache code already handles
BS > PS. Therefore, ext4_set_inode_mapping_order() is now explicitly
called for this specific inode to set its folio order range.

Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Message-ID: <20251121090654.631996-15-libaokun@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-28 22:35:27 -05:00
Baokun Li
0ad55fa104 ext4: support large block size in ext4_mb_init_cache()
Currently, ext4_mb_init_cache() uses blocks_per_page to calculate the
folio index and offset. However, when blocksize is larger than PAGE_SIZE,
blocks_per_page becomes zero, leading to a potential division-by-zero bug.

Since we now have the folio, we know its exact size. This allows us to
convert {blocks, groups}_per_page to {blocks, groups}_per_folio, thus
supporting block sizes greater than page size.

Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Message-ID: <20251121090654.631996-14-libaokun@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-28 22:35:27 -05:00
Baokun Li
3938fc29f8 ext4: support large block size in ext4_mb_get_buddy_page_lock()
Currently, ext4_mb_get_buddy_page_lock() uses blocks_per_page to calculate
folio index and offset. However, when blocksize is larger than PAGE_SIZE,
blocks_per_page becomes zero, leading to a potential division-by-zero bug.

To support BS > PS, use bytes to compute folio index and offset within
folio to get rid of blocks_per_page.

Also, since ext4_mb_get_buddy_page_lock() already fully supports folio,
rename it to ext4_mb_get_buddy_folio_lock().

Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Message-ID: <20251121090654.631996-13-libaokun@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-28 22:35:27 -05:00
Baokun Li
6117f1806a ext4: support large block size in ext4_mb_load_buddy_gfp()
Currently, ext4_mb_load_buddy_gfp() uses blocks_per_page to calculate the
folio index and offset. However, when blocksize is larger than PAGE_SIZE,
blocks_per_page becomes zero, leading to a potential division-by-zero bug.

To support BS > PS, use bytes to compute folio index and offset within
folio to get rid of blocks_per_page.

Also, if buddy and bitmap land in the same folio, we get that folio’s ref
instead of looking it up again before updating the buddy.

Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Message-ID: <20251121090654.631996-12-libaokun@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-28 22:35:27 -05:00
Baokun Li
2a8de76b2b ext4: add EXT4_LBLK_TO_PG and EXT4_PG_TO_LBLK for block/page conversion
As BS > PS support is coming, all block number to page index (and
vice-versa) conversions must now go via bytes. Added EXT4_LBLK_TO_PG()
and EXT4_PG_TO_LBLK() macros to simplify these conversions and handle
both BS <= PS and BS > PS scenarios cleanly.

Suggested-by: Jan Kara <jack@suse.cz>
Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Message-ID: <20251121090654.631996-11-libaokun@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-28 22:35:27 -05:00
Baokun Li
125d1f6a5a ext4: add EXT4_LBLK_TO_B macro for logical block to bytes conversion
No functional changes.

Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Message-ID: <20251121090654.631996-10-libaokun@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-28 22:35:27 -05:00
Baokun Li
609c5e0081 ext4: support large block size in ext4_readdir()
In ext4_readdir(), page_cache_sync_readahead() is used to readahead mapped
physical blocks. With LBS support, this can lead to a negative right shift.

To fix this, the page index is now calculated by first converting the
physical block number (pblk) to a file position (pos) before converting
it to a page index. Also, the correct number of pages to readahead is now
passed.

Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Pankaj Raghav <p.raghav@samsung.com>
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Message-ID: <20251121090654.631996-9-libaokun@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-28 22:35:26 -05:00
Baokun Li
6a28b5c990 ext4: support large block size in ext4_calculate_overhead()
ext4_calculate_overhead() used a single page for its bitmap buffer, which
worked fine when PAGE_SIZE >= block size. However, with block size greater
than page size (BS > PS) support, the bitmap can exceed a single page.

To address this, we now use kvmalloc() to allocate memory of the filesystem
block size, to properly support BS > PS.

Suggested-by: Jan Kara <jack@suse.cz>
Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Message-ID: <20251121090654.631996-8-libaokun@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-28 22:35:26 -05:00
Baokun Li
8611e608a8 ext4: introduce s_min_folio_order for future BS > PS support
This commit introduces the s_min_folio_order field to the ext4_sb_info
structure. This field will store the minimum folio order required by the
current filesystem, laying groundwork for future support of block sizes
greater than PAGE_SIZE.

Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Pankaj Raghav <p.raghav@samsung.com>
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Message-ID: <20251121090654.631996-7-libaokun@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-28 22:35:26 -05:00
Baokun Li
58297412ed ext4: enable DIOREAD_NOLOCK by default for BS > PS as well
The dioread_nolock related processes already support large folio, so
dioread_nolock is enabled by default regardless of whether the blocksize
is less than, equal to, or greater than PAGE_SIZE.

Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Message-ID: <20251121090654.631996-6-libaokun@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-28 22:35:26 -05:00
Baokun Li
d37a7ddd3a ext4: make ext4_punch_hole() support large block size
When preparing for bs > ps support, clean up unnecessary PAGE_SIZE
references in ext4_punch_hole().

Previously, when a hole extended beyond i_size, we aligned the hole end
upwards to PAGE_SIZE to handle partial folio invalidation. Now that
truncate_inode_pages_range() already handles partial folio invalidation
correctly, this alignment is no longer required.

However, to save pointless tail block zeroing, we still keep rounding up
to the block size here.

In addition, as Honza pointed out, when the hole end equals i_size, it
should also be rounded up to the block size. This patch fixes that as well.

Suggested-by: Jan Kara <jack@suse.cz>
Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Message-ID: <20251121090654.631996-5-libaokun@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-28 22:35:26 -05:00
Baokun Li
afa6d5a16b ext4: remove PAGE_SIZE checks for rec_len conversion
Previously, ext4_rec_len_(to|from)_disk only performed complex rec_len
conversions when PAGE_SIZE >= 65536 to reduce complexity.

However, we are soon to support file system block sizes greater than
page size, which makes these conditional checks unnecessary. Thus, these
checks are now removed.

Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Message-ID: <20251121090654.631996-4-libaokun@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-28 22:35:26 -05:00
Baokun Li
b73f45a324 ext4: remove page offset calculation in ext4_block_truncate_page()
For bs <= ps scenarios, calculating the offset within the block is
sufficient. For bs > ps, an initial page offset calculation can lead to
incorrect behavior. Thus this redundant calculation has been removed.

Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Message-ID: <20251121090654.631996-3-libaokun@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-28 22:35:26 -05:00
Zhihao Cheng
5835b1339e ext4: remove page offset calculation in ext4_block_zero_page_range()
For bs <= ps scenarios, calculating the offset within the block is
sufficient. For bs > ps, an initial page offset calculation can lead to
incorrect behavior. Thus this redundant calculation has been removed.

Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Message-ID: <20251121090654.631996-2-libaokun@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-28 22:35:26 -05:00
Jakub Kicinski
2c80116b50 Merge tag 'wireless-next-2025-11-27' of https://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless-next
Johannes Berg says:

====================
Apart from the usual small things just driver updates:
 - mt76:
   - WED support for >32-bit DMA
   - airoha NPU support
   - regdomain improvements
   - continued WiFi7/MLO work
 - rtw89
   - support USB devices RTL8852AU and RTL8852CU
   - initial work for RTL8922DE
   - improved injection support
 - rtl8xxxu: 40 MHz connection fixes/support
 - brcmfmac: Acer A1 840 tablet quirk

* tag 'wireless-next-2025-11-27' of https://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless-next: (152 commits)
  wifi: mac80211: allow sharing identical chanctx for S1G interfaces
  wifi: nl80211: vendor-cmd: intel: fix a blank kernel-doc line warning
  wifi: cfg80211: include s1g_primary_2mhz when comparing chandefs
  wifi: cfg80211: include s1g_primary_2mhz when sending chandef
  wifi: ieee80211: correct FILS status codes
  mt76: mt7615: Fix memory leak in mt7615_mcu_wtbl_sta_add()
  wifi: mt76: mt792x: fix wifi init fail by setting MCU_RUNNING after CLC load
  wifi: mt76: Strip whitespace from build ddate
  wifi: mt76: mt7996: Add missing locking in mt7996_mac_sta_rc_work()
  wifi: mt76: mt7996: skip ieee80211_iter_keys() on scanning link remove
  wifi: mt76: mt7996: skip deflink accounting for offchannel links
  wifi: mt76: Move mt76_abort_scan out of mt76_reset_device()
  wifi: mt76: mt7996: move mt7996_update_beacons under mt76 mutex
  wifi: mt76: mt7996: grab mt76 mutex in mt7996_mac_sta_event()
  wifi: mt76: mt7925: ensure the 6GHz A-MPDU density cap from the hardware.
  wifi: mt76: mt7996: fix EMI rings for RRO
  wifi: mt76: mt7996: fix using wrong phy to start in mt7996_mac_restart()
  wifi: mt76: mt7996: fix MLO set key and group key issues
  wifi: mt76: mt7996: fix MLD group index assignment
  wifi: mt76: mt7996: use correct link_id when filling TXD and TXP
  ...
====================

Link: https://patch.msgid.link/20251127103806.17776-3-johannes@sipsolutions.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-28 19:34:21 -08:00
Heiko Carstens
c940be4c7c net: Remove KMSG_COMPONENT macro
The KMSG_COMPONENT macro is a leftover of the s390 specific "kernel message
catalog" from 2008 [1] which never made it upstream.

The macro was added to s390 code to allow for an out-of-tree patch which
used this to generate unique message ids. Also this out-of-tree patch
doesn't exist anymore.

The pattern of how the KMSG_COMPONENT macro is used can also be found at
some non s390 specific code, for whatever reasons. Besides adding an
indirection it is unused.

Remove the macro in order to get rid of a pointless indirection. Replace
all users with the string it defines. In all cases this leads to a simple
replacement like this:

 - #define KMSG_COMPONENT "af_iucv"
 - #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
 + #define pr_fmt(fmt) "af_iucv: " fmt

[1] https://lwn.net/Articles/292650/

Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Acked-by: Alexandra Winter <wintera@linux.ibm.com>
Acked-by: Julian Anastasov <ja@ssi.bg>
Acked-by: Sidraya Jayagond <sidraya@linux.ibm.com>
Link: https://patch.msgid.link/20251126140705.1944278-1-hca@linux.ibm.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-28 19:20:27 -08:00
Hoyeon Lee
bd5bdd200c bpf: Remove runqslower tool
runqslower was added in commit 9c01546d26 "tools/bpf: Add runqslower
tool to tools/bpf" as a BCC port to showcase early BPF CO-RE + libbpf
workflows. runqslower continues to live in BCC (libbpf-tools), so there
is no need to keep building and maintaining it.

Drop tools/bpf/runqslower and remove all build hooks in tools/bpf and
selftests accordingly.

Signed-off-by: Hoyeon Lee <hoyeon.lee@suse.com>
Link: https://lore.kernel.org/r/20251126093821.373291-1-hoyeon.lee@suse.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-28 15:20:16 -08:00
Amery Hung
a3a60cc120 selftests/bpf: Remove usage of lsm/file_alloc_security in selftest
file_alloc_security hook is disabled. Use other LSM hooks in selftests
instead.

Signed-off-by: Amery Hung <ameryhung@gmail.com>
Link: https://lore.kernel.org/r/20251126202927.2584874-2-ameryhung@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-28 15:18:28 -08:00
Amery Hung
b4bf1d23dc bpf: Disable file_alloc_security hook
A use-after-free bug may be triggered by calling bpf_inode_storage_get()
in a BPF LSM program hooked to file_alloc_security. Disable the hook to
prevent this from happening.

The cause of the bug is shown in the trace below. In alloc_file(), a
file struct is first allocated through kmem_cache_alloc(). Then,
file_alloc_security hook is invoked. Since the zero initialization or
assignment of f->f_inode happen after this LSM hook, a BPF program may
get a dangeld inode pointer by walking the file struct.

  alloc_file()
  -> alloc_empty_file()
     -> f = kmem_cache_alloc()
     -> init_file()
        -> security_file_alloc() // f->f_inode not init-ed yet!
     -> f->f_inode = NULL;
  -> file_init_path()
     -> f->f_inode = path->dentry->d_inode

Reported-by: Kaiyan Mei <M202472210@hust.edu.cn>
Reported-by: Yinhao Hu <dddddd@hust.edu.cn>
Reported-by: Dongliang Mu <dzm91@hust.edu.cn>
Closes: https://lore.kernel.org/bpf/1d2d1968.47cd3.19ab9528e94.Coremail.kaiyanm@hust.edu.cn/
Signed-off-by: Amery Hung <ameryhung@gmail.com>
Link: https://lore.kernel.org/r/20251126202927.2584874-1-ameryhung@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-28 15:18:28 -08:00
Alexei Starovoitov
19f4091bf2 Merge branch 'a-pair-of-follow-ups-for-indirect-jumps'
Anton Protopopov says:

====================
A pair of follow ups for indirect jumps

Two fixes suggested by Alexei in [1]. Resending as a series,
as the second patch depends on the first.

  [1] https://lore.kernel.org/bpf/CAADnVQK3piReoo1ja=9hgz7aJ60Y_Jjur_JMOaYV8-Mn_VyE4A@mail.gmail.com/#R
====================

Link: https://patch.msgid.link/20251128063224.1305482-1-a.s.protopopov@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-28 15:15:43 -08:00
Anton Protopopov
e3ea26add6 bpf: check for insn arrays in check_ptr_alignment
Do not abuse the strict_alignment_once flag, and check if the map is
an instruction array inside the check_ptr_alignment() function.

Suggested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Link: https://lore.kernel.org/r/20251128063224.1305482-3-a.s.protopopov@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-28 15:15:43 -08:00
Anton Protopopov
7feff23cdf bpf: force BPF_F_RDONLY_PROG on insn array creation
The original implementation added a hack to check_mem_access()
to prevent programs from writing into insn arrays. To get rid
of this hack, enforce BPF_F_RDONLY_PROG on map creation.

Also fix the corresponding selftest, as the error message changes
with this patch.

Suggested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Link: https://lore.kernel.org/r/20251128063224.1305482-2-a.s.protopopov@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-28 15:15:43 -08:00
Randy Dunlap
418de94e75 sbitmap: fix all kernel-doc warnings
Modify kernel-doc comments in sbitmap.h to prevent warnings:

Warning: include/linux/sbitmap.h:84 struct member 'alloc_hint' not
 described in 'sbitmap'
Warning: include/linux/sbitmap.h:151 struct member 'ws_active' not
 described in 'sbitmap_queue'
Warning: include/linux/sbitmap.h:552 No description found for
 return value of 'sbq_wait_ptr'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-28 09:21:18 -07:00
Ming Lei
28d7a371f0 ublk: add helper of __ublk_fetch()
Add helper __ublk_fetch() for refactoring ublk_fetch().

Meantime move ublk_config_io_buf() out of __ublk_fetch() to make
the code structure cleaner.

Reviewed-by: Caleb Sander Mateos <csander@purestorage.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-28 09:20:13 -07:00
Ming Lei
3443bab2f8 ublk: pass const pointer to ublk_queue_is_zoned()
Pass const pointer to ublk_queue_is_zoned() because it is readonly.

Reviewed-by: Caleb Sander Mateos <csander@purestorage.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-28 09:20:13 -07:00
Ming Lei
0a9beafa7c ublk: refactor auto buffer register in ublk_dispatch_req()
Refactor auto buffer register code and prepare for supporting batch IO
feature, and the main motivation is to put 'ublk_io' operation code
together, so that per-io lock can be applied for the code block.

The key changes are:
- Rename ublk_auto_buf_reg() as ublk_do_auto_buf_reg()
- Introduce an enum `auto_buf_reg_res` to represent the result of
  the buffer registration attempt (FAIL, FALLBACK, OK).
- Split the existing `ublk_do_auto_buf_reg` function into two:
  - `__ublk_do_auto_buf_reg`: Performs the actual buffer registration
    and returns the `auto_buf_reg_res` status.
  - `ublk_do_auto_buf_reg`: A wrapper that calls the internal function
    and handles the I/O preparation based on the result.
- Introduce `ublk_prep_auto_buf_reg_io` to encapsulate the logic for
  preparing the I/O for completion after buffer registration.
- Pass the `tag` directly to `ublk_auto_buf_reg_fallback` to avoid
  recalculating it.

This refactoring makes the control flow clearer and isolates the different
stages of the auto buffer registration process.

Reviewed-by: Caleb Sander Mateos <csander@purestorage.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-28 09:20:13 -07:00
Ming Lei
8d61ece156 ublk: add union ublk_io_buf with improved naming
Add `union ublk_io_buf` for naming the anonymous union of struct ublk_io's
addr and buf fields, meantime apply it to `struct ublk_io` for storing either
ublk auto buffer register data or ublk server io buffer address.

The union uses clear field names:
- `addr`: for regular ublk server io buffer addresses
- `auto_reg`: for ublk auto buffer registration data

This eliminates confusing access patterns and improves code readability.

Reviewed-by: Caleb Sander Mateos <csander@purestorage.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-28 09:20:13 -07:00
Ming Lei
3035b9b46b ublk: add parameter struct io_uring_cmd * to ublk_prep_auto_buf_reg()
Add parameter `struct io_uring_cmd *` to ublk_prep_auto_buf_reg() and
prepare for reusing this helper for the coming UBLK_BATCH_IO feature,
which can fetch & commit one batch of io commands via single uring_cmd.

Reviewed-by: Caleb Sander Mateos <csander@purestorage.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-28 09:20:13 -07:00
Ming Lei
9574b21e95 kfifo: add kfifo_alloc_node() helper for NUMA awareness
Add __kfifo_alloc_node() by refactoring and reusing __kfifo_alloc(),
and define kfifo_alloc_node() macro to support NUMA-aware memory
allocation.

The new __kfifo_alloc_node() function accepts a NUMA node parameter
and uses kmalloc_array_node() instead of kmalloc_array() for
node-specific allocation. The existing __kfifo_alloc() now calls
__kfifo_alloc_node() with NUMA_NO_NODE to maintain backward
compatibility.

This enables users to allocate kfifo buffers on specific NUMA nodes,
which is important for performance in NUMA systems where the kfifo
will be primarily accessed by threads running on specific nodes.

Cc: Stefani Seibold <stefani@seibold.net>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-28 09:20:13 -07:00
Fengnan Chang
89e1fb7cef blk-mq: fix potential uaf for 'queue_hw_ctx'
This is just apply Kuai's patch in [1] with mirror changes.

blk_mq_realloc_hw_ctxs() will free the 'queue_hw_ctx'(e.g. undate
submit_queues through configfs for null_blk), while it might still be
used from other context(e.g. switch elevator to none):

t1					t2
elevator_switch
 blk_mq_unquiesce_queue
  blk_mq_run_hw_queues
   queue_for_each_hw_ctx
    // assembly code for hctx = (q)->queue_hw_ctx[i]
    mov    0x48(%rbp),%rdx -> read old queue_hw_ctx

					__blk_mq_update_nr_hw_queues
					 blk_mq_realloc_hw_ctxs
					  hctxs = q->queue_hw_ctx
					  q->queue_hw_ctx = new_hctxs
					  kfree(hctxs)
    movslq %ebx,%rax
    mov    (%rdx,%rax,8),%rdi ->uaf

This problem was found by code review, and I comfirmed that the concurrent
scenario do exist(specifically 'q->queue_hw_ctx' can be changed during
blk_mq_run_hw_queues()), however, the uaf problem hasn't been repoduced yet
without hacking the kernel.

Sicne the queue is freezed in __blk_mq_update_nr_hw_queues(), fix the
problem by protecting 'queue_hw_ctx' through rcu where it can be accessed
without grabbing 'q_usage_counter'.

[1] https://lore.kernel.org/all/20220225072053.2472431-1-yukuai3@huawei.com/

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Fengnan Chang <changfengnan@bytedance.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-28 09:09:19 -07:00
Fengnan Chang
d0c98769ee blk-mq: use array manage hctx map instead of xarray
After commit 4e5cc99e1e ("blk-mq: manage hctx map via xarray"), we use
an xarray instead of array to store hctx, but in poll mode, each time
in blk_mq_poll, we need use xa_load to find corresponding hctx, this
introduce some costs. In my test, xa_load may cost 3.8% cpu.

This patch revert previous change, eliminates the overhead of xa_load
and can result in a 3% performance improvement.

Signed-off-by: Fengnan Chang <changfengnan@bytedance.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-28 09:09:19 -07:00
Gao Xiang
3a991f784c erofs: enable error reporting for z_erofs_stream_switch_bufs()
Enable propagation of detailed errors to callers.

Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
2025-11-28 22:00:08 +08:00
Gao Xiang
83564b06b2 erofs: improve Zstd, LZMA and DEFLATE error strings
Enable better, more detailed, and unique error reporting.

Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
2025-11-28 22:00:08 +08:00
Gao Xiang
831faabed8 erofs: improve decompression error reporting
Change the return type of decompress() from `int` to `const char *` to
provide more informative error diagnostics:

 - A NULL return indicates successful decompression;

 - If IS_ERR(ptr) is true, the return value encodes a standard negative
   errno (e.g., -ENOMEM, -EOPNOTSUPP) identifying the specific error;

 - Otherwise, a non-NULL return points to a human-readable error string,
   and the corresponding error code should be treated as -EFSCORRUPTED.

Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
2025-11-28 22:00:07 +08:00
Gao Xiang
9ae77198d4 erofs: tidy up z_erofs_lz4_handle_overlap()
- Add some useful comments to explain inplace I/Os and decompression;

 - Rearrange the code to get rid of one unnecessary goto.

Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
2025-11-28 21:59:51 +08:00
Jakub Kicinski
ed01d2069e Merge branch 'bnxt_en-updates-for-net-next'
Michael Chan says:

====================
bnxt_en: Updates for net-next (part)

This series includes an enhnacement to the priority TX counters,
an enhancement to a PHY module error extack message, cleanup of
unneeded MSIX logic in bnxt_ulp.c, adding CQ dump during TX timeout,
LRO/HW_GRO performance improvement by enabling Relaxed Ordering,
and improved SRIOV admin link state support.
====================

Link: https://patch.msgid.link/20251126215648.1885936-1-michael.chan@broadcom.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-27 18:59:34 -08:00
Rob Miller
72405298e9 bnxt_en: Add Virtual Admin Link State Support for VFs
The firmware can now cache the virtual link admin state (auto/on/off) of
all VFs and as such, the PF driver no longer has to intercept the VF
driver's port_phy_qcfg() call and then provide the link admin state.

If the FW does not have this capability, fall back to the existing
interception method.

The initial default link admin state (auto) is also set initially when
the VFs are created.

Reviewed-by: Pavan Chebbi <pavan.chebbi@broadcom.com>
Reviewed-by: Kalesh AP <kalesh-anakkur.purayil@broadcom.com>
Reviewed-by: Mohammad Shuab Siddique <mohammad-shuab.siddique@broadcom.com>
Signed-off-by: Rob Miller <rmiller@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Link: https://patch.msgid.link/20251126215648.1885936-7-michael.chan@broadcom.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-27 18:59:29 -08:00
Michael Chan
30f253f8d9 bnxt_en: Do not set EOP on RX AGG BDs on 5760X chips
With End-of-Packet padding (EOP) set, the chip will disable Relaxed
Ordering (RO) of TPA data packets.  A TPA segment with EOP set will be
padded to the next cache boundary and can potentially overwrite the
beginning bytes of the next TPA segment when RO is enabled on 5760X.
To prevent that, the chip disables RO for TPA when EOP is set.

To take advantge of RO and higher performance, do not set EOP on
5760X chips when TPA is enabled.  Define a proper RX_BD_FLAGS_AGG_EOP
constant to make it clear that we are setting EOP.

Reviewed-by: Andy Gospodarek <andrew.gospodarek@broadcom.com>
Reviewed-by: Somnath Kotur <somnath.kotur@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Link: https://patch.msgid.link/20251126215648.1885936-6-michael.chan@broadcom.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-27 18:59:28 -08:00
Michael Chan
b1e7f9566f bnxt_en: Add CQ ring dump to bnxt_dump_cp_sw_state()
On newer chips that use NQs and CQs, add the CQ ring dump to
bnxt_dump_cp_sw_state() to make it more complete.

Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Link: https://patch.msgid.link/20251126215648.1885936-5-michael.chan@broadcom.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-27 18:59:28 -08:00
Kalesh AP
bba2a0577e bnxt_en: Remove the redundant BNXT_EN_FLAG_MSIX_REQUESTED flag
MSIX is always requested when the RoCE driver calls bnxt_register_dev().
We already check bnxt_ulp_registered(), so checking the flag is
redundant.  It was a left-over flag after converting to auxbus, so
remove it.

Reviewed-by: Pavan Chebbi <pavan.chebbi@broadcom.com>
Reviewed-by: Somnath Kotur <somnath.kotur@broadcom.com>
Signed-off-by: Kalesh AP <kalesh-anakkur.purayil@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Link: https://patch.msgid.link/20251126215648.1885936-4-michael.chan@broadcom.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-27 18:59:28 -08:00
Gautam R A
f3d88fe635 bnxt_en: Enhance log message in bnxt_get_module_status()
Rturn early with -EOPNOTSUPP and an extack message if the PHY type is
BaseT since module status is not available for BaseT.

Reviewed-by: Somnath Kotur <somnath.kotur@broadcom.com>
Signed-off-by: Gautam R A <gautam-r.a@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Link: https://patch.msgid.link/20251126215648.1885936-3-michael.chan@broadcom.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-27 18:59:28 -08:00
Michael Chan
caa343e9a4 bnxt_en: Enhance TX pri counters
The priority packet and byte counters in ethtool -S are returned by
the driver based on the pri2cos mapping.  The assumption is that each
priority is mapped to one and only one hardware CoS queue.  In a
special RoCE configuration, the FW uses combined CoS queue 0 and CoS
queue 1 for the priority mapped to CoS queue 0.  In this special
case, we need to add the CoS queue 0 and CoS queue 1 counters for
the priority packet and byte counters.

Reviewed-by: Andy Gospodarek <andrew.gospodarek@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Link: https://patch.msgid.link/20251126215648.1885936-2-michael.chan@broadcom.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-27 18:59:28 -08:00
Jakub Kicinski
61dbc61a34 Merge branch 'intel-wired-lan-driver-updates-2025-11-25-ice-idpf-iavf-ixgbe-ixgbevf-e1000e'
Tony Nguyen says:

====================
Intel Wired LAN Driver Updates 2025-11-25 (ice, idpf, iavf, ixgbe, ixgbevf, e1000e)

Natalia cleans up ixgbevf_q_vector struct removing an unused field.

Emil converts vport state tracking from enum to bitmap and removes
unneeded states for idpf.

Tony removes an unneeded check from e1000e.

Alok Tiwari removes an unnecessary second call to
ixgbe_non_sfp_link_config() and adjusts the checked member, in idpf, to
reflect the member that is later used. He also fixes various typos and
messages for better clarity misc Intel drivers.
====================

Link: https://patch.msgid.link/20251125223632.1857532-1-anthony.l.nguyen@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-27 18:34:22 -08:00
Alok Tiwari
57bb13d7eb iavf: clarify VLAN add/delete log messages and lower log level
The current dev_warn messages for too many VLAN changes are confusing
and one place incorrectly references "add" instead of "delete" VLANs
due to copy-paste errors.

- Use dev_info instead of dev_warn to lower the log level.
- Rephrase the message to: "virtchnl: Too many VLAN [add|delete]
  ([v1|v2]) requests; splitting into multiple messages to PF\n".

Suggested-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Signed-off-by: Alok Tiwari <alok.a.tiwari@oracle.com>
Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
Link: https://patch.msgid.link/20251125223632.1857532-12-anthony.l.nguyen@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-27 18:34:21 -08:00
Alok Tiwari
1105a7a120 ice: fix comment typo and correct module format string
- Fix a typo in the ice_fdir_has_frag() kernel-doc comment ("is" -> "if")

- Correct the NVM erase error message format string from "0x02%x" to
  "0x%02x" so the module value is printed correctly.

Signed-off-by: Alok Tiwari <alok.a.tiwari@oracle.com>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Reviewed-by: Paul Menzel <pmenzel@molgen.mpg.de>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
Link: https://patch.msgid.link/20251125223632.1857532-11-anthony.l.nguyen@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-27 18:34:21 -08:00
Alok Tiwari
79bb84758f idpf: correct queue index in Rx allocation error messages
The error messages in idpf_rx_desc_alloc_all() used the group index i
when reporting memory allocation failures for individual Rx and Rx buffer
queues. This is incorrect.

Update the messages to use the correct queue index j and include the
queue group index i for clearer identification of the affected Rx and Rx
buffer queues.

Signed-off-by: Alok Tiwari <alok.a.tiwari@oracle.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
Link: https://patch.msgid.link/20251125223632.1857532-10-anthony.l.nguyen@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-27 18:34:21 -08:00
Alok Tiwari
d89a5c27e4 idpf: use desc_ring when checking completion queue DMA allocation
idpf_compl_queue uses a union for comp, comp_4b, and desc_ring. The
release path should check complq->desc_ring to determine whether the DMA
descriptor ring is allocated. The current check against comp works but is
leftover from a previous commit and is misleading in this context.

Switching the check to desc_ring improves readability and more directly
reflects the intended meaning, since desc_ring is the field representing
the allocated DMA-backed descriptor ring.

No functional change.

Signed-off-by: Alok Tiwari <alok.a.tiwari@oracle.com>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
Link: https://patch.msgid.link/20251125223632.1857532-9-anthony.l.nguyen@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-27 18:34:21 -08:00
Alok Tiwari
5849b56add ixgbe: avoid redundant call to ixgbe_non_sfp_link_config()
ixgbe_non_sfp_link_config() is called twice in ixgbe_open()
once to assign its return value to err and again in the
conditional check. This patch uses the stored err value
instead of calling the function a second time. This avoids
redundant work and ensures consistent error reporting.

Also fix a small typo in the ixgbe_remove() comment:
"The could be caused" -> "This could be caused".

Signed-off-by: Alok Tiwari <alok.a.tiwari@oracle.com>
Reviewed-by: Jedrzej Jagielski <jedrzej.jagielski@intel.com>
Reviewed-by: Paul Menzel <pmenzel@molgen.mpg.de>
Tested-by: Rinitha S <sx.rinitha@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
Link: https://patch.msgid.link/20251125223632.1857532-8-anthony.l.nguyen@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-27 18:34:21 -08:00
Tony Nguyen
954ba97cca e1000e: Remove unneeded checks
The caller, ethtool_set_eeprom(), already performs the same checks so
these are unnecessary in the driver. This reverts commit
90fb7db49c ("e1000e: fix heap overflow in e1000_set_eeprom"), however,
corrections for RCT have been kept.

Link: https://lore.kernel.org/all/db92fcc8-114d-4e85-9d15-7860545bc65e@suse.de/
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
Link: https://patch.msgid.link/20251125223632.1857532-7-anthony.l.nguyen@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-27 18:34:21 -08:00
Emil Tantilov
8dd72ebc73 idpf: convert vport state to bitmap
Convert vport state to a bitmap and remove the DOWN state which is
redundant in the existing logic. There are no functional changes aside
from the use of bitwise operations when setting and checking the states.
Removed the double underscore to be consistent with the naming of other
bitmaps in the header and renamed current_state to vport_is_up to match
the meaning of the new variable.

Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Reviewed-by: Chittim Madhu <madhu.chittim@intel.com>
Signed-off-by: Emil Tantilov <emil.s.tantilov@intel.com>
Tested-by: Samuel Salin <Samuel.salin@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
Link: https://patch.msgid.link/20251125223632.1857532-6-anthony.l.nguyen@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-27 18:34:21 -08:00
Natalia Wochtman
1645759a04 ixgbevf: ixgbevf_q_vector clean up
Flex array should be at the end of the structure and use [] syntax

Remove unused fields of ixgbevf_q_vector.
They aren't used since busy poll was moved to core code in commit
508aac6dee ("ixgbevf: get rid of custom busy polling code").

Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Signed-off-by: Natalia Wochtman <natalia.wochtman@intel.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Rafal Romanowski <rafal.romanowski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
Link: https://patch.msgid.link/20251125223632.1857532-5-anthony.l.nguyen@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-27 18:34:21 -08:00
Heiko Carstens
4636b4e797 dibs: Remove KMSG_COMPONENT macro
The KMSG_COMPONENT macro is a leftover of the s390 specific "kernel message
catalog" from 2008 [1] which never made it upstream.

The macro was added to s390 code to allow for an out-of-tree patch which
used this to generate unique message ids. Also this out-of-tree doesn't
exist anymore.

The pattern of how the KMSG_COMPONENT is used was partially also used for
non s390 specific code, for whatever reasons.

Remove the macro in order to get rid of a pointless indirection.

[1] https://lwn.net/Articles/292650/

Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Acked-by: Alexandra Winter <wintera@linux.ibm.com>
Link: https://patch.msgid.link/20251126142242.2124317-1-hca@linux.ibm.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-27 18:11:43 -08:00
Breno Leitao
73880e66b7 net: thunder: convert to use .get_rx_ring_count
Convert the Cavium Thunder NIC VF driver to use the new .get_rx_ring_count
ethtool operation instead of implementing .get_rxnfc solely for handling
ETHTOOL_GRXRINGS command. This simplifies the code by removing the
switch statement and replacing it with a direct return of the queue
count.

The new callback provides the same functionality in a more direct way,
following the ongoing ethtool API modernization.

Signed-off-by: Breno Leitao <leitao@debian.org>
Link: https://patch.msgid.link/20251126-gxring_cavium-v1-1-a066c0c9e0c6@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-27 18:09:53 -08:00
Alexey Kodanev
8048168df5 net: stmmac: fix rx limit check in stmmac_rx_zc()
The extra "count >= limit" check in stmmac_rx_zc() is redundant and
has no effect because the value of "count" doesn't change after the
while condition at this point.

However, it can change after "read_again:" label:

        while (count < limit) {
            ...

            if (count >= limit)
                break;
    read_again:
            ...
            /* XSK pool expects RX frame 1:1 mapped to XSK buffer */
            if (likely(status & rx_not_ls)) {
                xsk_buff_free(buf->xdp);
                buf->xdp = NULL;
                dirty++;
                count++;
                goto read_again;
            }
            ...

This patch addresses the same issue previously resolved in stmmac_rx()
by commit fa02de9e75 ("net: stmmac: fix rx budget limit check").
The fix is the same: move the check after the label to ensure that it
bounds the goto loop.

Fixes: bba2556efa ("net: stmmac: Enable RX via AF_XDP zero-copy")
Signed-off-by: Alexey Kodanev <aleksei.kodanev@bell-sw.com>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/20251126104327.175590-1-aleksei.kodanev@bell-sw.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-27 18:07:30 -08:00
Jakub Kicinski
ebb2eaeb05 Merge branch 'net-dsa-yt921x-fix-parsing-mib-attributes'
David Yang says:

====================
net: dsa: yt921x: Fix parsing MIB attributes
====================

Link: https://patch.msgid.link/20251126084024.2843851-1-mmyangfl@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-27 17:55:59 -08:00
David Yang
fbce7b36c8 net: dsa: yt921x: Use macros for MIB locations
Extract MIB constants into the header file to improve code style. This
patch will not change the behavior of the function.

Signed-off-by: David Yang <mmyangfl@gmail.com>
Link: https://patch.msgid.link/20251126084024.2843851-3-mmyangfl@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-27 17:55:54 -08:00
David Yang
510026a398 net: dsa: yt921x: Fix parsing MIB attributes
There are hard-to-find unused fields in the MIB table I didn't notice in
the example driver code, causing wrong interpretation of the MIB data.

For some 64-bit attributes, the current (wrong) implementation took the
correct lower 32 bits, but messed up the upper 32 bits, so it would work
accidentally until 32-bit overflows happen. Fix that too.

Fixes: 186623f4aa ("net: dsa: yt921x: Add support for Motorcomm YT921x")
Signed-off-by: David Yang <mmyangfl@gmail.com>
Link: https://patch.msgid.link/20251126084024.2843851-2-mmyangfl@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-27 17:55:53 -08:00
Javen Xu
17e9f841dd r8169: add DASH support for RTL8127AP
This adds DASH support for chip RTL8127AP. Its mac version is
RTL_GIGA_MAC_VER_80. DASH is a standard for remote management of network
device, allowing out-of-band control.

Signed-off-by: Javen Xu <javen_xu@realsil.com.cn>
Link: https://patch.msgid.link/20251126055950.2050-1-javen_xu@realsil.com.cn
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-27 17:53:40 -08:00
Antoine Tenart
1f73a56f98 net: vxlan: prevent NULL deref in vxlan_xmit_one
Neither sock4 nor sock6 pointers are guaranteed to be non-NULL in
vxlan_xmit_one, e.g. if the iface is brought down. This can lead to the
following NULL dereference:

  BUG: kernel NULL pointer dereference, address: 0000000000000010
  Oops: Oops: 0000 [#1] SMP NOPTI
  RIP: 0010:vxlan_xmit_one+0xbb3/0x1580
  Call Trace:
   vxlan_xmit+0x429/0x610
   dev_hard_start_xmit+0x55/0xa0
   __dev_queue_xmit+0x6d0/0x7f0
   ip_finish_output2+0x24b/0x590
   ip_output+0x63/0x110

Mentioned commits changed the code path in vxlan_xmit_one and as a side
effect the sock4/6 pointer validity checks in vxlan(6)_get_route were
lost. Fix this by adding back checks.

Since both commits being fixed were released in the same version (v6.7)
and are strongly related, bundle the fixes in a single commit.

Reported-by: Liang Li <liali@redhat.com>
Fixes: 6f19b2c136 ("vxlan: use generic function for tunnel IPv4 route lookup")
Fixes: 2aceb896ee ("vxlan: use generic function for tunnel IPv6 route lookup")
Cc: Beniamino Galvani <b.galvani@gmail.com>
Signed-off-by: Antoine Tenart <atenart@kernel.org>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Tested-by: Ido Schimmel <idosch@nvidia.com>
Link: https://patch.msgid.link/20251126102627.74223-1-atenart@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-27 17:48:57 -08:00
Michal Schmidt
1e43ebcd51 iavf: Implement settime64 with -EOPNOTSUPP
ptp_clock_settime() assumes every ptp_clock has implemented settime64().
Stub it with -EOPNOTSUPP to prevent a NULL dereference.

The fix is similar to commit 329d050bbe ("gve: Implement settime64
with -EOPNOTSUPP").

Fixes: d734223b2f ("iavf: add initial framework for registering PTP clock")
Signed-off-by: Michal Schmidt <mschmidt@redhat.com>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Reviewed-by: Tim Hostetler <thostet@google.com>
Link: https://patch.msgid.link/20251126094850.2842557-1-mschmidt@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-27 17:47:28 -08:00
Peter Enderborg
6557cae0a2 if_ether.h: Clarify ethertype validity for gsw1xx dsa
This 0x88C3 is registered to Infineon Technologies Corporate Research ST
and are used by MaxLinear.
Infineon made a spin off called Lantiq.
Lantiq was acquired by Intel
MaxLinear acquired Intels Connected Home division.

The product FAQ from MaxLinear describes it's history from the F24S.
The driver for the gsw1xx is based on Lantiq showing it's similarities.

Ref https://standards-oui.ieee.org/ethertype/eth.txt

Signed-off-by: Peter Enderborg <Peter.Enderborg@axis.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-27 17:46:54 -08:00
Gustavo A. R. Silva
eeecf5d3a3 net: wwan: mhi_wwan_mbim: Avoid -Wflex-array-member-not-at-end warning
Use DEFINE_RAW_FLEX() to avoid a -Wflex-array-member-not-at-end warning.

Remove fixed-size array struct usb_cdc_ncm_dpe16 dpe16[2]; from struct
mbim_tx_hdr, so that flex-array member struct mbim_tx_hdr::ndp16.dpe16[]
ends last in this structure.

Compensate for this by using the DEFINE_RAW_FLEX() helper to declare the
on-stack struct instance that contains struct usb_cdc_ncm_ndp16 as a
member. Adjust the rest of the code, accordingly.

So, with these changes fix the following warning:

drivers/net/wwan/mhi_wwan_mbim.c:81:34: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end]

Reviewed-by: Loic Poulain <loic.poulain@oss.qualcomm.com>
Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-27 17:45:18 -08:00
Max Yuan
858b1d07e4 gve: Fix race condition on tx->dropped_pkt update
The tx->dropped_pkt counter is a 64-bit integer that is incremented
directly. On 32-bit architectures, this operation is not atomic and
can lead to read/write tearing if a reader accesses the counter during
the update. This can result in incorrect values being reported for
dropped packets.

To prevent this potential data corruption, wrap the increment
operation with u64_stats_update_begin() and u64_stats_update_end().
This ensures that updates to the 64-bit counter are atomic, even on
32-bit systems, by using a sequence lock.

The u64_stats_sync API requires the writer to have exclusive access,
which is already provided in this context by the network stack's
serialization of the transmit path (net_device_ops::ndo_start_xmit
[1]) for a given queue.

[1]: https://www.kernel.org/doc/Documentation/networking/netdevices.txt

Signed-off-by: Max Yuan <maxyuan@google.com>
Reviewed-by: Jordan Rhee <jordanrhee@google.com>
Signed-off-by: Harshitha Ramamurthy <hramamurthy@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-27 17:44:52 -08:00
Jakub Kicinski
4c03592689 net: restore napi_consume_skb()'s NULL-handling
Commit e20dfbad8a ("net: fix napi_consume_skb() with alien skbs")
added a skb->cpu check to napi_consume_skb(), before the point where
napi_consume_skb() validated skb is not NULL.

Add an explicit check to the early exit condition.

Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-27 17:44:22 -08:00
Jakub Kicinski
362a161b25 eth: bnxt: make use of napi_consume_skb()
As those following recent changes from Eric know very well
using NAPI skb cache is crucial to achieve good perf, at
least on recent AMD platforms. Make sure bnxt feeds the skb
cache with Tx skbs.

Reviewed-by: Michael Chan <michael.chan@broadcom.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-27 17:44:01 -08:00
Byungchul Park
df59bb5b9a netmem, devmem, tcp: access pp fields through @desc in net_iov
Convert all the legacy code directly accessing the pp fields in net_iov
to access them through @desc in net_iov.

Signed-off-by: Byungchul Park <byungchul@sk.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-27 17:41:51 -08:00
Randy Dunlap
d3a439e55c netfilter: nf_tables: improve UAPI kernel-doc comments
In include/uapi/linux/netfilter/nf_tables.h,
correct the kernel-doc comments for mistyped enum names and enum values to
avoid these kernel-doc warnings and improve the documentation:

nf_tables.h:896: warning: Enum value 'NFT_EXTHDR_OP_TCPOPT' not described
 in enum 'nft_exthdr_op'
nf_tables.h:896: warning: Excess enum value 'NFT_EXTHDR_OP_TCP' description
 in 'nft_exthdr_op'

nf_tables.h:1210: warning: expecting prototype for enum
 nft_flow_attributes. Prototype was for enum nft_offload_attributes instead

nf_tables.h:1428: warning: expecting prototype for enum nft_reject_code.
 Prototype was for enum nft_reject_inet_code instead

(add beginning '@' to each enum value description:)
nf_tables.h:1493: warning: Enum value 'NFTA_TPROXY_FAMILY' not described
 in enum 'nft_tproxy_attributes'
nf_tables.h:1493: warning: Enum value 'NFTA_TPROXY_REG_ADDR' not described
 in enum 'nft_tproxy_attributes'
nf_tables.h:1493: warning: Enum value 'NFTA_TPROXY_REG_PORT' not described
 in enum 'nft_tproxy_attributes'

nf_tables.h:1796: warning: expecting prototype for enum
 nft_device_attributes. Prototype was for enum
 nft_devices_attributes instead

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2025-11-28 00:07:19 +00:00
Randy Dunlap
c4f0ab06e1 netfilter: ip6t_srh: fix UAPI kernel-doc comments format
Fix the kernel-doc format for struct members to be "@member" instead of
"@ member" to avoid kernel-doc warnings.

Warning: ip6t_srh.h:60 struct member 'next_hdr' not described in 'ip6t_srh'
Warning: ip6t_srh.h:60 struct member 'hdr_len' not described in 'ip6t_srh'
Warning: ip6t_srh.h:60 struct member 'segs_left' not described
 in 'ip6t_srh'
Warning: ip6t_srh.h:60 struct member 'last_entry' not described
 in 'ip6t_srh'
Warning: ip6t_srh.h:60 struct member 'tag' not described in 'ip6t_srh'
Warning: ip6t_srh.h:60 struct member 'mt_flags' not described in 'ip6t_srh'
Warning: ip6t_srh.h:60 struct member 'mt_invflags' not described
 in 'ip6t_srh'
Warning: ip6t_srh.h:93 struct member 'next_hdr' not described
 in 'ip6t_srh1'
Warning: ip6t_srh.h:93 struct member 'hdr_len' not described in 'ip6t_srh1'
Warning: ip6t_srh.h:93 struct member 'segs_left' not described
 in 'ip6t_srh1'
Warning: ip6t_srh.h:93 struct member 'last_entry' not described
 in 'ip6t_srh1'
Warning: ip6t_srh.h:93 struct member 'tag' not described in 'ip6t_srh1'
Warning: ip6t_srh.h:93 struct member 'psid_addr' not described
 in 'ip6t_srh1'
Warning: ip6t_srh.h:93 struct member 'nsid_addr' not described
 in 'ip6t_srh1'
Warning: ip6t_srh.h:93 struct member 'lsid_addr' not described
 in 'ip6t_srh1'
Warning: ip6t_srh.h:93 struct member 'psid_msk' not described
 in 'ip6t_srh1'
Warning: ip6t_srh.h:93 struct member 'nsid_msk' not described
 in 'ip6t_srh1'
Warning: ip6t_srh.h:93 struct member 'lsid_msk' not described
 in 'ip6t_srh1'
Warning: ip6t_srh.h:93 struct member 'mt_flags' not described
 in 'ip6t_srh1'
Warning: ip6t_srh.h:93 struct member 'mt_invflags' not described
 in 'ip6t_srh1'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2025-11-28 00:07:19 +00:00
Lorenzo Bianconi
c0bd21682a selftests: netfilter: nft_flowtable.sh: Add the capability to send IPv6 TCP traffic
Introduce the capability to send TCP traffic over IPv6 to
nft_flowtable netfilter selftest.

Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2025-11-28 00:07:19 +00:00
Fernando Fernandez Mancera
c4cbe4a4df netfilter: nft_connlimit: add support to object update operation
This is useful to update the limit or flags without clearing the
connections tracked. Use READ_ONCE() on packetpath as it can be modified
on controlplane.

Signed-off-by: Fernando Fernandez Mancera <fmancera@suse.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2025-11-28 00:06:43 +00:00
Fernando Fernandez Mancera
69894e5b4c netfilter: nft_connlimit: update the count if add was skipped
Connlimit expression can be used for all kind of packets and not only
for packets with connection state new. See this ruleset as example:

table ip filter {
        chain input {
                type filter hook input priority filter; policy accept;
                tcp dport 22 ct count over 4 counter
        }
}

Currently, if the connection count goes over the limit the counter will
count the packets. When a connection is closed, the connection count
won't decrement as it should because it is only updated for new
connections due to an optimization on __nf_conncount_add() that prevents
updating the list if the connection is duplicated.

To solve this problem, check whether the connection was skipped and if
so, update the list. Adjust count_tree() too so the same fix is applied
for xt_connlimit.

Fixes: 976afca1ce ("netfilter: nf_conncount: Early exit in nf_conncount_lookup() and cleanup")
Closes: https://lore.kernel.org/netfilter/trinity-85c72a88-d762-46c3-be97-36f10e5d9796-1761173693813@3c-app-mailcom-bs12/
Signed-off-by: Fernando Fernandez Mancera <fmancera@suse.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2025-11-28 00:05:52 +00:00
Fernando Fernandez Mancera
c0362b5748 netfilter: nf_conncount: make nf_conncount_gc_list() to disable BH
For convenience when performing GC over the connection list, make
nf_conncount_gc_list() to disable BH. This unifies the behavior with
nf_conncount_add() and nf_conncount_count().

Signed-off-by: Fernando Fernandez Mancera <fmancera@suse.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2025-11-28 00:05:52 +00:00
Fernando Fernandez Mancera
be102eb6a0 netfilter: nf_conncount: rework API to use sk_buff directly
When using nf_conncount infrastructure for non-confirmed connections a
duplicated track is possible due to an optimization introduced since
commit d265929930 ("netfilter: nf_conncount: reduce unnecessary GC").

In order to fix this introduce a new conncount API that receives
directly an sk_buff struct.  It fetches the tuple and zone and the
corresponding ct from it. It comes with both existing conncount variants
nf_conncount_count_skb() and nf_conncount_add_skb(). In addition remove
the old API and adjust all the users to use the new one.

This way, for each sk_buff struct it is possible to check if there is a
ct present and already confirmed. If so, skip the add operation.

Fixes: d265929930 ("netfilter: nf_conncount: reduce unnecessary GC")
Signed-off-by: Fernando Fernandez Mancera <fmancera@suse.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2025-11-28 00:05:49 +00:00
Lorenzo Bianconi
fe8313316e selftests: netfilter: nft_flowtable.sh: Add IPIP flowtable selftest
Introduce specific selftest for IPIP flowtable SW acceleration in
nft_flowtable.sh

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2025-11-28 00:00:51 +00:00
Lorenzo Bianconi
d30301ba4b netfilter: flowtable: Add IPIP tx sw acceleration
Introduce sw acceleration for tx path of IPIP tunnels relying on the
netfilter flowtable infrastructure.
This patch introduces basic infrastructure to accelerate other tunnel
types (e.g. IP6IP6).
IPIP sw tx acceleration can be tested running the following scenario where
the traffic is forwarded between two NICs (eth0 and eth1) and an IPIP
tunnel is used to access a remote site (using eth1 as the underlay device):

ETH0 -- TUN0 <==> ETH1 -- [IP network] -- TUN1 (192.168.100.2)

$ip addr show
6: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
    link/ether 00:00:22:33:11:55 brd ff:ff:ff:ff:ff:ff
    inet 192.168.0.2/24 scope global eth0
       valid_lft forever preferred_lft forever
7: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
    link/ether 00:11:22:33:11:55 brd ff:ff:ff:ff:ff:ff
    inet 192.168.1.1/24 scope global eth1
       valid_lft forever preferred_lft forever
8: tun0@NONE: <POINTOPOINT,NOARP,UP,LOWER_UP> mtu 1480 qdisc noqueue state UNKNOWN group default qlen 1000
    link/ipip 192.168.1.1 peer 192.168.1.2
    inet 192.168.100.1/24 scope global tun0
       valid_lft forever preferred_lft forever

$ip route show
default via 192.168.100.2 dev tun0
192.168.0.0/24 dev eth0 proto kernel scope link src 192.168.0.2
192.168.1.0/24 dev eth1 proto kernel scope link src 192.168.1.1
192.168.100.0/24 dev tun0 proto kernel scope link src 192.168.100.1

$nft list ruleset
table inet filter {
        flowtable ft {
                hook ingress priority filter
                devices = { eth0, eth1 }
        }

        chain forward {
                type filter hook forward priority filter; policy accept;
                meta l4proto { tcp, udp } flow add @ft
        }
}

Reproducing the scenario described above using veths I got the following
results:
- TCP stream trasmitted into the IPIP tunnel:
  - net-next: (baseline)                ~ 85Gbps
  - net-next + IPIP flowtable support:  ~102Gbps

Co-developed-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2025-11-28 00:00:45 +00:00
Lorenzo Bianconi
ab427db178 netfilter: flowtable: Add IPIP rx sw acceleration
Introduce sw acceleration for rx path of IPIP tunnels relying on the
netfilter flowtable infrastructure. Subsequent patches will add sw
acceleration for IPIP tunnels tx path.
This series introduces basic infrastructure to accelerate other tunnel
types (e.g. IP6IP6).
IPIP rx sw acceleration can be tested running the following scenario where
the traffic is forwarded between two NICs (eth0 and eth1) and an IPIP
tunnel is used to access a remote site (using eth1 as the underlay device):

ETH0 -- TUN0 <==> ETH1 -- [IP network] -- TUN1 (192.168.100.2)

$ip addr show
6: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
    link/ether 00:00:22:33:11:55 brd ff:ff:ff:ff:ff:ff
    inet 192.168.0.2/24 scope global eth0
       valid_lft forever preferred_lft forever
7: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
    link/ether 00:11:22:33:11:55 brd ff:ff:ff:ff:ff:ff
    inet 192.168.1.1/24 scope global eth1
       valid_lft forever preferred_lft forever
8: tun0@NONE: <POINTOPOINT,NOARP,UP,LOWER_UP> mtu 1480 qdisc noqueue state UNKNOWN group default qlen 1000
    link/ipip 192.168.1.1 peer 192.168.1.2
    inet 192.168.100.1/24 scope global tun0
       valid_lft forever preferred_lft forever

$ip route show
default via 192.168.100.2 dev tun0
192.168.0.0/24 dev eth0 proto kernel scope link src 192.168.0.2
192.168.1.0/24 dev eth1 proto kernel scope link src 192.168.1.1
192.168.100.0/24 dev tun0 proto kernel scope link src 192.168.100.1

$nft list ruleset
table inet filter {
        flowtable ft {
                hook ingress priority filter
                devices = { eth0, eth1 }
        }

        chain forward {
                type filter hook forward priority filter; policy accept;
                meta l4proto { tcp, udp } flow add @ft
        }
}

Reproducing the scenario described above using veths I got the following
results:
- TCP stream received from the IPIP tunnel:
  - net-next: (baseline)		~ 71Gbps
  - net-next + IPIP flowtbale support:	~101Gbps

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2025-11-28 00:00:38 +00:00
Pablo Neira Ayuso
a0d98b641d netfilter: flowtable: use tuple address to calculate next hop
This simplifies IPIP tunnel support coming in follow up patches.

No function changes are intended.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2025-11-28 00:00:30 +00:00
Pablo Neira Ayuso
030feea309 netfilter: flowtable: remove hw_ifidx
hw_ifidx was originally introduced to store the real netdevice as a
requirement for the hardware offload support in:

 73f97025a9 ("netfilter: nft_flow_offload: use direct xmit if hardware offload is enabled")

Since ("netfilter: flowtable: consolidate xmit path"), ifidx and
hw_ifidx points to the real device in the xmit path, remove it.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2025-11-28 00:00:22 +00:00
Pablo Neira Ayuso
18d27bed08 netfilter: flowtable: inline pppoe encapsulation in xmit path
Push the pppoe header from the flowtable xmit path, inlining is faster
than the original xmit path because it can avoid some locking.

This is based on a patch originally written by wenxu.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2025-11-28 00:00:14 +00:00
Pablo Neira Ayuso
c653d5a78f netfilter: flowtable: inline vlan encapsulation in xmit path
Push the vlan header from the flowtable xmit path, instead of passing
the packet to the vlan device.

This is based on a patch originally written by wenxu.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2025-11-28 00:00:04 +00:00
Pablo Neira Ayuso
b5964aac51 netfilter: flowtable: consolidate xmit path
Use dev_queue_xmit() for the XMIT_NEIGH case. Store the interface index
of the real device behind the vlan/pppoe device, this introduces  an
extra lookup for the real device in the xmit path because rt->dst.dev
provides the vlan/pppoe device.

XMIT_NEIGH now looks more similar to XMIT_DIRECT but the check for stale
dst and the neighbour lookup still remain in place which is convenient
to deal with network topology changes.

Note that nft_flow_route() needs to relax the check for _XMIT_NEIGH so
the existing basic xfrm offload (which only works in one direction) does
not break.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2025-11-27 23:59:56 +00:00
Pablo Neira Ayuso
93d7a7ed07 netfilter: flowtable: move path discovery infrastructure to its own file
This file contains the path discovery that is run from the forward chain
for the packet offloading the flow into the flowtable. This consists
of a series of calls to dev_fill_forward_path() for each device stack.

More topologies may be supported in the future, so move this code to its
own file to separate it from the nftables flow_offload expression.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2025-11-27 23:59:43 +00:00
Pablo Neira Ayuso
634f3853cc netfilter: flowtable: check for maximum number of encapsulations in bridge vlan
Add a sanity check to skip path discovery if the maximum number of
encapsulation is reached. While at it, check for underflow too.

Fixes: 26267bf9bb ("netfilter: flowtable: bridge vlan hardware offload and switchdev")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2025-11-27 23:51:31 +00:00
Jakub Kicinski
db4029859d Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
Conflicts:

net/xdp/xsk.c
  0ebc27a4c6 ("xsk: avoid data corruption on cq descriptor number")
  8da7bea7db ("xsk: add indirect call for xsk_destruct_skb")
  30ed05adca ("xsk: use a smaller new lock for shared pool case")
https://lore.kernel.org/20251127105450.4a1665ec@canb.auug.org.au
https://lore.kernel.org/eb4eee14-7e24-4d1b-b312-e9ea738fefee@kernel.org

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-27 12:19:08 -08:00
Darrick J. Wong
69ceb8a2d6 docs: remove obsolete links in the xfs online repair documentation
Online repair is now merged in upstream, no need to point to patchset
links anymore.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
2025-11-27 16:09:14 +01:00
Paolo Abeni
73f784b2c9 Merge tag 'linux-can-next-for-6.19-20251126' of git://git.kernel.org/pub/scm/linux/kernel/git/mkl/linux-can-next
Marc Kleine-Budde says:

====================
pull-request: can-next 2025-11-26

this is a pull request of 27 patches for net-next/main.

The first 17 patches are by Vincent Mailhol and Oliver Hartkopp and
add CAN XL support to the CAN netlink interface.

Geert Uytterhoeven and Biju Das provide 7 patches for the rcar_canfd
driver to add suspend/resume support.

The next 2 patches are by Markus Schneider-Pargmann and add them as
the m_can maintainer.

Conor Dooley's patch updates the mpfs-can DT bindungs.

linux-can-next-for-6.19-20251126

* tag 'linux-can-next-for-6.19-20251126' of git://git.kernel.org/pub/scm/linux/kernel/git/mkl/linux-can-next: (27 commits)
  dt-bindings: can: mpfs: document resets
  MAINTAINERS: Simplify m_can section
  MAINTAINERS: Add myself as m_can maintainer
  can: rcar_canfd: Add suspend/resume support
  can: rcar_canfd: Convert to DEFINE_SIMPLE_DEV_PM_OPS()
  can: rcar_canfd: Invert CAN clock and close_candev() order
  can: rcar_canfd: Extract rcar_canfd_global_{,de}init()
  can: rcar_canfd: Use devm_clk_get_optional() for RAM clk
  can: rcar_canfd: Invert global vs. channel teardown
  can: rcar_canfd: Invert reset assert order
  can: dev: print bitrate error with two decimal digits
  can: raw: instantly reject unsupported CAN frames
  can: add dummy_can driver
  can: calc_bittiming: add can_calc_sample_point_pwm()
  can: calc_bittiming: add can_calc_sample_point_nrz()
  can: calc_bittiming: replace misleading "nominal" by "reference"
  can: netlink: add PWM netlink interface
  can: calc_bittiming: add PWM calculation
  can: bittiming: add PWM validation
  can: bittiming: add PWM parameters
  ...
====================

Link: https://patch.msgid.link/20251126120106.154635-1-mkl@pengutronix.de
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-27 15:45:17 +01:00
Rohan G Thomas
45d100ee0d net: stmmac: dwmac: Disable flushing frames on Rx Buffer Unavailable
In Store and Forward mode, flushing frames when the receive buffer is
unavailable, can cause the MTL Rx FIFO to go out of sync. This results
in buffering of a few frames in the FIFO without triggering Rx DMA
from transferring the data to the system memory until another packet
is received. Once the issue happens, for a ping request, the packet is
forwarded to the system memory only after we receive another packet
and hece we observe a latency equivalent to the ping interval.

64 bytes from 192.168.2.100: seq=1 ttl=64 time=1000.344 ms

Also, we can observe constant gmacgrp_debug register value of
0x00000120, which indicates "Reading frame data".

The issue is not reproducible after disabling frame flushing when Rx
buffer is unavailable. But in that case, the Rx DMA enters a suspend
state due to buffer unavailability. To resume operation, software
must write to the receive_poll_demand register after adding new
descriptors, which reactivates the Rx DMA.

This issue is observed in the socfpga platforms which has dwmac1000 IP
like Arria 10, Cyclone V and Agilex 7. Issue is reproducible after
running iperf3 server at the DUT for UDP lower packet sizes.

Signed-off-by: Rohan G Thomas <rohan.g.thomas@altera.com>
Reviewed-by: Matthew Gerlach <matthew.gerlach@altera.com>
Tested-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/20251126-a10_ext_fix-v1-1-d163507f646f@altera.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-27 13:05:24 +01:00
Sunday Adelodun
5c9c1e78de selftests: af_unix: remove unused stdlib.h include
The unix_connreset.c test included <stdlib.h>, but no symbol from that
header is used. This causes a fatal build error under certain
linux-next configurations where stdlib.h is not available.

Remove the unused include to fix the build.

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/r/202511221800.hcgCKvVa-lkp@intel.com/
Signed-off-by: Sunday Adelodun <adelodunolaoluwa@yahoo.com>
Link: https://patch.msgid.link/20251125113648.25903-1-adelodunolaoluwa@yahoo.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-27 12:42:51 +01:00
Paolo Abeni
ed245fe9c1 Merge branch 'net-phy-add-support-for-fbnic-phy-w-25g-50g-and-100g-support'
Alexander Duyck says:

====================
net: phy: Add support for fbnic PHY w/ 25G, 50G, and 100G support

To transition the fbnic driver to using the XPCS driver we need to address
the fact that we need a representation for the FW managed PMD that is
actually a SerDes PHY to handle link bouncing during link training.

This patch set introduces the necessary bits to the XPCS driver code to
enable it to read 25G, 50G, and 100G speeds from the PCS ctrl1 register,
and adds support for the approriate interfaces.

The rest of this patch set enables the changes to fbnic to make use of
these interfaces and expose a PMD that can provide a necessary link delay
to avoid link flapping in the event that a cable is disconnected and
reconnected, and to correctly expose the count for the link down events.

With this we have the basic groundwork laid as with this all the bits and
pieces are in place in terms of reading the configuration. The general plan
for follow-on patch sets is to start looking at enabling changing the
configuration in environments where that is supported.
====================

Link: https://patch.msgid.link/176374310349.959489.838154632023183753.stgit@ahduyck-xeon-server.home.arpa
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-27 10:41:51 +01:00
Alexander Duyck
d0fe7104c7 fbnic: Replace use of internal PCS w/ Designware XPCS
As we have exposed the PCS registers via the SWMII we can now start looking
at connecting the XPCS driver to those registers and let it mange the PCS
instead of us doing it directly from the fbnic driver.

For now this just gets us the ability to detect link. The hope is in the
future to add some of the vendor specific registers to begin enabling XPCS
configuration of the interface.

Signed-off-by: Alexander Duyck <alexanderduyck@fb.com>
Link: https://patch.msgid.link/176374325295.959489.14521115864034905277.stgit@ahduyck-xeon-server.home.arpa
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-27 10:41:31 +01:00
Alexander Duyck
d0ce9fd7ea fbnic: Add SW shim for MDIO interface to PMD and PCS
In order for us to support a PCS device we need to add an MDIO bus to allow
the drivers to have access to the registers for the device.  This change
adds such an interface.

The interface will consist of 2 PHY addrs, the first one consisting of a
PMD and PCS, and the second just being a PCS. There is a need for 2 PHYs
addrs due to the fact that in order to support the 50GBase-CR2 mode we will
need to access and configure the PCS vendor registers and RSFEC registers
from the second lane identical to the first.

Signed-off-by: Alexander Duyck <alexanderduyck@fb.com>
Link: https://patch.msgid.link/176374324532.959489.15389723111560978054.stgit@ahduyck-xeon-server.home.arpa
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-27 10:41:31 +01:00
Alexander Duyck
1fe7978329 fbnic: Add handler for reporting link down event statistics
We were previously not displaying the number of link_down_events tracked by
the device. With this change we should now be able to display the value.
The value itself tracks the calls from the phylink interface to the
mac_link_down call.

Signed-off-by: Alexander Duyck <alexanderduyck@fb.com>
Link: https://patch.msgid.link/176374323824.959489.6915296616773178954.stgit@ahduyck-xeon-server.home.arpa
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-27 10:41:31 +01:00
Alexander Duyck
9963117a2b fbnic: Add logic to track PMD state via MAC/PCS signals
One complication with the design of our part is that the PMD doesn't
provide a direct signal to the host. Instead we have visibility to signals
that the PCS provides to the MAC that allow us to check the link state
through that.

We will need to account for several things in the PMD and firmware when
managing the link. Specifically when the link first starts to come up the
PMD will cause the link to flap. This is due to the firmware starting a
training cycle when the link is first detected. This will cause link
flapping if we were to immediately report link up when the PCS first
detects it.

To address that we are adding a pmd_state variable that is meant to be a
countdown of sorts indicating the state of the PMD. If the link is down or
has been reconfigured the PMD will start out in the initialize state. By
default the link is assumed to be in the SEND_DATA state if it is available
on initial link inspection. If link is detected while in the initialize
state the PMD state will switch to training, and if after 4 seconds the
link is still stable we will transition to link_ready, and finally the
send_data state.  With this we can avoid link flapping when a cable is
first connected to the NIC.

One side effect of this is that we need to pull the link state away from
the PCS. For now we use a union of the PCS link state register value and
the pmd_state. The plan is to add a PMD register to report the pmd_state
to the phylink interface. With that we can then look at switching over to
the use of the XPCS driver for fbnic instead of having an internal one.

Signed-off-by: Alexander Duyck <alexanderduyck@fb.com>
Link: https://patch.msgid.link/176374323107.959489.14951134213387615059.stgit@ahduyck-xeon-server.home.arpa
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-27 10:41:31 +01:00
Alexander Duyck
f18dd1b15f fbnic: Rename PCS IRQ to MAC IRQ as it is actually a MAC interrupt
Throughout several spots in the code I had called out the IRQ as being
related to the PCS. However the actual IRQ is a part of the MAC and it is
just exposing PCS data. To more accurately reflect the owner of the calls
this change makes it so that we rename the functions and values that are
taking in the interrupt value and processing it to reflect that it is a MAC
call and not a PCS one.

This change is mostly motivated by the fact that we will be moving the
handling of this interrupt from being PCS focused to being more PMA/PMD
focused as this will drive the phydev driver that I am adding instead of
driving the PCS directly.

Signed-off-by: Alexander Duyck <alexanderduyck@fb.com>
Link: https://patch.msgid.link/176374322373.959489.12018231545479053860.stgit@ahduyck-xeon-server.home.arpa
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-27 10:41:31 +01:00
Alexander Duyck
3f29dd34f7 net: pcs: xpcs: Add support for FBNIC 25G, 50G, 100G PMD
The fbnic driver is planning to make use of the XPCS driver to enable
support for PCS and better integration with phylink. To do this though we
will need to enable several workarounds since the PMD interface for fbnic
is likely to be unique since it is a mix of two different vendor products
with a unique wrapper around the IP.

I have generated a PHY identifier based on IEEE 802.3-2022 22.2.4.3.1 using
an OUI belonging to Meta Platforms and used with our NICs. Using this we
will provide it as the PMD ID via the SW based MDIO interface so that
the fbnic device can be identified and necessary workarounds enabled in the
XPCS driver.

As an initial workaround this change adds an exception so that soft_reset
is not set when the driver is initially bound to the PCS.

In addition I have added logic to integrate the PMD Rx signal detect state
into the link state for the PCS. With this we can avoid the link coming up
too soon on the FBNIC PMD and as a result of it being in the training state
so we can avoid link flaps.

Signed-off-by: Alexander Duyck <alexanderduyck@fb.com>
Link: https://patch.msgid.link/176374321695.959489.6648161125012056619.stgit@ahduyck-xeon-server.home.arpa
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-27 10:41:31 +01:00
Alexander Duyck
39e138173a net: pcs: xpcs: Fix PMA identifier handling in XPCS
The XPCS driver was mangling the PMA identifier as the original code
appears to have been focused on just capturing the OUI. Rather than store a
mangled ID it is better to work with the actual PMA ID and instead just
mask out the values that don't apply rather than shifting them and
reordering them as you still don't get the original OUI for the NIC without
having to bitswap the values as per the definition of the layout in IEEE
802.3-2022 22.2.4.3.1.

By laying it out as it was in the hardware it is also less likely for us to
have an unintentional collision as the enum values will occupy the revision
number area while the OUI occupies the upper 22 bits.

Signed-off-by: Alexander Duyck <alexanderduyck@fb.com>
Link: https://patch.msgid.link/176374320920.959489.17267159479370601070.stgit@ahduyck-xeon-server.home.arpa
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-27 10:41:31 +01:00
Alexander Duyck
7622d55276 net: pcs: xpcs: Add support for 25G, 50G, and 100G interfaces
With this change we are adding support for 25G, 50G, and 100G interface
types to the XPCS driver. This had supposedly been enabled with the
addition of XLGMII but I don't see any capability for configuration there
so I suspect it may need to be refactored in the future.

With this change we can enable the XPCS driver with the selected interface
and it should be able to detect link, speed, and report the link status to
the phylink interface.

Signed-off-by: Alexander Duyck <alexanderduyck@fb.com>
Link: https://patch.msgid.link/176374320248.959489.11649590675011158859.stgit@ahduyck-xeon-server.home.arpa
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-27 10:41:30 +01:00
Alexander Duyck
e6c43c9500 net: phy: Add MDIO_PMA_CTRL1_SPEED for 2.5G and 5G to reflect PMA values
The 2.5G and 5G values are not consistent between the PCS CTRL1 and PMA
CTRL1 values. In order to avoid confusion between the two I am updating the
values to include "PMA" in the name similar to values used in similar
places.

To avoid breaking UAPI I have retained the original macros and just defined
them as the new PMA based defines.

Signed-off-by: Alexander Duyck <alexanderduyck@fb.com>
Link: https://patch.msgid.link/176374319569.959489.6610469879021800710.stgit@ahduyck-xeon-server.home.arpa
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-27 10:41:30 +01:00
Raju Rangoju
f93505f357 amd-xgbe: let the MAC manage PHY PM
Use the MAC managed PM flag to indicate that MAC driver takes care of
suspending/resuming the PHY, and reset it when the device is brought up.

Signed-off-by: Raju Rangoju <Raju.Rangoju@amd.com>
Link: https://patch.msgid.link/20251123163721.442162-1-Raju.Rangoju@amd.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-26 18:53:24 -08:00
Jakub Kicinski
ce69978ae8 Merge branch 'net-hibmcge-add-support-for-tracepoint-and-pagepool-on-hibmcge-driver'
Jijie Shao says:

====================
net: hibmcge: Add support for tracepoint and pagepool on hibmcge driver

In this patch set:
1: add support for tracepoint for rx descriptor
2: double the rx queue depth to reduce packet drop
3: add support for pagepool on rx
====================

Link: https://patch.msgid.link/20251122034657.3373143-1-shaojijie@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-26 18:22:44 -08:00
Jijie Shao
c305959175 net: hibmcge: add support for pagepool on rx
add support for pagepool on rx, and remove the legacy path

Signed-off-by: Jijie Shao <shaojijie@huawei.com>
Link: https://patch.msgid.link/20251122034657.3373143-4-shaojijie@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-26 18:22:40 -08:00
Jijie Shao
2e68bb2e0f net: hibmcge: reduce packet drop under stress testing
Under stress test scenarios, hibmcge driver may not receive packets
in a timely manner, which can lead to the buffer of the hardware queue
being exhausted, resulting in packet drop.

This patch doubles the software queue depth and uses half of the buffer
to fill the hardware queue before receiving packets, thus preventing
packet loss caused by the hardware queue buffer being exhausted.

Signed-off-by: Jijie Shao <shaojijie@huawei.com>
Link: https://patch.msgid.link/20251122034657.3373143-3-shaojijie@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-26 18:22:40 -08:00
Tao Lan
91f3305b97 net: hibmcge: add support for tracepoint to dump some fields of rx_desc
add support for tracepoint to dump some fields of rx_desc

Signed-off-by: Tao Lan <lantao5@huawei.com>
Signed-off-by: Jijie Shao <shaojijie@huawei.com>
Link: https://patch.msgid.link/20251122034657.3373143-2-shaojijie@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-26 18:22:40 -08:00
Vladimir Oltean
37a96c2009 net: fman_memac: report structured ethtool counters
The FMan driver has support for 2 MACs: mEMAC (newer, present on
Layerscape and PowerPC T series) and dTSEC/TGEC (older, present on
PowerPC P series). I only have handy access to the mEMAC, and this adds
support for MAC counters for those platforms.

MAC counters are necessary for any kind of low-level debugging, and
currently there is no mechanism to dump them.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://patch.msgid.link/20251122115931.151719-1-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-26 18:10:41 -08:00
Vladimir Oltean
7241d80e77 net: dpaa: fman_memac: complete phylink support with 2500base-x
The DPAA phylink conversion in the following commits partially developed
code for handling the 2500base-x host interface mode (called "2.5G
SGMII" in LS1043A/LS1046A reference manuals).

- 0fc83bd795 ("net: fman: memac: Add serdes support")
- 5d93cfcf73 ("net: dpaa: Convert to phylink")

In principle, having phy-interface-mode = "2500base-x" and a pcsphy-handle
(unnamed or with pcs-handle-names = "sgmii") in the MAC device tree node
results in PHY_INTERFACE_MODE_2500BASEX being set in phylink_config ::
supported_interfaces, but this isn't sufficient.

Because memac_select_pcs() returns no PCS for PHY_INTERFACE_MODE_2500BASEX,
the Lynx PCS code never engages. There's a chance the PCS driver doesn't
have any configuration to change for 2500base-x fixed-link (based on
bootloader pre-initialization), but there's an even higher chance that
this is not the case, and the PCS remains misconfigured.

More importantly, memac_if_mode() does not handle
PHY_INTERFACE_MODE_2500BASEX, and it should be telling the mEMAC to
configure itself in GMII mode (which is upclocked by the PCS). Currently
it prints a WARN_ON() and returns zero, aka IF_MODE_10G (incorrect).

The additional case statement in memac_prepare() for calling
phy_set_mode_ext() does not make any difference, because there is no
generic PHY driver for the Lynx 10G SerDes from LS1043A/LS1046A. But we
add it nonetheless, for consistency.

Regarding the question "did 2500base-x ever work with the FMan mEMAC
mainline code prior to the phylink conversion?" - the answer is more
nuanced.

For context, the previous phylib-based implementation was unable to
describe the fixed-link speed as 2500, because the software PHY
implementation is limited to 1G. However, improperly describing the link
as an sgmii fixed-link with speed = <1000> would have resulted in a
functional 2.5G speed, because there is no other difference than the
SerDes lane clock net frequency (3.125 GHz for 2500base-x) - all the
other higher-level settings are the same, and the SerDes lane frequency
is currently handled by the RCW.

But this hack cannot be extended towards a phylib PHY such as Aquantia
operating in OCSGMII, because the latter requires phy-mode = "2500base-x",
which the mEMAC driver did not support prior to the phylink conversion.
So I do not really consider this a regression, just completing support
for a missing feature.

The FMan mEMAC driver sets phylink's "default_an_inband" property to
true, making it as if the device tree node had the managed =
"in-band-status" property anyway. This default made sense for SGMII,
where it was added to avoid regressions, but for 2500base-x we learned
only recently how to enable in-band autoneg:
https://lore.kernel.org/netdev/20251122113433.141930-1-vladimir.oltean@nxp.com/

so the driver needs to opt out of this default in-band enabled
behaviour, and only enable in-band based on the device tree property.

Suggested-by: Russell King (Oracle) <linux@armlinux.org.uk>
Link: https://lore.kernel.org/netdev/aIyx0OLWGw5zKarX@shell.armlinux.org.uk/#t
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://patch.msgid.link/20251122115523.150260-1-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-26 18:06:10 -08:00
Vladimir Oltean
002373a8b0 net: phy: dp83867: implement configurability for SGMII in-band auto-negotiation
Implement the inband_caps() and config_inband() PHY driver methods, to
allow working with PCS devices that do not support or want in-band to be
used.

There is a complication due to existing logic from commit c76acfb7e1
("net: phy: dp83867: retrigger SGMII AN when link change") which might
re-enable what dp83867_config_inband() has disabled. So we need to
modify dp83867_link_change_notify() to use phy_modify_changed() when
temporarily disabling in-band autoneg. If the return code is 0, it means
the original in-band was disabled and we need to keep it disabled.
If the return code is 1, the original was enabled and we need to
re-enable it. If negative, there was an error, which was silent before,
and remains silent now.

dp83867_config_inband() and dp83867_link_change_notify() are serialized
by the phydev->lock.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://patch.msgid.link/20251122110427.133035-1-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-26 18:04:17 -08:00
Hangbin Liu
308b7dee3e tools: ynl: add YNL test framework
Add a test framework for YAML Netlink (YNL) tools, covering both CLI and
ethtool functionality. The framework includes:

1) cli: family listing, netdev, ethtool, rt-* families, and nlctrl
   operations
2) ethtool: device info, statistics, ring/coalesce/pause parameters, and
   feature gettings

The current YNL syntax is a bit obscure, and end users may not always know
how to use it. This test framework provides usage examples and also serves
as a regression test to catch potential breakages caused by future changes.

Reviewed-by: Donald Hunter <donald.hunter@gmail.com>
Acked-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Link: https://patch.msgid.link/20251124022055.33389-1-liuhangbin@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-26 17:58:49 -08:00
Hangbin Liu
651765e8d5 netlink: specs: add big-endian byte-order for u32 IPv4 addresses
The fix commit converted several IPv4 address attributes from binary
to u32, but forgot to specify byte-order: big-endian. Without this,
YNL tools display IPv4 addresses incorrectly due to host-endian
interpretation.

Add the missing byte-order: big-endian to all affected u32 IPv4
address fields to ensure correct parsing and display.

Fixes: 1064d521d1 ("netlink: specs: support ipv4-or-v6 for dual-stack fields")
Reported-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Reviewed-by: Asbjørn Sloth Tønnesen <ast@fiberby.net>
Link: https://patch.msgid.link/20251125112048.37631-1-liuhangbin@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-26 17:14:17 -08:00
Jakub Kicinski
a8080c2f0e Merge branch 'net-intel-migrate-to-get_rx_ring_count-ethtool-callback'
Breno Leitao says:

====================
net: intel: migrate to .get_rx_ring_count() ethtool callback

This series migrates Intel network drivers to use the new .get_rx_ring_count()
ethtool callback introduced in commit 84eaf4359c ("net: ethtool: add
get_rx_ring_count callback to optimize RX ring queries").

The new callback simplifies the .get_rxnfc() implementation by removing
ETHTOOL_GRXRINGS handling and moving it to a dedicated callback. This provides
a cleaner separation of concerns and aligns these drivers with the modern
ethtool API.

The series updates the following Intel drivers:
  - idpf
  - igb
  - igc
  - ixgbevf
  - fm10k
====================

Link: https://patch.msgid.link/20251125-gxring_intel-v2-0-f55cd022d28b@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-26 17:09:13 -08:00
Breno Leitao
73d834cd17 fm10k: extract GRXRINGS from .get_rxnfc
Commit 84eaf4359c ("net: ethtool: add get_rx_ring_count callback to
optimize RX ring queries") added specific support for GRXRINGS callback,
simplifying .get_rxnfc.

Remove the handling of GRXRINGS in .get_rxnfc() by moving it to the new
.get_rx_ring_count().

This simplifies the RX ring count retrieval and aligns fm10k with the new
ethtool API for querying RX ring parameters.

Signed-off-by: Breno Leitao <leitao@debian.org>
Signed-off-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Link: https://patch.msgid.link/20251125-gxring_intel-v2-8-f55cd022d28b@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-26 17:09:10 -08:00
Breno Leitao
3399fd519d ixgbevf: extract GRXRINGS from .get_rxnfc
Commit 84eaf4359c ("net: ethtool: add get_rx_ring_count callback to
optimize RX ring queries") added specific support for GRXRINGS callback,
simplifying .get_rxnfc.

Remove the handling of GRXRINGS in .get_rxnfc() by moving it to the new
.get_rx_ring_count().

This simplifies the RX ring count retrieval and aligns ixgbevf with the new
ethtool API for querying RX ring parameters.

Signed-off-by: Breno Leitao <leitao@debian.org>
Signed-off-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Link: https://patch.msgid.link/20251125-gxring_intel-v2-7-f55cd022d28b@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-26 17:09:10 -08:00
Breno Leitao
768ce58ddd igc: extract GRXRINGS from .get_rxnfc
Commit 84eaf4359c ("net: ethtool: add get_rx_ring_count callback to
optimize RX ring queries") added specific support for GRXRINGS callback,
simplifying .get_rxnfc.

Remove the handling of GRXRINGS in .get_rxnfc() by moving it to the new
.get_rx_ring_count().

This simplifies the RX ring count retrieval and aligns igc with the new
ethtool API for querying RX ring parameters.

Signed-off-by: Breno Leitao <leitao@debian.org>
Signed-off-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Link: https://patch.msgid.link/20251125-gxring_intel-v2-6-f55cd022d28b@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-26 17:09:10 -08:00
Breno Leitao
d6c744f468 igb: extract GRXRINGS from .get_rxnfc
Commit 84eaf4359c ("net: ethtool: add get_rx_ring_count callback to
optimize RX ring queries") added specific support for GRXRINGS callback,
simplifying .get_rxnfc.

Remove the handling of GRXRINGS in .get_rxnfc() by moving it to the new
.get_rx_ring_count().

This simplifies the RX ring count retrieval and aligns igb with the new
ethtool API for querying RX ring parameters.

Signed-off-by: Breno Leitao <leitao@debian.org>
Signed-off-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Link: https://patch.msgid.link/20251125-gxring_intel-v2-5-f55cd022d28b@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-26 17:09:10 -08:00
Breno Leitao
873a1942fb idpf: extract GRXRINGS from .get_rxnfc
Commit 84eaf4359c ("net: ethtool: add get_rx_ring_count callback to
optimize RX ring queries") added specific support for GRXRINGS callback,
simplifying .get_rxnfc.

Remove the handling of GRXRINGS in .get_rxnfc() by moving it to the new
.get_rx_ring_count().

This simplifies the RX ring count retrieval and aligns idpf with the new
ethtool API for querying RX ring parameters.

I was not totally convinced I needed to have the lock, but, I decided to
be on the safe side and get the exact same behaviour it was before.

Signed-off-by: Breno Leitao <leitao@debian.org>
Signed-off-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Link: https://patch.msgid.link/20251125-gxring_intel-v2-4-f55cd022d28b@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-26 17:09:09 -08:00
Breno Leitao
8e8c00e1d2 ice: extract GRXRINGS from .get_rxnfc
Commit 84eaf4359c ("net: ethtool: add get_rx_ring_count callback to
optimize RX ring queries") added specific support for GRXRINGS callback,
simplifying .get_rxnfc.

Remove the handling of GRXRINGS in .get_rxnfc() by moving it to the new
.get_rx_ring_count().

This simplifies the RX ring count retrieval and aligns ice with the new
ethtool API for querying RX ring parameters.

Signed-off-by: Breno Leitao <leitao@debian.org>
Signed-off-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Link: https://patch.msgid.link/20251125-gxring_intel-v2-3-f55cd022d28b@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-26 17:09:09 -08:00
Breno Leitao
fe0a3d7d1d iavf: extract GRXRINGS from .get_rxnfc
Commit 84eaf4359c ("net: ethtool: add get_rx_ring_count callback to
optimize RX ring queries") added specific support for GRXRINGS callback,
simplifying .get_rxnfc.

Remove the handling of GRXRINGS in .get_rxnfc() by moving it to the new
.get_rx_ring_count().

This simplifies the RX ring count retrieval and aligns iavf with the new
ethtool API for querying RX ring parameters.

Signed-off-by: Breno Leitao <leitao@debian.org>
Signed-off-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Link: https://patch.msgid.link/20251125-gxring_intel-v2-2-f55cd022d28b@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-26 17:09:09 -08:00
Breno Leitao
a8acbcbaf6 i40e: extract GRXRINGS from .get_rxnfc
Commit 84eaf4359c ("net: ethtool: add get_rx_ring_count callback to
optimize RX ring queries") added specific support for GRXRINGS callback,
simplifying .get_rxnfc.

Remove the handling of GRXRINGS in .get_rxnfc() by moving it to the new
.get_rx_ring_count().

This simplifies the RX ring count retrieval and aligns i40e with the new
ethtool API for querying RX ring parameters.

Signed-off-by: Breno Leitao <leitao@debian.org>
Signed-off-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Link: https://patch.msgid.link/20251125-gxring_intel-v2-1-f55cd022d28b@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-26 17:09:09 -08:00
Jakub Kicinski
4585847fdd Merge branch 'unify-platform-suspend-resume-routines-for-pci-dwmac-glue'
Yao Zi says:

====================
Unify platform suspend/resume routines for PCI DWMAC glue

There are currently three PCI-based DWMAC glue drivers in tree,
stmmac_pci.c, dwmac-intel.c, and dwmac-loongson.c. Both stmmac_pci.c and
dwmac-intel.c implements the same and duplicated platform suspend/resume
routines.

This series introduces a new PCI helper library, stmmac_libpci.c,
providing a pair of helpers, stmmac_pci_plat_{suspend,resume}, and
replaces the driver-specific implementation with the helpers to reduce
code duplication. The helper will also simplify the Motorcomm DWMAC glue
driver which I'm working on.

The glue driver for Intel controllers isn't covered by the series, since
its suspend routine doesn't call pci_disable_device() and thus is a
little different from the new generic helpers.

I only have Loongson hardware on hand, thus the series is only tested on
Loongson 3A5000 machine. I could confirm the controller works after
resume, and WoL works as expected. This shouldn't break stmmac_pci.c,
either, since the new helpers have the exactly same code as the old
driver-specific suspend/resume hooks.
====================

Link: https://patch.msgid.link/20251124160417.51514-1-ziyao@disroot.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-26 17:07:42 -08:00
Yao Zi
b35e94edf2 net: stmmac: pci: Use generic PCI suspend/resume routines
Convert STMMAC PCI glue driver to use the generic platform
suspend/resume routines for PCI controllers, instead of implementing its
own one.

Signed-off-by: Yao Zi <ziyao@disroot.org>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Yanteng Si <siyanteng@cqsoftware.com.cn>
Link: https://patch.msgid.link/20251124160417.51514-4-ziyao@disroot.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-26 17:07:42 -08:00
Yao Zi
c4064af1c7 net: stmmac: loongson: Use generic PCI suspend/resume routines
Convert glue driver for Loongson DWMAC controller to use the generic
platform suspend/resume routines for PCI controllers, instead of
implementing its own one.

Signed-off-by: Yao Zi <ziyao@disroot.org>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Acked-by: Yanteng Si <siyanteng@cqsoftware.com.cn>
Link: https://patch.msgid.link/20251124160417.51514-3-ziyao@disroot.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-26 17:07:42 -08:00
Yao Zi
4440bf5f2e net: stmmac: Add generic suspend/resume helper for PCI-based controllers
Most glue driver for PCI-based DWMAC controllers utilize similar
platform suspend/resume routines. Add a generic implementation to reduce
duplicated code.

Signed-off-by: Yao Zi <ziyao@disroot.org>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Yanteng Si <siyanteng@cqsoftware.com.cn>
Link: https://patch.msgid.link/20251124160417.51514-2-ziyao@disroot.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-26 17:07:38 -08:00
Jakub Kicinski
fdaf715b1a Merge branch 'add-hwtstamp_get-callback-to-phy-drivers'
Vadim Fedorenko says:

====================
add hwtstamp_get callback to phy drivers

PHY drivers are able to configure HW time stamping and are not able to
report configuration back to user space. Add callback to report
configuration like it's done for net_device and add implementation to
the drivers.
====================

Link: https://patch.msgid.link/20251124181151.277256-1-vadim.fedorenko@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-26 16:56:36 -08:00
Vadim Fedorenko
dadc51871d ptp: ptp_ines: add HW timestamp configuration reporting
The driver partially stores HW timestamping configuration, but missing
pieces can be read from HW. Add callback to report configuration.

Reviewed-by: Kory Maincent <kory.maincent@bootlin.com>
Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Link: https://patch.msgid.link/20251124181151.277256-8-vadim.fedorenko@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-26 16:56:34 -08:00
Vadim Fedorenko
d51de60b8e net: phy: nxp-c45-tja11xx: add HW timestamp configuration reporting
The driver stores HW timestamping configuration and can technically
report it. Add callback to do it.

Reviewed-by: Kory Maincent <kory.maincent@bootlin.com>
Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Link: https://patch.msgid.link/20251124181151.277256-7-vadim.fedorenko@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-26 16:56:34 -08:00
Vadim Fedorenko
ab95392ab5 phy: mscc: add HW timestamp configuration reporting
The driver stores HW configuration and can technically report it.
Add callback to do it.

Reviewed-by: Kory Maincent <kory.maincent@bootlin.com>
Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Link: https://patch.msgid.link/20251124181151.277256-6-vadim.fedorenko@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-26 16:56:34 -08:00
Vadim Fedorenko
036bb4a537 net: phy: dp83640: add HW timestamp configuration reporting
The driver stores configuration of TX timestamping and can technically
report it. Patch RX timestamp configuration storage to be more precise
on reporting and add callback to actually report it.

Reviewed-by: Kory Maincent <kory.maincent@bootlin.com>
Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Link: https://patch.msgid.link/20251124181151.277256-5-vadim.fedorenko@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-26 16:56:34 -08:00
Vadim Fedorenko
1cff8392df net: phy: broadcom: add HW timestamp configuration reporting
The driver stores configuration information and can technically report
it. Implement hwtstamp_get callback to report the configuration.

Reviewed-by: Kory Maincent <kory.maincent@bootlin.com>
Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Link: https://patch.msgid.link/20251124181151.277256-4-vadim.fedorenko@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-26 16:56:34 -08:00
Vadim Fedorenko
f467777efb phy: add hwtstamp_get callback to phy drivers
PHY devices had lack of hwtstamp_get callback even though most of them
are tracking configuration info. Introduce new call back to
mii_timestamper.

Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Kory Maincent <kory.maincent@bootlin.com>
Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Link: https://patch.msgid.link/20251124181151.277256-3-vadim.fedorenko@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-26 16:56:33 -08:00
Vadim Fedorenko
6aac2aa2df phy: rename hwtstamp callback to hwtstamp_set
PHY devices has hwtstamp callback which actually performs set operation.
Rename it to better reflect the action.

Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Kory Maincent <kory.maincent@bootlin.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Link: https://patch.msgid.link/20251124181151.277256-2-vadim.fedorenko@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-26 16:56:33 -08:00
Willem de Bruijn
c01a6e5b2e selftests/net: packetdrill: pass send_omit_free to MSG_ZEROCOPY tests
The --send_omit_free flag is needed for TCP zero copy tests, to ensure
that packetdrill doesn't free the send() buffer after the send() call.

Fixes: 1e42f73fd3 ("selftests/net: packetdrill: import tcp/zerocopy")
Closes: https://lore.kernel.org/netdev/20251124071831.4cbbf412@kernel.org/
Suggested-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20251125234029.1320984-1-willemdebruijn.kernel@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-26 15:57:55 -08:00
Andreas Gruenbacher
83348905e4 gfs2: Clean up SDF_JOURNAL_LIVE flag handling
Change do_withdraw() to clear the SDF_JOURNAL_LIVE flag under the log
flush lock.  In addition, change __gfs2_trans_begin() to check if the
filesystem is already known to be withdrawn using gfs2_withdrawn().
Then, once we are holding the log flush lock, check if the
SDF_JOURNAL_LIVE flag is still set.  This second check ensures that the
filesystem will remain live until the transaction is submitted.

With these changes, it is no longer useful to clear SDF_JOURNAL_LIVE in
gfs2_end_log_write() after calling gfs2_withdraw().

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2025-11-26 23:52:28 +00:00
Andreas Gruenbacher
16c3197984 gfs2: No longer thaw filesystems during a withdraw
Previously, when a withdraw occurred, we would wait for another node to
recover our journal.  This also meant that frozen filesystem needed to
be thawed because otherwise, other nodes wouldn't be able to recover the
filesystem.  With the reversal of commit 601ef0d52e ("gfs2: Force
withdraw to replay journals and wait for it to finish"), we are no
longer waiting for journal recovery during a withdraw, so we no longer
need to thaw frozen filesystems, either.  This also fixes a potential
deadlock reported by lockdep when running xfstest generic/108.

In addition, there is nothing left in do_withdraw() that would require
taking sd_freeze_mutex, so don't bother taking that lock there anymore.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2025-11-26 23:52:28 +00:00
Andreas Gruenbacher
3a88edc165 gfs2: Withdraw immediately in gfs2_trans_add_meta
We can now withdraw while the log is locked.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2025-11-26 23:52:28 +00:00
Andreas Gruenbacher
bbbf1529ea gfs2: New gfs2_withdraw_helper
Currently, when a gfs2 filesystem is withdrawn, an "offline" uevent is
triggered that invokes gfs2-util's gfs2_withdraw_helper script.  The
purpose of this script is to deactivate the filesystem's block device so
that it can be withdrawn immediately, even before all the filesystem's
caches have been discarded.  The script provided by gfs2-utils never did
anything useful, and there was no way for it to report back its status
to the kernel.

To fix that, extend the gfs2_withdraw_helper mechanism so that the
script can report one of the following results by writing the
corresponding value into "/sys$DEVPATH/lock_module/withdraw":

 0 - The shared block device has been marked inactive.  Future write
     operations will fail.

 1 - The shared block device may still be active and carry out
     write operations.

If the "offline" uevent isn't reacted upon within the timeout configured
in /sys$DEVPATH/tune/withdraw_helper_timeout (default 5 seconds), the
event handler is assumed to have failed.

In addition, add an additional "errors=deactivate" mount option.

With these changes, if fatal errors are detected on a gfs2 filesystem
and the filesystem is mounted with the "errors=panic" option, the kernel
will panic immediately.  Otherwise, an attempt will be made to
deactivate the underlying block device.  If successful, the kernel will
release all cluster-wide locks immediately so that the rest of the
cluster can continue.  If unsuccessful, the kernel will either panic
("errors=deactivate"), or it will purge all filesystem I/O before
releasing all cluster-wide locks ("errors=withdraw").

Note that the gfs2_withdraw_helper script still needs to be fixed to
take advantage of these improvements.  It could be changed to use a
mechanism like LVM Persistent Reservations.  "dmsetup suspend" is not a
suitable mechanism as it infinitely postpones I/O operations, which may
prevent withdraw from completing.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2025-11-26 23:52:27 +00:00
Andreas Gruenbacher
0e10da69d1 gfs2: Clean up properly during a withdraw
During a withdraw, we don't want to write out any more data than we have
to, so in do_xmote(), skip the ->go_sync() glock operation.  We still
want to keep calling ->go_inval() to discard any cached data or
metadata, whether clean or dirty.

We do still allow glocks to transition into state LM_ST_UNLOCKED.  This
has the desired side effect of calling ->go_inval() and invalidating the
glock caches.

Function gfs2_withdraw_glocks() is already used for dequeuing any
left-over waiters.  We still want that to happen, but additionally, we
want all glocks to be unlocked.

Finally, we change function do_promote() to refuse any further
promotions.

This commit cleans up the leftovers of commit 86934198ee ("gfs2: Clear
flags when withdraw prevents xmote").

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2025-11-26 23:52:27 +00:00
Andreas Gruenbacher
473678ccb9 gfs2: Rename gfs2_{gl_dq_holders => withdraw_glocks}
Rename function gfs2_gl_dq_holders() to gfs2_withdraw_glocks().  This
function will soon be used for more than just dequeuing holders.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2025-11-26 23:52:27 +00:00
Andreas Gruenbacher
655531c95b Revert "gfs2: fix infinite loop when checking ail item count before go_inval"
The current withdraw code duplicates the journal recovery code gfs2
already has for dealing with node failures, and it does so poorly.  That
code was added because when releasing a lockspace, we didn't have a way
to indicate that the lockspace needs recovery.  We now do have this
feature, so the current withdraw code can be removed almost entirely.
This is one of several steps towards that.

Reverts commit 33dbd1e41a ("gfs2: fix infinite loop when checking ail
item count before go_inval").

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2025-11-26 23:52:27 +00:00
Andreas Gruenbacher
af572efef1 Revert "gfs2: Allow some glocks to be used during withdraw"
The current withdraw code duplicates the journal recovery code gfs2
already has for dealing with node failures, and it does so poorly.  That
code was added because when releasing a lockspace, we didn't have a way
to indicate that the lockspace needs recovery.  We now do have this
feature, so the current withdraw code can be removed almost entirely.
This is one of several steps towards that.

Reverts commit a72d2401f5 ("gfs2: Allow some glocks to be used during
withdraw").

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2025-11-26 23:52:27 +00:00
Andreas Gruenbacher
41ad1f7c8b Revert "gfs2: Check for log write errors before telling dlm to unlock"
The current withdraw code duplicates the journal recovery code gfs2
already has for dealing with node failures, and it does so poorly.  That
code was added because when releasing a lockspace, we didn't have a way
to indicate that the lockspace needs recovery.  We now do have this
feature, so the current withdraw code can be removed almost entirely.
This is one of several steps towards that.

Reverts the rest of d93ae386ef ("gfs2: Check for log write errors
before telling dlm to unlock").

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2025-11-26 23:52:27 +00:00
Andreas Gruenbacher
6bb7c1bf5a Revert "gfs2: fix a deadlock on withdraw-during-mount"
The current withdraw code duplicates the journal recovery code gfs2
already has for dealing with node failures, and it does so poorly.  That
code was added because when releasing a lockspace, we didn't have a way
to indicate that the lockspace needs recovery.  We now do have this
feature, so the current withdraw code can be removed almost entirely.
This is one of several steps towards that.

Reverts commit 865cc3e9cc ("gfs2: fix a deadlock on
withdraw-during-mount").

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2025-11-26 23:52:27 +00:00
Andreas Gruenbacher
dcc42d5541 Revert "gfs2: Force withdraw to replay journals and wait for it to finish" (6/6)
The current withdraw code duplicates the journal recovery code gfs2
already has for dealing with node failures, and it does so poorly.  That
code was added because when releasing a lockspace, we didn't have a way
to indicate that the lockspace needs recovery.  We now do have this
feature, so the current withdraw code can be removed almost entirely.
This is one of several steps towards that.

Reverts parts of commit 601ef0d52e ("gfs2: Force withdraw to replay
journals and wait for it to finish").

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2025-11-26 23:52:27 +00:00
Andreas Gruenbacher
406058184c Revert "gfs2: Force withdraw to replay journals and wait for it to finish" (5/6)
The current withdraw code duplicates the journal recovery code gfs2
already has for dealing with node failures, and it does so poorly.  That
code was added because when releasing a lockspace, we didn't have a way
to indicate that the lockspace needs recovery.  We now do have this
feature, so the current withdraw code can be removed almost entirely.
This is one of several steps towards that.

Reverts parts of commit 601ef0d52e ("gfs2: Force withdraw to replay
journals and wait for it to finish").

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2025-11-26 23:52:27 +00:00
Andreas Gruenbacher
a07a1e46d2 Revert "gfs2: Force withdraw to replay journals and wait for it to finish" (4/6)
The current withdraw code duplicates the journal recovery code gfs2
already has for dealing with node failures, and it does so poorly.  That
code was added because when releasing a lockspace, we didn't have a way
to indicate that the lockspace needs recovery.  We now do have this
feature, so the current withdraw code can be removed almost entirely.
This is one of several steps towards that.

Reverts parts of commit 601ef0d52e ("gfs2: Force withdraw to replay
journals and wait for it to finish").

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2025-11-26 23:52:27 +00:00
Andreas Gruenbacher
4cee5b0f7a Revert "gfs2: Force withdraw to replay journals and wait for it to finish" (3/6)
The current withdraw code duplicates the journal recovery code gfs2
already has for dealing with node failures, and it does so poorly.  That
code was added because when releasing a lockspace, we didn't have a way
to indicate that the lockspace needs recovery.  We now do have this
feature, so the current withdraw code can be removed almost entirely.
This is one of several steps towards that.

Reverts parts of commit 601ef0d52e ("gfs2: Force withdraw to replay
journals and wait for it to finish").

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2025-11-26 23:52:27 +00:00
Andreas Gruenbacher
2aae092dc4 Revert "gfs2: Force withdraw to replay journals and wait for it to finish" (2/6)
The current withdraw code duplicates the journal recovery code gfs2
already has for dealing with node failures, and it does so poorly.  That
code was added because when releasing a lockspace, we didn't have a way
to indicate that the lockspace needs recovery.  We now do have this
feature, so the current withdraw code can be removed almost entirely.
This is one of several steps towards that.

Reverts parts of commit 601ef0d52e ("gfs2: Force withdraw to replay
journals and wait for it to finish").

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2025-11-26 23:52:26 +00:00
Andreas Gruenbacher
20b44ddbbb Revert "gfs2: Force withdraw to replay journals and wait for it to finish" (1/6)
The current withdraw code duplicates the journal recovery code gfs2
already has for dealing with node failures, and it does so poorly.  That
code was added because when releasing a lockspace, we didn't have a way
to indicate that the lockspace needs recovery.  We now do have this
feature, so the current withdraw code can be removed almost entirely.
This is one of several steps towards that.

Reverts parts of commit 601ef0d52e ("gfs2: Force withdraw to replay
journals and wait for it to finish").

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2025-11-26 23:52:26 +00:00
Andreas Gruenbacher
833c93caea Revert "gfs2: don't stop reads while withdraw in progress"
The current withdraw code duplicates the journal recovery code gfs2
already has for dealing with node failures, and it does so poorly.  That
code was added because when releasing a lockspace, we didn't have a way
to indicate that the lockspace needs recovery.  We now do have this
feature, so the current withdraw code can be removed almost entirely.
This is one of several steps towards that.

The withdrawing node has no role in recovering from the withdraw
anymore, so it also no longer needs to read metadata blocks after a
withdraw.

We now only need to set a single bit in gfs2_withdraw(), so switch from
try_cmpxchg() to test_and_set_bit().

Reverts commit 8cc67f704f ("gfs2: don't stop reads while withdraw in
progress").

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2025-11-26 23:52:26 +00:00
Andreas Gruenbacher
1714e8543d gfs2: Rename LM_FLAG_{NOEXP -> RECOVER}
GFS sets the LM_FLAG_NOEXP flag on locking requests it makes during
journal recovery, so rename the flag to LM_FLAG_RECOVER for improved
code readability.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2025-11-26 23:52:26 +00:00
Andreas Gruenbacher
fab27b4930 gfs2: Kill gfs2_io_error_bh_wd
All callers of gfs2_io_error_bh() call gfs2_withdraw() as well, so
change gfs2_io_error_bh() to call gfs2_withdraw() directly.  This also
brings it in line with other similar error reporting functions.

With that, gfs2_io_error_bh() is the same as gfs2_io_error_bh_wd(), so
remove the latter.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2025-11-26 23:52:26 +00:00
Andreas Gruenbacher
0e2038a90c gfs2: Withdraw immediately on log write errors
Now that gfs2_withdraw() is asynchronous, immediately withdraw when
a log write error is detected.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2025-11-26 23:52:26 +00:00
Andreas Gruenbacher
1b7d498dca gfs2: Rename gfs2_{withdrawing_or_ => }withdrawn
With delayed withdraws and the SDF_WITHDRAWING flag gone, we can now
rename gfs2_withdrawing_or_withdrawn() back to gfs2_withdrawn().

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2025-11-26 23:52:23 +00:00
Andreas Gruenbacher
8daf6c2b3d gfs2: Get rid of delayed withdraws
Now that gfs2_withdraw() is asynchronous, is can be called in any
context and there is no more need for gfs2_withdraw_delayed() or for
turning delayed withdraws into actual withdraws.  Remove the
now-obsolete code.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2025-11-26 23:51:47 +00:00
Andreas Gruenbacher
9c4a3de6cd gfs2: Asynchronous withdraw
So far, withdraws are carried out in the context of the calling task.
When another task tries to withdraw while a withdraw is already
underway, that task blocks as well.  Change that to carry out withdraws
asynchronously in workqueue context and don't block the task triggering
the withdraw anymore.

Fixes: syzbot+6b156e132970e550194c@syzkaller.appspotmail.com
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2025-11-26 23:51:47 +00:00
Andreas Gruenbacher
9334c73fb1 gfs2: Add clean argument to lm_unmount hook
Add a 'clean' argument to ->lm_unmount() that indicates whether the
filesystem is clean or needs recovery.  Set clean to true for normal
unmounts, and to false for withdraws.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2025-11-26 23:51:47 +00:00
Andreas Gruenbacher
94f56488c7 gfs2: Clean up quotad timeout handling
Instead of tracking the remaining time, track the deadline of each of
the timeouts.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2025-11-26 23:51:41 +00:00
Andreas Gruenbacher
dff1fb6d8b gfs2: Fix "gfs2: Switch to wait_event in gfs2_quotad"
Commit e4a8b5481c ("gfs2: Switch to wait_event in gfs2_quotad") broke
cyclic statfs syncing, so the numbers reported by "df" could easily get
completely out of sync with reality.  Fix this by reverting part of
commit e4a8b5481c for now.

A follow-up commit will clean this code up later.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2025-11-26 23:50:53 +00:00
Andreas Gruenbacher
5b351583a3 gfs2: Minor cosmetic remote delete cleanups
Rename gfs2_try_evict() to gfs2_try_to_evict().  The GIF_DEFER_DELETE
flag has been superceded by the GLF_DEFER_DELETE flag, so fix a
left-over comment.  Add a clarifying comment to delete_work_func().

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2025-11-26 23:50:53 +00:00
Andreas Gruenbacher
64c10ed927 gfs2: fix remote evict for read-only filesystems
When a node tries to delete an inode, it first requests exclusive access
to the iopen glock.  This triggers demote requests on all remote nodes
currently holding the iopen glock.  To satisfy those requests, the
remote nodes evict the inode in question, or they poke the corresponding
inode glock to signal that the inode is still in active use.

This behavior doesn't depend on whether or not a filesystem is
read-only, so remove the incorrect read-only check.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2025-11-26 23:50:53 +00:00
Ankit Khushwaha
af7273cc7a selftests/net: initialize char variable to null
char variable in 'so_txtime.c' & 'txtimestamp.c' were left uninitilized
when switch default case taken. which raises following warning.

	txtimestamp.c:240:2: warning: variable 'tsname' is used uninitialized
	whenever switch default is taken [-Wsometimes-uninitialized]

	so_txtime.c:210:3: warning: variable 'reason' is used uninitialized
	whenever switch default is taken [-Wsometimes-uninitialized]

initializing these variables to NULL to fix this.

Signed-off-by: Ankit Khushwaha <ankitkhushwaha.linux@gmail.com>
Link: https://patch.msgid.link/20251125165302.20079-1-ankitkhushwaha.linux@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-26 15:23:27 -08:00
Baokun Li
7c11c56eb3 ext4: align max orphan file size with e2fsprogs limit
Kernel commit 0a6ce20c15 ("ext4: verify orphan file size is not too big")
limits the maximum supported orphan file size to 8 << 20.

However, in e2fsprogs, the orphan file size is set to 32–512 filesystem
blocks when creating a filesystem.

With 64k block size, formatting an ext4 fs >32G gives an orphan file bigger
than the kernel allows, so mount prints an error and fails:

    EXT4-fs (vdb): orphan file too big: 8650752
    EXT4-fs (vdb): mount failed

To prevent this issue and allow previously created 64KB filesystems to
mount, we updates the maximum allowed orphan file size in the kernel to
512 filesystem blocks.

Fixes: 0a6ce20c15 ("ext4: verify orphan file size is not too big")
Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Message-ID: <20251120134233.2994147-1-libaokun@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
2025-11-26 17:13:34 -05:00
Daniel Tang
39fc6d4d35 Documentation: ext4: Document casefold and encrypt flags
Based on ext4(5) and fs/ext4/ext4.h.

For INCOMPAT_ENCRYPT, it's possible to create a new filesystem with that
flag without creating any encrypted inodes. ext4(5) says it adds
"support" but doesn't say whether anything's actually present like
COMPAT_RESIZE_INODE does.

Signed-off-by: Daniel Tang <danielzgtg.opensource@gmail.com>
Message-ID: <4506189.9SDvczpPoe@daniel-desktop3>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-26 17:13:34 -05:00
Haodong Tian
4ada1e4f89 fs/ext4: fix typo in comment
Correct 'metdata' -> 'metadata' in comment.

Signed-off-by: Haodong Tian <tianhd25@mails.tsinghua.edu.cn>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Message-ID: <20251112155916.3007639-1-tianhd25@mails.tsinghua.edu.cn>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-26 17:13:34 -05:00
Yang Erkun
cc742fd1d1 ext4: correct the comments place for EXT4_EXT_MAY_ZEROOUT
Move the comments just before we set EXT4_EXT_MAY_ZEROOUT in
ext4_split_convert_extents.

Signed-off-by: Yang Erkun <yangerkun@huawei.com>
Message-ID: <20251112084538.1658232-4-yangerkun@huawei.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-26 17:13:34 -05:00
Yang Erkun
a927242231 ext4: cleanup for ext4_map_blocks
Retval from ext4_map_create_blocks means we really create some blocks,
cannot happened with m_flags without EXT4_MAP_UNWRITTEN and
EXT4_MAP_MAPPED.

Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Baokun Li <libaokun1@huawei.com>
Signed-off-by: Yang Erkun <yangerkun@huawei.com>
Message-ID: <20251112084538.1658232-3-yangerkun@huawei.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-26 17:13:34 -05:00
Yang Erkun
dac092195b ext4: rename EXT4_GET_BLOCKS_PRE_IO
This flag has been generalized to split an unwritten extent when we do
dio or dioread_nolock writeback, or to avoid merge new extents which was
created by extents split. Update some related comments too.

Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Baokun Li <libaokun1@huawei.com>
Signed-off-by: Yang Erkun <yangerkun@huawei.com>
Message-ID: <20251112084538.1658232-2-yangerkun@huawei.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-26 17:13:33 -05:00
Yongjian Sun
d9ee3ff810 ext4: improve integrity checking in __mb_check_buddy by enhancing order-0 validation
When the MB_CHECK_ASSERT macro is enabled, we found that the
current validation logic in __mb_check_buddy has a gap in
detecting certain invalid buddy states, particularly related
to order-0 (bitmap) bits.

The original logic consists of three steps:
1. Validates higher-order buddies: if a higher-order bit is
set, at most one of the two corresponding lower-order bits
may be free; if a higher-order bit is clear, both lower-order
bits must be allocated (and their bitmap bits must be 0).
2. For any set bit in order-0, ensures all corresponding
higher-order bits are not free.
3. Verifies that all preallocated blocks (pa) in the group
have pa_pstart within bounds and their bitmap bits marked as
allocated.

However, this approach fails to properly validate cases where
order-0 bits are incorrectly cleared (0), allowing some invalid
configurations to pass:

               corrupt            integral

order 3           1                  1
order 2       1       1          1       1
order 1     1   1   1   1      1   1   1   1
order 0    0 0 1 1 1 1 1 1    1 1 1 1 1 1 1 1

Here we get two adjacent free blocks at order-0 with inconsistent
higher-order state, and the right one shows the correct scenario.

The root cause is insufficient validation of order-0 zero bits.
To fix this and improve completeness without significant performance
cost, we refine the logic:

1. Maintain the top-down higher-order validation, but we no longer
check the cases where the higher-order bit is 0, as this case will
be covered in step 2.
2. Enhance order-0 checking by examining pairs of bits:
   - If either bit in a pair is set (1), all corresponding
     higher-order bits must not be free.
   - If both bits are clear (0), then exactly one of the
     corresponding higher-order bits must be free
3. Keep the preallocation (pa) validation unchanged.

This change closes the validation gap, ensuring illegal buddy states
involving order-0 are correctly detected, while removing redundant
checks and maintaining efficiency.

Fixes: c9de560ded ("ext4: Add multi block allocator for ext4")
Suggested-by: Jan Kara <jack@suse.cz>
Signed-off-by: Yongjian Sun <sunyongjian1@huawei.com>
Reviewed-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Message-ID: <20251106060614.631382-3-sunyongjian@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-26 17:13:33 -05:00
Yongjian Sun
3f7a79d05c ext4: fix incorrect group number assertion in mb_check_buddy
When the MB_CHECK_ASSERT macro is enabled, an assertion failure can
occur in __mb_check_buddy when checking preallocated blocks (pa) in
a block group:

Assertion failure in mb_free_blocks() : "groupnr == e4b->bd_group"

This happens when a pa at the very end of a block group (e.g.,
pa_pstart=32765, pa_len=3 in a group of 32768 blocks) becomes
exhausted - its pa_pstart is advanced by pa_len to 32768, which
lies in the next block group. If this exhausted pa (with pa_len == 0)
is still in the bb_prealloc_list during the buddy check, the assertion
incorrectly flags it as belonging to the wrong group. A possible
sequence is as follows:

ext4_mb_new_blocks
  ext4_mb_release_context
    pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len)
    pa->pa_len -= ac->ac_b_ex.fe_len

	                 __mb_check_buddy
                           for each pa in group
                             ext4_get_group_no_and_offset
                             MB_CHECK_ASSERT(groupnr == e4b->bd_group)

To fix this, we modify the check to skip block group validation for
exhausted preallocations (where pa_len == 0). Such entries are in a
transitional state and will be removed from the list soon, so they
should not trigger an assertion. This change prevents the false
positive while maintaining the integrity of the checks for active
allocations.

Fixes: c9de560ded ("ext4: Add multi block allocator for ext4")
Signed-off-by: Yongjian Sun <sunyongjian1@huawei.com>
Reviewed-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Message-ID: <20251106060614.631382-2-sunyongjian@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
2025-11-26 17:13:33 -05:00
Alexey Nepomnyashih
0cd8feea87 ext4: add i_data_sem protection in ext4_destroy_inline_data_nolock()
Fix a race between inline data destruction and block mapping.

The function ext4_destroy_inline_data_nolock() changes the inode data
layout by clearing EXT4_INODE_INLINE_DATA and setting EXT4_INODE_EXTENTS.
At the same time, another thread may execute ext4_map_blocks(), which
tests EXT4_INODE_EXTENTS to decide whether to call ext4_ext_map_blocks()
or ext4_ind_map_blocks().

Without i_data_sem protection, ext4_ind_map_blocks() may receive inode
with EXT4_INODE_EXTENTS flag and triggering assert.

kernel BUG at fs/ext4/indirect.c:546!
EXT4-fs (loop2): unmounting filesystem.
invalid opcode: 0000 [#1] PREEMPT SMP KASAN NOPTI
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014
RIP: 0010:ext4_ind_map_blocks.cold+0x2b/0x5a fs/ext4/indirect.c:546

Call Trace:
 <TASK>
 ext4_map_blocks+0xb9b/0x16f0 fs/ext4/inode.c:681
 _ext4_get_block+0x242/0x590 fs/ext4/inode.c:822
 ext4_block_write_begin+0x48b/0x12c0 fs/ext4/inode.c:1124
 ext4_write_begin+0x598/0xef0 fs/ext4/inode.c:1255
 ext4_da_write_begin+0x21e/0x9c0 fs/ext4/inode.c:3000
 generic_perform_write+0x259/0x5d0 mm/filemap.c:3846
 ext4_buffered_write_iter+0x15b/0x470 fs/ext4/file.c:285
 ext4_file_write_iter+0x8e0/0x17f0 fs/ext4/file.c:679
 call_write_iter include/linux/fs.h:2271 [inline]
 do_iter_readv_writev+0x212/0x3c0 fs/read_write.c:735
 do_iter_write+0x186/0x710 fs/read_write.c:861
 vfs_iter_write+0x70/0xa0 fs/read_write.c:902
 iter_file_splice_write+0x73b/0xc90 fs/splice.c:685
 do_splice_from fs/splice.c:763 [inline]
 direct_splice_actor+0x10f/0x170 fs/splice.c:950
 splice_direct_to_actor+0x33a/0xa10 fs/splice.c:896
 do_splice_direct+0x1a9/0x280 fs/splice.c:1002
 do_sendfile+0xb13/0x12c0 fs/read_write.c:1255
 __do_sys_sendfile64 fs/read_write.c:1323 [inline]
 __se_sys_sendfile64 fs/read_write.c:1309 [inline]
 __x64_sys_sendfile64+0x1cf/0x210 fs/read_write.c:1309
 do_syscall_x64 arch/x86/entry/common.c:51 [inline]
 do_syscall_64+0x35/0x80 arch/x86/entry/common.c:81
 entry_SYSCALL_64_after_hwframe+0x6e/0xd8

Fixes: c755e25135 ("ext4: fix deadlock between inline_data and ext4_expand_extra_isize_ea()")
Cc: stable@vger.kernel.org # v4.11+
Signed-off-by: Alexey Nepomnyashih <sdl@nppct.ru>
Message-ID: <20251104093326.697381-1-sdl@nppct.ru>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-26 17:06:24 -05:00
Haibo Chen
4091c8206c ext4: clear i_state_flags when alloc inode
i_state_flags used on 32-bit archs, need to clear this flag when
alloc inode.
Find this issue when umount ext4, sometimes track the inode as orphan
accidently, cause ext4 mesg dump.

Fixes: acf943e976 ("ext4: fix checks for orphan inodes")
Signed-off-by: Haibo Chen <haibo.chen@nxp.com>
Reviewed-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Message-ID: <20251104-ext4-v1-1-73691a0800f9@nxp.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
2025-11-26 17:06:09 -05:00
Ye Bin
6abfe10789 jbd2: fix the inconsistency between checksum and data in memory for journal sb
Copying the file system while it is mounted as read-only results in
a mount failure:
[~]# mkfs.ext4 -F /dev/sdc
[~]# mount /dev/sdc -o ro /mnt/test
[~]# dd if=/dev/sdc of=/dev/sda bs=1M
[~]# mount /dev/sda /mnt/test1
[ 1094.849826] JBD2: journal checksum error
[ 1094.850927] EXT4-fs (sda): Could not load journal inode
mount: mount /dev/sda on /mnt/test1 failed: Bad message

The process described above is just an abstracted way I came up with to
reproduce the issue. In the actual scenario, the file system was mounted
read-only and then copied while it was still mounted. It was found that
the mount operation failed. The user intended to verify the data or use
it as a backup, and this action was performed during a version upgrade.
Above issue may happen as follows:
ext4_fill_super
 set_journal_csum_feature_set(sb)
  if (ext4_has_metadata_csum(sb))
   incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3;
  if (test_opt(sb, JOURNAL_CHECKSUM)
   jbd2_journal_set_features(sbi->s_journal, compat, 0, incompat);
    lock_buffer(journal->j_sb_buffer);
    sb->s_feature_incompat  |= cpu_to_be32(incompat);
    //The data in the journal sb was modified, but the checksum was not
      updated, so the data remaining in memory has a mismatch between the
      data and the checksum.
    unlock_buffer(journal->j_sb_buffer);

In this case, the journal sb copied over is in a state where the checksum
and data are inconsistent, so mounting fails.
To solve the above issue, update the checksum in memory after modifying
the journal sb.

Fixes: 4fd5ea43bc ("jbd2: checksum journal superblock")
Signed-off-by: Ye Bin <yebin10@huawei.com>
Reviewed-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Message-ID: <20251103010123.3753631-1-yebin@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
2025-11-26 17:05:47 -05:00
Fedor Pchelkin
3db63d2c2d ext4: check if mount_opts is NUL-terminated in ext4_ioctl_set_tune_sb()
params.mount_opts may come as potentially non-NUL-term string.  Userspace
is expected to pass a NUL-term string.  Add an extra check to ensure this
holds true.  Note that further code utilizes strscpy_pad() so this is just
for proper informing the user of incorrect data being provided.

Found by Linux Verification Center (linuxtesting.org).

Signed-off-by: Fedor Pchelkin <pchelkin@ispras.ru>
Reviewed-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Message-ID: <20251101160430.222297-2-pchelkin@ispras.ru>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
2025-11-26 17:05:39 -05:00
Fedor Pchelkin
ee5a977b4e ext4: fix string copying in parse_apply_sb_mount_options()
strscpy_pad() can't be used to copy a non-NUL-term string into a NUL-term
string of possibly bigger size.  Commit 0efc5990bc ("string.h: Introduce
memtostr() and memtostr_pad()") provides additional information in that
regard.  So if this happens, the following warning is observed:

strnlen: detected buffer overflow: 65 byte read of buffer size 64
WARNING: CPU: 0 PID: 28655 at lib/string_helpers.c:1032 __fortify_report+0x96/0xc0 lib/string_helpers.c:1032
Modules linked in:
CPU: 0 UID: 0 PID: 28655 Comm: syz-executor.3 Not tainted 6.12.54-syzkaller-00144-g5f0270f1ba00 #0
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
RIP: 0010:__fortify_report+0x96/0xc0 lib/string_helpers.c:1032
Call Trace:
 <TASK>
 __fortify_panic+0x1f/0x30 lib/string_helpers.c:1039
 strnlen include/linux/fortify-string.h:235 [inline]
 sized_strscpy include/linux/fortify-string.h:309 [inline]
 parse_apply_sb_mount_options fs/ext4/super.c:2504 [inline]
 __ext4_fill_super fs/ext4/super.c:5261 [inline]
 ext4_fill_super+0x3c35/0xad00 fs/ext4/super.c:5706
 get_tree_bdev_flags+0x387/0x620 fs/super.c:1636
 vfs_get_tree+0x93/0x380 fs/super.c:1814
 do_new_mount fs/namespace.c:3553 [inline]
 path_mount+0x6ae/0x1f70 fs/namespace.c:3880
 do_mount fs/namespace.c:3893 [inline]
 __do_sys_mount fs/namespace.c:4103 [inline]
 __se_sys_mount fs/namespace.c:4080 [inline]
 __x64_sys_mount+0x280/0x300 fs/namespace.c:4080
 do_syscall_x64 arch/x86/entry/common.c:52 [inline]
 do_syscall_64+0x64/0x140 arch/x86/entry/common.c:83
 entry_SYSCALL_64_after_hwframe+0x76/0x7e

Since userspace is expected to provide s_mount_opts field to be at most 63
characters long with the ending byte being NUL-term, use a 64-byte buffer
which matches the size of s_mount_opts, so that strscpy_pad() does its job
properly.  Return with error if the user still managed to provide a
non-NUL-term string here.

Found by Linux Verification Center (linuxtesting.org) with Syzkaller.

Fixes: 8ecb790ea8 ("ext4: avoid potential buffer over-read in parse_apply_sb_mount_options()")
Cc: stable@vger.kernel.org
Signed-off-by: Fedor Pchelkin <pchelkin@ispras.ru>
Reviewed-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Message-ID: <20251101160430.222297-1-pchelkin@ispras.ru>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-26 17:05:39 -05:00
Wengang Wang
80d05f640a jbd2: store more accurate errno in superblock when possible
When jbd2_journal_abort() is called, the provided error code is stored
in the journal superblock. Some existing calls hard-code -EIO even when
the actual failure is not I/O related.

This patch updates those calls to pass more accurate error codes,
allowing the superblock to record the true cause of failure. This helps
improve diagnostics and debugging clarity when analyzing journal aborts.

Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Message-ID: <20251031210501.7337-1-wen.gang.wang@oracle.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-26 17:05:39 -05:00
Ye Bin
986835bf4d jbd2: avoid bug_on in jbd2_journal_get_create_access() when file system corrupted
There's issue when file system corrupted:
------------[ cut here ]------------
kernel BUG at fs/jbd2/transaction.c:1289!
Oops: invalid opcode: 0000 [#1] SMP KASAN PTI
CPU: 5 UID: 0 PID: 2031 Comm: mkdir Not tainted 6.18.0-rc1-next
RIP: 0010:jbd2_journal_get_create_access+0x3b6/0x4d0
RSP: 0018:ffff888117aafa30 EFLAGS: 00010202
RAX: 0000000000000000 RBX: ffff88811a86b000 RCX: ffffffff89a63534
RDX: 1ffff110200ec602 RSI: 0000000000000004 RDI: ffff888100763010
RBP: ffff888100763000 R08: 0000000000000001 R09: ffff888100763028
R10: 0000000000000003 R11: 0000000000000000 R12: 0000000000000000
R13: ffff88812c432000 R14: ffff88812c608000 R15: ffff888120bfc000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f91d6970c99 CR3: 00000001159c4000 CR4: 00000000000006f0
Call Trace:
 <TASK>
 __ext4_journal_get_create_access+0x42/0x170
 ext4_getblk+0x319/0x6f0
 ext4_bread+0x11/0x100
 ext4_append+0x1e6/0x4a0
 ext4_init_new_dir+0x145/0x1d0
 ext4_mkdir+0x326/0x920
 vfs_mkdir+0x45c/0x740
 do_mkdirat+0x234/0x2f0
 __x64_sys_mkdir+0xd6/0x120
 do_syscall_64+0x5f/0xfa0
 entry_SYSCALL_64_after_hwframe+0x76/0x7e

The above issue occurs with us in errors=continue mode when accompanied by
storage failures. There have been many inconsistencies in the file system
data.
In the case of file system data inconsistency, for example, if the block
bitmap of a referenced block is not set, it can lead to the situation where
a block being committed is allocated and used again. As a result, the
following condition will not be satisfied then trigger BUG_ON. Of course,
it is entirely possible to construct a problematic image that can trigger
this BUG_ON through specific operations. In fact, I have constructed such
an image and easily reproduced this issue.
Therefore, J_ASSERT() holds true only under ideal conditions, but it may
not necessarily be satisfied in exceptional scenarios. Using J_ASSERT()
directly in abnormal situations would cause the system to crash, which is
clearly not what we want. So here we directly trigger a JBD abort instead
of immediately invoking BUG_ON.

Fixes: 470decc613 ("[PATCH] jbd2: initial copy of files from jbd")
Signed-off-by: Ye Bin <yebin10@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Message-ID: <20251025072657.307851-1-yebin@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
2025-11-26 17:05:02 -05:00
Gabriel Krisman Bertazi
5d24321e4c io_uring: Introduce getsockname io_uring cmd
Introduce a socket-specific io_uring_cmd to support
getsockname/getpeername via io_uring.  I made this an io_uring_cmd
instead of a new operation to avoid polluting the command namespace with
what is exclusively a socket operation.  In addition, since we don't
need to conform to existing interfaces, this merges the
getsockname/getpeername in a single operation, since the implementation
is pretty much the same.

This has been frequently requested, for instance at [1] and more
recently in the project Discord channel. The main use-case is to support
fixed socket file descriptors.

[1] https://github.com/axboe/liburing/issues/1356

Signed-off-by: Gabriel Krisman Bertazi <krisman@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-26 13:45:23 -07:00
Gabriel Krisman Bertazi
d73c167708 socket: Split out a getsockname helper for io_uring
Similar to getsockopt, split out a helper to check security and issue
the operation from the main handler that can be used by io_uring.

Signed-off-by: Gabriel Krisman Bertazi <krisman@suse.de>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-26 13:45:23 -07:00
Gabriel Krisman Bertazi
4677e78800 socket: Unify getsockname and getpeername implementation
They are already implemented by the same get_name hook in the protocol
level.  Bring the unification one level up to reduce code duplication
in preparation to supporting these as io_uring operations.

Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Signed-off-by: Gabriel Krisman Bertazi <krisman@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-26 13:45:23 -07:00
Edward Adam Davis
688b745401 bpf: Fix exclusive map memory leak
When excl_prog_hash is 0 and excl_prog_hash_size is non-zero, the map also
needs to be freed. Otherwise, the map memory will not be reclaimed, just
like the memory leak problem reported by syzbot [1].

syzbot reported:
BUG: memory leak
  backtrace (crc 7b9fb9b4):
    map_create+0x322/0x11e0 kernel/bpf/syscall.c:1512
    __sys_bpf+0x3556/0x3610 kernel/bpf/syscall.c:6131

Fixes: baefdbdf68 ("bpf: Implement exclusive map creation")
Reported-by: syzbot+cf08c551fecea9fd1320@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=cf08c551fecea9fd1320
Tested-by: syzbot+cf08c551fecea9fd1320@syzkaller.appspotmail.com
Signed-off-by: Edward Adam Davis <eadavis@qq.com>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/tencent_3F226F882CE56DCC94ACE90EED1ECCFC780A@qq.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-26 11:23:27 -08:00
Kevin Brodsky
c6a45ee760 ublk: prevent invalid access with DEBUG
ublk_ch_uring_cmd_local() may jump to the out label before
initialising the io pointer. This will cause trouble if DEBUG is
defined, because the pr_devel() call dereferences io. Clang reports:

drivers/block/ublk_drv.c:2403:6: error: variable 'io' is used uninitialized whenever 'if' condition is true [-Werror,-Wsometimes-uninitialized]
 2403 |         if (tag >= ub->dev_info.queue_depth)
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
drivers/block/ublk_drv.c:2492:32: note: uninitialized use occurs here
 2492 |                         __func__, cmd_op, tag, ret, io->flags);
      |

Fix this by initialising io to NULL and checking it before
dereferencing it.

Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
Fixes: 71f28f3136 ("ublk_drv: add io_uring based userspace block driver")
Reviewed-by: Caleb Sander Mateos <csander@purestorage.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-26 10:59:18 -07:00
Jan Höppner
a857d99201 s390/dasd: Use scnprintf() instead of sprintf()
Use scnprintf() instead of sprintf() for those cases where the
destination is an array and the size of the array is known at compile
time.

This prevents theoretical buffer overflows, but also avoids that people
again and again spend time to figure out if the code is actually safe.

Signed-off-by: Jan Höppner <hoeppner@linux.ibm.com>
Reviewed-by: Stefan Haberland <sth@linux.ibm.com>
Signed-off-by: Stefan Haberland <sth@linux.ibm.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-26 10:14:17 -07:00
Jan Höppner
43198756ee s390/dasd: Move device name formatting into separate function
The device name formatting can be generalized and made more readable
compared to the current state. SCSI already provides a generalized way
to format many devices in the same naming scheme as DASD does, which was
introduced with commit 3e1a7ff8a0 ("block: allow disk to have extended
device number").

Use this much cleaner code from drivers/scsi/sd.c to handle the legacy
naming scheme in DASD as a replacement for the current implementation.

For easier error handling for the new function, move the gendisk free
portion of dasd_gendisk_free() out into a new function dasd_gd_free().

Signed-off-by: Jan Höppner <hoeppner@linux.ibm.com>
Reviewed-by: Stefan Haberland <sth@linux.ibm.com>
Signed-off-by: Stefan Haberland <sth@linux.ibm.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-26 10:14:17 -07:00
Stefan Haberland
764def9e8e s390/dasd: Remove unnecessary debugfs_create() return checks
The DASD driver only uses the dentry pointers when removing debugfs
entries, and debugfs_remove() can safely handle both NULL and ERR_PTR.
There is therefore no need to check debugfs_create() return values.

This simplifies the debugfs setup code without changing functionality.

Suggested-by: Heiko Carstens <hca@linux.ibm.com>
Reviewed-by: Heiko Carstens <hca@linux.ibm.com>
Reviewed-by: Jan Hoeppner <hoeppner@linux.ibm.com>
Signed-off-by: Stefan Haberland <sth@linux.ibm.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-26 10:14:17 -07:00
Stefan Haberland
c943bfc6af s390/dasd: Fix gendisk parent after copy pair swap
After a copy pair swap the block device's "device" symlink points to
the secondary CCW device, but the gendisk's parent remained the
primary, leaving /sys/block/<dasdx> under the wrong parent.

Move the gendisk to the secondary's device with device_move(), keeping
the sysfs topology consistent after the swap.

Fixes: 413862caad ("s390/dasd: add copy pair swap capability")
Cc: stable@vger.kernel.org #6.1
Reviewed-by: Jan Hoeppner <hoeppner@linux.ibm.com>
Signed-off-by: Stefan Haberland <sth@linux.ibm.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-26 10:14:16 -07:00
Caleb Sander Mateos
1e93de9205 io_uring/query: drop unused io_handle_query_entry() ctx arg
io_handle_query_entry() doesn't use its struct io_ring_ctx *ctx
argument. So remove it from the function and its callers.

Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-26 09:37:10 -07:00
Alexey Velichayshiy
4cfc7d5a4a gfs2: fix freeze error handling
After commit b77b4a4815 ("gfs2: Rework freeze / thaw logic"),
the freeze error handling is broken because gfs2_do_thaw()
overwrites the 'error' variable, causing incorrect processing
of the original freeze error.

Fix this by calling gfs2_do_thaw() when gfs2_lock_fs_check_clean()
fails but ignoring its return value to preserve the original
freeze error for proper reporting.

Found by Linux Verification Center (linuxtesting.org) with SVACE.

Fixes: b77b4a4815 ("gfs2: Rework freeze / thaw logic")
Cc: stable@vger.kernel.org # v6.5+
Signed-off-by: Alexey Velichayshiy <a.velichayshiy@ispras.ru>
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2025-11-26 13:01:07 +00:00
Andreas Gruenbacher
2c5f4a5347 gfs2: Prevent recursive memory reclaim
Function new_inode() returns a new inode with inode->i_mapping->gfp_mask
set to GFP_HIGHUSER_MOVABLE.  This value includes the __GFP_FS flag, so
allocations in that address space can recurse into filesystem memory
reclaim.  We don't want that to happen because it can consume a
significant amount of stack memory.

Worse than that is that it can also deadlock: for example, in several
places, gfs2_unstuff_dinode() is called inside filesystem transactions.
This calls filemap_grab_folio(), which can allocate a new folio, which
can trigger memory reclaim.  If memory reclaim recurses into the
filesystem and starts another transaction, a deadlock will ensue.

To fix these kinds of problems, prevent memory reclaim from recursing
into filesystem code by making sure that the gfp_mask of inode address
spaces doesn't include __GFP_FS.

The "meta" and resource group address spaces were already using GFP_NOFS
as their gfp_mask (which doesn't include __GFP_FS).  The default value
of GFP_HIGHUSER_MOVABLE is less restrictive than GFP_NOFS, though.  To
avoid being overly limiting, use the default value and only knock off
the __GFP_FS flag.  I'm not sure if this will actually make a
difference, but it also shouldn't hurt.

This patch is loosely based on commit ad22c7a043 ("xfs: prevent stack
overflows from page cache allocation").

Fixes xfstest generic/273.

Fixes: dc0b943523 ("gfs: Don't use GFP_NOFS in gfs2_unstuff_dinode")
Reviewed-by: Andrew Price <anprice@redhat.com>
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2025-11-26 12:57:10 +00:00
Conor Dooley
9aea35eb98 dt-bindings: can: mpfs: document resets
The CAN cores on Polarfire SoC both have a reset. The platform firmware
brings both cores out of reset, but the linux driver must use them
during normal operation. The resets should have been made required, but
this is one of the things that can happen when the binding is written
without driver support.

Fixes: c878d518d7 ("dt-bindings: can: mpfs: document the mpfs CAN controller")
Signed-off-by: Conor Dooley <conor.dooley@microchip.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@oss.qualcomm.com>
Link: https://patch.msgid.link/20251121-sample-footsore-743d81772efc@spud
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-11-26 11:30:37 +01:00
Marc Kleine-Budde
4715d930f3 Merge patch series "MAINTAINERS: Add myself as m_can maintainer"
Markus Schneider-Pargmann <msp@baylibre.com> says:

these two patches are updating the m_can maintainer entry, replacing the
current maintainer and simplifying the files mentioned.

Link: https://patch.msgid.link/20251119-topic-mcan-reviewer-v6-18-v2-0-f842c3094b18@baylibre.com
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-11-26 11:24:59 +01:00
Markus Schneider-Pargmann
d20103d8f8 MAINTAINERS: Simplify m_can section
Simplify the section by using the whole m_can directory. This includes a
few new files, e.g. Kconfig, Makefile, m_can_pci.c and tcan4x5x* which
are all closely coupled to the m_can driver core.

Signed-off-by: Markus Schneider-Pargmann <msp@baylibre.com>
Link: https://patch.msgid.link/20251119-topic-mcan-reviewer-v6-18-v2-2-f842c3094b18@baylibre.com
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-11-26 11:24:58 +01:00
Markus Schneider-Pargmann
07688a882f MAINTAINERS: Add myself as m_can maintainer
As I have contributed to the m_can driver over the past years, I would
like to continue maintaining the driver. As Chandrasekar is currently
not responsive, I will replace him as the maintainer of the driver.

Signed-off-by: Markus Schneider-Pargmann <msp@baylibre.com>
Link: https://patch.msgid.link/20251119-topic-mcan-reviewer-v6-18-v2-1-f842c3094b18@baylibre.com
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-11-26 11:24:58 +01:00
Marc Kleine-Budde
4718d39e72 Merge patch series "Add R-Car CAN-FD suspend/resume support"
Biju <biju.das.au@gmail.com> says:

From: Biju Das <biju.das.jz@bp.renesas.com>

This patch series adds proper suspend/resume support to the Renesas
R-Car CAN-FD controller driver, after the customary cleanups and fixes.
It aims to fix CAN-FD operation after resume from s2ram on systems where
PSCI powers down the SoC.

This patch series has been tested on RZ/G3E SMARC EVK and RZ/G2L SMARC
EVK.

This patch series depend upon [1]
[1] https://lore.kernel.org/all/20251123112326.128448-1-biju.das.jz@bp.renesas.com/

v2->v3:
 * Updated commit header and description for patch#3
 * Collected tags.
v1->v2:
 * Added logs from RZ/G3E
 * Collected tags.
 * Moved enabling of RAM clk from probe().
 * Added RAM clk handling in rcar_canfd_global_{,de}init().
 * Fixed the typo in error path of rcar_canfd_resume().

Logs from RZ/G3E:
root@smarc-rzg3e:~# /canfd_t_003_all.sh
 [INFO] Testing can0<->can1 with bitrate 1000000 and dbitrate 4000000
 [INFO] Bringing down can0 can1
 [INFO] Bringing up can0 can1
 [INFO] Testing can1 as producer and can0 as consumer
[  541.705921] can: controller area network core
[  541.710369] NET: Registered PF_CAN protocol family
[  541.753974] can: raw protocol
 [INFO] Testing can0 as producer and can1 as consumer
 [INFO] Testing can0<->can1 with bitrate 500000 and dbitrate 2000000
 [INFO] Bringing down can0 can1
 [INFO] Bringing up can0 can1
 [INFO] Testing can1 as producer and can0 as consumer
 [INFO] Testing can0 as producer and can1 as consumer
 [INFO] Testing can0<->can1 with bitrate 250000 and dbitrate 1000000
 [INFO] Bringing down can0 can1
 [INFO] Bringing up can0 can1
 [INFO] Testing can1 as producer and can0 as consumer
 [INFO] Testing can0 as producer and can1 as consumer

EXIT|PASS|canfd_t_003.sh|[00:00:25] ||

bind/unbind
----------
[  566.821475] rcar_canfd 12440000.can: can_clk rate is 80000000
[  566.828076] rcar_canfd 12440000.can: device registered (channel 1)
[  566.834361] rcar_canfd 12440000.can: can_clk rate is 80000000
[  566.841842] rcar_canfd 12440000.can: device registered (channel 4)
[  566.848093] rcar_canfd 12440000.can: global operational state (canfd clk, fd mode)
 [INFO] Testing can0<->can1 with bitrate 1000000 and dbitrate 4000000
 [INFO] Bringing down can0 can1
 [INFO] Bringing up can0 can1
 [INFO] Testing can1 as producer and can0 as consumer
 [INFO] Testing can0 as producer and can1 as consumer
 [INFO] Testing can0<->can1 with bitrate 500000 and dbitrate 2000000
 [INFO] Bringing down can0 can1
 [INFO] Bringing up can0 can1
 [INFO] Testing can1 as producer and can0 as consumer
 [INFO] Testing can0 as producer and can1 as consumer
 [INFO] Testing can0<->can1 with bitrate 250000 and dbitrate 1000000
 [INFO] Bringing down can0 can1
 [INFO] Bringing up can0 can1
 [INFO] Testing can1 as producer and can0 as consumer
 [INFO] Testing can0 as producer and can1 as consumer

EXIT|PASS|canfd_t_003.sh|[00:00:25] ||

s2idle
-----
[  592.182479] PM: suspend entry (s2idle)
[  592.187031] Filesystems sync: 0.000 seconds
[  592.193221] Freezing user space processes
[  592.199425] Freezing user space processes completed (elapsed 0.002 seconds)
[  592.206450] OOM killer disabled.
[  592.209843] Freezing remaining freezable tasks
[  592.215775] Freezing remaining freezable tasks completed (elapsed 0.001 seconds)
[  592.223247] printk: Suspending console(s) (use no_console_suspend to debug)
[  592.260524] sd 0:0:0:0: [sda] Synchronizing SCSI cache
[  592.322759] renesas-gbeth 15c30000.ethernet end0: Link is Down
[  596.070955] dwmac4: Master AXI performs any burst length
[  596.072307] renesas-gbeth 15c30000.ethernet end0: No Safety Features support found
[  596.072376] renesas-gbeth 15c30000.ethernet end0: IEEE 1588-2008 Advanced Timestamp supported
[  596.077470] renesas-gbeth 15c30000.ethernet end0: configuring for phy/rgmii-id link mode
[  596.087503] dwmac4: Master AXI performs any burst length
[  596.088817] renesas-gbeth 15c40000.ethernet end1: No Safety Features support found
[  596.088881] renesas-gbeth 15c40000.ethernet end1: IEEE 1588-2008 Advanced Timestamp supported
[  596.093997] renesas-gbeth 15c40000.ethernet end1: configuring for phy/rgmii-id link mode
[  596.141986] usb usb1: root hub lost power or was reset
[  596.142031] usb usb2: root hub lost power or was reset
[  598.304525] usb 2-1: reset SuperSpeed Plus Gen 2x1 USB device number 2 using xhci-renesas-hcd
[  598.414846] OOM killer enabled.
[  598.418002] Restarting tasks: Starting
[  598.422518] Restarting tasks: Done
[  598.425999] random: crng reseeded on system resumption
[  598.431248] PM: suspend exit
[  598.661875] renesas-gbeth 15c30000.ethernet end0: Link is Up - 1Gbps/Full - flow control rx/tx
 [INFO] Testing can0<->can1 with bitrate 1000000 and dbitrate 4000000
 [INFO] Bringing down can0 can1
 [INFO] Bringing up can0 can1
 [INFO] Testing can1 as producer and can0 as consumer
 [INFO] Testing can0 as producer and can1 as consumer
 [INFO] Testing can0<->can1 with bitrate 500000 and dbitrate 2000000
 [INFO] Bringing down can0 can1
 [INFO] Bringing up can0 can1
 [INFO] Testing can1 as producer and can0 as consumer
 [INFO] Testing can0 as producer and can1 as consumer
 [INFO] Testing can0<->can1 with bitrate 250000 and dbitrate 1000000
 [INFO] Bringing down can0 can1
 [INFO] Bringing up can0 can1
 [INFO] Testing can1 as producer and can0 as consumer
 [INFO] Testing can0 as producer and can1 as consumer

EXIT|PASS|canfd_t_003.sh|[00:00:25] ||

Link: https://patch.msgid.link/20251124102837.106973-1-biju.das.jz@bp.renesas.com
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-11-26 11:21:57 +01:00
Geert Uytterhoeven
3a34330f63 can: rcar_canfd: Add suspend/resume support
On R-Car Gen3 using PSCI, s2ram powers down the SoC.  After resume, the
CAN-FD interface no longer works.  Trying to bring it up again fails:

    # ip link set can0 up
    RTNETLINK answers: Connection timed out

    # dmesg
    ...
    channel 0 communication state failed

Fix this by populating the (currently empty) suspend and resume
callbacks, to stop/start the individual CAN-FD channels, and
(de)initialize the CAN-FD controller.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Tested-by: Biju Das <biju.das.jz@bp.renesas.com>
Signed-off-by: Biju Das <biju.das.jz@bp.renesas.com>
Link: https://patch.msgid.link/20251124102837.106973-8-biju.das.jz@bp.renesas.com
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-11-26 11:21:56 +01:00
Geert Uytterhoeven
161266c754 can: rcar_canfd: Convert to DEFINE_SIMPLE_DEV_PM_OPS()
Convert the Renesas R-Car CAN-FD driver from SIMPLE_DEV_PM_OPS() to
DEFINE_SIMPLE_DEV_PM_OPS() and pm_sleep_ptr().  This lets us drop the
__maybe_unused annotations from its suspend and resume callbacks, and
reduces kernel size in case CONFIG_PM or CONFIG_PM_SLEEP is disabled.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Biju Das <biju.das.jz@bp.renesas.com>
Signed-off-by: Biju Das <biju.das.jz@bp.renesas.com>
Link: https://patch.msgid.link/20251124102837.106973-7-biju.das.jz@bp.renesas.com
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-11-26 11:21:56 +01:00
Geert Uytterhoeven
ddf9bbf22b can: rcar_canfd: Invert CAN clock and close_candev() order
The CAN clock is enabled before calling open_candev(), and disabled
before calling close_candev().  Invert the order of the latter, to
restore symmetry.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Biju Das <biju.das.jz@bp.renesas.com>
Signed-off-by: Biju Das <biju.das.jz@bp.renesas.com>
Link: https://patch.msgid.link/20251124102837.106973-6-biju.das.jz@bp.renesas.com
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-11-26 11:21:55 +01:00
Geert Uytterhoeven
fa5f4ec8ff can: rcar_canfd: Extract rcar_canfd_global_{,de}init()
Extract the code to (de)initialize global state into separate functions,
for future reuse.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Biju Das <biju.das.jz@bp.renesas.com>
Link: https://patch.msgid.link/20251124102837.106973-5-biju.das.jz@bp.renesas.com
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-11-26 11:21:55 +01:00
Biju Das
eda3d6c8d7 can: rcar_canfd: Use devm_clk_get_optional() for RAM clk
Replace devm_clk_get_optional_enabled()->devm_clk_get_optional() as the
RAM clk needs to be enabled in resume for proper operation in STR mode
for RZ/G3E SoC.

Signed-off-by: Biju Das <biju.das.jz@bp.renesas.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://patch.msgid.link/20251124102837.106973-4-biju.das.jz@bp.renesas.com
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-11-26 11:21:55 +01:00
Geert Uytterhoeven
790ec4c453 can: rcar_canfd: Invert global vs. channel teardown
Global state is initialized and torn down before per-channel state.
Invert the order to restore symmetry.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Vincent Mailhol <mailhol@kernel.org>
Reviewed-by: Biju Das <biju.das.jz@bp.renesas.com>
Signed-off-by: Biju Das <biju.das.jz@bp.renesas.com>
Link: https://patch.msgid.link/20251124102837.106973-3-biju.das.jz@bp.renesas.com
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-11-26 11:21:55 +01:00
Geert Uytterhoeven
41c13eaf39 can: rcar_canfd: Invert reset assert order
The two resets are asserted during cleanup in the same order as they
were deasserted during probe.  Invert the order to restore symmetry.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Vincent Mailhol <mailhol@kernel.org>
Reviewed-by: Biju Das <biju.das.jz@bp.renesas.com>
Signed-off-by: Biju Das <biju.das.jz@bp.renesas.com>
Link: https://patch.msgid.link/20251124102837.106973-2-biju.das.jz@bp.renesas.com
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-11-26 11:21:55 +01:00
Marc Kleine-Budde
113aa9101a Merge patch series "can: netlink: add CAN XL support"
Marc Kleine-Budde <mkl@pengutronix.de> says:

Similarly to how CAN FD reuses the bittiming logic of Classical CAN, CAN XL
also reuses the entirety of CAN FD features, and, on top of that, adds new
features which are specific to CAN XL.

A so-called 'mixed-mode' is intended to have (XL-tolerant) CAN FD nodes and
CAN XL nodes on one CAN segment, where the FD-controllers can talk CC/FD
and the XL-controllers can talk CC/FD/XL. This mixed-mode utilizes the
known error-signalling (ES) for sending CC/FD/XL frames. For CAN FD and CAN
XL the tranceiver delay compensation (TDC) is supported to use common CAN
and CAN-SIG transceivers.

The CANXL-only mode disables the error-signalling in the CAN XL controller.
This mode does not allow CC/FD frames to be sent but additionally offers a
CAN XL transceiver mode switching (TMS) to send CAN XL frames with up to
20Mbit/s data rate. The TMS utilizes a PWM configuration which is added to
the netlink interface.

Configured with CAN_CTRLMODE_FD and CAN_CTRLMODE_XL this leads to:

FD=0 XL=0 CC-only mode         (ES=1)
FD=1 XL=0 FD/CC mixed-mode     (ES=1)
FD=1 XL=1 XL/FD/CC mixed-mode  (ES=1)
FD=0 XL=1 XL-only mode         (ES=0, TMS optional)

Patch #1 print defined ctrlmode strings capitalized to increase the
readability and to be in line with the 'ip' tool (iproute2).

Patch #2 is a small clean-up which makes can_calc_bittiming() use
NL_SET_ERR_MSG() instead of netdev_err().

Patch #3 adds a check in can_dev_dropped_skb() to drop CAN FD frames
when CAN FD is turned off.

Patch #4 adds CAN_CTRLMODE_RESTRICTED. Note that contrary to the other
CAN_CTRL_MODE_XL_* that are introduced in the later patches, this control
mode is not specific to CAN XL. The nuance is that because this restricted
mode was only added in ISO 11898-1:2024, it is made mandatory for CAN XL
devices but optional for other protocols. This is why this patch is added
as a preparation before introducing the core CAN XL logic.

Patch #5 adds all the CAN XL features which are inherited from CAN FD: the
nominal bittiming, the data bittiming and the TDC.

Patch #6 add a new CAN_CTRLMODE_XL_TMS control mode which is specific to
CAN XL to enable the transceiver mode switching (TMS) in XL-only mode.

Patch #7 adds a check in can_dev_dropped_skb() to drop CAN CC/FD frames
when the CAN XL controller is in CAN XL-only mode. The introduced
can_dev_in_xl_only_mode() function also determines the error-signalling
configuration for the CAN XL controllers.

Patch #8 to #11 add the PWM logic for the CAN XL TMS mode.

Patch #12 to #14 add different default sample-points for standard CAN and
CAN SIG transceivers (with TDC) and CAN XL transceivers using PWM in the
CAN XL TMS mode.

Patch #15 add a dummy_can driver for netlink testing and debugging.

Patch #16 check CAN frame type (CC/FD/XL) when writing those frames to the
CAN_RAW socket and reject them if it's not supported by the CAN interface.

Patch #17 increase the resolution when printing the bitrate error and
round-up the value to 0.01% in the case the resolution would still provide
values which would lead to 0.00%.

Link: https://patch.msgid.link/20251126-canxl-v8-0-e7e3eb74f889@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-11-26 11:20:45 +01:00
Oliver Hartkopp
b360a13d44 can: dev: print bitrate error with two decimal digits
Increase the resolution when printing the bitrate error and round-up the
value to 0.01% in the case the resolution would still provide values
which would lead to 0.00%.

Suggested-by: Vincent Mailhol <mailhol@kernel.org>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://patch.msgid.link/20251126-canxl-v8-17-e7e3eb74f889@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-11-26 11:20:44 +01:00
Oliver Hartkopp
1a620a7238 can: raw: instantly reject unsupported CAN frames
For real CAN interfaces the CAN_CTRLMODE_FD and CAN_CTRLMODE_XL control
modes indicate whether an interface can handle those CAN FD/XL frames.

In the case a CAN XL interface is configured in CANXL-only mode with
disabled error-signalling neither CAN CC nor CAN FD frames can be sent.

The checks are performed on CAN_RAW sockets to give an instant feedback
to the user when writing unsupported CAN frames to the interface.

Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://patch.msgid.link/20251126-canxl-v8-16-e7e3eb74f889@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-11-26 11:20:44 +01:00
Vincent Mailhol
816cf430e8 can: add dummy_can driver
During the development of CAN XL, we found the need of creating a
dummy CAN XL driver in order to test the new netlink interface. While
this code was initially intended to be some throwaway, it received
some positive feedback.

Add the dummy_can driver. This driver acts similarly to the vcan
interface in the sense that it will echo back any packet it receives.
The difference is that it exposes a set on bittiming parameters as a
real device would and thus must be configured as if it was a real
physical interface.

The driver comes with a debug mode. If debug message are enabled (for
example by enabling CONFIG_CAN_DEBUG_DEVICES), it will print in the
kernel log all the bittiming values, similar to what a:

  ip --details link show can0

would do.

This driver is mostly intended for debugging and testing, but some
developers also may want to look at it as a simple reference
implementation.

Signed-off-by: Vincent Mailhol <mailhol@kernel.org>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://patch.msgid.link/20251126-canxl-v8-15-e7e3eb74f889@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-11-26 11:20:44 +01:00
Vincent Mailhol
f5de373ae4 can: calc_bittiming: add can_calc_sample_point_pwm()
The optimum sample point value depends on the bit symmetry. The more
asymmetric the bit is, the more the sample point would be located
towards the end of the bit. On the contrary, if the transceiver only
has a small asymmetry, the optimal sample point would be slightly
after the centre of the bit.

For NRZ encoding (used by Classical CAN, CAN FD and CAN XL with TMS
off), the optimum sample points values are above 70% as implemented in
can_calc_sample_point_nrz().

When TMS is on, CAN XL optimum sample points are near to 50% or
60% [1]. Add can_calc_sample_point_pwm() which returns a sample point
which is suitable for PWM encoding. We crafted the formula to make it
return the same values as below table (source: table 3 of [1]).

       Bit rate (Mbits/s)	Sample point
       -------------------------------------
         2.0			 51.3%
         5.0			 53.1%
         8.0			 55.0%
        10.0			 56.3%
        12.3			 53.8%
        13.3			 58.3%
        14.5			 54.5%
        16.0			 60.0%
        17.7			 55.6%
        20.0			 62.5%

The calculation simply consists of setting a slightly too high sample
point and then letting can_update_sample_point() correct the values.

For now, it is just a formula up our sleeves which matches the
empirical observations of [1]. Once CiA recommendations become
available, can_calc_sample_point_pwm() should be updated accordingly.

[1] CAN XL system design: Clock tolerances and edge deviations edge
    deviations
Link: https://www.can-cia.org/fileadmin/cia/documents/publications/cnlm/december_2024/cnlm_24-4_p18_can_xl_system_design_clock_tolerances_and_edge_deviations_dr_arthur_mutter_bosch.pdf

Signed-off-by: Vincent Mailhol <mailhol@kernel.org>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://patch.msgid.link/20251126-canxl-v8-14-e7e3eb74f889@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-11-26 11:20:44 +01:00
Vincent Mailhol
a6ddf91a4f can: calc_bittiming: add can_calc_sample_point_nrz()
CAN XL optimal sample point for PWM encoding (when TMS is on) differs
from the NRZ optimal one. There is thus a need to calculate a
different sample point depending whether TMS is on or off.

This is a preparation change: move the sample point calculation from
can_calc_bittiming() into the new can_calc_sample_point_nrz()
function.

In an upcoming change, a function will be added to calculate the
sample point for PWM encoding.

Signed-off-by: Vincent Mailhol <mailhol@kernel.org>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://patch.msgid.link/20251126-canxl-v8-13-e7e3eb74f889@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-11-26 11:20:44 +01:00
Vincent Mailhol
1d147cb7c5 can: calc_bittiming: replace misleading "nominal" by "reference"
The functions can_update_sample_point() and can_calc_bittiming() are
generic and meant to be used for both the nominal and the data bittiming
calculation.

However, those functions use misleading terminologies such as "bitrate
nominal" or "sample point nominal". Replace all places where the word
"nominal" appears with "reference" in order to better distinguish it from
the calculated values.

Signed-off-by: Vincent Mailhol <mailhol@kernel.org>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://patch.msgid.link/20251126-canxl-v8-12-e7e3eb74f889@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-11-26 11:20:44 +01:00
Vincent Mailhol
46552323fa can: netlink: add PWM netlink interface
When the TMS is switched on, the node uses PWM (Pulse Width
Modulation) during the data phase instead of the classic NRZ (Non
Return to Zero) encoding.

PWM is configured by three parameters:

  - PWMS: Pulse Width Modulation Short phase
  - PWML: Pulse Width Modulation Long phase
  - PWMO: Pulse Width Modulation Offset time

For each of these parameters, define three IFLA symbols:

  - IFLA_CAN_PWM_PWM*_MIN: the minimum allowed value.
  - IFLA_CAN_PWM_PWM*_MAX: the maximum allowed value.
  - IFLA_CAN_PWM_PWM*: the runtime value.

This results in a total of nine IFLA symbols which are all nested in a
parent IFLA_CAN_XL_PWM symbol.

IFLA_CAN_PWM_PWM*_MIN and IFLA_CAN_PWM_PWM*_MAX define the range of
allowed values and will match the value statically configured by the
device in struct can_pwm_const.

IFLA_CAN_PWM_PWM* match the runtime values stored in struct can_pwm.
Those parameters may only be configured when the tms mode is on. If
the PWMS, PWML and PWMO parameters are provided, check that all the
needed parameters are present using can_validate_pwm(), then check
their value using can_validate_pwm_bittiming(). PWMO defaults to zero
if omitted. Otherwise, if CAN_CTRLMODE_XL_TMS is true but none of the
PWM parameters are provided, calculate them using can_calc_pwm().

Signed-off-by: Vincent Mailhol <mailhol@kernel.org>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://patch.msgid.link/20251126-canxl-v8-11-e7e3eb74f889@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-11-26 11:20:43 +01:00
Vincent Mailhol
9892339cf0 can: calc_bittiming: add PWM calculation
Perform the PWM calculation according to CiA recommendations.

Note that for databitrates greater than 5 MBPS, tqmin is less than
CAN_PWM_NS_MAX (which is defined to 200 nano seconds), consequently,
the result of the division:

  DIV_ROUND_UP(xl_ns, CAN_PWM_NS_MAX)

is one and thus the for loop automatically stops on the first
iteration giving a single PWM symbol per bit as expected. Because of
that, there is no actual need for a separate conditional branch for
when the databitrate is greater than 5 MBPS.

Signed-off-by: Vincent Mailhol <mailhol@kernel.org>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://patch.msgid.link/20251126-canxl-v8-10-e7e3eb74f889@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-11-26 11:20:43 +01:00
Vincent Mailhol
8e2a2885a2 can: bittiming: add PWM validation
Add can_validate_pwm() to validate the values pwms, pwml and pwml.
Error messages are added to each of the checks to inform the user on
what went wrong. Refer to those error messages to understand the
validation logic.

The boundary values CAN_PWM_DECODE_NS (the transceiver minimum
decoding margin) and CAN_PWM_NS_MAX (the maximum PWM symbol duration)
are hardcoded for the moment. Note that a transceiver capable of
bitrates higher than 20 Mbps may be able to handle a CAN_PWM_DECODE_NS
below 5 ns. If such transceivers become commercially available, this
code could be revisited to make this parameter configurable. For now,
leave it static.

Signed-off-by: Vincent Mailhol <mailhol@kernel.org>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://patch.msgid.link/20251126-canxl-v8-9-e7e3eb74f889@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-11-26 11:20:43 +01:00
Vincent Mailhol
f6ccc2b293 can: bittiming: add PWM parameters
In CAN XL, higher data bit rates require the CAN transceiver to switch
its operation mode to use Pulse-Width Modulation (PWM) transmission
mode instead of the classic dominant/recessive transmission mode.

The PWM parameters are:

  - PWMS: pulse width modulation short phase
  - PWML: pulse width modulation long phase
  - PWMO: pulse width modulation offset

CiA 612-2 specifies PWMS and PWML to be at least 1 (arguably, PWML
shall be at least 2 to respect the PWMS < PWML rule). PWMO's minimum
is expected to always be zero. It is added more for consistency than
anything else.

Add struct can_pwm_const so that the different devices can provide
their minimum and maximum values.

When TMS is on, the runtime PWMS, PWML and PWMO are needed (either
calculated or provided by the user): add struct can_pwm to store
these.

TDC and PWM can not be used at the same time (TDC can only be used
when TMS is off and PWM only when TMS is on). struct can_pwm is thus
put together with struct can_tdc inside a union to save some space.

The netlink logic will be added in an upcoming change.

Signed-off-by: Vincent Mailhol <mailhol@kernel.org>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://patch.msgid.link/20251126-canxl-v8-8-e7e3eb74f889@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-11-26 11:20:43 +01:00
Oliver Hartkopp
6df01533e5 can: dev: can_dev_dropped_skb: drop CC/FD frames in CANXL-only mode
The error-signalling (ES) is a mandatory functionality for CAN CC and
CAN FD to report CAN frame format violations by sending an error-frame
signal on the bus.

A so-called 'mixed-mode' is intended to have (XL-tolerant) CAN FD nodes
and CAN XL nodes on one CAN segment, where the FD-controllers can talk
CC/FD and the XL-controllers can talk CC/FD/XL. This mixed-mode
utilizes the error-signalling for sending CC/FD/XL frames.

The CANXL-only mode disables the error-signalling in the CAN XL
controller. This mode does not allow CC/FD frames to be sent but
additionally offers a CAN XL transceiver mode switching (TMS).

Configured with CAN_CTRLMODE_FD and CAN_CTRLMODE_XL this leads to:

FD=0 XL=0 CC-only mode         (ES=1)
FD=1 XL=0 FD/CC mixed-mode     (ES=1)
FD=1 XL=1 XL/FD/CC mixed-mode  (ES=1)
FD=0 XL=1 XL-only mode         (ES=0, TMS optional)

The helper function can_dev_in_xl_only_mode() determines the required
value to disable error signalling in the CAN XL controller.

Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://patch.msgid.link/20251126-canxl-v8-7-e7e3eb74f889@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-11-26 11:20:43 +01:00
Vincent Mailhol
233134af20 can: netlink: add CAN_CTRLMODE_XL_TMS flag
The Transceiver Mode Switching (TMS) indicates whether the CAN XL
controller shall use the PWM or NRZ encoding during the data phase.

The term "transceiver mode switching" is used in both ISO 11898-1 and
CiA 612-2 (although only the latter one uses the abbreviation TMS). We
adopt the same naming convention here for consistency.

Add the CAN_CTRLMODE_XL_TMS flag to the list of the CAN control modes.

Add can_validate_xl_flags() to check the coherency of the TMS flag.
That function will be reused in upcoming changes to validate the other
CAN XL flags.

Signed-off-by: Vincent Mailhol <mailhol@kernel.org>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://patch.msgid.link/20251126-canxl-v8-6-e7e3eb74f889@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-11-26 11:20:43 +01:00
Vincent Mailhol
e632816147 can: netlink: add initial CAN XL support
CAN XL uses bittiming parameters different from Classical CAN and CAN
FD. Thus, all the data bittiming parameters, including TDC, need to be
duplicated for CAN XL.

Add the CAN XL netlink interface for all the features which are common
with CAN FD. Any new CAN XL specific features are added later on.

The first time CAN XL is activated, the MTU is set by default to
CANXL_MAX_MTU. The user may then configure a custom MTU within the
CANXL_MIN_MTU to CANXL_MAX_MTU range, in which case, the custom MTU
value will be kept as long as CAN XL remains active.

Signed-off-by: Vincent Mailhol <mailhol@kernel.org>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://patch.msgid.link/20251126-canxl-v8-5-e7e3eb74f889@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-11-26 11:20:43 +01:00
Vincent Mailhol
60f511f443 can: netlink: add CAN_CTRLMODE_RESTRICTED
ISO 11898-1:2024 adds a new restricted operation mode. This mode is
added as a mandatory feature for nodes which support CAN XL and is
retrofitted as optional for legacy nodes (i.e. the ones which only
support Classical CAN and CAN FD).

The restricted operation mode is nearly the same as the listen only
mode: the node can not send data frames or remote frames and can not
send dominant bits if an error occurs. The only exception is that the
node shall still send the acknowledgment bit. A second niche exception
is that the node may still send a data frame containing a time
reference message if the node is a primary time provider, but because
the time provider feature is not yet implemented in the kernel, this
second exception is not relevant to us at the moment.

Add the CAN_CTRLMODE_RESTRICTED control mode flag and update the
can_dev_dropped_skb() helper function accordingly.

Finally, bail out if both CAN_CTRLMODE_LISTENONLY and
CAN_CTRLMODE_RESTRICTED are provided.

Signed-off-by: Vincent Mailhol <mailhol@kernel.org>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://patch.msgid.link/20251126-canxl-v8-4-e7e3eb74f889@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-11-26 11:20:43 +01:00
Vincent Mailhol
d037d05c2e can: dev: can_dev_dropped_skb: drop CAN FD skbs if FD is off
Currently, the CAN FD skb validation logic is based on the MTU: the
interface is deemed FD capable if and only if its MTU is greater or
equal to CANFD_MTU.

This logic is showing its limit with the introduction of CAN XL. For
example, consider the two scenarios below:

  1. An interface configured with CAN FD on and CAN XL on

  2. An interface configured with CAN FD off and CAN XL on

In those two scenarios, the interfaces would have the same MTU:

  CANXL_MTU

making it impossible to differentiate which one has CAN FD turned on
and which one has it off.

Because of the limitation, the only non-UAPI-breaking workaround is to
do the check at the device level using the can_priv->ctrlmode flags.
Unfortunately, the virtual interfaces (vcan, vxcan), which do not have
a can_priv, are left behind.

Add a check on the CAN_CTRLMODE_FD flag in can_dev_dropped_skb() and
drop FD frames whenever the feature is turned off.

Signed-off-by: Vincent Mailhol <mailhol@kernel.org>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://patch.msgid.link/20251126-canxl-v8-3-e7e3eb74f889@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-11-26 11:20:43 +01:00
Vincent Mailhol
585a4f22c4 can: bittiming: apply NL_SET_ERR_MSG() to can_calc_bittiming()
When CONFIG_CAN_CALC_BITTIMING is disabled, the can_calc_bittiming()
functions can not be used and the user needs to provide all the
bittiming parameters.

Currently, can_calc_bittiming() prints an error message to the kernel
log. Instead use NL_SET_ERR_MSG() to make it return the error message
through the netlink interface so that the user can directly see it.

Signed-off-by: Vincent Mailhol <mailhol@kernel.org>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://patch.msgid.link/20251126-canxl-v8-2-e7e3eb74f889@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-11-26 11:20:43 +01:00
Oliver Hartkopp
66e75b2758 can: dev: can_get_ctrlmode_str: use capitalized ctrlmode strings
Unify the ctrlmode related strings to the command line options of the
'ip' tool from the iproute2 package. The capitalized strings are also
shown when the detailed interface configuration is printed by 'ip'.

Suggested-by: Stephane Grosjean <stephane.grosjean@hms-networks.com>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://patch.msgid.link/20251126-canxl-v8-1-e7e3eb74f889@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-11-26 11:20:43 +01:00
Lachlan Hodges
f9e788c5fd wifi: mac80211: allow sharing identical chanctx for S1G interfaces
Introduce support for sharing identical channel contexts for S1G
interfaces. Additionally, do not downgrade channel requests for
S1G interfaces.

Signed-off-by: Lachlan Hodges <lachlan.hodges@morsemicro.com>
Link: https://patch.msgid.link/20251126015758.149034-1-lachlan.hodges@morsemicro.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-26 10:34:51 +01:00
Dimitri Daskalakis
ab084f0b8d drivers: net: fbnic: Return the true error in fbnic_alloc_napi_vectors.
The error case in fbnic_alloc_napi_vectors defaulted to returning
ENOMEM. This can mask the true error case, causing confusion.

Signed-off-by: Dimitri Daskalakis <dimitri.daskalakis1@gmail.com>
Link: https://patch.msgid.link/20251124200518.1848029-1-dimitri.daskalakis1@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-25 19:52:58 -08:00
Jakub Kicinski
7a57b32523 Merge branch 'selftest-af_unix-misc-updates'
Kuniyuki Iwashima says:

====================
selftest: af_unix: Misc updates.

Patch 1 add .gitignore under tools/testing/selftests/net/af_unix/.

Patch 2 make so_peek_off.c less flaky.
====================

Link: https://patch.msgid.link/20251124212805.486235-1-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-25 19:52:30 -08:00
Kuniyuki Iwashima
ebe2f0b3cf selftest: af_unix: Extend recv() timeout in so_peek_off.c.
so_peek_off.c is reported to be flaky on NIPA:

  # # so_peek_off.c:149:two_chunks_overlap_blocking:Expected -1 (-1) != bytes (-1)
  # # two_chunks_overlap_blocking: Test terminated by assertion
  # #          FAIL  so_peek_off.stream.two_chunks_overlap_blocking

The test fork()s a child process to send() data after 1ms to
wake up the parent process being blocked (up to 3ms) on recv().

But, from the log, the parent woke up after 3ms timeout, so it
could be too short when the host is overloaded.

Let's extend it to 5s.

Reported-by: Jakub Kicinski <kuba@kernel.org>
Closes: https://lore.kernel.org/netdev/20251124070722.1e828c53@kernel.org/
Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20251124212805.486235-3-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-25 19:52:28 -08:00
Kuniyuki Iwashima
adb6b68c50 selftest: af_unix: Create its own .gitignore.
Somehow AF_UNIX tests have reused ../.gitignore,
but now NIPA warns about it.

Let's create .gitignore under af_unix/.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20251124212805.486235-2-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-25 19:52:28 -08:00
Liming Wu
cfeb7cd80f virtio_net: enhance wake/stop tx queue statistics accounting
This patch refines and strengthens the statistics collection of TX queue
wake/stop events introduced by commit c39add9b24 ("virtio_net: Add TX
stopped and wake counters").

Previously, the driver only recorded partial wake/stop statistics
for TX queues. Some wake events triggered by 'skb_xmit_done()' or resume
operations were not counted, which made the per-queue metrics incomplete.

Signed-off-by: Liming Wu <liming.wu@jaguarmicro.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Link: https://patch.msgid.link/20251120015320.1418-1-liming.wu@jaguarmicro.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-25 19:38:01 -08:00
Jakub Kicinski
8ccd116016 Merge branch 'tcp-provide-better-locality-for-retransmit-timer'
Eric Dumazet says:

====================
tcp: provide better locality for retransmit timer

TCP stack uses three timers per flow, currently spread this way:

- sk->sk_timer : keepalive timer
- icsk->icsk_retransmit_timer : retransmit timer
- icsk->icsk_delack_timer : delayed ack timer

This series moves the retransmit timer to sk->sk_timer location,
to increase data locality in TX paths.

keepalive timers are not often used, this change should be neutral for them.

After the series we have following fields:

- sk->tcp_retransmit_timer : retransmit timer, in sock_write_tx group
- icsk->icsk_delack_timer : delayed ack timer
- icsk->icsk_keepalive_timer : keepalive timer

Moving icsk_delack_timer in a beter location would also be welcomed.
====================

Link: https://patch.msgid.link/20251124175013.1473655-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-25 19:28:33 -08:00
Eric Dumazet
9a5e5334ad tcp: remove icsk->icsk_retransmit_timer
Now sk->sk_timer is no longer used by TCP keepalive, we can use
its storage for TCP and MPTCP retransmit timers for better
cache locality.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20251124175013.1473655-5-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-25 19:28:29 -08:00
Eric Dumazet
08dfe37023 tcp: introduce icsk->icsk_keepalive_timer
sk->sk_timer has been used for TCP keepalives.

Keepalive timers are not in fast path, we want to use sk->sk_timer
storage for retransmit timers, for better cache locality.

Create icsk->icsk_keepalive_timer and change keepalive
code to no longer use sk->sk_timer.

Added space is reclaimed in the following patch.

This includes changes to MPTCP, which was also using sk_timer.

Alias icsk->mptcp_tout_timer and icsk->icsk_keepalive_timer
for inet_sk_diag_fill() sake.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20251124175013.1473655-4-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-25 19:28:29 -08:00
Eric Dumazet
27e8257a86 net: move sk_dst_pending_confirm and sk_pacing_status to sock_read_tx group
These two fields are mostly read in TCP tx path, move them
in an more appropriate group for better cache locality.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20251124175013.1473655-3-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-25 19:28:29 -08:00
Eric Dumazet
3a6e8fd0bf tcp: rename icsk_timeout() to tcp_timeout_expires()
In preparation of sk->tcp_timeout_timer introduction,
rename icsk_timeout() helper and change its argument to plain
'const struct sock *sk'.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20251124175013.1473655-2-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-25 19:28:28 -08:00
Alexander Lobakin
436fa8e7d1 ice: fix broken Rx on VFs
Since the tagged commit, ice stopped respecting Rx buffer length
passed from VFs.
At that point, the buffer length was hardcoded in ice, so VFs still
worked up to some point (until, for example, a VF wanted an MTU
larger than its PF).
The next commit 93f53db9f9 ("ice: switch to Page Pool"), broke
Rx on VFs completely since ice started accounting per-queue buffer
lengths again, but now VF queues always had their length zeroed, as
ice was already ignoring what iavf was passing to it.

Restore the line that initializes the buffer length on VF queues
basing on the virtchnl messages.

Fixes: 3a4f419f75 ("ice: drop page splitting and recycling")
Reported-by: Jakub Slepecki <jakub.slepecki@intel.com>
Reviewed-by: Tony Nguyen <anthony.l.nguyen@intel.com>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Tested-by: Jakub Slepecki <jakub.slepecki@intel.com>
Link: https://patch.msgid.link/20251124170735.3077425-1-aleksander.lobakin@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-25 19:26:27 -08:00
Gustavo A. R. Silva
d696c73716 chtls: Avoid -Wflex-array-member-not-at-end warning
-Wflex-array-member-not-at-end was introduced in GCC-14, and we are
getting ready to enable it, globally.

Use the `DEFINE_RAW_FLEX()` helper for on-stack definitions of
a flexible structure where the size of the flexible-array member
is known at compile-time, and refactor the rest of the code,
accordingly.

So, with these changes, fix the following warning:

drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c:163:36: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end]

Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Link: https://patch.msgid.link/aSQocKoJGkN0wzEj@kspp
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-25 19:25:24 -08:00
Jakub Kicinski
864f3eda00 Merge branch 'tools-ynl-gen-regeneration-comment-function-prefix'
Asbjørn Sloth Tønnesen says:

====================
tools: ynl-gen: regeneration comment + function prefix

It looks like these two patches are the last ones needed
for YNL, before the WireGuard patches can go in.

These patches was both requested by Jason, during review
of the WireGuard YNL conversion patchset[1].
====================

Link: https://patch.msgid.link/20251120174429.390574-1-ast@fiberby.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-25 19:20:47 -08:00
Asbjørn Sloth Tønnesen
68e83f3472 tools: ynl-gen: add regeneration comment
Add a comment on regeneration to the generated files.

The comment is placed after the YNL-GEN line[1], as to not interfere
with ynl-regen.sh's detection logic.

[1] and after the optional YNL-ARG line.

Link: https://lore.kernel.org/r/aR5m174O7pklKrMR@zx2c4.com/
Suggested-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Asbjørn Sloth Tønnesen <ast@fiberby.net>
Acked-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20251120174429.390574-3-ast@fiberby.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-25 19:20:42 -08:00
Asbjørn Sloth Tønnesen
17fa6ee35b tools: ynl-gen: add function prefix argument
This patch adds a new CLI argument for overriding the default
function prefix, as used for naming the doit/dumpit functions
in the generated kernel code.

When not specified the default "$(FAMILY)-nl" is used.

This can also be specified persistently in generated files:
  /* YNL-ARG --function-prefix wg */

In the above example it causes the following changes:
  wireguard_nl_get_device_dumpit() -> wg_get_device_dumpit()
  wireguard_nl_get_device_doit()   -> wg_get_device_doit()

The variable name fn_prefix, was chosen as it relates to op_prefix
which is used to prefix the UAPI commands enum entries.

Link: https://lore.kernel.org/r/aRvWzC8qz3iXDAb3@zx2c4.com/
Suggested-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Asbjørn Sloth Tønnesen <ast@fiberby.net>
Link: https://patch.msgid.link/20251120174429.390574-2-ast@fiberby.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-25 19:20:42 -08:00
Jakub Kicinski
97a88d9e2a Merge branch 'ptp-ocp-a-fix-and-refactoring'
Andy Shevchenko says:

====================
ptp: ocp: A fix and refactoring

Here is the fix for incorrect use of %ptT with the associated
refactoring and additional cleanups.

Note, %ptS, which is introduced in another series that is already
applied to PRINTK tree, doesn't fit here, that's why this fix
is separated from that series.
====================

Link: https://patch.msgid.link/20251124084816.205035-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-25 19:17:23 -08:00
Andy Shevchenko
648282e2d1 ptp: ocp: Reuse META's PCI vendor ID
The META's PCI vendor ID is listed already in the pci_ids.h.
Reuse it here.

Reviewed-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://patch.msgid.link/20251124084816.205035-5-andriy.shevchenko@linux.intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-25 19:17:22 -08:00
Andy Shevchenko
4c84a5c7b0 ptp: ocp: Apply standard pattern for cleaning up loop
The while (i--) is a standard pattern for the cleaning up loops.
Apply this pattern where it makes sense in the driver.

Reviewed-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://patch.msgid.link/20251124084816.205035-4-andriy.shevchenko@linux.intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-25 19:17:21 -08:00
Andy Shevchenko
590f5d1fa6 ptp: ocp: Make ptp_ocp_unregister_ext() NULL-aware
It's a common practice to make resource release functions be NULL-aware.
Make ptp_ocp_unregister_ext() NULL-aware.

Reviewed-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://patch.msgid.link/20251124084816.205035-3-andriy.shevchenko@linux.intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-25 19:17:21 -08:00
Andy Shevchenko
622cc66ed7 ptp: ocp: Refactor signal_show() and fix %ptT misuse
Refactor signal_show() to avoid sequential calls to sysfs_emit*()
and use the same pattern to get the index of a signal as it's done
in signal_store().

While at it, fix wrong use of %ptT against struct timespec64.
It's kinda lucky that it worked just because the first member
there 64-bit and it's of time64_t type. Now with %ptS it may
be used correctly.

Reviewed-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://patch.msgid.link/20251124084816.205035-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-25 19:17:21 -08:00
Michal Luczaj
b796632fc8 vsock/test: Extend transport change null-ptr-deref test
syzkaller reported a lockdep lock order inversion warning[1] due to
commit 687aa0c558 ("vsock: Fix transport_* TOCTOU"). This was fixed in
commit f7c877e753 ("vsock: fix lock inversion in
vsock_assign_transport()").

Redo syzkaller's repro by piggybacking on a somewhat related test
implemented in commit 3a764d9338 ("vsock/test: Add test for null ptr
deref when transport changes").

[1]: https://lore.kernel.org/netdev/68f6cdb0.a70a0220.205af.0039.GAE@google.com/

Signed-off-by: Michal Luczaj <mhal@rbox.co>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Link: https://patch.msgid.link/20251123-vsock_test-linger-lockdep-warn-v1-1-4b1edf9d8cdc@rbox.co
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-25 19:16:21 -08:00
Heiner Kallweit
87ad869fea r8169: improve MAC EEE handling
Let phydev->enable_tx_lpi control whether MAC enables TX LPI, instead of
enabling it unconditionally. This way TX LPI is disabled if e.g. link
partner doesn't support EEE. This helps to avoid potential issues like
link flaps.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/91bcb837-3fab-4b4e-b495-038df0932e44@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-25 19:14:34 -08:00
Daniel Golle
de1e5c9333 net: phy: mxl-gpy: add support for MxL86252 and MxL86282
Add PHY driver support for Maxlinear MxL86252 and MxL86282 switches.
The PHYs built-into those switches are just like any other GPY 2.5G PHYs
with the exception of the temperature sensor data being encoded in a
different way.

Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/a6cd7fe461b011cec2b59dffaf34e9c8b0819059.1763818120.git.daniel@makrotopia.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-25 19:11:38 -08:00
Chad Monroe
9d844da693 net: phy: mxl-gpy: add support for MxL86211C
MxL86211C is a smaller and more efficient version of the GPY211C.
Add the PHY ID and phy_driver instance to the mxl-gpy driver.

Signed-off-by: Chad Monroe <chad@monroe.io>
Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/cabf3559d6511bed6b8a925f540e3162efc20f6b.1763818120.git.daniel@makrotopia.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-25 19:11:38 -08:00
Buday Csaba
ce28e333d6 net: mdio: remove redundant fwnode cleanup
Remove redundant fwnode cleanup in of_mdiobus_register_device()
and xpcs_plat_init_dev().

mdio_device_free() eventually calls mdio_device_release(),
which already performs fwnode_handle_put(), making the manual
cleanup unnecessary.

Combine fwnode_handle_get() with device_set_node() in
of_mdiobus_register_device() for clarity.

Signed-off-by: Buday Csaba <buday.csaba@prolan.hu>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/00847693daa8f7c8ff5dfa19dd35fc712fa4e2b5.1763995734.git.buday.csaba@prolan.hu
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-25 18:45:37 -08:00
Buday Csaba
a11e0d467d net: mdio: eliminate kdoc warnings in mdio_device.c and mdio_bus.c
Fix all warnings reported by scripts/kernel-doc in
mdio_device.c and mdio_bus.c

Signed-off-by: Buday Csaba <buday.csaba@prolan.hu>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/7ef7b80669da2b899d38afdb6c45e122229c3d8c.1763968667.git.buday.csaba@prolan.hu
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-25 18:43:56 -08:00
Jakub Kicinski
652eb5afce Merge branch 'net-enetc-add-port-mdio-support-for-both-i-mx94-and-i-mx95'
Wei Fang says:

====================
net: enetc: add port MDIO support for both i.MX94 and i.MX95

The NETC IP has one external master MDIO interface (eMDIO) for managing
external PHYs, all ENETC ports share this eMDIO. The EMDIO function and
the ENETC port MDIO are the virtual ports of this eMDIO, ENETC can use
these virtual ports to access their PHYs. The difference is that EMDIO
function is a 'global port', it can access all the PHYs on the eMDIO, so
it provides a means for different software modules to share a single set
of MDIO signals to access their PHYs.

The ENETC port MDIO can only access its own external PHY. Furthermore,
its PHY address must be set to its corresponding LaBCR register in IERB
module, which is is a 64 KB size page containing registers that are used
for pre-boot initialization for all NETC PCIe functions. And this IERB
is owned by the host OS and it will be locked after the initialization,
so it cannot be configured at running time any more. The port MDIO can
only work properly when the PHY address accessed by it matches the value
of its corresponding LaBCR[MDIO_PHYAD_PRTAD]. Otherwise, the MDIO access
by the port MDIO will not take effect.

Note that the same PHY is either controlled by port MDIO or by the EMDIO
function. The netc-blk-ctrl driver will only set the PHY address in the
LaBCR register corresponding to the ENETC when the ENETC node contains
an mdio child node, and the ENETC driver will only create the port MDIO
bus then. An example in DTS is as follows, the EMDIO function will not\
access this PHY.

enetc_port0 {
        phy-handle = <&ethphy0>;
        phy-mode = "rgmii-id";

        mdio {
                #address-cells = <1>;
                #size-cells = <0>;

                ethphy0: ethernet-phy@1 {
                        reg = <1>;
                };
        };
};

If users want to use EMDIO funtion to manage the PHY, they only need to
place the PHY node in the emdio node. The same PHY must not be placed
simultaneously within the ENETC node. An example in DTS to use EMDIO
is as below.

netc_emdio {
        ethphy0: ethernet-phy@1 {
                reg = <1>;
        };

        ethphy2: ethernet-phy@8 {
                reg = <8>;
        };
};

In the host OS, when there are multiple ENETCs, they can all access their
PHYs using their own port MDIO, or they can all access their PHYs using
the EMDIO function, or they can partially use port MDIO and partially use
the EMDIO function.

Another typical use case of port MDIO is the Jailhouse usage. An ENETC is
assigned to a guest OS. The EMDIO function will be unavailable in the
guest OS because EMDIO is controlled by the host OS. Therefore, the ENETC
can use its port MDIO to manage its external PHY in this situation. In
this use case, the host OS's root dtb will disable the ENETC node, so the
host OS's ENETC driver will not probe the ENETC and its PHY.

In addition, this series also adds the internal MDIO bus support, each
ENETC has an internal MDIO interface for managing on-die PHY (PCS) if it
has PCS layer.
====================

Link: https://patch.msgid.link/20251119102557.1041881-1-wei.fang@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-25 17:44:50 -08:00
Wei Fang
10ba23a7f6 net: enetc: update the base address of port MDIO registers for ENETC v4
Each ENETC has a set of external MDIO registers to access its external
PHY based on its port EMDIO bus, these registers are used for MDIO bus
access, such as setting the PHY address, PHY register address and value,
read or write operations, C22 or C45 format, etc. The base address of
this set of registers has been modified in ENETC v4 and is different
from that in ENETC v1. So the base address needs to be updated so that
ENETC v4 can use port MDIO to manage its own external PHY.

Additionally, if ENETC has the PCS layer, it also has a set of internal
MDIO registers for managing its on-die PHY (PCS/Serdes). The base address
of this set of registers is also different from that of ENETC v1, so the
base address also needs to be updated so that ENETC v4 can support the
management of on-die PHY through the internal MDIO bus.

Signed-off-by: Wei Fang <wei.fang@nxp.com>
Reviewed-by: Claudiu Manoil <claudiu.manoil@nxp.com>
Link: https://patch.msgid.link/20251119102557.1041881-4-wei.fang@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-25 17:44:48 -08:00
Wei Fang
50bfd9c06f net: enetc: set external PHY address in IERB for i.MX94 ENETC
NETC IP has only one external master MDIO interface (eMDIO) for managing
the external PHYs. ENETC can use the interfaces provided by the EMDIO
function or its port MDIO to access and manage its external PHY. Both
the EMDIO function and the port MDIO are all virtual ports of the eMDIO.

The difference is that the EMDIO function is a 'global port', it can
access all the PHYs on the eMDIO, but port MDIO can only access its own
PHY. To ensure that ENETC can only access its own PHY through port MDIO,
LaBCR[MDIO_PHYAD_PRTAD] needs to be set, which represents the address of
the external PHY connected to ENETC. If the accessed PHY address is not
consistent with LaBCR[MDIO_PHYAD_PRTAD], then the MDIO access initiated
by port MDIO will be invalid.

Signed-off-by: Wei Fang <wei.fang@nxp.com>
Reviewed-by: Claudiu Manoil <claudiu.manoil@nxp.com>
Link: https://patch.msgid.link/20251119102557.1041881-3-wei.fang@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-25 17:44:47 -08:00
Wei Fang
6633df05f3 net: enetc: set the external PHY address in IERB for port MDIO usage
The ENETC supports managing its own external PHY through its port MDIO
functionality. To use this function, the PHY address needs be set in the
corresponding LaBCR register in the Integrated Endpoint Register Block
(IERB), which is used for pre-boot initialization of NETC PCIe functions.
The port MDIO can only work properly when the PHY address accessed by the
port MDIO matches the corresponding LaBCR[MDIO_PHYAD_PRTAD] value.

Because the ENETC driver only registers the MDIO bus (port MDIO bus) when
it detects an MDIO child node in its node, similarly, the netc-blk-ctrl
driver only resolves the PHY address and sets it in the corresponding
LaBCR when it detects an MDIO child node in the ENETC node.

Co-developed-by: Aziz Sellami <aziz.sellami@nxp.com>
Signed-off-by: Aziz Sellami <aziz.sellami@nxp.com>
Signed-off-by: Wei Fang <wei.fang@nxp.com>
Reviewed-by: Claudiu Manoil <claudiu.manoil@nxp.com>
Link: https://patch.msgid.link/20251119102557.1041881-2-wei.fang@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-25 17:44:47 -08:00
Jakub Kicinski
f0054f7bb9 Merge branch 'improvements-over-dsa-conduit-ethtool-ops'
Vladimir Oltean says:

====================
Improvements over DSA conduit ethtool ops

DSA interceps 'ethtool -S eth0', where eth0 is the host port of the
switch (called 'conduit'). It does this because otherwise there is no
way to report port counters for the CPU port, which is a MAC like any
other of that switch, except Linux exposes no net_device for it, thus no
ethtool hook.

Having understood all downsides of this debugging interface, when we
need it we needed, so the proposed changes here are to make it more
useful by dumping more counters in it: not just the switch CPU port,
but all other switch ports in the tree which lack a net_device. Not
reinventing any wheel, just putting more output in an existing command.

That is patch 3/3. The other 2 are cleanup.
====================

Link: https://patch.msgid.link/20251122112311.138784-1-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-25 17:34:01 -08:00
Vladimir Oltean
f647ed2ca7 net: dsa: append ethtool counters of all hidden ports to conduit
Currently there is no way to see packet counters on cascade ports, and
no clarity on how the API for that would look like.

Because it's something that is currently needed, just extend the hack
where ethtool -S on the conduit interface dumps CPU port counters, and
also use it to dump counters of cascade ports.

Note that the "pXX_" naming convention changes to "sXX_pYY", to
distinguish between ports having the same index but belonging to
different switches. This has a slight chance of causing regressions to
existing tooling:

- grepping for "p04_counter_name" still works, but might return more
  than one string now
- grepping for "    p04_counter_name" no longer works

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20251122112311.138784-4-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-25 17:33:55 -08:00
Vladimir Oltean
8afabd27fe net: dsa: use kernel data types for ethtool ops on conduit
Suppress some checkpatch 'CHECK' messages about u8 being preferable over
uint8_t, etc. No functional change.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20251122112311.138784-3-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-25 17:33:55 -08:00
Vladimir Oltean
eba81b0a6d net: dsa: cpu_dp->orig_ethtool_ops might be NULL
In theory this would have been seen by now, but it seems that all
drivers used as DSA conduit interfaces thus far have had ethtool_ops
set, and it's hard to even find modern Ethernet drivers (and not VF
ones) which don't use ethtool.

Here is the unfiltered list of drivers which register any sort of
net_device but don't set its ethtool_ops pointer. I don't think any of
them 'risks' being used as a DSA conduit, maybe except for moxart,
rnpbge and icssm, I'm not sure.

- drivers/net/can/dev/dev.c
- drivers/net/wwan/qcom_bam_dmux.c
- drivers/net/wwan/t7xx/t7xx_netdev.c
- drivers/net/arcnet/arcnet.c
- drivers/net/hamradio/
- drivers/net/slip/slip.c
- drivers/net/ethernet/ezchip/nps_enet.c
- drivers/net/ethernet/moxa/moxart_ether.c
- drivers/net/ethernet/wangxun/txgbevf/txgbevf_main.c
- drivers/net/ethernet/wangxun/ngbevf/ngbevf_main.c
- drivers/net/ethernet/huawei/hinic3/hinic3_main.c
- drivers/net/ethernet/i825xx/
- drivers/net/ethernet/ti/icssm/icssm_prueth.c
- drivers/net/ethernet/seeq/
- drivers/net/ethernet/litex/litex_liteeth.c
- drivers/net/ethernet/sunplus/spl2sw_driver.c
- drivers/net/ethernet/mucse/rnpgbe/rnpgbe_main.c
- drivers/net/ipa/
- drivers/net/wireless/microchip/wilc1000/
- drivers/net/wireless/mediatek/mt76/dma.c
- drivers/net/wireless/ath/ath12k/
- drivers/net/wireless/ath/ath11k/
- drivers/net/wireless/ath/ath6kl/
- drivers/net/wireless/ath/ath10k/
- drivers/net/wireless/intel/iwlwifi/pcie/gen1_2/trans.c
- drivers/net/wireless/virtual/mac80211_hwsim.c
- drivers/net/wireless/quantenna/qtnfmac/pcie/pcie.c
- drivers/net/wireless/realtek/rtw89/core.c
- drivers/net/wireless/realtek/rtw88/pci.c
- drivers/net/caif/
- drivers/net/plip/
- drivers/net/wan/
- drivers/net/mctp/
- drivers/net/ppp/
- drivers/net/thunderbolt/

Nonetheless, it's good for the framework not to make such assumptions,
and not panic when coming across such kind of host device in the future.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20251122112311.138784-2-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-25 17:33:55 -08:00
Alan Maguire
380d19db6e cxgb4: Rename sched_class to avoid type clash
drivers/net/ethernet/chelsio/cxgb4/sched.h declares a sched_class
struct which has a type name clash with struct sched_class
in kernel/sched/sched.h (a type used in a field in task_struct).

When cxgb4 is a builtin we end up with both sched_class types,
and as a result of this we wind up with DWARF (and derived from
that BTF) with a duplicate incorrect task_struct representation.
When cxgb4 is built-in this type clash can cause kernel builds to
fail as resolve_btfids will fail when confused which task_struct
to use. See [1] for more details.

As such, renaming sched_class to ch_sched_class (in line with
other structs like ch_sched_flowc) makes sense.

[1] https://lore.kernel.org/bpf/2412725b-916c-47bd-91c3-c2d57e3e6c7b@acm.org/

Reported-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Acked-by: Potnuri Bharat Teja <bharat@chelsio.com>
Link: https://patch.msgid.link/20251121181231.64337-1-alan.maguire@oracle.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-25 17:27:11 -08:00
Javen Xu
d6eea0048b r8169: add support for RTL9151A
This adds support for chip RTL9151A. Its XID is 0x68b. It is bascially
basd on the one with XID 0x688, but with different firmware file.

Signed-off-by: Javen Xu <javen_xu@realsil.com.cn>
Reviewed-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://patch.msgid.link/20251121090104.3753-1-javen_xu@realsil.com.cn
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-25 17:24:29 -08:00
Alexei Starovoitov
5262cb2339 Merge branch 'general-enhancements-to-rqspinlock-stress-test'
Kumar Kartikeya Dwivedi says:

====================
General enhancements to rqspinlock stress test

Three enchancements, details in commit messages.

First, the CPU requirements are 2 for AA, 3 for ABBA, and 4 for ABBCCA,
hence relax the check during module initialization. Second, add a
per-CPU histogram to capture lock acquisition times to record which
buckets these acquisitions fall into for the normal task context and NMI
context.  Anything below 10ms is not printed in detail, but above that
displays the full breakdown for each context. Finally, make the delay of
the NMI and task contexts configurable, set to 10 and 20 ms respectively
by default.
====================

Link: https://patch.msgid.link/20251125020749.2421610-1-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-25 15:30:14 -08:00
Kumar Kartikeya Dwivedi
88337b587b selftests/bpf: Make CS length configurable for rqspinlock stress test
Allow users to configure the critical section delay for both task/normal
and NMI contexts, and set to 20ms and 10ms as before by default.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20251125020749.2421610-4-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-25 15:30:14 -08:00
Kumar Kartikeya Dwivedi
6173c1d620 selftests/bpf: Add lock wait time stats to rqspinlock stress test
Add statistics per-CPU broken down by context and various timing windows
for the time taken to acquire an rqspinlock. Cases where all
acquisitions fit into the 10ms window are skipped from printing,
otherwise the full breakdown is displayed when printing the summary.
This allows capturing precisely the number of times outlier attempts
happened for a given lock in a given context.

A critical detail is that time is captured regardless of success or
failure, which is important to capture events for failed but long
waiting timeout attempts.

Output:

[   64.279459] rqspinlock acquisition latency histogram (ms):
[   64.279472]  cpu1: total 528426 (normal 526559, nmi 1867)
[   64.279477]    0-1ms: total 524697 (normal 524697, nmi 0)
[   64.279480]    2-2ms: total 3652 (normal 1811, nmi 1841)
[   64.279482]    3-3ms: total 66 (normal 47, nmi 19)
[   64.279485]    4-4ms: total 2 (normal 1, nmi 1)
[   64.279487]    5-5ms: total 1 (normal 1, nmi 0)
[   64.279489]    6-6ms: total 1 (normal 0, nmi 1)
[   64.279490]    101-150ms: total 1 (normal 0, nmi 1)
[   64.279492]    >= 251ms: total 6 (normal 2, nmi 4)
...

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20251125020749.2421610-3-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-25 15:30:14 -08:00
Kumar Kartikeya Dwivedi
224de8d5a3 selftests/bpf: Relax CPU requirements for rqspinlock stress test
Only require 2 CPUs for AA, 3 for ABBA, 4 for ABBCCA, which is
calculated nicely by adding to the mode enum. Enables running single CPU
AA tests.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20251125020749.2421610-2-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-25 15:30:13 -08:00
Leon Hwang
8f6ddc0587 bpf: Introduce internal bpf_map_check_op_flags helper function
It is to unify map flags checking for lookup_elem, update_elem,
lookup_batch and update_batch APIs.

Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
Link: https://lore.kernel.org/r/20251125145857.98134-2-leon.hwang@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-25 15:27:48 -08:00
Viacheslav Dubeyko
ec95cd103c hfs/hfsplus: move on-disk layout declarations into hfs_common.h
Currently, HFS declares on-disk layout's metadata structures
in fs/hfs/hfs.h and HFS+ declares it in fs/hfsplus/hfsplus_raw.h.
However, HFS and HFS+ on-disk layouts have some similarity and
overlapping in declarations. As a result, fs/hfs/hfs.h and
fs/hfsplus/hfsplus_raw.h contain multiple duplicated declarations.
Moreover, both HFS and HFS+ drivers contain completely similar
implemented functionality in multiple places.

This patch is moving the on-disk layout declarations from
fs/hfs/hfs.h and fs/hfsplus/hfsplus_raw.h into
include/linux/hfs_common.h with the goal to exclude
the duplication in declarations. Also, this patch prepares
the basis for creating a hfslib that can aggregate common
functionality without necessity to duplicate the same code
in HFS and HFS+ drivers.

Signed-off-by: Viacheslav Dubeyko <slava@dubeyko.com>
cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
cc: Yangtao Li <frank.li@vivo.com>
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Viacheslav Dubeyko <slava@dubeyko.com>
2025-11-25 15:16:03 -08:00
Jianyun Gao
8c868a34ea libbpf: Fix some incorrect @param descriptions in the comment of libbpf.h
Fix up some of missing or incorrect @param descriptions for libbpf public APIs
in libbpf.h.

Signed-off-by: Jianyun Gao <jianyungao89@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20251118033025.11804-1-jianyungao89@gmail.com
2025-11-25 14:39:58 -08:00
Menglong Dong
f2cb0660ac selftests/bpf: Call bpf_get_numa_node_id() in trigger_count()
The bench test "trig-kernel-count" can be used as a baseline comparison
for fentry and other benchmarks, and the calling to bpf_get_numa_node_id()
should be considered as composition of the baseline. So, let's call it in
trigger_count(). Meanwhile, rename trigger_count() to
trigger_kernel_count() to make it easier understand.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20251116014242.151110-1-dongml2@chinatelecom.cn
2025-11-25 14:32:50 -08:00
Alex Tran
44bf461182 docs: bpf: map_array: Specify BPF_MAP_TYPE_PERCPU_ARRAY value size limit
Specify value size limit for BPF_MAP_TYPE_PERCPU_ARRAY which
is PCPU_MIN_UNIT_SIZE (32 kb). In percpu allocator (mm: percpu),
any request with a size greater than PCPU_MIN_UNIT_SIZE is rejected.

Signed-off-by: Alex Tran <alex.t.tran@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20251115063531.2302903-1-alex.t.tran@gmail.com
2025-11-25 14:32:00 -08:00
Viacheslav Dubeyko
3f04ee216b hfsplus: fix volume corruption issue for generic/101
The xfstests' test-case generic/101 leaves HFS+ volume
in corrupted state:

sudo ./check generic/101
FSTYP -- hfsplus
PLATFORM -- Linux/x86_64 hfsplus-testing-0001 6.17.0-rc1+ #4 SMP PREEMPT_DYNAMIC Wed Oct 1 15:02:44 PDT 2025
MKFS_OPTIONS -- /dev/loop51
MOUNT_OPTIONS -- /dev/loop51 /mnt/scratch

generic/101 _check_generic_filesystem: filesystem on /dev/loop51 is inconsistent
(see XFSTESTS-2/xfstests-dev/results//generic/101.full for details)

Ran: generic/101
Failures: generic/101
Failed 1 of 1 tests

sudo fsck.hfsplus -d /dev/loop51
** /dev/loop51
Using cacheBlockSize=32K cacheTotalBlock=1024 cacheSize=32768K.
Executing fsck_hfs (version 540.1-Linux).
** Checking non-journaled HFS Plus Volume.
The volume name is untitled
** Checking extents overflow file.
** Checking catalog file.
** Checking multi-linked files.
** Checking catalog hierarchy.
** Checking extended attributes file.
** Checking volume bitmap.
** Checking volume information.
Invalid volume free block count
(It should be 2614350 instead of 2614382)
Verify Status: VIStat = 0x8000, ABTStat = 0x0000 EBTStat = 0x0000
CBTStat = 0x0000 CatStat = 0x00000000
** Repairing volume.
** Rechecking volume.
** Checking non-journaled HFS Plus Volume.
The volume name is untitled
** Checking extents overflow file.
** Checking catalog file.
** Checking multi-linked files.
** Checking catalog hierarchy.
** Checking extended attributes file.
** Checking volume bitmap.
** Checking volume information.
** The volume untitled was repaired successfully.

This test executes such steps: "Test that if we truncate a file
to a smaller size, then truncate it to its original size or
a larger size, then fsyncing it and a power failure happens,
the file will have the range [first_truncate_size, last_size[ with
all bytes having a value of 0x00 if we read it the next time
the filesystem is mounted.".

HFS+ keeps volume's free block count in the superblock.
However, hfsplus_file_fsync() doesn't store superblock's
content. As a result, superblock contains not correct
value of free blocks if a power failure happens.

This patch adds functionality of saving superblock's
content during hfsplus_file_fsync() call.

sudo ./check generic/101
FSTYP         -- hfsplus
PLATFORM      -- Linux/x86_64 hfsplus-testing-0001 6.18.0-rc3+ #96 SMP PREEMPT_DYNAMIC Wed Nov 19 12:47:37 PST 2025
MKFS_OPTIONS  -- /dev/loop51
MOUNT_OPTIONS -- /dev/loop51 /mnt/scratch

generic/101 32s ...  30s
Ran: generic/101
Passed all 1 tests

sudo fsck.hfsplus -d /dev/loop51
** /dev/loop51
	Using cacheBlockSize=32K cacheTotalBlock=1024 cacheSize=32768K.
   Executing fsck_hfs (version 540.1-Linux).
** Checking non-journaled HFS Plus Volume.
   The volume name is untitled
** Checking extents overflow file.
** Checking catalog file.
** Checking multi-linked files.
** Checking catalog hierarchy.
** Checking extended attributes file.
** Checking volume bitmap.
** Checking volume information.
** The volume untitled appears to be OK.

Signed-off-by: Viacheslav Dubeyko <slava@dubeyko.com>
cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
cc: Yangtao Li <frank.li@vivo.com>
cc: linux-fsdevel@vger.kernel.org
Link: https://lore.kernel.org/r/20251119223219.1824434-1-slava@dubeyko.com
Signed-off-by: Viacheslav Dubeyko <slava@dubeyko.com>
2025-11-25 11:35:13 -08:00
Chaitanya Kulkarni
7d09a8e251 block: ignore __blkdev_issue_discard() return value
__blkdev_issue_discard() always returns 0, making the error check
in blkdev_issue_discard() dead code.

In function blkdev_issue_discard() initialize ret = 0, remove ret
assignment from __blkdev_issue_discard(), rely on bio == NULL check to
call submit_bio_wait(), preserve submit_bio_wait() error handling, and
preserve -EOPNOTSUPP to 0 mapping.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Chaitanya Kulkarni <ckulkarnilinux@gmail.com>
Reviewed-by: Anuj Gupta <anuj20.g@samsung.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-25 12:19:39 -07:00
shechenglong
3a64c46c40 block: fix typos in comments and strings in blk-core
This patch fixes multiple spelling mistakes in comments and documentation
in the file block/blk-core.c.

No functional changes intended.

Suggested-by: Christoph Hellwig <hch@infradead.org>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: shechenglong <shechenglong@xfusion.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-25 10:39:49 -07:00
John Garry
a74de0c366 block: Remove references to __device_add_disk()
Since commit d1254a8749 ("block: remove support for delayed queue
registrations"), function __device_add_disk() has been replaced with
device_add_disk(), so fix up comments.

Signed-off-by: John Garry <john.g.garry@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-25 10:36:37 -07:00
Paolo Abeni
61e628023d Merge branch 'net_sched-speedup-qdisc-dequeue'
Eric Dumazet says:

====================
net_sched: speedup qdisc dequeue

Avoid up to two cache line misses in qdisc dequeue() to fetch
skb_shinfo(skb)->gso_segs/gso_size while qdisc spinlock is held.

Idea is to cache gso_segs at enqueue time before spinlock is
acquired, in the first skb cache line, where we already
have qdisc_skb_cb(skb)->pkt_len.

This series gives a 8 % improvement in a TX intensive workload.

(120 Mpps -> 130 Mpps on a Turin host, IDPF with 32 TX queues)

v2: https://lore.kernel.org/netdev/20251111093204.1432437-1-edumazet@google.com/
v1: https://lore.kernel.org/netdev/20251110094505.3335073-1-edumazet@google.com/T/#m8f562ed148f807c02fd02c6cd243604d449615b9
====================

Link: https://patch.msgid.link/20251121083256.674562-1-edumazet@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-25 16:10:35 +01:00
Eric Dumazet
a6efc273ab net_sched: use qdisc_dequeue_drop() in cake, codel, fq_codel
cake, codel and fq_codel can drop many packets from dequeue().

Use qdisc_dequeue_drop() so that the freeing can happen
outside of the qdisc spinlock scope.

Add TCQ_F_DEQUEUE_DROPS to sch->flags.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251121083256.674562-15-edumazet@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-25 16:10:32 +01:00
Eric Dumazet
191ff13e42 net_sched: add qdisc_dequeue_drop() helper
Some qdisc like cake, codel, fq_codel might drop packets
in their dequeue() method.

This is currently problematic because dequeue() runs with
the qdisc spinlock held. Freeing skbs can be extremely expensive.

Add qdisc_dequeue_drop() method and a new TCQ_F_DEQUEUE_DROPS
so that these qdiscs can opt-in to defer the skb frees
after the socket spinlock is released.

TCQ_F_DEQUEUE_DROPS is an attempt to not penalize other qdiscs
with an extra cache line miss.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251121083256.674562-14-edumazet@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-25 16:10:32 +01:00
Eric Dumazet
0170d7f47c net_sched: add tcf_kfree_skb_list() helper
Using kfree_skb_list_reason() to free list of skbs from qdisc
operations seems wrong as each skb might have a different drop reason.

Cleanup __dev_xmit_skb() to call tcf_kfree_skb_list() once
in preparation of the following patch.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251121083256.674562-13-edumazet@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-25 16:10:32 +01:00
Eric Dumazet
4792c3a4c1 net: annotate a data-race in __dev_xmit_skb()
q->limit is read locklessly, add a READ_ONCE().

Fixes: 100dfa74ca ("net: dev_queue_xmit() llist adoption")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251121083256.674562-12-edumazet@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-25 16:10:32 +01:00
Eric Dumazet
b2e9821cff net: prefech skb->priority in __dev_xmit_skb()
Most qdiscs need to read skb->priority at enqueue time().

In commit 100dfa74ca ("net: dev_queue_xmit() llist adoption")
I added a prefetch(next), lets add another one for the second
half of skb.

Note that skb->priority and skb->hash share a common cache line,
so this patch helps qdiscs needing both fields.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251121083256.674562-11-edumazet@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-25 16:10:32 +01:00
Eric Dumazet
2f9babc04d net_sched: sch_fq: prefetch one skb ahead in dequeue()
prefetch the skb that we are likely to dequeue at the next dequeue().

Also call fq_dequeue_skb() a bit sooner in fq_dequeue().

This reduces the window between read of q.qlen and
changes of fields in the cache line that could be dirtied
by another cpu trying to queue a packet.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251121083256.674562-10-edumazet@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-25 16:10:32 +01:00
Eric Dumazet
3c1100f042 net_sched: sch_fq: move qdisc_bstats_update() to fq_dequeue_skb()
Group together changes to qdisc fields to reduce chances of false sharing
if another cpu attempts to acquire the qdisc spinlock.

  qdisc_qstats_backlog_dec(sch, skb);
  sch->q.qlen--;
  qdisc_bstats_update(sch, skb);

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251121083256.674562-9-edumazet@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-25 16:10:32 +01:00
Eric Dumazet
ad50d5a3fc net_sched: add Qdisc_read_mostly and Qdisc_write groups
It is possible to reorg Qdisc to avoid always dirtying 2 cache lines in
fast path by reducing this to a single dirtied cache line.

In current layout, we change only four/six fields in the first cache line:
 - q.spinlock
 - q.qlen
 - bstats.bytes
 - bstats.packets
 - some Qdisc also change q.next/q.prev

In the second cache line we change in the fast path:
 - running
 - state
 - qstats.backlog

        /* --- cacheline 2 boundary (128 bytes) --- */
        struct sk_buff_head        gso_skb __attribute__((__aligned__(64))); /*  0x80  0x18 */
        struct qdisc_skb_head      q;                    /*  0x98  0x18 */
        struct gnet_stats_basic_sync bstats __attribute__((__aligned__(16))); /*  0xb0  0x10 */

        /* --- cacheline 3 boundary (192 bytes) --- */
        struct gnet_stats_queue    qstats;               /*  0xc0  0x14 */
        bool                       running;              /*  0xd4   0x1 */

        /* XXX 3 bytes hole, try to pack */

        unsigned long              state;                /*  0xd8   0x8 */
        struct Qdisc *             next_sched;           /*  0xe0   0x8 */
        struct sk_buff_head        skb_bad_txq;          /*  0xe8  0x18 */
        /* --- cacheline 4 boundary (256 bytes) --- */

Reorganize things to have a first cache line mostly read,
then a mostly written one.

This gives a ~3% increase of performance under tx stress.

Note that there is an additional hole because @qstats now spans over a third cache line.

	/* --- cacheline 2 boundary (128 bytes) --- */
	__u8                       __cacheline_group_begin__Qdisc_read_mostly[0] __attribute__((__aligned__(64))); /*  0x80     0 */
	struct sk_buff_head        gso_skb;              /*  0x80  0x18 */
	struct Qdisc *             next_sched;           /*  0x98   0x8 */
	struct sk_buff_head        skb_bad_txq;          /*  0xa0  0x18 */
	__u8                       __cacheline_group_end__Qdisc_read_mostly[0]; /*  0xb8     0 */

	/* XXX 8 bytes hole, try to pack */

	/* --- cacheline 3 boundary (192 bytes) --- */
	__u8                       __cacheline_group_begin__Qdisc_write[0] __attribute__((__aligned__(64))); /*  0xc0     0 */
	struct qdisc_skb_head      q;                    /*  0xc0  0x18 */
	unsigned long              state;                /*  0xd8   0x8 */
	struct gnet_stats_basic_sync bstats __attribute__((__aligned__(16))); /*  0xe0  0x10 */
	bool                       running;              /*  0xf0   0x1 */

	/* XXX 3 bytes hole, try to pack */

	struct gnet_stats_queue    qstats;               /*  0xf4  0x14 */
	/* --- cacheline 4 boundary (256 bytes) was 8 bytes ago --- */
	__u8                       __cacheline_group_end__Qdisc_write[0]; /* 0x108     0 */

	/* XXX 56 bytes hole, try to pack */

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251121083256.674562-8-edumazet@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-25 16:10:32 +01:00
Eric Dumazet
c5d34f4583 net_sched: cake: use qdisc_pkt_segs()
Use new qdisc_pkt_segs() to avoid a cache line miss in cake_enqueue()
for non GSO packets.

cake_overhead() does not have to recompute it.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251121083256.674562-7-edumazet@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-25 16:10:32 +01:00
Eric Dumazet
2773cb0b31 net_sched: use qdisc_skb_cb(skb)->pkt_segs in bstats_update()
Avoid up to two cache line misses in qdisc dequeue() to fetch
skb_shinfo(skb)->gso_segs/gso_size while qdisc spinlock is held.

This gives a 5 % improvement in a TX intensive workload.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251121083256.674562-6-edumazet@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-25 16:10:32 +01:00
Eric Dumazet
f9e00e51e3 net: use qdisc_pkt_len_segs_init() in sch_handle_ingress()
sch_handle_ingress() sets qdisc_skb_cb(skb)->pkt_len.

We also need to initialize qdisc_skb_cb(skb)->pkt_segs.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251121083256.674562-5-edumazet@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-25 16:10:31 +01:00
Eric Dumazet
874c1928d3 net_sched: initialize qdisc_skb_cb(skb)->pkt_segs in qdisc_pkt_len_init()
qdisc_pkt_len_init() is currently initalizing qdisc_skb_cb(skb)->pkt_len.

Add qdisc_skb_cb(skb)->pkt_segs initialization and rename this function
to qdisc_pkt_len_segs_init().

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251121083256.674562-4-edumazet@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-25 16:10:31 +01:00
Eric Dumazet
be1b70ab21 net: init shinfo->gso_segs from qdisc_pkt_len_init()
Qdisc use shinfo->gso_segs for their pkts stats in bstats_update(),
but this field needs to be initialized for SKB_GSO_DODGY users.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251121083256.674562-3-edumazet@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-25 16:10:31 +01:00
Eric Dumazet
b2a38f6df9 net_sched: make room for (struct qdisc_skb_cb)->pkt_segs
Add a new u16 field, next to pkt_len : pkt_segs

This will cache shinfo->gso_segs to speed up qdisc deqeue().

Move slave_dev_queue_mapping at the end of qdisc_skb_cb,
and move three bits from tc_skb_cb :
- post_ct
- post_ct_snat
- post_ct_dnat

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251121083256.674562-2-edumazet@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-25 16:10:31 +01:00
Jacky Chou
e3daf0e7fe dt-bindings: net: aspeed: add AST2700 MDIO compatible
Add "aspeed,ast2700-mdio" compatible to the binding schema with a fallback
to "aspeed,ast2600-mdio".

Although the MDIO controller on AST2700 is functionally the same as the
one on AST2600, it's good practice to add a SoC-specific compatible for
new silicon. This allows future driver updates to handle any 2700-specific
integration issues without requiring devicetree changes or complex
runtime detection logic.

For now, the driver continues to bind via the existing
"aspeed,ast2600-mdio" compatible, so no driver changes are needed.

Acked-by: Conor Dooley <conor.dooley@microchip.com>
Signed-off-by: Jacky Chou <jacky_chou@aspeedtech.com>
Link: https://patch.msgid.link/20251120-aspeed_mdio_ast2700-v2-1-0d722bfb2c54@aspeedtech.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-25 12:11:47 +01:00
Randy Dunlap
1c6a92a5a5 wifi: nl80211: vendor-cmd: intel: fix a blank kernel-doc line warning
Delete an empty line prevent a kernel-doc warning:

Warning: ../include/uapi/linux/nl80211-vnd-intel.h:86 bad line:

Fixes: 3d2a2544ea ("nl80211: vendor-cmd: add Intel vendor commands for iwlmei usage")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://patch.msgid.link/20251125022834.3171742-1-rdunlap@infradead.org
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-25 10:34:55 +01:00
Lachlan Hodges
cba1ba11c1 wifi: cfg80211: include s1g_primary_2mhz when comparing chandefs
When comparing chandefs, ensure we include s1g_primary_2mhz.

Signed-off-by: Lachlan Hodges <lachlan.hodges@morsemicro.com>
Link: https://patch.msgid.link/20251125025927.245280-3-lachlan.hodges@morsemicro.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-25 10:31:28 +01:00
Lachlan Hodges
3fc830cd8c wifi: cfg80211: include s1g_primary_2mhz when sending chandef
The chandef now includes a flag denoting the use of a 2MHz primary
channel for S1G interfaces, include this when sending the chandef.

Signed-off-by: Lachlan Hodges <lachlan.hodges@morsemicro.com>
Link: https://patch.msgid.link/20251125025927.245280-2-lachlan.hodges@morsemicro.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-25 10:31:11 +01:00
Ria Thomas
24d4da5c25 wifi: ieee80211: correct FILS status codes
The FILS status codes are set to 108/109, but the IEEE 802.11-2020
spec defines them as 112/113. Update the enum so it matches the
specification and keeps the kernel consistent with standard values.

Fixes: a3caf7440d ("cfg80211: Add support for FILS shared key authentication offload")
Signed-off-by: Ria Thomas <ria.thomas@morsemicro.com>
Reviewed-by: Jeff Johnson <jeff.johnson@oss.qualcomm.com>
Link: https://patch.msgid.link/20251124125637.3936154-1-ria.thomas@morsemicro.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-25 10:28:20 +01:00
Johannes Berg
cf1d7dc28c Merge tag 'mt76-next-2025-11-24' of https://github.com/nbd168/wireless
Felix Fietkau says:
===================
mt76 patches for 6.19

- relicense to BSD-3-Clause-Clear
- fixes
- support WED on devices which exceed 32-bit DMA
- airoha NPU support
- mt7925 regd improvements
- more mt7996 MLO work
===================

Link: https://patch.msgid.link/6b6d1033-dddf-4cc9-b240-93da7f8b8773@nbd.name
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-25 10:27:19 +01:00
Jakub Kicinski
cc1b62512a Merge branch 'mptcp-memcg-accounting-for-passive-sockets-backlog-processing'
Matthieu Baerts says:

====================
mptcp: memcg accounting for passive sockets & backlog processing

This series is split in two: the 4 first patches are linked to memcg
accounting for passive sockets, and the rest introduce the backlog
processing. They are sent together, because the first one appeared to be
needed to get the second one fully working.

The second part includes RX path improvement built around backlog
processing. The main goals are improving the RX performances _and_
increase the long term maintainability.

- Patches 1-3: preparation work to ease the introduction of the next
  patch.

- Patch 4: fix memcg accounting for passive sockets. Note that this is a
  (non-urgent) fix, but it depends on material that is currently only in
  net-next, e.g. commit 4a997d49d9 ("tcp: Save lock_sock() for memcg
  in inet_csk_accept().").

- Patches 5-6: preparation of the stack for backlog processing, removing
  assumptions that will not hold true any more after the backlog
  introduction.

- Patches 7,8,10,11,12 are more cleanups that will make the backlog
  patch a little less huge.

- Patch 9: somewhat an unrelated cleanup, included here not to forget
  about it.

- Patches 13-14: The real work is done by them. Patch 13 introduces the
  helpers needed to manipulate the msk-level backlog, and the data
  struct itself, without any actual functional change. Patch 14 finally
  uses the backlog for RX skb processing. Note that MPTCP can't use the
  sk_backlog, as the MPTCP release callback can also release and
  re-acquire the msk-level spinlock and core backlog processing works
  under the assumption that such event is not possible.
  A relevant point is memory accounts for skbs in the backlog. It's
  somewhat "original" due to MPTCP constraints. Such skbs use space from
  the incoming subflow receive buffer, do not use explicitly any forward
  allocated memory, as we can't update the msk fwd mem while enqueuing,
  nor we want to acquire again the ssk socket lock while processing the
  skbs. Instead the msk borrows memory from the subflow and reserve it
  for the backlog, see patch 5 and 14 for the gory details.
====================

Link: https://patch.msgid.link/20251121-net-next-mptcp-memcg-backlog-imp-v1-0-1f34b6c1e0b1@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-24 20:23:43 -08:00
Jens Axboe
96f03c8cb2 Revert "Merge branch 'loop-aio-nowait' into for-6.19/block"
This reverts commit f43fdeb9a3, reversing
changes made to 2c6d792d4b.

There are concerns that doing inline submits can cause excessive
stack usage, particularly when going back into the filesystem. Revert
the loop dio nowait change for now.

Link: https://lore.kernel.org/linux-block/aSP3SG_KaROJTBHx@infradead.org/
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-24 20:53:19 -07:00
Paolo Abeni
6228efe0cc mptcp: leverage the backlog for RX packet processing
When the msk socket is owned or the msk receive buffer is full,
move the incoming skbs in a msk level backlog list. This avoid
traversing the joined subflows and acquiring the subflow level
socket lock at reception time, improving the RX performances.

When processing the backlog, use the fwd alloc memory borrowed from
the incoming subflow. skbs exceeding the msk receive space are
not dropped; instead they are kept into the backlog until the receive
buffer is freed. Dropping packets already acked at the TCP level is
explicitly discouraged by the RFC and would corrupt the data stream
for fallback sockets.

Special care is needed to avoid adding skbs to the backlog of a closed
msk and to avoid leaving dangling references into the backlog
at subflow closing time.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Mat Martineau <martineau@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20251121-net-next-mptcp-memcg-backlog-imp-v1-14-1f34b6c1e0b1@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-24 19:49:43 -08:00
Paolo Abeni
ee458a3f31 mptcp: introduce mptcp-level backlog
We are soon using it for incoming data processing.
MPTCP can't leverage the sk_backlog, as the latter is processed
before the release callback, and such callback for MPTCP releases
and re-acquire the socket spinlock, breaking the sk_backlog processing
assumption.

Add a skb backlog list inside the mptcp sock struct, and implement
basic helper to transfer packet to and purge such list.

Packets in the backlog are memory accounted and still use the incoming
subflow receive memory, to allow back-pressure. The backlog size is
implicitly bounded to the sum of subflows rcvbuf.

When a subflow is closed, references from the backlog to such sock
are removed.

No packet is currently added to the backlog, so no functional changes
intended here.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Mat Martineau <martineau@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20251121-net-next-mptcp-memcg-backlog-imp-v1-13-1f34b6c1e0b1@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-24 19:49:43 -08:00
Paolo Abeni
9db5b3cec4 mptcp: borrow forward memory from subflow
In the MPTCP receive path, we release the subflow allocated fwd
memory just to allocate it again shortly after for the msk.

That could increases the failures chances, especially when we will
add backlog processing, with other actions could consume the just
released memory before the msk socket has a chance to do the
rcv allocation.

Replace the skb_orphan() call with an open-coded variant that
explicitly borrows, the fwd memory from the subflow socket instead
of releasing it.

The borrowed memory does not have PAGE_SIZE granularity; rounding to
the page size will make the fwd allocated memory higher than what is
strictly required and could make the incoming subflow fwd mem
consistently negative. Instead, keep track of the accumulated frag and
borrow the full page at subflow close time.

This allow removing the last drop in the TCP to MPTCP transition and
the associated, now unused, MIB.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Mat Martineau <martineau@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20251121-net-next-mptcp-memcg-backlog-imp-v1-12-1f34b6c1e0b1@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-24 19:49:42 -08:00
Paolo Abeni
0eeb372dee mptcp: handle first subflow closing consistently
Currently, as soon as the PM closes a subflow, the msk stops accepting
data from it, even if the TCP socket could be still formally open in the
incoming direction, with the notable exception of the first subflow.

The root cause of such behavior is that code currently piggy back two
separate semantic on the subflow->disposable bit: the subflow context
must be released and that the subflow must stop accepting incoming
data.

The first subflow is never disposed, so it also never stop accepting
incoming data. Use a separate bit to mark the latter status and set such
bit in __mptcp_close_ssk() for all subflows.

Beyond making per subflow behaviour more consistent this will also
simplify the next patch.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Mat Martineau <martineau@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20251121-net-next-mptcp-memcg-backlog-imp-v1-11-1f34b6c1e0b1@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-24 19:49:42 -08:00
Paolo Abeni
38a4a469c8 mptcp: drop the __mptcp_data_ready() helper
It adds little clarity and there is a single user of such helper,
just inline it in the caller.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Geliang Tang <geliang@kernel.org>
Tested-by: Geliang Tang <geliang@kernel.org>
Reviewed-by: Mat Martineau <martineau@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20251121-net-next-mptcp-memcg-backlog-imp-v1-10-1f34b6c1e0b1@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-24 19:49:42 -08:00
Paolo Abeni
9d82959603 mptcp: make mptcp_destroy_common() static
Such function is only used inside protocol.c, there is no need
to expose it to the whole stack.

Note that the function definition most be moved earlier to avoid
forward declaration.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Geliang Tang <geliang@kernel.org>
Tested-by: Geliang Tang <geliang@kernel.org>
Reviewed-by: Mat Martineau <martineau@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20251121-net-next-mptcp-memcg-backlog-imp-v1-9-1f34b6c1e0b1@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-24 19:49:42 -08:00
Paolo Abeni
48a395605e mptcp: do not miss early first subflow close event notification
The MPTCP protocol is not currently emitting the NL event when the first
subflow is closed before msk accept() time.

By replacing the in use close helper is such scenario, implicitly introduce
the missing notification. Note that in such scenario we want to be sure
that mptcp_close_ssk() will not trigger any PM work, move the msk state
change update earlier, so that the previous patch will offer such
guarantee.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Geliang Tang <geliang@kernel.org>
Tested-by: Geliang Tang <geliang@kernel.org>
Reviewed-by: Mat Martineau <martineau@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20251121-net-next-mptcp-memcg-backlog-imp-v1-8-1f34b6c1e0b1@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-24 19:49:42 -08:00
Paolo Abeni
2ca1b8926f mptcp: ensure the kernel PM does not take action too late
The PM hooks can currently take place when the msk is already shutting
down. Subflow creation will fail, thanks to the existing check at join
time, but we can entirely avoid starting the to be failed operations.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Geliang Tang <geliang@kernel.org>
Tested-by: Geliang Tang <geliang@kernel.org>
Reviewed-by: Mat Martineau <martineau@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20251121-net-next-mptcp-memcg-backlog-imp-v1-7-1f34b6c1e0b1@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-24 19:49:42 -08:00
Paolo Abeni
2834f8edd7 mptcp: cleanup fallback dummy mapping generation
MPTCP currently access ack_seq outside the msk socket log scope to
generate the dummy mapping for fallback socket. Soon we are going
to introduce backlog usage and even for fallback socket the ack_seq
value will be significantly off outside of the msk socket lock scope.

Avoid relying on ack_seq for dummy mapping generation, using instead
the subflow sequence number. Note that in case of disconnect() and
(re)connect() we must ensure that any previous state is re-set.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Geliang Tang <geliang@kernel.org>
Tested-by: Geliang Tang <geliang@kernel.org>
Reviewed-by: Mat Martineau <martineau@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20251121-net-next-mptcp-memcg-backlog-imp-v1-6-1f34b6c1e0b1@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-24 19:49:41 -08:00
Paolo Abeni
85f22b8e1e mptcp: cleanup fallback data fin reception
MPTCP currently generate a dummy data_fin for fallback socket
when the fallback subflow has completed data reception using
the current ack_seq.

We are going to introduce backlog usage for the msk soon, even
for fallback sockets: the ack_seq value will not match the most recent
sequence number seen by the fallback subflow socket, as it will ignore
data_seq sitting in the backlog.

Instead use the last map sequence number to set the data_fin,
as fallback (dummy) map sequences are always in sequence.

Reviewed-by: Geliang Tang <geliang@kernel.org>
Tested-by: Geliang Tang <geliang@kernel.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Mat Martineau <martineau@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20251121-net-next-mptcp-memcg-backlog-imp-v1-5-1f34b6c1e0b1@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-24 19:49:41 -08:00
Paolo Abeni
68c7c38671 mptcp: fix memcg accounting for passive sockets
The passive sockets never got proper memcg accounting: the msk
socket is associated with the memcg at accept time, but the
passive subflows never got it right.

At accept time, traverse the subflows list and associate each of them
with the msk memcg, and try to do the same at join completion time, if
the msk has been already accepted.

Fixes: cf7da0d66c ("mptcp: Create SUBFLOW socket for incoming connections")
Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/298
Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/597
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20251121-net-next-mptcp-memcg-backlog-imp-v1-4-1f34b6c1e0b1@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-24 19:49:41 -08:00
Paolo Abeni
e777a7fb06 mptcp: grafting MPJ subflow earlier
Later patches need to ensure that all MPJ subflows are grafted to the
msk socket before accept() completion.

Currently the grafting happens under the msk socket lock: potentially
at msk release_cb time which make satisfying the above condition a bit
tricky.

Move the MPJ subflow grafting earlier, under the msk data lock, so that
we can use such lock as a synchronization point.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20251121-net-next-mptcp-memcg-backlog-imp-v1-3-1f34b6c1e0b1@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-24 19:49:41 -08:00
Paolo Abeni
bd92dd8e03 mptcp: factor-out cgroup data inherit helper
MPTCP will soon need the same functionality for passive sockets,
factor them out in a common helper. No functional change intended.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Geliang Tang <geliang@kernel.org>
Reviewed-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20251121-net-next-mptcp-memcg-backlog-imp-v1-2-1f34b6c1e0b1@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-24 19:49:41 -08:00
Paolo Abeni
075b19c211 net: factor-out _sk_charge() helper
Move out of __inet_accept() the code dealing charging newly
accepted socket to memcg. MPTCP will soon use it to on a per
subflow basis, in different contexts.

No functional changes intended.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Geliang Tang <geliang@kernel.org>
Acked-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20251121-net-next-mptcp-memcg-backlog-imp-v1-1-1f34b6c1e0b1@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-24 19:49:40 -08:00
Dmitry Skorodumov
f296b73d17 ipvlan: fix sparse warning about __be32 -> u32
Fixed a sparse warning:

ipvlan_core.c:56: warning: incorrect type in argument 1
(different base types) expected unsigned int [usertype] a
got restricted __be32 const [usertype] s_addr

Force cast the s_addr to u32

Signed-off-by: Dmitry Skorodumov <skorodumov.dmitry@huawei.com>
Link: https://patch.msgid.link/20251121155112.4182007-1-skorodumov.dmitry@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-24 19:48:49 -08:00
Breno Leitao
20c20f05cf net: mvpp2: extract GRXRINGS from .get_rxnfc
Commit 84eaf4359c ("net: ethtool: add get_rx_ring_count callback to
optimize RX ring queries") added specific support for GRXRINGS callback,
simplifying .get_rxnfc.

Remove the handling of GRXRINGS in .get_rxnfc() by moving it to the new
.get_rx_ring_count() for the mvpp2 driver.

This simplifies the RX ring count retrieval and aligns mvpp2 with the new
ethtool API for querying RX ring parameters, while keeping the other
rxnfc handlers (GRXCLSRLCNT, GRXCLSRULE, GRXCLSRLALL) intact.

Signed-off-by: Breno Leitao <leitao@debian.org>
Link: https://patch.msgid.link/20251121-marvell-v1-2-8338f3e55a4c@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-24 19:45:02 -08:00
Breno Leitao
737e14c5dc net: mvneta: convert to use .get_rx_ring_count
Convert the mvneta driver to use the new .get_rx_ring_count ethtool
operation instead of implementing .get_rxnfc solely for handling
ETHTOOL_GRXRINGS command. This simplifies the code by removing the
switch statement and replacing it with a direct return of the queue
count.

The new callback provides the same functionality in a more direct way,
following the ongoing ethtool API modernization.

Signed-off-by: Breno Leitao <leitao@debian.org>
Link: https://patch.msgid.link/20251121-marvell-v1-1-8338f3e55a4c@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-24 19:45:02 -08:00
Breno Leitao
a8ff4842da net: hyperv: convert to use .get_rx_ring_count
Convert the hyperv netvsc driver to use the new .get_rx_ring_count
ethtool operation instead of implementing .get_rxnfc solely for handling
ETHTOOL_GRXRINGS command. This simplifies the code by replacing the
switch statement with a direct return of the queue count.

The new callback provides the same functionality in a more direct way,
following the ongoing ethtool API modernization.

Signed-off-by: Breno Leitao <leitao@debian.org>
Link: https://patch.msgid.link/20251121-hyperv_gxrings-v1-1-31293104953b@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-24 19:42:30 -08:00
Eric Dumazet
ec1e48e97f net: optimize eth_type_trans() vs CONFIG_STACKPROTECTOR_STRONG=y
Some platforms exhibit very high costs with CONFIG_STACKPROTECTOR_STRONG=y
when a function needs to pass the address of a local variable to external
functions.

eth_type_trans() (and its callers) is showing this anomaly on AMD EPYC 7B12
platforms (and maybe others).

We could :

1) inline eth_type_trans()

   This would help if its callers also has the same issue, and the canary cost
   would be paid by the callers already.

   This is a bit cumbersome because netdev_uses_dsa() is pulling
   whole <net/dsa.h> definitions.

2) Compile net/ethernet/eth.c with -fno-stack-protector

   This would weaken security.

3) Hack eth_type_trans() to temporarily use skb->dev as a place holder
   if skb_header_pointer() needs to pull 2 bytes not present in skb->head.

This patch implements 3), and brings a 5% improvement on TX/RX intensive
workload (tcp_rr 10,000 flows) on AMD EPYC 7B12.

Removing CONFIG_STACKPROTECTOR_STRONG on this platform can improve
performance by 25 %.
This means eth_type_trans() issue is not an isolated artifact.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20251121061725.206675-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-24 19:27:31 -08:00
Jakub Kicinski
e254c212cd selftests: af_unix: don't use SKIP for expected failures
netdev CI reserves SKIP in selftests for cases which can't be executed
due to setup issues, like missing or old commands. Tests which are
expected to fail must use XFAIL.

Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20251123021601.158709-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-24 19:07:51 -08:00
Andre Carvalho
00f3b32518 selftests: netconsole: ensure required log level is set on netcons_basic
This commit ensures that the required log level is set at the start of
the test iteration.

Part of the cleanup performed at the end of each test iteration resets
the log level (do_cleanup in lib_netcons.sh) to the values defined at the
time test script started. This may cause further test iterations to fail
if the default values are not sufficient.

Signed-off-by: Andre Carvalho <asantostc@gmail.com>
Reviewed-by: Breno Leitao <leitao@debian.org>
Link: https://patch.msgid.link/20251121-netcons-basic-loglevel-v1-1-577f8586159c@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-24 18:52:20 -08:00
Jakub Kicinski
d350d28350 Merge branch 'selftests-hw-net-toeplitz-read-config-from-the-nic-directly'
Jakub Kicinski says:

====================
selftests: hw-net: toeplitz: read config from the NIC directly

First patch here tries to auto-disable building the iouring sample.
Our CI will still run the iouring test(s), of course, but it looks
like the liburing updates aren't very quick in distroes and having
to hack around it when developing unrelated tests is a bit annoying.

Remaining 4 patches iron out running the Toeplitz hash test against
real NICs. I tested mlx5, bnxt and fbnic, they all pass now.
I switched to using YNL directly in the C code, can't see a reason
to get the info in Python and pass it to C via argv. The old code
likely did this because it predates YNL.
====================

Link: https://patch.msgid.link/20251121040259.3647749-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-24 18:51:44 -08:00
Jakub Kicinski
5aadc15584 selftests: hw-net: toeplitz: give the test up to 4 seconds
Increase the receiver timeout. When running between machines
in different geographic regions the test needs more than
a second to SSH across and send the frames.

The bkg() command that runs the receiver defaults to 5 sec timeout,
so using 4 sec sounds like a reasonable value for the receiver itself.

Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20251121040259.3647749-6-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-24 18:51:41 -08:00
Jakub Kicinski
c0105ffc50 selftests: hw-net: toeplitz: read indirection table from the device
Replace the simple modulo math with the real indirection table
read from the device. This makes the tests pass for mlx5 and
bnxt NICs.

Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20251121040259.3647749-5-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-24 18:51:41 -08:00
Jakub Kicinski
aa91dbf3ed selftests: hw-net: toeplitz: read the RSS key directly from C
Now that we have YNL support for RSS accessing the RSS info from
C is very easy. Instead of passing the RSS key from Python do it
directly in the C code.

Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20251121040259.3647749-4-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-24 18:51:40 -08:00
Jakub Kicinski
27c512af19 selftests: hw-net: toeplitz: make sure NICs have pure Toeplitz configured
Make sure that the NIC under test is configured for pure Toeplitz
hashing, and no input key transform (no symmetric hashing).

Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20251121040259.3647749-3-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-24 18:51:40 -08:00
Jakub Kicinski
f81171fecd selftests: hw-net: auto-disable building the iouring C code
Looks like the liburing is not updated by distros very aggressively.
Presumably because a lot of packages depend on it. I just updated
to Fedora 43 and it's still on liburing 2.9. The test is 9mo old,
at this stage I think this warrants handling the build failure
more gracefully.

Detect if iouring is recent enough and if not print a warning
and exclude the C prog from build. The Python test will just
fail since the binary won't exist. But it removes the major
annoyance of having to update liburing from sources when
developing other tests.

Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20251121040259.3647749-2-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-24 18:51:40 -08:00
Dan Carpenter
ef0b78b5b6 i40e: delete a stray tab
This return statement is indented one tab too far.  Delete a tab.

Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Reviewed-by: Tony Nguyen <anthony.l.nguyen@intel.com>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Link: https://patch.msgid.link/aSBqjtA8oF25G1OG@stanley.mountain
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-24 18:51:06 -08:00
Filipe Manana
9e0e6577b3 btrfs: remove unnecessary inode key in btrfs_log_all_parents()
We are setting up an inode key to lookup parent directory inode but all we
need is the inode's objectid. The use of the key was necessary in the past
but since commit 0202e83fda ("btrfs: simplify iget helpers") we only
need the objectid.

So remove the key variable in the stack and use instead a simple u64 for
the inode's objectid.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-25 01:53:33 +01:00
Filipe Manana
1c3e03b340 btrfs: remove redundant zero/NULL initializations in btrfs_alloc_root()
We have allocated the root with kzalloc() so all the memory is already
zero initialized, therefore it's redundant to assign 0 and NULL to several
of the root members. Remove all of them except the atomic initializations
since atomic_t is an opaque type and it's not a good practice to assume
its internals.

This slightly reduces the binary size.
With gcc 14.2.0-19 from Debian on x86_64, before this change:

  $ size fs/btrfs/btrfs.ko
     text	   data	    bss	    dec	    hex	filename
  1939404	 162963	  15592	2117959	 205147	fs/btrfs/btrfs.ko

After this change:

  $ size fs/btrfs/btrfs.ko
     text	   data	    bss	    dec	    hex	filename
  1939212	 162963	  15592	2117767	 205087	fs/btrfs/btrfs.ko

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-25 01:53:33 +01:00
David Sterba
10934c131f btrfs: remaining BTRFS_PATH_AUTO_FREE conversions
Do the remaining btrfs_path conversion to the auto cleaning, this seems
to be the last one. Most of the conversions are trivial, only adding the
declaration and removing the freeing, or changing the goto patterns to
return.

There are some functions with many changes, like __btrfs_free_extent(),
btrfs_remove_from_free_space_tree() or btrfs_add_to_free_space_tree()
but it still follows the same pattern.

Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-25 01:53:33 +01:00
Filipe Manana
5c9cac55b7 btrfs: send: do not allocate memory for xattr data when checking it exists
When checking if xattrs were deleted we don't care about their data, but
we are allocating memory for the data and copying it, which only wastes
time and can result in an unnecessary error in case the allocation fails.
So stop allocating memory and copying data by making find_xattr() and
__find_xattr() skip those steps if the given data buffer is NULL.

Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-25 01:53:33 +01:00
Filipe Manana
7c3acdb998 btrfs: send: add unlikely to all unexpected overflow checks
There are several checks for unexpected overflows of buffers and path
lengths that makes us fail the send operation with an error if for some
highly unexpected reason they happen. So add the unlikely tag to those
checks to hint the compiler to generate better code, while also making
it more explicit in the source that it's highly unexpected.

With gcc 14.2.0-19 from Debian on x86_64, I also got a small reduction
the text size of the btrfs module.

Before:

  $ size fs/btrfs/btrfs.ko
     text	   data	    bss	    dec	    hex	filename
  1936917	 162723	  15592	2115232	 2046a0	fs/btrfs/btrfs.ko

After:

  $ size fs/btrfs/btrfs.ko
     text	   data	    bss	    dec	    hex	filename
  1936789	 162723	  15592	2115104	 204620	fs/btrfs/btrfs.ko

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-25 01:53:33 +01:00
Filipe Manana
139e3167d8 btrfs: reduce arguments to btrfs_del_inode_ref_in_log()
Instead of passing a root and the objectid of the parent directory, just
pass the directory inode, as like that we can extract both the root and
the objectid, reducing the number of arguments by one. It also makes the
function more consistent with other log tree functions in the sense that
we pass the inode and not only its objectid.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-25 01:53:33 +01:00
Filipe Manana
1361f7d8da btrfs: remove root argument from btrfs_del_dir_entries_in_log()
There's no need to pass the root as we can extract it from the directory
inode, so remove it.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-25 01:53:32 +01:00
Filipe Manana
9c78fe4a85 btrfs: use test_and_set_bit() in btrfs_delayed_delete_inode_ref()
Instead of testing and setting the BTRFS_DELAYED_NODE_DEL_IREF bit in the
delayed node's flags, use test_and_set_bit() which makes the code shorter
without compromising readability and getting rid of the label and goto.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Daniel Vacek <neelx@suse.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-25 01:53:32 +01:00
Josef Bacik
70085399b1 btrfs: don't search back for dir inode item in INO_LOOKUP_USER
We don't need to search back to the inode item, the directory inode
number is in key.offset, so simply use that.  If we can't find the
directory we'll get an ENOENT at the iget().

Note: The patch was taken from v5 of fscrypt patchset
(https://lore.kernel.org/linux-btrfs/cover.1706116485.git.josef@toxicpanda.com/)
which was handled over time by various people: Omar Sandoval, Sweet Tea
Dorminy, Josef Bacik.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Daniel Vacek <neelx@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ add note ]
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-25 01:53:27 +01:00
Josef Bacik
0185c2292c btrfs: don't rewrite ret from inode_permission
In our user safe ino resolve ioctl we'll just turn any ret into -EACCES
from inode_permission().  This is redundant, and could potentially be
wrong if we had an ENOMEM in the security layer or some such other
error, so simply return the actual return value.

Note: The patch was taken from v5 of fscrypt patchset
(https://lore.kernel.org/linux-btrfs/cover.1706116485.git.josef@toxicpanda.com/)
which was handled over time by various people: Omar Sandoval, Sweet Tea
Dorminy, Josef Bacik.

Fixes: 23d0b79dfa ("btrfs: Add unprivileged version of ino_lookup ioctl")
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Daniel Vacek <neelx@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ add note ]
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-25 01:52:24 +01:00
Josef Bacik
bd45e9e3f6 btrfs: add orig_logical to btrfs_bio for encryption
When checksumming the encrypted bio on writes we need to know which
logical address this checksum is for.  At the point where we get the
encrypted bio the bi_sector is the physical location on the target disk,
so we need to save the original logical offset in the btrfs_bio.  Then
we can use this when checksumming the bio instead of the
bio->iter.bi_sector.

Note: The patch was taken from v5 of fscrypt patchset
(https://lore.kernel.org/linux-btrfs/cover.1706116485.git.josef@toxicpanda.com/)
which was handled over time by various people: Omar Sandoval, Sweet Tea
Dorminy, Josef Bacik.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Daniel Vacek <neelx@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ add note ]
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-25 01:52:23 +01:00
Sweet Tea Dorminy
45d99129b6 btrfs: disable verity on encrypted inodes
Right now there isn't a way to encrypt things that aren't either
filenames in directories or data on blocks on disk with extent
encryption, so for now, disable verity usage with encryption on btrfs.

fscrypt with fsverity should be possible and it can be implemented
in the future.

Note: The patch was taken from v5 of fscrypt patchset
(https://lore.kernel.org/linux-btrfs/cover.1706116485.git.josef@toxicpanda.com/)
which was handled over time by various people: Omar Sandoval, Sweet Tea
Dorminy, Josef Bacik.

Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me>
Signed-off-by: Daniel Vacek <neelx@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ add note ]
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-25 01:50:56 +01:00
Omar Sandoval
f968340053 btrfs: disable various operations on encrypted inodes
Initially, only normal data extents will be encrypted. This change
forbids various other bits:

- allows reflinking only if both inodes have the same encryption status
- disable inline data on encrypted inodes

Note: The patch was taken from v5 of fscrypt patchset
(https://lore.kernel.org/linux-btrfs/cover.1706116485.git.josef@toxicpanda.com/)
which was handled over time by various people: Omar Sandoval, Sweet Tea
Dorminy, Josef Bacik.

Signed-off-by: Omar Sandoval <osandov@osandov.com>
Signed-off-by: Daniel Vacek <neelx@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ add note ]
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-25 01:50:56 +01:00
Sun YangKai
4357dd76f5 btrfs: remove redundant level reset in btrfs_del_items()
When btrfs_del_items() empties a leaf, it deletes the leaf unless it's
the root node. For the root leaf case, the code used to reset its level
to 0 via btrfs_set_header_level(). This is redundant as leaf nodes
always have level == 0.

Remove the unnecessary level assignment and invert the conditional to
handle only the non-root leaf deletion. The root leaf is correctly left
as-is.

Signed-off-by: Sun YangKai <sunk67188@gmail.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-25 01:50:56 +01:00
Sun YangKai
139f75a3b1 btrfs: simplify leaf traversal after path release in btrfs_next_old_leaf()
After releasing the path in btrfs_next_old_leaf(), we need to re-check
the leaf because a balance operation may have added items or removed the
last item. The original code handled this with two separate conditional
blocks, the second marked with a lengthy comment explaining a "missed
case".

Merge these two blocks into a single logical structure that handles both
scenarios more clearly.

Also update the comment to be more concise and accurate, incorporating the
explanation directly into the main block rather than a separate annotation.

Signed-off-by: Sun YangKai <sunk67188@gmail.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-25 01:50:56 +01:00
Sun YangKai
3afa17bf24 btrfs: optimize balance_level() path reference handling
Instead of incrementing refcount on 'left' node when it's referenced by
path, simply transfer ownership to path and set left to NULL. This
eliminates:

- Unnecessary refcount increment/decrement operations
- Redundant conditional checks for left node cleanup

The path now consistently owns the left node reference when used.

Signed-off-by: Sun YangKai <sunk67188@gmail.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-25 01:50:56 +01:00
Sun YangKai
31b37b7667 btrfs: factor out root promotion logic into promote_child_to_root()
The balance_level() function is overly long and contains a cold code path
that handles promoting a child node to root when the root has only one item.
This code has distinct logic that is clearer and more maintainable when
isolated in its own function.

Signed-off-by: Sun YangKai <sunk67188@gmail.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-25 01:50:56 +01:00
Qu Wenruo
1a332a6d70 btrfs: raid56: remove the "_step" infix
The following functions are introduced as a middle step for bs > ps
support:

- rbio_streip_step_paddr()
- rbio_pstripe_step_paddr()
- rbio_qstripe_step_paddr()
- sector_step_paddr_in_rbio()

As there is already an existing function without the infix, and has a
different parameter list.

But the existing functions have been cleaned up, there is no need to
keep the "_step" infix, just remove it completely.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-25 01:50:19 +01:00
Qu Wenruo
8870dbeedc btrfs: raid56: enable bs > ps support
The support code for bs > ps is complete, enable it and update
assertions.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-25 01:48:52 +01:00
Saket Kumar Bhaskar
590699d858 selftests/bpf: Fix htab_update/reenter_update selftest failure
Since commit 31158ad02d ("rqspinlock: Add deadlock detection
and recovery") the updated path on re-entrancy now reports deadlock
via -EDEADLK instead of the previous -EBUSY.

Also, the way reentrancy was exercised (via fentry/lookup_elem_raw)
has been fragile because lookup_elem_raw may be inlined
(find_kernel_btf_id() will return -ESRCH).

To fix this fentry is attached to bpf_obj_free_fields() instead of
lookup_elem_raw() and:

- The htab map is made to use a BTF-described struct val with a
  struct bpf_timer so that check_and_free_fields() reliably calls
  bpf_obj_free_fields() on element replacement.

- The selftest is updated to do two updates to the same key (insert +
  replace) in prog_test.

- The selftest is updated to align with expected errno with the
  kernel’s current behavior.

Signed-off-by: Saket Kumar Bhaskar <skb99@linux.ibm.com>
Tested-by: Venkat Rao Bagalkote <venkat88@linux.ibm.com>
Link: https://lore.kernel.org/r/20251117060752.129648-1-skb99@linux.ibm.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-24 16:48:28 -08:00
Qu Wenruo
89ca1a403e btrfs: raid56: prepare finish_parity_scrub() to support bs > ps cases
The function finish_parity_scrub() assume each fs block can be mapped by
one page, blocking bs > ps support for raid56.

Prepare it for bs > ps cases by:

- Introduce a helper, verify_one_parity_step()
  Since the P/Q generation is always done in a vertical stripe, we have
  to handle the range step by step.

- Only clear the rbio->dbitmap if all steps of an fs block match

- Remove rbio_stripe_paddr() and sector_paddr_in_rbio() helpers
  Now we either use the paddrs version for checksum, or the step version
  for P/Q generation/recovery.

- Make alloc_rbio_essential_pages() to handle bs > ps cases
  Since for bs > ps cases, one fs block needs multiple pages, the
  existing simple check against rbio->stripe_pages[] is not enough.

  Extract a dedicated helper, alloc_rbio_sector_pages(), for the
  existing alloc_rbio_essential_pages(), which is still based on sector
  number.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-25 01:48:02 +01:00
Qu Wenruo
ba88278c69 btrfs: raid56: prepare rbio_bio_add_io_paddr() to support bs > ps cases
The function rbio_bio_add_io_paddr() assume each fs block can be mapped by
one page, blocking bs > ps support for raid56.

Prepare it for bs > ps cases by:

- Introduce a helper bio_add_paddrs()
  Previously we only need to add a single page to a bio for a fs block,
  but now we need to add multiple pages, this means we can fail halfway.

  In that case we need to properly revert the bio (only for its size
  though) for halfway failed cases.

- Rename rbio_add_io_paddr() to rbio_add_io_paddrs()
  And change the @paddr parameter to @paddrs[].

- Change all callers to use the updated rbio_add_io_paddrs()
  For the @paddrs pointer used for the new function, it can be grabbed
  using sector_paddrs_in_rbio() and rbio_stripe_paddrs() helpers.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-25 01:47:54 +01:00
Qu Wenruo
53474a2ae1 btrfs: raid56: prepare steal_rbio() to support bs > ps cases
The function steal_rbio() assume each fs block can be mapped by
one page, blocking bs > ps support for raid56.

Prepare it for bs > ps cases by:

- Introduce two helpers to calculate the sector number
  Previously we assume one page will contain at least one fs block, thus
  can use something like "sectors_per_page = PAGE_SIZE / sectorsize;",
  but with bs > ps support that above number will be 0.

  Instead introduce two helpers:

  * page_nr_to_sector_nr()
    Returns the sector number of the first sector covered by the page.

  * page_nr_to_num_sectors()
    Return how many sectors are covered by the page.

  And use the returned values for bitmap operations other than
  open-coded "PAGE_SIZE / sectorsize".
  Those helpers also have extra ASSERT()s to catch weird numbers.

- Use above helpers
  The involved functions are:
  * steal_rbio_page()
  * is_data_stripe_page()
  * full_page_sectors_uptodate()

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-25 01:47:41 +01:00
Qu Wenruo
05ddf35a5d btrfs: raid56: prepare set_bio_pages_uptodate() to support bs > ps cases
The function set_bio_pages_uptodate() assume each fs block can be mapped by
one page, blocking bs > ps support for raid56.

Prepare it for bs > ps cases by:

- Update find_stripe_sector_nr() to check only the first step paddr
  We don't need to check each paddr, as the bios are still aligned to fs
  block size, thus checking the first step is enough.

- Use step size to iterate the bio
  This means we only need to find the sector number for the first step
  of each fs block, and skip the remaining part.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-25 01:47:31 +01:00
Qu Wenruo
64e7b8c7c5 btrfs: raid56: prepare verify_bio_data_sectors() to support bs > ps cases
The function verify_bio_data_sectors() assume each fs block can be mapped by
one page, blocking bs > ps support for raid56.

Prepare it for bs > ps cases by:

- Make get_bio_sector_nr() to consider bs > ps cases
  The function is utilized to calculate the sector number of a device
  bio submitted by btrfs raid56 layer.

- Assemble a local paddrs[] for checksum calculation

- Open code btrfs_check_block_csum()
  btrfs_check_block_csum() only supports fs blocks backed by large
  folios.

  But for raid56 we can have fs blocks backed by multiple non-contiguous
  pages, e.g. direct IO, encoded read/write/send.

  So instead of using btrfs_check_block_csum(), open code it to use
  btrfs_calculate_block_csum_pages().

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-25 01:47:20 +01:00
Qu Wenruo
e0eadfcc95 btrfs: raid56: prepare verify_one_sector() to support bs > ps cases
The function verify_one_sector() assume each fs block can be mapped by
one page, blocking bs > ps support for raid56.

Prepare it for bs > ps cases by:

- Introduce helpers to get a paddrs pointer
  Thankfully all the higher layer bio should still be aligned to fs
  block size, thus a fs block should still be fully covered by the bio.

  Introduce sector_paddrs_in_rbio() and rbio_stripe_paddrs(), which will
  return a paddrs pointer inside btrfs_raid_bio::bio_paddrs[] or
  stripe_paddrs[].

  The pointer can be directly passed to
  btrfs_calculate_block_csum_pages() to verify the checksum.

- Open code btrfs_check_block_csum()
  btrfs_check_block_csum() only supports fs blocks backed by large
  folios.

  But for raid56 we can have fs blocks backed by multiple non-contiguous
  pages, e.g. direct IO, encoded read/write/send.

  So instead of using btrfs_check_block_csum(), open code it to use
  btrfs_calculate_block_csum_pages().

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-25 01:47:16 +01:00
Qu Wenruo
9ba67fd616 btrfs: raid56: prepare recover_vertical() to support bs > ps cases
Currently recover_vertical() assumes that every fs block can be mapped
by one page, this is blocking bs > ps support for raid56.

Prepare recover_vertical() to support bs > ps cases by:

- Introduce recover_vertical_step() helper
  Which will recover a full step (min(PAGE_SIZE, sectorsize)).

  Now recover_vertical() will do the error check for the specified
  sector, do the recover step by step, then do the sector verification.

- Fix a spelling error of get_rbio_vertical_errors()
  The old name has a typo: "veritical".

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-25 01:47:11 +01:00
Qu Wenruo
826325b6d0 btrfs: raid56: prepare generate_pq_vertical() for bs > ps cases
Unlike btrfs_calculate_block_csum_pages(), we cannot handle multiple
pages at the same time for P/Q generation.

So here we introduce a new @step_nr, and various helpers to grab the
sub-block page from the rbio, and generate the P/Q stripe page by page.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-25 01:47:05 +01:00
Qu Wenruo
91cd1b5865 btrfs: raid56: introduce a new parameter to locate a sector
Since we cannot ensure that all bios from the higher layer are backed by
large folios (e.g. direct IO, encoded read/write/send), we need the
ability to locate sub-block (aka, a page) inside a full stripe.

So the existing @stripe_nr + @sector_nr combination is not enough to
locate such page for bs > ps cases.

Introduce a new parameter, @step_nr, to locate the page of a larger fs
block.  The naming is following the conventions used inside btrfs
elsewhere, where one step is min(sectorsize, PAGE_SIZE).

It's still a preparation, only touching the following aspects:

- btrfs_dump_rbio()
  To show the new @sector_nsteps member.

- btrfs_raid_bio::sector_nsteps
  Recording how many steps there are inside a fs block.

- Enlarge btrfs_raid_bio::*_paddrs[] size
  To take @sector_nsteps into consideration.

- index_one_bio()
- index_stripe_sectors()
- memcpy_from_bio_to_stripe()
- cache_rbio_pages()
- need_read_stripe_sectors()
  Those functions are iterating *_paddrs[], which needs to take
  sector_nsteps into consideration.

- Rename rbio_stripe_sector_index() to rbio_sector_index()
  The "stripe" part is not that helpful.

  And an extra ASSERT() before returning the result.

- Add a new rbio_paddr_index() helper
  This will take the extra @step_nr into consideration.

- The comments of btrfs_raid_bio

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-25 01:46:58 +01:00
Qu Wenruo
9042dc0002 btrfs: raid56: add an overview for the btrfs_raid_bio structure
The structure needs to track both the pages from higher layer bio and
internal pages, thus it can be a little complex to grasp.

Add an overview of the structure, especially how we track different
pages from higher layer bios and internal ones, to save some time for
future developers.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-25 01:46:29 +01:00
Viacheslav Dubeyko
6f84ceb985 hfsplus: introduce KUnit tests for HFS+ string operations
This patch implements the Kunit based set of
unit tests for HFS+ string operations. It checks
functionality of hfsplus_strcasecmp(), hfsplus_strcmp(),
hfsplus_uni2asc(), hfsplus_asc2uni(), hfsplus_hash_dentry(),
and hfsplus_compare_dentry().

./tools/testing/kunit/kunit.py run --kunitconfig ./fs/hfsplus/.kunitconfig
[14:38:05] Configuring KUnit Kernel ...
[14:38:05] Building KUnit Kernel ...
Populating config with:
$ make ARCH=um O=.kunit olddefconfig
Building with:
$ make all compile_commands.json scripts_gdb ARCH=um O=.kunit --jobs=22
[14:38:09] Starting KUnit Kernel (1/1)...
[14:38:09] ============================================================
Running tests with:
$ .kunit/linux kunit.enable=1 mem=1G console=tty kunit_shutdown=halt
[14:38:09] ============== hfsplus_unicode (27 subtests) ===============
[14:38:09] [PASSED] hfsplus_strcasecmp_test
[14:38:09] [PASSED] hfsplus_strcmp_test
[14:38:09] [PASSED] hfsplus_unicode_edge_cases_test
[14:38:09] [PASSED] hfsplus_unicode_boundary_test
[14:38:09] [PASSED] hfsplus_uni2asc_basic_test
[14:38:09] [PASSED] hfsplus_uni2asc_special_chars_test
[14:38:09] [PASSED] hfsplus_uni2asc_buffer_test
[14:38:09] [PASSED] hfsplus_uni2asc_corrupted_test
[14:38:09] [PASSED] hfsplus_uni2asc_edge_cases_test
[14:38:09] [PASSED] hfsplus_asc2uni_basic_test
[14:38:09] [PASSED] hfsplus_asc2uni_special_chars_test
[14:38:09] [PASSED] hfsplus_asc2uni_buffer_limits_test
[14:38:09] [PASSED] hfsplus_asc2uni_edge_cases_test
[14:38:09] [PASSED] hfsplus_asc2uni_decompose_test
[14:38:09] [PASSED] hfsplus_hash_dentry_basic_test
[14:38:09] [PASSED] hfsplus_hash_dentry_casefold_test
[14:38:09] [PASSED] hfsplus_hash_dentry_special_chars_test
[14:38:09] [PASSED] hfsplus_hash_dentry_decompose_test
[14:38:09] [PASSED] hfsplus_hash_dentry_consistency_test
[14:38:09] [PASSED] hfsplus_hash_dentry_edge_cases_test
[14:38:09] [PASSED] hfsplus_compare_dentry_basic_test
[14:38:09] [PASSED] hfsplus_compare_dentry_casefold_test
[14:38:09] [PASSED] hfsplus_compare_dentry_special_chars_test
[14:38:09] [PASSED] hfsplus_compare_dentry_length_test
[14:38:09] [PASSED] hfsplus_compare_dentry_decompose_test
[14:38:09] [PASSED] hfsplus_compare_dentry_edge_cases_test
[14:38:09] [PASSED] hfsplus_compare_dentry_combined_flags_test
[14:38:09] ================= [PASSED] hfsplus_unicode =================
[14:38:09] ============================================================
[14:38:09] Testing complete. Ran 27 tests: passed: 27
[14:38:09] Elapsed time: 3.875s total, 0.001s configuring, 3.707s building, 0.115s running

v2
Rework memory management model.

Signed-off-by: Viacheslav Dubeyko <slava@dubeyko.com>
cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
cc: Yangtao Li <frank.li@vivo.com>
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Viacheslav Dubeyko <slava@dubeyko.com>
2025-11-24 16:12:51 -08:00
Qu Wenruo
54df8b80cc btrfs: scrub: always update btrfs_scrub_progress::last_physical
[BUG]
When a scrub failed immediately without any byte scrubbed, the returned
btrfs_scrub_progress::last_physical will always be 0, even if there is a
non-zero @start passed into btrfs_scrub_dev() for resume cases.

This will reset the progress and make later scrub resume start from the
beginning.

[CAUSE]
The function btrfs_scrub_dev() accepts a @progress parameter to copy its
updated progress to the caller, there are cases where we either don't
touch progress::last_physical at all or copy 0 into last_physical:

- last_physical not updated at all
  If some error happened before scrubbing any super block or chunk, we
  will not copy the progress, leaving the @last_physical untouched.

  E.g. failed to allocate @sctx, scrubbing a missing device or even
  there is already a running scrub and so on.

  All those cases won't touch @progress at all, resulting the
  last_physical untouched and will be left as 0 for most cases.

- Error out before scrubbing any bytes
  In those case we allocated @sctx, and sctx->stat.last_physical is all
  zero (initialized by kvzalloc()).
  Unfortunately some critical errors happened during
  scrub_enumerate_chunks() or scrub_supers() before any stripe is really
  scrubbed.

  In that case although we will copy sctx->stat back to @progress, since
  no byte is really scrubbed, last_physical will be overwritten to 0.

[FIX]
Make sure the parameter @progress always has its @last_physical member
updated to @start parameter inside btrfs_scrub_dev().

At the very beginning of the function, set @progress->last_physical to
@start, so that even if we error out without doing progress copying,
last_physical is still at @start.

Then after we got @sctx allocated, set sctx->stat.last_physical to
@start, this will make sure even if we didn't get any byte scrubbed, at
the progress copying stage the @last_physical is not left as zero.

This should resolve the resume progress reset problem.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:42:26 +01:00
Filipe Manana
7a832b870b btrfs: place all boolean fields together in struct find_free_extent_ctl
Move the 'retry_uncached' and 'hint' fields close to the other boolean
fields so that we remove a hole from the structure and reduce its size
from 136 bytes down to 128 bytes. Currently this structure is only
allocated in the stack of btrfs_reserve_extent().

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:42:26 +01:00
Filipe Manana
e21756fc4a btrfs: use booleans for delalloc arguments and struct find_free_extent_ctl
The struct find_free_extent_ctl uses an int for the 'delalloc' field but
it's always used as a boolean, and its value is used to be passed to
several functions to signal if we are dealing with delalloc. The same goes
for the 'is_data' argument from btrfs_reserve_extent(). So change the type
from int to bool and move the field definition in the find_free_extent_ctl
structure so that it's close to other bool fields and reduces the size of
the structure from 144 down to 136 bytes (at the moment it's only declared
in the stack of btrfs_reserve_extent(), never allocated otherwise).

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:42:26 +01:00
Filipe Manana
d7fe41044b btrfs: use bool type for btrfs_path members used as booleans
Many fields of struct btrfs_path are used as booleans but their type is
an unsigned int (of one 1 bit width to save space). Change the type to
bool keeping the :1 suffix so that they combine with the previous u8
fields in order to save space. This makes the code more clear by using
explicit true/false and more in line with the preferred style, preserving
the size of the structure.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:42:25 +01:00
Filipe Manana
c2b2504ece btrfs: update check_skip variable after unlocking current node
There's no need to update the local variable 'check_skip' to false inside
the critical section delimited by the lock of the current node, so do it
after unlocking the node.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:42:25 +01:00
Filipe Manana
5d8222a50a btrfs: abort transaction on item count overflow in __push_leaf_left()
If we try to push an item count from the right leaf that is greater than
the number of items in the leaf, we just emit a warning. This should
never happen but if it does we get an underflow in the new number of
items in the right leaf and chaos follows from it. So replace the warning
with proper error handling, by aborting the transaction and returning
-EUCLEAN, and proper logging by using btrfs_crit() instead of WARN(),
which gives us proper formatting and information about the filesystem.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:42:25 +01:00
Filipe Manana
027358a090 btrfs: always use right leaf variable in __push_leaf_left()
The 'right' variable points to path->nodes[0] and path->nodes[0] is never
changed, but some places use 'right' while others refer to path->nodes[0].
Update all sites to use 'right' as not only it's shorter it's also easier
to reason since it means the right leaf and avoids any confusion with the
sibling left leaf.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:42:25 +01:00
Filipe Manana
29bb40ed56 btrfs: remove duplicated leaf dirty status clearing in __push_leaf_right()
We have already called btrfs_clear_buffer_dirty() against the left leaf in
the code above:

  btrfs_set_header_nritems(left, left_nritems);

  if (left_nritems)
       btrfs_mark_buffer_dirty(trans, left);
  else
       btrfs_clear_buffer_dirty(trans, left);

So remove the second check for a 0 number of items in the left leaf and
calling again btrfs_clear_buffer_dirty() against the left leaf.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:42:25 +01:00
Filipe Manana
7447263d7d btrfs: always use left leaf variable in __push_leaf_right()
The 'left' variable points to path->nodes[0] and path->nodes[0] is never
changed, but some places use 'left' while others refer to path->nodes[0].
Update all sites to use 'left' as not only it's shorter it's also easier
to reason since it means the left leaf and avoids any confusion with the
sibling right leaf.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:42:25 +01:00
Filipe Manana
fad159f69e btrfs: add unlikely to critical error in btrfs_extend_item()
It's not expected to get a data size less than the leaf's free space,
which would lead to a leaf dump and BUG(), so tag the if statement's
expression as unlikely, hinting the compiler to potentially generate
better code.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:42:25 +01:00
Filipe Manana
86d3dc812f btrfs: remove pointless return value update in btrfs_del_items()
The call to btrfs_del_leaf() can only return an error (negative value) or
zero (success). If we didn't get an error then 'ret' is zero, so it's
pointless to set it to zero again.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:42:24 +01:00
Filipe Manana
e7dd1182fc btrfs: fix leaf leak in an error path in btrfs_del_items()
If the call to btrfs_del_leaf() fails we return without decrementing the
extra ref we took on the leaf, therefore leaking it. Fix this by ensuring
we drop the ref count before returning the error.

Fixes: 751a27615d ("btrfs: do not BUG_ON() on tree mod log failures at btrfs_del_ptr()")
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:42:24 +01:00
Zhen Ni
280dd7c106 btrfs: fix incomplete parameter rename in btrfs_decompress()
Commit 2c25716dcc ("btrfs: zlib: fix and simplify the inline extent
decompression") renamed the 'start_byte' parameter to 'dest_pgoff' in
the btrfs_decompress(). The remaining 'start_byte' references are
inconsistent with the actual implementation and may cause confusion for
developers.

Ensure consistency between function declaration and implementation.

Signed-off-by: Zhen Ni <zhen.ni@easystack.cn>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:42:24 +01:00
David Sterba
1c094e6cce btrfs: make a few more ASSERTs verbose
We have support for optional string to be printed in ASSERT() (added in
19468a623a ("btrfs: enhance ASSERT() to take optional format
string")), it's not yet everywhere it could be so add a few more files.

Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:42:24 +01:00
Qu Wenruo
ec20799064 btrfs: enable encoded read/write/send for bs > ps cases
Since the read verification and read repair are all supporting bs > ps
without large folios now, we can enable encoded read/write/send.

Now we can relax the alignment in assert_bbio_alignment() to
min(blocksize, PAGE_SIZE).
But also add the extra blocksize based alignment check for the logical
and length of the bbio.

There is a pitfall in btrfs_add_compress_bio_folios(), which relies on
the folios passed in to meet the minimal folio order.
But now we can pass regular page sized folios in, update it to check
each folio's size instead of using the minimal folio size.

This allows btrfs_add_compress_bio_folios() to even handle folios array
with different sizes, thankfully we don't yet need to handle such crazy
situation.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:42:24 +01:00
Qu Wenruo
052fd7a5ca btrfs: make read verification handle bs > ps cases without large folios
The current read verification is also relying on large folios to support
bs > ps cases, but that introduced quite some limits.

To enhance read-repair to support bs > ps without large folios:

- Make btrfs_data_csum_ok() to accept an array of paddrs
  Which can pass the paddrs[] direct into
  btrfs_calculate_block_csum_pages().

- Make repair_one_sector() to accept an array of paddrs
  So that it can submit a repair bio backed by regular pages, not only
  large folios.
  This requires us to allocate more slots at bio allocation time though.

  Also since the caller may have only partially advanced the saved_iter
  for bs > ps cases, we can not directly trust the logical bytenr from
  saved_iter (can be unaligned), thus a manual round down is necessary
  for the logical bytenr.

- Make btrfs_check_read_bio() to build an array of paddrs
  The tricky part is that we can only call btrfs_data_csum_ok() after
  all involved pages are assembled.

  This means at the call time of btrfs_check_read_bio(), our offset
  inside the bio is already at the end of the fs block.
  Thus we must re-calculate @bio_offset for btrfs_data_csum_ok() and
  repair_one_sector().

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:42:24 +01:00
Qu Wenruo
2574e90110 btrfs: make btrfs_repair_io_failure() handle bs > ps cases without large folios
Currently btrfs_repair_io_failure() only accept a single @paddr
parameter, and for bs > ps cases it's required that @paddr is backed by
a large folio.

That assumption has quite some limitations, preventing us from utilizing
true zero-copy direct-io and encoded read/writes.

To address the problem, enhance btrfs_repair_io_failure() by:

- Accept an array of paddrs, up to 64K / PAGE_SIZE entries
  This kind of acts like a bio_vec, but with very limited entries, as the
  function is only utilized to repair one fs data block, or a tree block.

  Both have an upper size limit (BTRFS_MAX_BLOCK_SIZE, i.e. 64K), so we
  don't need the full bio_vec thing to handle it.

- Allocate a bio with multiple slots
  Previously even for bs > ps cases, we only passed in a contiguous
  physical address range, thus a single slot will be enough.

  But not anymore, so we have to allocate a bio structure, other than
  using the on-stack one.

- Use on-stack memory to allocate @paddrs array
  It's at most 16 pages (4K page size, 64K block size), will take up at
  most 128 bytes.
  I think the on-stack cost is still acceptable.

- Add one extra check to make sure the repair bio is exactly one block

- Utilize btrfs_repair_io_failure() to submit a single bio for metadata
  This should improve the read-repair performance for metadata, as now
  we submit a node sized bio then wait, other than submit each block of
  the metadata and wait for each submitted block.

- Add one extra parameter indicating the step
  This is due to the fact that metadata step can be as large as
  nodesize, instead of sectorsize.
  So we need a way to distinguish metadata and data repair.

- Reduce the width of @length parameter of btrfs_repair_io_failure()
  Since we only call btrfs_repair_io_failure() on a single data or
  metadata block, u64 is overkilled.
  Use u32 instead and add one extra ASSERT()s to make sure the length
  never exceed BTRFS_MAX_BLOCK_SIZE.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:42:23 +01:00
Qu Wenruo
62bcbdca0e btrfs: make btrfs_csum_one_bio() handle bs > ps without large folios
For bs > ps cases, all folios passed into btrfs_csum_one_bio() are
ensured to be backed by large folios.  But that requirement excludes
features like direct IO and encoded writes.

To support bs > ps without large folios, enhance btrfs_csum_one_bio()
by:

- Split btrfs_calculate_block_csum() into two versions
  * btrfs_calculate_block_csum_folio()
    For call sites where a fs block is always backed by a large folio.

    This will do extra checks on the folio size, build a paddrs[] array,
    and pass it into the newer btrfs_calculate_block_csum_pages()
    helper.

    For now btrfs_check_block_csum() is still using this version.

  * btrfs_calculate_block_csum_pages()
    For call sites that may hit a fs block backed by noncontiguous pages.
    The pages are represented by paddrs[] array, which includes the
    offset inside the page.

    This function will do the proper sub-block handling.

- Make btrfs_csum_one_bio() to use btrfs_calculate_block_csum_pages()
  This means we will need to build a local paddrs[] array, and after
  filling a fs block, do the checksum calculation.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:42:23 +01:00
Filipe Manana
fe1e50031f btrfs: move struct reserve_ticket definition to space-info.c
It's not used anywhere outside space-info.c so move it from space-info.h
into space-info.c.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:42:23 +01:00
David Sterba
4decf577fb btrfs: move and rename CSUM_FMT definition
Move the CSUM_FMT* definitions to fs.h where is be the BTRFS_KEY_FMT
and add the prefix for consistency.

Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:42:23 +01:00
Sun YangKai
a320476ca8 btrfs: tests: do trivial BTRFS_PATH_AUTO_FREE conversions
Trivial pattern for the auto freeing where there are no operations
between btrfs_free_path() and the function returns.

Signed-off-by: Sun YangKai <sunk67188@gmail.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:42:23 +01:00
Qu Wenruo
5387bd9581 btrfs: raid56: remove sector_ptr structure
Since sector_ptr structure is now only containing a single paddr, there
is no need to use that structure.

Instead use phys_addr_t array for bio and stripe pointers.

This means several helpers are also needed to accept a paddr instead of
a sector_ptr pointer.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:42:23 +01:00
Qu Wenruo
1810350b04 btrfs: raid56: move sector_ptr::uptodate into a dedicated bitmap
The uptodate boolean member can be extracted into a bitmap, which will
save us some space (1 bit in a byte vs 8 bits in a byte).

Furthermore we do not need to record the uptodate bitmap for bio
sectors, as if bio_sectors[].paddr is valid it means there is a bio and
will be uptodate.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:42:22 +01:00
Qu Wenruo
17d552ab9b btrfs: raid56: remove sector_ptr::has_paddr member
We can use paddr -1 as an indicator for unset/uninitialized paddr.

We can not use 0 paddr, unlike virtual address 0 which is never mapped
thus will always trigger a page fault, physical address 0 may be a valid
page.

So here we follow swiotlb to use (paddr)-1 as a special indicator for
invalid/unset physical address.

Even if the PFN may still be valid, our usage of the physical address
should always be aligned to fs block size (or page size for bs > ps
cases), thus such -1 paddr should never be a valid one.

With this special -1 paddr, we can get rid of has_paddr member and save
1 byte for sector_ptr structure.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:42:22 +01:00
Baolin Liu
9b3743a676 btrfs: simplify list initialization in btrfs_compr_pool_scan()
In btrfs_compr_pool_scan(), use LIST_HEAD() to declare and initialize
the 'remove' list_head in one step instead of using INIT_LIST_HEAD()
separately.

Signed-off-by: Baolin Liu <liubaolin@kylinos.cn>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:42:22 +01:00
Qu Wenruo
07166122b5 btrfs: scrub: factor out parity scrub code into a helper
The function scrub_raid56_parity_stripe() is handling the parity stripe
by the following steps:

- Scrub each data stripes
  And make sure everything is fine in each data stripe

- Cache the data stripe into the raid bio

- Use the cached raid bio to scrub the target parity stripe

Extract the last two steps into a new helper,
scrub_raid56_cached_parity(), as a cleanup and make the error handling
more straightforward.

With the following minor cleanups:

- Use on-stack bio structure
  The bio is always empty thus we do not need any bio vector nor the
  block device. Thus there is no need to allocate a bio, the on-stack
  one is more than enough to cut it.

- Remove the unnecessary btrfs_put_bioc() call if btrfs_map_block()
  failed
  If btrfs_map_block() is failed, @bioc_ret will not be touched thus
  there is no need to call btrfs_put_bioc() in this case.

- Use a proper out: tag to do the cleanup
  Now the error cleanup is much shorter and simpler, just
  btrfs_bio_counter_dec() and bio_uninit().

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:42:22 +01:00
Qu Wenruo
d435c51365 btrfs: make sure extent and csum paths are always released in scrub_raid56_parity_stripe()
Unlike queue_scrub_stripe() which uses the global sctx->extent_path and
sctx->csum_path which are always released at the end of scrub_stripe(),
scrub_raid56_parity_stripe() uses local extent_path and csum_path, as
that function is going to handle the full stripe, whose bytenr may be
smaller than the bytenr in the global sctx paths.

However the cleanup of local extent/csum paths is only happening after
we have successfully submitted an rbio.

There are several error routes that we didn't release those two paths:

- scrub_find_fill_first_stripe() errored out at csum tree search
  In that case extent_path is still valid, and that function itself will
  not release the extent_path passed in.
  And the function returns directly without releasing both paths.

- The full stripe is empty
- Some blocks failed to be recovered
- btrfs_map_block() failed
- raid56_parity_alloc_scrub_rbio() failed
  The function returns directly without releasing both paths.

Fix it by covering btrfs_release_path() calls inside the out: tag.

This is just a hot fix, in the long run we will go scoped based auto
freeing for both local paths.

Fixes: 1dc4888e72 ("btrfs: scrub: avoid unnecessary extent tree search preparing stripes")
Fixes: 3c771c1944 ("btrfs: scrub: avoid unnecessary csum tree search preparing stripes")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:42:22 +01:00
Qu Wenruo
cfc7fe2b0f btrfs: use kvcalloc for btrfs_bio::csum allocation
[BUG]
There is a report that memory allocation failed for btrfs_bio::csum
during a large read:

  b2sum: page allocation failure: order:4, mode:0x40c40(GFP_NOFS|__GFP_COMP), nodemask=(null),cpuset=/,mems_allowed=0
  CPU: 0 UID: 0 PID: 416120 Comm: b2sum Tainted: G        W           6.17.0 #1 NONE
  Tainted: [W]=WARN
  Hardware name: Raspberry Pi 4 Model B Rev 1.5 (DT)
  Call trace:
   show_stack+0x18/0x30 (C)
   dump_stack_lvl+0x5c/0x7c
   dump_stack+0x18/0x24
   warn_alloc+0xec/0x184
   __alloc_pages_slowpath.constprop.0+0x21c/0x730
   __alloc_frozen_pages_noprof+0x230/0x260
   ___kmalloc_large_node+0xd4/0xf0
   __kmalloc_noprof+0x1c8/0x260
   btrfs_lookup_bio_sums+0x214/0x278
   btrfs_submit_chunk+0xf0/0x3c0
   btrfs_submit_bbio+0x2c/0x4c
   submit_one_bio+0x50/0xac
   submit_extent_folio+0x13c/0x340
   btrfs_do_readpage+0x4b0/0x7a0
   btrfs_readahead+0x184/0x254
   read_pages+0x58/0x260
   page_cache_ra_unbounded+0x170/0x24c
   page_cache_ra_order+0x360/0x3bc
   page_cache_async_ra+0x1a4/0x1d4
   filemap_readahead.isra.0+0x44/0x74
   filemap_get_pages+0x2b4/0x3b4
   filemap_read+0xc4/0x3bc
   btrfs_file_read_iter+0x70/0x7c
   vfs_read+0x1ec/0x2c0
   ksys_read+0x4c/0xe0
   __arm64_sys_read+0x18/0x24
   el0_svc_common.constprop.0+0x5c/0x130
   do_el0_svc+0x1c/0x30
   el0_svc+0x30/0xa0
   el0t_64_sync_handler+0xa0/0xe4
   el0t_64_sync+0x198/0x19c

[CAUSE]
Btrfs needs to allocate memory for btrfs_bio::csum for large reads, so
that we can later verify the contents of the read.

However nowadays a read bio can easily go beyond BIO_MAX_VECS *
PAGE_SIZE (which is 1M for 4K page sizes), due to the multi-page bvec
that one bvec can have more than one pages, as long as the pages are
physically adjacent.

This will become more common when the large folio support is moved out
of experimental features.

In the above case, a read larger than 4MiB with SHA256 checksum (32
bytes for each 4K block) will be able to trigger a order 4 allocation.

The order 4 is larger than PAGE_ALLOC_COSTLY_ORDER (3), thus without
extra flags such allocation will not retry.

And if the system has very small amount of memory (e.g. RPI4 with low
memory spec) or VMs with small vRAM, or the memory is heavily
fragmented, such allocation will fail and cause the above warning.

[FIX]
Although btrfs is handling the memory allocation failure correctly, we
do not really need the physically contiguous memory just to restore
our checksum.

In fact btrfs_csum_one_bio() is already using kvzalloc() to reduce the
memory pressure.

So follow the step to use kvcalloc() for btrfs_bio::csum.

Reported-by: Calvin Owens <calvin@wbinvd.org>
Link: https://lore.kernel.org/linux-btrfs/20251105180054.511528-1-calvin@wbinvd.org/
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:42:22 +01:00
Gladyshev Ilya
1dac8db80c btrfs: don't generate any code from ASSERT() in release builds
The current definition of ASSERT(cond) as (void)(cond) is redundant,
since these checks have no side effects and don't affect code logic.

However, some checks contain READ_ONCE() or other compiler-unfriendly
constructs. For example, ASSERT(list_empty) in btrfs_add_dealloc_inode()
was compiled to a redundant mov instruction due to this issue.

Define ASSERT as BUILD_BUG_ON_INVALID for !CONFIG_BTRFS_ASSERT builds
which uses sizeof(cond) trick.  Also mark full_page_sectors_uptodate()
as __maybe_unused to suppress "unneeded declaration" warning (it's
needed in compile time)

Signed-off-by: Gladyshev Ilya <foxido@foxido.dev>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:42:22 +01:00
Qu Wenruo
dd57c78aec btrfs: introduce btrfs_bio::async_csum
[ENHANCEMENT]
Btrfs currently calculates data checksums then submits the bio.

But after commit 968f19c5b1 ("btrfs: always fallback to buffered write
if the inode requires checksum"), any writes with data checksum will
fallback to buffered IO, meaning the content will not change during
writeback.

This means we're safe to calculate the data checksum and submit the bio
in parallel, and only need the following new behavior:

- Wait the csum generation to finish before calling btrfs_bio::end_io()
  Or this can lead to use-after-free for the csum generation worker.

- Save the current bi_iter for csum_one_bio()
  As the submission part can advance btrfs_bio::bio.bi_iter, if not
  saved csum_one_bio() may got an empty bi_iter and do not generate any
  checksum.

  Unfortunately this means we have to increase the size of btrfs_bio for
  16 bytes, but this is still acceptable.

As usual, such new feature is hidden behind the experimental flag.

[THEORETIC ANALYZE]
Consider the following theoretic hardware performance, which should be
more or less close to modern mainstream hardware:

	Memory bandwidth:	50GiB/s
	CRC32C bandwidth:	45GiB/s
	SSD bandwidth:		8GiB/s

Then write bandwidth with data checksum before the patch is:

	1 / ( 1 / 50 + 1 / 45 + 1 / 8) = 5.98 GiB/s

After the patch, the bandwidth is:

	1 / ( 1 / 50 + max( 1 / 45 + 1 / 8)) = 6.90 GiB/s

The difference is 15.32% improvement.

[REAL WORLD BENCHMARK]
I'm using a Zen5 (HX 370) as the host, the VM has 4GiB memory, 10 vCPUs, the
storage is backed by a PCIe gen3 x4 NVMe.

The test is a direct IO write, with 1MiB block size, write 7GiB data
into a btrfs mount with data checksum. Thus the direct write will
fallback to buffered one:

Vanilla Datasum:	1619.97 GiB/s
Patched Datasum:	1792.26 GiB/s
Diff			+10.6 %

In my case, the bottleneck is the storage, thus the improvement is not
reaching the theoretic one, but still some observable improvement.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:42:21 +01:00
Qu Wenruo
39bc80216a btrfs: relax btrfs_inode::ordered_tree_lock IRQ locking context
We used IRQ version of spinlock for ordered_tree_lock, as
btrfs_finish_ordered_extent() can be called in end_bbio_data_write()
which was in IRQ context.

However since we're moving all the btrfs_bio::end_io() calls into task
context, there is no more need to support IRQ context thus we can relax
to regular spin_lock()/spin_unlock() for btrfs_inode::ordered_tree_lock.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:42:21 +01:00
Qu Wenruo
4bbdce8417 btrfs: remove btrfs_fs_info::compressed_write_workers
The reason why end_bbio_compressed_write() queues a work into
compressed_write_workers wq is for end_compressed_writeback() call, as
it will grab all the involved folios and clear the writeback flags,
which may sleep.

However now we always run btrfs_bio::end_io() in task context, there is
no need to queue the work anymore.

Just remove btrfs_fs_info::compressed_write_workers and
compressed_bio::write_end_work.

There is a comment about the works queued into
compressed_write_workers, now change to flush endio wq instead, which is
responsible to handle all data endio functions.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:42:19 +01:00
Qu Wenruo
4591c3ef75 btrfs: make sure all btrfs_bio::end_io are called in task context
[BACKGROUND]
Btrfs has a lot of different bi_end_io functions, to handle different
raid profiles. But they introduced a lot of different contexts for
btrfs_bio::end_io() calls:

- Simple read bios
  Run in task context, backed by either endio_meta_workers or
  endio_workers.

- Simple write bios
  Run in IRQ context.

- RAID56 write or rebuild bios
  Run in task context, backed by rmw_workers.

- Mirrored write bios
  Run in irq context.

This is inconsistent, and contributes to the number of workqueues used
in btrfs.

[ENHANCEMENT]
Make all the above bios call their btrfs_bio::end_io() in task context,
backed by either endio_meta_workers for metadata, or endio_workers for
data.

For simple write bios, merge the handling into simple_end_io_work().

For mirrored write bios, it will be a little more complex, since both
the original or the cloned bios can run the final btrfs_bio::end_io().

Here we make sure the cloned bios are using btrfs_bioset, to reuse the
end_io_work, and run both original and cloned work inside the workqueue.

Add extra ASSERT()s to make sure btrfs_bio_end_io() is running in task
context.

This not only unifies the context for btrfs_bio::end_io() functions, but
also opens a new door for further btrfs_bio::end_io() related cleanups.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:40:21 +01:00
Qu Wenruo
81cea6cd70 btrfs: remove btrfs_bio::fs_info by extracting it from btrfs_bio::inode
Currently there is only one caller which doesn't populate
btrfs_bio::inode, and that's scrub.

The idea is scrub doesn't want any automatic csum verification nor
read-repair, as everything will be handled by scrub itself.

However that behavior is really no different than metadata inode, thus
we can reuse btree_inode as btrfs_bio::inode for scrub.

The only exception is in btrfs_submit_chunk() where if a bbio is from
scrub or data reloc inode, we set rst_search_commit_root to true.
This means we still need a way to distinguish scrub from metadata, but
that can be done by a new flag inside btrfs_bio.

Now btrfs_bio::inode is a mandatory parameter, we can extract fs_info
from that inode thus can remove btrfs_bio::fs_info to save 8 bytes from
btrfs_bio structure.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:40:16 +01:00
Qu Wenruo
c5667f9c8e btrfs: headers cleanup to remove unnecessary local includes
[BUG]
When I tried to remove btrfs_bio::fs_info and use btrfs_bio::inode to
grab the fs_info, the header "btrfs_inode.h" is needed to access the
full btrfs_inode structure.

Then btrfs will fail to compile.

[CAUSE]
There is a recursive including chain:

  "bio.h" -> "btrfs_inode.h" -> "extent_map.h" -> "compression.h" ->
  "bio.h"

That recursive including is causing problems for btrfs.

[ENHANCEMENT]
To reduce the risk of recursive including:

- Remove unnecessary local includes from btrfs headers
  Either the included header is pulled in by other headers, or is
  completely unnecessary.

- Remove btrfs local includes if the header only requires a pointer
  In that case let the implementing C file to pull the required header.

  This is especially important for headers like "btrfs_inode.h" which
  pulls in a lot of other btrfs headers, thus it's a mine field of
  recursive including.

- Remove unnecessary temporary structure definition
  Either if we have included the header defining the structure, or
  completely unused.

Now including "btrfs_inode.h" inside "bio.h" is completely fine,
although "btrfs_inode.h" still includes "extent_map.h", but that header
only includes "fs.h", no more chain back to "bio.h".

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:34:52 +01:00
Qu Wenruo
afc04c8b1b btrfs: replace BTRFS_MAX_BIO_SECTORS with BIO_MAX_VECS
It's impossible to have a btrfs bio with more than BIO_MAX_VECS vectors
anyway.  And there is only one location utilizing that macro, just
replace it with BIO_MAX_VECS. Both have the same value.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:34:52 +01:00
Andy Shevchenko
c913649c1b btrfs: replace const_ilog2() with ilog2()
const_ilog2() was a workaround of some sparse issue, which has never
appeared in the C functions. Replace it with ilog2().

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:34:52 +01:00
Johannes Thumshirn
51070655e7 btrfs: zoned: show statistics for zoned filesystems
Provide statistics for zoned filesystems. These statistics include, the
number of active block-groups, how many of them are reclaimable or unused,
if the filesystem needs to be reclaimed, the currently assigned relocation
and treelog block-groups if they're present and a list of active zones.

Example:
  active block-groups: 4
    reclaimable: 0
    unused: 2
    need reclaim: false
  data relocation block-group: 4294967296
  active zones:
    start: 1610612736, wp: 344064 used: 16384, reserved: 0, unusable: 327680
    start: 1879048192, wp: 34963456 used: 131072, reserved: 0, unusable: 34832384
    start: 4026531840, wp: 0 used: 0, reserved: 0, unusable: 0
    start: 4294967296, wp: 0 used: 0, reserved: 0, unusable: 0

Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:34:52 +01:00
Miquel Sabaté Solà
252877a870 btrfs: add ASSERTs on prealloc in qgroup functions
The prealloc variable in these functions is always initialized to
NULL. Whenever we allocate memory for it, if it fails then NULL is
preserved, otherwise we delegate the ownership of the pointer to
add_qgroup_rb() and set it right after to NULL.

Since in any case the pointer ends up being NULL at the end of its
usage, we can safely remove calls to kfree() for it, while adding an
ASSERT as an extra check.

Signed-off-by: Miquel Sabaté Solà <mssola@mssola.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:34:51 +01:00
Miquel Sabaté Solà
7ab5d01d58 btrfs: apply the AUTO_K(V)FREE macros throughout the code
Apply the AUTO_KFREE and AUTO_KVFREE macros wherever it makes
sense. Since this macro is expected to improve code readability, it has
been avoided in places where the lifetime of objects wasn't easy to
follow and a cleanup attribute would've made things worse; or when the
cleanup section of a function involved many other things and thus there
was no readability impact anyways. This change has also not been applied
in extremely short functions where readability was clearly not an issue.

Signed-off-by: Miquel Sabaté Solà <mssola@mssola.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:34:51 +01:00
Miquel Sabaté Solà
d00cbce0a7 btrfs: define the AUTO_KFREE/AUTO_KVFREE helper macros
These are two simple macros which ensure that a pointer is initialized
to NULL and with the proper cleanup attribute for it.

Signed-off-by: Miquel Sabaté Solà <mssola@mssola.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:34:51 +01:00
Miquel Sabaté Solà
285c3ab28e btrfs: declare free_ipath() via DEFINE_FREE()
The free_ipath() function was being used as a cleanup function
everywhere. Declare it via DEFINE_FREE() so we can use this function
with the __free() helper.

The name has also been adjusted so it's closer to the type's name.

Signed-off-by: Miquel Sabaté Solà <mssola@mssola.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:34:51 +01:00
Qu Wenruo
937f99c736 btrfs: scrub: cancel the run if there is a pending signal
Unlike relocation, scrub never checks pending signals, and even for
relocation is only explicitly checking for fatal signal (SIGKILL), not
for regular ones.

Thankfully relocation can still be interrupted by regular signals by
the usage of wait_on_bit(), which is called with TASK_INTERRUPTIBLE.

Do the same for scrub/dev-replace, so that regular signals can also
cancel the scrub/replace run, and more importantly handle v2 cgroup
freezing which is based on signal handling code inside the kernel, and
freezing() function will not return true for v2 cgroup freezing.

This will address the problem that systemd slice freezing will timeout
on long running scrub/dev-replace.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:34:32 +01:00
Qu Wenruo
c7b478504b btrfs: scrub: cancel the run if the process or fs is being frozen
It's a known bug that btrfs scrub/dev-replace can prevent the system
from suspending.

There are at least two factors involved:

- Holding super_block::s_writers for the whole scrub/dev-replace duration
  We hold that percpu rw semaphore through mnt_want_write_file() for the
  whole scrub/dev-replace duration.

  That will prevent the fs being frozen, which can be initiated by
  either the user (e.g. fsfreeze) or power management suspend/hibernate.

- Stuck in the kernel space for a long time
  During suspend all user processes (and some kernel threads) will
  be frozen.
  But if a user space progress has fallen into kernel (scrub ioctl) and
  do not return for a long time, it will make process freezing time out.

  Unfortunately scrub/dev-replace is a long running ioctl, and it will
  prevent the btrfs process from returning to the user space, thus make PM
  suspend/hibernate time out.

Address them in one go:

- Introduce a new helper should_cancel_scrub()
  Which includes the existing cancel request and new fs/process freezing
  checks.

  Here we have to check both fs and process freezing for PM
  suspend/hibernate.

  PM can be configured to freeze filesystems before processes.
  (The current default is not to freeze filesystems, but planned to
  freeze the filesystems as the new default.)

  Checking only fs freezing will fail PM without fs freezing, as the
  process freezing will time out.

  Checking only process freezing will fail PM with fs freezing since the
  fs freezing happens before process freezing.

  And the return value will indicate the reason, -ECANCLED for the
  explicitly canceled runs, and -EINTR for fs freeze or PM reasons.

- Cancel the run if should_cancel_scrub() is true
  Unfortunately canceling is the only feasible solution here, pausing is
  not possible as we will still stay in the kernel space thus will still
  prevent the process from being frozen.

This will cause a user impacting behavior change:

  Dev-replace can be interrupted by PM, and there is no way to resume
  but start from the beginning again.

This means dev-replace may fail on newer kernels, and end users will
need extra steps like using systemd-inhibit to prevent
suspend/hibernate, to get back the old uninterrupted behavior.

This behavior change will need extra documentation updates and
communication with projects involving scrub/dev-replace including
btrfs-progs.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Link: https://lore.kernel.org/linux-btrfs/d93b2a2d-6ad9-4c49-809f-11d769a6f30a@app.fastmail.com/
Reported-by: Chris Murphy <lists@colorremedies.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:24:52 +01:00
Qu Wenruo
02a7e90797 btrfs: scrub: add cancel/pause/removed bg checks for raid56 parity stripes
For raid56, data and parity stripes are handled differently.

For data stripes they are handled just like regular RAID1/RAID10 stripes,
going through the regular scrub_simple_mirror().

But for parity stripes we have to read out all involved data stripes and
do any needed verification and repair, then scrub the parity stripe.

This process will take a much longer time than a regular stripe, but
unlike scrub_simple_mirror(), we do not check if we should cancel/pause
or the block group is already removed.

Aligned the behavior of scrub_raid56_parity_stripe() to
scrub_simple_mirror(), by adding:

- Cancel check
- Pause check
- Removed block group check

Since those checks are the same from the scrub_simple_mirror(), also
update the comments of scrub_simple_mirror() by:

- Remove too obvious comments
  We do not need extra comments on what we're checking, it's really too
  obvious.

- Remove a stale comment about pausing
  Now the scrub is always queuing all involved stripes, and submit them
  in one go, there is no more submission part during pausing.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:21:38 +01:00
Filipe Manana
38e03b820e btrfs: annotate as unlikely fs aborted checks in space flushing code
It's not expected to have the fs in an aborted state, so surround the
abortion checks with unlikely to make it clear it's unexpected and to
hint the compiler to generate better code.

Also at maybe_fail_all_tickets() untangle all repeated checks for the
abortion into a single if-then-else. This makes things more readable
and makes the compiler generate less code. On x86_64 with gcc 14.2.0-19
from Debian I got the following object size differences.

Before this change:

  $ size fs/btrfs/btrfs.ko
     text	   data	    bss	    dec	    hex	filename
  2021606	 179704	  25088	2226398	 21f8de	fs/btrfs/btrfs.ko

After this change:

  $ size fs/btrfs/btrfs.ko
     text	   data	    bss	    dec	    hex	filename
  2021458	 179704	  25088	2226250	 21f84a	fs/btrfs/btrfs.ko

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:20:29 +01:00
Filipe Manana
f912f0af13 btrfs: avoid space_info locking when checking if tickets are served
When checking if a ticket was served, we take the space_info's spinlock.
If the ticket was served (its ->bytes is 0) or had an error (its ->error
it not 0) then we just unlock the space_info and return.

This however causes contention on the space_info's spinlock, which is
heavily used (space reservation, space flushing, allocating and
deallocating an extent from a block group (btrfs_update_block_group()),
etc).

Instead of using the space_info's spinlock to check if a ticket was
served, use a per ticket spinlock which isn't used by anyone other than
the task that created the ticket (stack allocated) and the task that
serves the ticket (a reclaim task or any task deallocating space that
ends up at btrfs_try_granting_tickets()).

After applying this patch and all previous patches from the same patchset
(many attempt to reduce space_info critical sections), lockstat showed
some improvements for a fs_mark test regarding the space_info's spinlock
'lock'. The lockstat results:

Before patchset:

  con-bounces:     13733858
  contentions:     15902322
  waittime-total:  264902529.72
  acq-bounces:     28161791
  acquisitions:    38679282

After patchset:

  con-bounces:     12032220
  contentions:     13598034
  waittime-total:  221806127.28
  acq-bounces:     24717947
  acquisitions:    34103281

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:18:21 +01:00
Filipe Manana
50a51b5378 btrfs: move ticket wakeup and finalization to remove_ticket()
Instead of repeating the wakeup and setup of the ->bytes or ->error field,
move those steps to remove_ticket() to avoid duplication. This is also
needed for the next patch in the series, so that we avoid duplicating more
logic.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:18:17 +01:00
Filipe Manana
cdf8a566ee btrfs: add data_race() in btrfs_account_ro_block_groups_free_space()
Surround the intentional empty list check with the data_race() annotation
so that tools like KCSAN don't report a data race. The race is intentional
as it's harmless and we want to avoid lock contention of the space_info
since its lock is heavily used (space reservation, space flushing, extent
allocation and deallocation, etc).

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:16:27 +01:00
Filipe Manana
8b6e1f5dce btrfs: remove pointless label and goto from unpin_extent_range()
There's no need to have an 'out' label and jump there in case we can
not find a block group. We can simply return directly since there are no
resources to release, removing the need for the label and the 'ret'
variable.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:16:23 +01:00
Filipe Manana
36574363b7 btrfs: reduce block group critical section in unpin_extent_range()
There's no need to update the bytes_pinned, bytes_readonly and
max_extent_size fields of the space_info while inside the critical section
delimited by the block group's lock. So move that out of the block group's
critical section, but sill inside the space_info's critical section.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:16:20 +01:00
Filipe Manana
4cb0abc1cf btrfs: change 'reserved' argument from pin_down_extent() to bool
It's used as a boolean, so convert it from int type to bool type.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:16:14 +01:00
Filipe Manana
8dcb8e4b11 btrfs: remove 'reserved' argument from btrfs_pin_extent()
All callers pass a value of 1 (true) to it, so remove it.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:16:10 +01:00
Filipe Manana
ec8022cd26 btrfs: use local variable for space_info in pin_down_extent()
Instead of dereferencing the block group multiple times to access its
space_info, use a local variable to shorten the code horizontal wise and
make it easier to read. Also, while at it, also rename the block group
argument from 'cache' to 'bg', as the cache name is confusing and it's
from the old days where the block group structure was named as
'btrfs_block_group_cache'.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:15:58 +01:00
Filipe Manana
585416766d btrfs: reduce block group critical section in pin_down_extent()
There's no need to update the bytes_reserved and bytes_may_use fields of
the space_info while holding the block group's spinlock. We are only
making the critical section longer than necessary. So move the space_info
updates outside of the block group's critical section.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:15:13 +01:00
Filipe Manana
c0d0b13d27 btrfs: reduce block group critical section in do_trimming()
There's no need to update the bytes_reserved and bytes_readonly fields of
the space_info while holding the block group's spinlock. We are only
making the critical section longer than necessary. So move the space_info
updates outside of the block group's critical section.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:15:09 +01:00
Filipe Manana
a270cb420c btrfs: reduce block group critical section in btrfs_add_reserved_bytes()
We are doing some things inside the block group's critical section that
are relevant only to the space_info: updating the space_info counters
bytes_reserved and bytes_may_use as well as trying to grant tickets
(calling btrfs_try_granting_tickets()), and this later can take some
time. So move all those updates to outside the block group's critical
section and still inside the space_info's critical section. Like this
we keep the block group's critical section only for block group updates
and can help reduce contention on a block group's lock.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:14:40 +01:00
Filipe Manana
8b6fa164ab btrfs: reduce block group critical section in btrfs_free_reserved_bytes()
There's no need to update the space_info fields (bytes_reserved,
max_extent_size, bytes_readonly, bytes_zone_unusable) while holding the
block group's spinlock. So move those updates to happen after we unlock
the block group (and while holding the space_info locked of course), so
that all we do under the block group's critical section is to update the
block group itself.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:14:06 +01:00
Filipe Manana
f7a32dd2a6 btrfs: reduce space_info critical section in btrfs_chunk_alloc()
There's no need to update local variables while holding the space_info's
spinlock, since the update isn't using anything from the space_info. So
move these updates outside the critical section to shorten it.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:13:25 +01:00
Filipe Manana
b70c32f10a btrfs: remove double underscore prefix from __reserve_bytes()
The use of a double underscore prefix is discouraged and we have no
justification at all for it all since there's no reserved_bytes() counter
part. So remove the prefix.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:13:00 +01:00
Filipe Manana
189db25105 btrfs: process ticket outside global reserve critical section
In steal_from_global_rsv() there's no need to process the ticket inside
the critical section of the global reserve. Move the ticket processing to
happen after the critical section. This helps reduce contention on the
global reserve's spinlock.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:12:37 +01:00
Filipe Manana
5ca7725ddf btrfs: assign booleans to global reserve's full field
We have a couple places that are assigning 0 and 1 to the full field of
the global reserve. This is harmless since 0 is converted to false and
1 converted to true, but for better readability, replace these with true
and false since the field is of type bool.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:12:07 +01:00
Filipe Manana
f18a203a1b btrfs: assert space_info is locked in steal_from_global_rsv()
The caller is supposed to have locked the space_info, so assert that.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:11:40 +01:00
Filipe Manana
afbc047ab0 btrfs: avoid unnecessary reclaim calculation in priority_reclaim_metadata_space()
If the given ticket was already served (its ->bytes is 0), then we wasted
time calculating the metadata reclaim size. So calculate it only after we
checked the ticket was not yet served.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:10:18 +01:00
Filipe Manana
4ddb077378 btrfs: shorten critical section in btrfs_preempt_reclaim_metadata_space()
We are doing a lot of small calculations and assignments while holding the
space_info's spinlock, which is a heavily used lock for space reservation
and flushing. There's no point in holding the lock for so long when all we
want is to call need_preemptive_reclaim() and get a consistent value for a
couple of counters from the space_info. Instead, grab the counters into
local variables, release the lock and then use the local variables.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:09:46 +01:00
Filipe Manana
8ab2b8bdbe btrfs: increment loop count outside critical section during metadata reclaim
In btrfs_preempt_reclaim_metadata_space() there's no need to increment the
local variable that tracks the number of iterations of the while loop
while inside the critical section delimited by the space_info's spinlock.
That spinlock is heavily used by space reservation and flushing code, so
it's desirable to have its critical sections as short as possible.
So move the loop count incremented outside the critical section.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:09:14 +01:00
Filipe Manana
49f204be22 btrfs: bail out earlier from need_preemptive_reclaim() if we have tickets
Instead of doing some calculations and then return false if it turns out
we have queued tickets, check first if we have tickets and return false
immediately if we have tickets, without wasting time on doing those
computations.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:08:46 +01:00
Filipe Manana
6f4779faa0 btrfs: inline btrfs_space_info_used()
The function is simple enough to be inlined and in fact doing it even
reduces the object code. In x86_64 with gcc 14.2.0-19 from Debian the
results were the following:

  Before this change

    $ size fs/btrfs/btrfs.ko
      text	   data	    bss	    dec	    hex	filename
    1919410	 161703	  15592	2096705	 1ffe41	fs/btrfs/btrfs.ko

  After this change

    $ size fs/btrfs/btrfs.ko
      text	   data	    bss	    dec	    hex	filename
    1918991	 161675	  15592	2096258	 1ffc82	fs/btrfs/btrfs.ko

Also remove the ASSERT() that checks the space_info argument is not NULL,
as it's odd to be there since it can never be NULL and in case that ever
happens during development, a stack trace from a NULL pointer dereference
will be obvious. It was originally added when btrfs_space_info_used() was
introduced in commit 4136135b08 ("Btrfs: use helper to get used bytes
of space_info").

Also add a lockdep assertion to check the space_info's lock is being held
by the calling task.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:08:14 +01:00
Filipe Manana
0ce6300fec btrfs: avoid used space computation when reserving space
In __reserve_bytes() we have 3 repeated calls to btrfs_space_info_used(),
one early on as soon as take the space_info's spinlock, another one when
we call btrfs_can_overcommit(), which calls btrfs_space_info_used() again,
and a final one when we are reserving for a flush emergency.

During all these calls we are holding the space_info's spinlock, which is
heavily used by the space reservation and flushing code, so it's desirable
to make the critical sections as short as possible.

So make this more efficient by:

1) Instead of calling btrfs_can_overcommit() call the new variant
   can_overcommit() which takes the space_info's used space as an argument
   and pass the value we already computed and have in the 'used' variable;

2) Instead of calling btrfs_space_info_used() with its second argument as
   false when we are doing a flush emergency, decrement the space_info's
   bytes_may_use counter from the 'used' variable, as the difference
   between passing true or false as the second argument to
   btrfs_space_info_used() is whether or not to include the space_info's
   bytes_may_use counter in the computation.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:07:36 +01:00
Filipe Manana
a5f8f64aa3 btrfs: avoid used space computation when trying to grant tickets
In btrfs_try_granting_tickets(), we call btrfs_can_overcommit() and that
calls btrfs_space_info_used(). But we already keep track, in the 'used'
local variable, of the used space in the space_info, so we are just
repeating the same computation and doing an extra function call while we
are holding the space_info's spinlock, which is heavily used by the space
reservation and flushing code.

So add a local variant of btrfs_can_overcommit() that takes in the used
space as an argument and therefore does not call btrfs_space_info_used(),
and use it in btrfs_try_granting_tickets().

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:07:26 +01:00
Filipe Manana
563ef2befb btrfs: make btrfs_can_overcommit() return bool instead of int
It's a boolean function, so switch its return type to bool.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:06:26 +01:00
Filipe Manana
60532c2136 btrfs: avoid recomputing used space in btrfs_try_granting_tickets()
In every iteration of the loop we call btrfs_space_info_used() which sums
a bunch of fields from a space_info object. This implies doing a function
call besides the sum, and we are holding the space_info's spinlock while
we do this, so we want to keep the critical section as short as possible
since that spinlock is used in all the code for space reservation and
flushing (therefore it's heavily used).

So call btrfs_try_granting_tickets() only once, before entering the loop,
and then update it as we remove tickets.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:05:06 +01:00
Filipe Manana
063171a4f0 btrfs: return real error when failing tickets in maybe_fail_all_tickets()
In case we had a transaction abort we set a ticket's error to -EIO, but we
have the real error that caused the transaction to be aborted returned by
the macro BTRFS_FS_ERROR(). So use that real error instead of -EIO.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:04:30 +01:00
Qu Wenruo
988f693a46 btrfs: subpage: simplify the PAGECACHE_TAG_TOWRITE handling
In function btrfs_subpage_set_writeback() we need to keep the
PAGECACHE_TAG_TOWRITE tag if the folio is still dirty.

This is a needed quirk for support async extents, as a subpage range can
almost suddenly go writeback, without touching other subpage ranges in
the same folio.

However we can simplify the handling by replace the open-coded tag
clearing by passing the @keep_write flag depending on if the folio is
dirty.

Since we're holding the subpage lock already, no one is able to change
the dirty/writeback flag, thus it's safe to check the folio dirty before
calling __folio_start_writeback().

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:03:38 +01:00
Filipe Manana
ca428e9b49 btrfs: remove pointless data_end assignment in btrfs_extent_item()
There's no point in setting 'data_end' to 'old_data' as we don't use it
afterwards. So remove the redundant assignment which was never needed
and added when the function was first added in commit 6567e837df
("Btrfs: early work to file_write in big extents").

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:03:02 +01:00
Filipe Manana
af1e800c02 btrfs: use the key format macros when printing keys
Change all locations that print a key to use the new macros to print
them in order to ensure a consistent style and avoid repetitive code.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:03:02 +01:00
Filipe Manana
95de4b097e btrfs: add macros to facilitate printing of keys
There's a lot of places where we need to print a key, and it's tiresome
to type the format specifier, typically "(%llu %u %llu)", as well as
passing 3 arguments to a prink family function (key->objectid, key->type,
key->offset).

So add a couple macros for this just like we have for csum values in
btrfs_inode.h (CSUM_FMT and CSUM_FMT_VALUE).

This also ensures that we consistently print a key in the same format,
always as "(%llu %llu %llu)", which is the most common format we use, but
we have a few variations such as "[%llu %llu %llu]" for no good reason.

This patch introduces the macros while the next one makes use of it.
This is to ease backports of future patches, since then we can backport
this patch which is simple and short and then backport those future
patches, as the next patch in the series that makes use of these new
macros is quite large and may have some dependencies.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:03:02 +01:00
Xuanqiang Luo
225e747ea5 btrfs: remove redundant refcount check in btrfs_put_transaction()
Eric Dumazet removed the redundant refcount check for sk_refcnt, I
noticed a similar issue in btrfs_put_transaction().
refcount_dec_and_test() already checks for a zero refcount and
complains, making the preceding WARN_ON redundant. This is a leftover
from the atomic_t times.

Signed-off-by: Xuanqiang Luo <luoxuanqiang@kylinos.cn>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:03:02 +01:00
Filipe Manana
a232ff90d1 btrfs: remove fs_info argument from btrfs_zoned_activate_one_bg()
We don't need it since we can grab fs_info from the given space_info.
So remove the fs_info argument.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:02:57 +01:00
Filipe Manana
771af6ff72 btrfs: remove fs_info argument from btrfs_sysfs_add_space_info_type()
We don't need it since we can grab fs_info from the given space_info.
So remove the fs_info argument.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 22:02:30 +01:00
Sun YangKai
7fc35cc559 btrfs: more trivial BTRFS_PATH_AUTO_FREE conversions
Convert more of the trivial pattern for the auto freeing of btrfs_path
with goto -> return conversions where applicable.

Signed-off-by: Sun YangKai <sunk67188@gmail.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 21:59:11 +01:00
Filipe Manana
a1359d06d7 btrfs: remove fs_info argument from btrfs_reserve_metadata_bytes()
We don't need it since we can grab fs_info from the given space_info.
So remove the fs_info argument.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <asj@kernel.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 21:59:11 +01:00
Filipe Manana
30b87a2319 btrfs: remove fs_info argument from __reserve_bytes()
We don't need it since we can grab fs_info from the given space_info.
So remove the fs_info argument.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <asj@kernel.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 21:59:11 +01:00
Filipe Manana
09d0f28531 btrfs: fix parameter documentation for btrfs_reserve_data_bytes()
We don't have a fs_info argument anymore since commit 5d39fda880
("btrfs: pass btrfs_space_info to btrfs_reserve_data_bytes()"), it
was replaced by a space_info argument. So update the documentation.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <asj@kernel.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 21:59:11 +01:00
Filipe Manana
5495cbe920 btrfs: remove fs_info argument from maybe_clamp_preempt()
We don't need it since we can grab fs_info from the given space_info.
So remove the fs_info argument.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <asj@kernel.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 21:59:11 +01:00
Filipe Manana
e182eca6ed btrfs: remove fs_info argument from handle_reserve_ticket()
We don't need it since we can grab fs_info from the given space_info.
So remove the fs_info argument.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <asj@kernel.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 21:59:11 +01:00
Filipe Manana
ddeac2a12b btrfs: remove fs_info argument from steal_from_global_rsv()
We don't need it since we can grab fs_info from the given space_info.
So remove the fs_info argument.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <asj@kernel.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 21:59:10 +01:00
Filipe Manana
d77b22de56 btrfs: remove fs_info argument from need_preemptive_reclaim()
We don't need it since we can grab fs_info from the given space_info.
So remove the fs_info argument.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <asj@kernel.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 21:59:10 +01:00
Filipe Manana
4199eb2761 btrfs: remove fs_info argument from btrfs_calc_reclaim_metadata_size()
We don't need it since we can grab fs_info from the given space_info.
So remove the fs_info argument.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <asj@kernel.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 21:59:10 +01:00
Filipe Manana
3ee1246536 btrfs: remove fs_info argument from shrink_delalloc() and flush_space()
We don't need it since we can grab fs_info from the given space_info.
So remove the fs_info argument.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <asj@kernel.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 21:59:10 +01:00
Filipe Manana
e96059c9d7 btrfs: remove fs_info argument from btrfs_dump_space_info()
We don't need it since we can grab fs_info from the given space_info.
So remove the fs_info argument.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <asj@kernel.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 21:59:10 +01:00
Filipe Manana
78a77f4da4 btrfs: remove fs_info argument from btrfs_can_overcommit()
We don't need it since we can grab fs_info from the given space_info.
So remove the fs_info argument.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <asj@kernel.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 21:59:10 +01:00
Filipe Manana
302b4b69c4 btrfs: remove fs_info argument from calc_available_free_space()
We don't need it since we can grab fs_info from the given space_info.
So remove the fs_info argument.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <asj@kernel.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 21:59:09 +01:00
Filipe Manana
1b809e3055 btrfs: remove fs_info argument from maybe_fail_all_tickets()
We don't need it since we can grab fs_info from the given space_info.
So remove the fs_info argument.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <asj@kernel.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 21:59:09 +01:00
Filipe Manana
cf3ae29caf btrfs: remove fs_info argument from priority_reclaim_metadata_space()
We don't need it since we can grab fs_info from the given space_info.
So remove the fs_info argument.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <asj@kernel.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 21:59:09 +01:00
Filipe Manana
f63b36686b btrfs: remove fs_info argument from priority_reclaim_data_space()
We don't need it since we can grab fs_info from the given space_info.
So remove the fs_info argument.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <asj@kernel.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 21:59:09 +01:00
Filipe Manana
e3df6408b1 btrfs: remove fs_info argument from btrfs_try_granting_tickets()
We don't need it since we can grab fs_info from the given space_info.
So remove the fs_info argument.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <asj@kernel.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 21:59:09 +01:00
Filipe Manana
f1ae05b8ea btrfs: avoid repeated computations in btrfs_mark_ordered_io_finished()
We're computing a few values several times:

1) The current ordered extent's end offset inside the while loop, we have
   computed it and stored it in the 'entry_end' variable but then we
   compute it again later as the first argument to the min() macro;

2) The end file offset, open coded 3 times;

3) The current length (stored in variable 'len') computed 2 times, one
   inside an assertion and the other when assigning to the 'len' variable.

So use existing variables and add new ones to prevent repeating these
expressions and reduce the source code.

We were also subtracting one from the result of min() macro call and
then adding 1 back in the next line, making both operations pointless.
So just remove the decrement and increment by 1.

This also reduces very slightly the object code.

Before:

  $ size fs/btrfs/btrfs.ko
     text	   data	    bss	    dec	    hex	filename
  1916576	 161679	  15592	2093847	 1ff317	fs/btrfs/btrfs.ko

After:

  $ size fs/btrfs/btrfs.ko
     text	   data	    bss	    dec	    hex	filename
  1916556	 161679	  15592	2093827	 1ff303	fs/btrfs/btrfs.ko

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Anand Jain <asj@kernel.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 21:59:09 +01:00
Filipe Manana
3b7c0c20b7 btrfs: avoid multiple i_size rounding in btrfs_truncate()
We have the inode locked so no one can concurrently change its i_size and
neither do we change it ourselves, so there's no point in keep rounding
it in the while loop and setting it up in the control structure. That only
causes confusion when reading the code.

So move all the i_size setup and rounding out of the loop and assert the
inode is locked.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Anand Jain <asj@kernel.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 21:59:09 +01:00
Filipe Manana
b917a94a4c btrfs: consistently round up or down i_size in btrfs_truncate()
We're using different ways to round down the i_size by sector size, one
with a bitwise and with a negated mask and another with ALIGN_DOWN(), and
using ALIGN() to round up.

Replace these uses with the round_down() and round_up() macros which have
have names that make it clear the direction of the rounding (unlike the
ALIGN() macro) and getting rid of the bitwise and, negated mask and local
variable for the mask.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Anand Jain <asj@kernel.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 21:59:08 +01:00
Filipe Manana
28fe58ce6a btrfs: add unlikely to unexpected error case in extent_writepages()
We don't expect to hit errors and log the error message, so add the
unlikely annotation to make it clear and to hint the compiler that it may
reorganize code to be more efficient.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Anand Jain <asj@kernel.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 21:59:08 +01:00
Filipe Manana
74ca34f79e btrfs: split assertion into two in extent_writepage_io()
If the assertion fails we don't get to know which of the two expressions
failed and neither the values used in each expression.

So split the assertion into two, each for a single expression, so that
if any is triggered we see a line number reported in a stack trace that
points to which expression failed. Also  make the assertions use the
verbose mode to print the values involved in the computations.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Anand Jain <asj@kernel.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 21:59:08 +01:00
Filipe Manana
46a2390859 btrfs: use variable for end offset in extent_writepage_io()
Instead of repeating the expression "start + len" multiple times, store it
in a variable and use it where needed.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Anand Jain <asj@kernel.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 21:59:08 +01:00
Filipe Manana
18de34daa7 btrfs: truncate ordered extent when skipping writeback past i_size
While running test case btrfs/192 from fstests with support for large
folios (needs CONFIG_BTRFS_EXPERIMENTAL=y) I ended up getting very sporadic
btrfs check failures reporting that csum items were missing. Looking into
the issue it turned out that btrfs check searches for csum items of a file
extent item with a range that spans beyond the i_size of a file and we
don't have any, because the kernel's writeback code skips submitting bios
for ranges beyond eof. It's not expected however to find a file extent item
that crosses the rounded up (by the sector size) i_size value, but there is
a short time window where we can end up with a transaction commit leaving
this small inconsistency between the i_size and the last file extent item.

Example btrfs check output when this happens:

  $ btrfs check /dev/sdc
  Opening filesystem to check...
  Checking filesystem on /dev/sdc
  UUID: 69642c61-5efb-4367-aa31-cdfd4067f713
  [1/8] checking log skipped (none written)
  [2/8] checking root items
  [3/8] checking extents
  [4/8] checking free space tree
  [5/8] checking fs roots
  root 5 inode 332 errors 1000, some csum missing
  ERROR: errors found in fs roots
  (...)

Looking at a tree dump of the fs tree (root 5) for inode 332 we have:

   $ btrfs inspect-internal dump-tree -t 5 /dev/sdc
   (...)
        item 28 key (332 INODE_ITEM 0) itemoff 2006 itemsize 160
                generation 17 transid 19 size 610969 nbytes 86016
                block group 0 mode 100666 links 1 uid 0 gid 0 rdev 0
                sequence 11 flags 0x0(none)
                atime 1759851068.391327881 (2025-10-07 16:31:08)
                ctime 1759851068.410098267 (2025-10-07 16:31:08)
                mtime 1759851068.410098267 (2025-10-07 16:31:08)
                otime 1759851068.391327881 (2025-10-07 16:31:08)
        item 29 key (332 INODE_REF 340) itemoff 1993 itemsize 13
                index 2 namelen 3 name: f1f
        item 30 key (332 EXTENT_DATA 589824) itemoff 1940 itemsize 53
                generation 19 type 1 (regular)
                extent data disk byte 21745664 nr 65536
                extent data offset 0 nr 65536 ram 65536
                extent compression 0 (none)
   (...)

We can see that the file extent item for file offset 589824 has a length of
64K and its number of bytes is 64K. Looking at the inode item we see that
its i_size is 610969 bytes which falls within the range of that file extent
item [589824, 655360[.

Looking into the csum tree:

  $ btrfs inspect-internal dump-tree /dev/sdc
  (...)
        item 15 key (EXTENT_CSUM EXTENT_CSUM 21565440) itemoff 991 itemsize 200
                range start 21565440 end 21770240 length 204800
           item 16 key (EXTENT_CSUM EXTENT_CSUM 1104576512) itemoff 983 itemsize 8
                range start 1104576512 end 1104584704 length 8192
  (..)

We see that the csum item number 15 covers the first 24K of the file extent
item - it ends at offset 21770240 and the extent's disk_bytenr is 21745664,
so we have:

   21770240 - 21745664 = 24K

We see that the next csum item (number 16) is completely outside the range,
so the remaining 40K of the extent doesn't have csum items in the tree.

If we round up the i_size to the sector size, we get:

   round_up(610969, 4096) = 614400

If we subtract from that the file offset for the extent item we get:

   614400 - 589824 = 24K

So the missing 40K corresponds to the end of the file extent item's range
minus the rounded up i_size:

   655360 - 614400 = 40K

Normally we don't expect a file extent item to span over the rounded up
i_size of an inode, since when truncating, doing hole punching and other
operations that trim a file extent item, the number of bytes is adjusted.

There is however a short time window where the kernel can end up,
temporarily,persisting an inode with an i_size that falls in the middle of
the last file extent item and the file extent item was not yet trimmed (its
number of bytes reduced so that it doesn't cross i_size rounded up by the
sector size).

The steps (in the kernel) that lead to such scenario are the following:

 1) We have inode I as an empty file, no allocated extents, i_size is 0;

 2) A buffered write is done for file range [589824, 655360[ (length of
    64K) and the i_size is updated to 655360. Note that we got a single
    large folio for the range (64K);

 3) A truncate operation starts that reduces the inode's i_size down to
    610969 bytes. The truncate sets the inode's new i_size at
    btrfs_setsize() by calling truncate_setsize() and before calling
    btrfs_truncate();

 4) At btrfs_truncate() we trigger writeback for the range starting at
    610304 (which is the new i_size rounded down to the sector size) and
    ending at (u64)-1;

 5) During the writeback, at extent_write_cache_pages(), we get from the
    call to filemap_get_folios_tag(), the 64K folio that starts at file
    offset 589824 since it contains the start offset of the writeback
    range (610304);

 6) At writepage_delalloc() we find the whole range of the folio is dirty
    and therefore we run delalloc for that 64K range ([589824, 655360[),
    reserving a 64K extent, creating an ordered extent, etc;

 7) At extent_writepage_io() we submit IO only for subrange [589824, 614400[
    because the inode's i_size is 610969 bytes (rounded up by sector size
    is 614400). There, in the while loop we intentionally skip IO beyond
    i_size to avoid any unnecessay work and just call
    btrfs_mark_ordered_io_finished() for the range [614400, 655360[ (which
    has a 40K length);

 8) Once the IO finishes we finish the ordered extent by ending up at
    btrfs_finish_one_ordered(), join transaction N, insert a file extent
    item in the inode's subvolume tree for file offset 589824 with a number
    of bytes of 64K, and update the inode's delayed inode item or directly
    the inode item with a call to btrfs_update_inode_fallback(), which
    results in storing the new i_size of 610969 bytes;

 9) Transaction N is committed either by the transaction kthread or some
    other task committed it (in response to a sync or fsync for example).

    At this point we have inode I persisted with an i_size of 610969 bytes
    and file extent item that starts at file offset 589824 and has a number
    of bytes of 64K, ending at an offset of 655360 which is beyond the
    i_size rounded up to the sector size (614400).

    --> So after a crash or power failure here, the btrfs check program
        reports that error about missing checksum items for this inode, as
	it tries to lookup for checksums covering the whole range of the
	extent;

10) Only after transaction N is committed that at btrfs_truncate() the
    call to btrfs_start_transaction() starts a new transaction, N + 1,
    instead of joining transaction N. And it's with transaction N + 1 that
    it calls btrfs_truncate_inode_items() which updates the file extent
    item at file offset 589824 to reduce its number of bytes from 64K down
    to 24K, so that the file extent item's range ends at the i_size
    rounded up to the sector size (614400 bytes).

Fix this by truncating the ordered extent at extent_writepage_io() when we
skip writeback because the current offset in the folio is beyond i_size.
This ensures we don't ever persist a file extent item with a number of
bytes beyond the rounded up (by sector size) value of the i_size.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Anand Jain <asj@kernel.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 21:59:08 +01:00
Qu Wenruo
803e115657 btrfs: implement remove_bdev and shutdown super operation callbacks
For the ->remove_bdev() callback, btrfs will:

- Mark the target device as missing

- Go degraded if the fs can afford it

- Return error other wise
  Thus falls back to the shutdown callback

For the ->shutdown callback, btrfs will:

- Set the SHUTDOWN flag
  Which will reject all new incoming operations, and make all writeback
  to fail.

  The behavior is the same as the NOLOGFLUSH behavior.

To support the lookup from bdev to a btrfs_device,
btrfs_dev_lookup_args is enhanced to have a new @devt member.
If set, we should be able to use that @devt member to uniquely locating a
btrfs device.

I know the shutdown can be a little overkilled, if one has a RAID1
metadata and RAID0 data, in that case one can still read data with 50%
chance to got some good data.

But a filesystem returning -EIO for half of the time is not really
considered usable.
Further it can also be as bad as the only device went missing for a single
device btrfs.

So here we go safe other than sorry when handling missing device.

And the remove_bdev callback will be hidden behind experimental features
for now, the reasons are:

- There are not enough btrfs specific bdev removal test cases
  The existing test cases are all removing the only device, thus only
  exercises the ->shutdown() behavior.

- Not yet determined what's the expected behavior
  Although the current auto-degrade behavior is no worse than the old
  behavior, it may not always be what the end users want.

  Before there is a concrete interface, better hide the new feature
  from end users.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <asj@kernel.org>
Tested-by: Anand Jain <asj@kernel.org>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 21:58:40 +01:00
Qu Wenruo
6b1ac78dd0 btrfs: implement shutdown ioctl
The shutdown ioctl should follow the XFS one, which use magic number 'X',
and ioctl number 125, with a uint32 as flags.

For now btrfs don't distinguish DEFAULT and LOGFLUSH flags (just like
f2fs), both will freeze the fs first (implies committing the current
transaction), setting the SHUTDOWN flag and finally thaw the fs.

For NOLOGFLUSH flag, the freeze/thaw part is skipped thus the current
transaction is aborted.

The new shutdown ioctl is hidden behind experimental features for more
testing.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <asj@kernel.org>
Tested-by: Anand Jain <asj@kernel.org>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 21:56:17 +01:00
Qu Wenruo
9b2839451d btrfs: introduce a new shutdown state
A new fs state EMERGENCY_SHUTDOWN is introduced, which is btrfs'
equivalent of XFS_IOC_GOINGDOWN or EXT4_IOC_SHUTDOWN, after entering
emergency shutdown state, all operations will return errors (-EIO), and
can not be bring back to normal state until unmouont.

The new state will reject the following file operations:

- read_iter()
- write_iter()
- mmap()
- open()
- remap_file_range()
- uring_cmd()
- splice_read()
  This requires a small wrapper to do the extra shutdown check, then call
  the regular filemap_splice_read() function

This should reject most of the file operations on a shutdown btrfs.

And for the existing dirty folios, extra shutdown checks are introduced
to the following functions:

- run_delalloc_nocow()
- run_delalloc_compressed()
- cow_file_range()

So that dirty ranges will still be properly cleaned without being
submitted.

Finally the shutdown state will also set the fs error, so that no new
transaction will be committed, protecting the metadata from any possible
further corruption.

And when the fs entered shutdown mode for the first time, a critical
level kernel message will show up to indicate the incident.

That message will be important for end users as rejected delalloc ranges
will output error messages, hopefully that shutdown message and the fact
that all fs operations are returning error will prevent end users from
getting too confused about the delalloc error messages.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anand Jain <asj@kernel.org>
Tested-by: Anand Jain <asj@kernel.org>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 21:45:03 +01:00
Filipe Manana
892794c025 btrfs: use end_pos variable where needed in btrfs_dirty_folio()
We have a couple places doing the computation "pos + write_bytes" when we
already have it in the local variable "end_pos". Change then to use the
variable instead and make source code smaller. Also make the variable
const since it's not supposed to change.

This also has a very slight reduction in the module size.

Before:

   $ size fs/btrfs/btrfs.ko
      text	   data	    bss	    dec	    hex	filename
   1915990	 161647	  15592	2093229	 1ff0ad	fs/btrfs/btrfs.ko

After:

   $ size fs/btrfs/btrfs.ko
      text	   data	    bss	    dec	    hex	filename
   1915974	 161647	  15592	2093213	 1ff09d	fs/btrfs/btrfs.ko

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 21:40:05 +01:00
Boris Burkov
38e818718c btrfs: fix racy bitfield write in btrfs_clear_space_info_full()
From the memory-barriers.txt document regarding memory barrier ordering
guarantees:

 (*) These guarantees do not apply to bitfields, because compilers often
     generate code to modify these using non-atomic read-modify-write
     sequences.  Do not attempt to use bitfields to synchronize parallel
     algorithms.

 (*) Even in cases where bitfields are protected by locks, all fields
     in a given bitfield must be protected by one lock.  If two fields
     in a given bitfield are protected by different locks, the compiler's
     non-atomic read-modify-write sequences can cause an update to one
     field to corrupt the value of an adjacent field.

btrfs_space_info has a bitfield sharing an underlying word consisting of
the fields full, chunk_alloc, and flush:

struct btrfs_space_info {
        struct btrfs_fs_info *     fs_info;              /*     0     8 */
        struct btrfs_space_info *  parent;               /*     8     8 */
        ...
        int                        clamp;                /*   172     4 */
        unsigned int               full:1;               /*   176: 0  4 */
        unsigned int               chunk_alloc:1;        /*   176: 1  4 */
        unsigned int               flush:1;              /*   176: 2  4 */
        ...

Therefore, to be safe from parallel read-modify-writes losing a write to
one of the bitfield members protected by a lock, all writes to all the
bitfields must use the lock. They almost universally do, except for
btrfs_clear_space_info_full() which iterates over the space_infos and
writes out found->full = 0 without a lock.

Imagine that we have one thread completing a transaction in which we
finished deleting a block_group and are thus calling
btrfs_clear_space_info_full() while simultaneously the data reclaim
ticket infrastructure is running do_async_reclaim_data_space():

          T1                                             T2
btrfs_commit_transaction
  btrfs_clear_space_info_full
  data_sinfo->full = 0
  READ: full:0, chunk_alloc:0, flush:1
                                              do_async_reclaim_data_space(data_sinfo)
                                              spin_lock(&space_info->lock);
                                              if(list_empty(tickets))
                                                space_info->flush = 0;
                                                READ: full: 0, chunk_alloc:0, flush:1
                                                MOD/WRITE: full: 0, chunk_alloc:0, flush:0
                                                spin_unlock(&space_info->lock);
                                                return;
  MOD/WRITE: full:0, chunk_alloc:0, flush:1

and now data_sinfo->flush is 1 but the reclaim worker has exited. This
breaks the invariant that flush is 0 iff there is no work queued or
running. Once this invariant is violated, future allocations that go
into __reserve_bytes() will add tickets to space_info->tickets but will
see space_info->flush is set to 1 and not queue the work. After this,
they will block forever on the resulting ticket, as it is now impossible
to kick the worker again.

I also confirmed by looking at the assembly of the affected kernel that
it is doing RMW operations. For example, to set the flush (3rd) bit to 0,
the assembly is:
  andb    $0xfb,0x60(%rbx)
and similarly for setting the full (1st) bit to 0:
  andb    $0xfe,-0x20(%rax)

So I think this is really a bug on practical systems.  I have observed
a number of systems in this exact state, but am currently unable to
reproduce it.

Rather than leaving this footgun lying around for the future, take
advantage of the fact that there is room in the struct anyway, and that
it is already quite large and simply change the three bitfield members to
bools. This avoids writes to space_info->full having any effect on
writes to space_info->flush, regardless of locking.

Fixes: 957780eb27 ("Btrfs: introduce ticketed enospc infrastructure")
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Boris Burkov <boris@bur.io>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 21:38:49 +01:00
Rajeev Tapadia
745483ea98 btrfs: fix comment in alloc_bitmap() and drop stale TODO
All callers of alloc_bitmap() hold a transaction handle, so GFP_NOFS is
needed to avoid deadlocks on recursion. Update the comment and drop the
stale TODO.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Rajeev Tapadia <rtapadia730@gmail.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 21:37:37 +01:00
Miquel Sabaté Solà
725e462988 btrfs: fix double free of qgroup record after failure to add delayed ref head
In the previous code it was possible to incur into a double kfree()
scenario when calling add_delayed_ref_head(). This could happen if the
record was reported to already exist in the
btrfs_qgroup_trace_extent_nolock() call, but then there was an error
later on add_delayed_ref_head(). In this case, since
add_delayed_ref_head() returned an error, the caller went to free the
record. Since add_delayed_ref_head() couldn't set this kfree'd pointer
to NULL, then kfree() would have acted on a non-NULL 'record' object
which was pointing to memory already freed by the callee.

The problem comes from the fact that the responsibility to kfree the
object is on both the caller and the callee at the same time. Hence, the
fix for this is to shift the ownership of the 'qrecord' object out of
the add_delayed_ref_head(). That is, we will never attempt to kfree()
the given object inside of this function, and will expect the caller to
act on the 'qrecord' object on its own. The only exception where the
'qrecord' object cannot be kfree'd is if it was inserted into the
tracing logic, for which we already have the 'qrecord_inserted_ret'
boolean to account for this. Hence, the caller has to kfree the object
only if add_delayed_ref_head() reports not to have inserted it on the
tracing logic.

As a side-effect of the above, we must guarantee that
'qrecord_inserted_ret' is properly initialized at the start of the
function, not at the end, and then set when an actual insert
happens. This way we avoid 'qrecord_inserted_ret' having an invalid
value on an early exit.

The documentation from the add_delayed_ref_head() has also been updated
to reflect on the exact ownership of the 'qrecord' object.

Fixes: 6ef8fbce01 ("btrfs: fix missing error handling when adding delayed ref with qgroups enabled")
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Miquel Sabaté Solà <mssola@mssola.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 21:37:36 +01:00
David Sterba
2215e6b403 btrfs: subpage: rename macro variables to avoid shadowing
When compiling with -Wshadow there are warnings in the subpage helper
macros that are used in functions like btrfs_subpage_dump_bitmap() or
btrfs_subpage_clear_and_test_dirty() that also use 'bfs' (for struct
btrfs_folio_state) or blocks_per_folio.

Add '__' to the macro variables and unify naming in all subpage macros.

Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 21:37:36 +01:00
Mehdi Ben Hadj Khelifa
2346b966c6 btrfs: refactor allocation size calculation in alloc_btrfs_io_context()
Use struct_size() to replace the open-coded calculation, remove the
comment as use of the helper is self explanatory.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Mehdi Ben Hadj Khelifa <mehdi.benhadjkhelifa@gmail.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 21:37:36 +01:00
David Sterba
aebe2bb0b8 btrfs: fix trivial -Wshadow warnings
When compiling with -Wshadow (also in 'make W=2' build) there are
several reports of shadowed variables that seem to be harmless:

- btrfs_do_encoded_write() - we can reuse 'ordered', there's no previous
			     value that would need to be preserved

- scrub_write_endio() - we need a standalone 'i' for bio iteration

- scrub_stripe() - duplicate ret2 for errors that must not overwrite 'ret'

- btrfs_subpage_set_writeback() - 'flags' is used for another irqsave lock
                                  but is not overwritten when reused for xarray
				  due to scoping, but for clarity let's rename it

- process_dir_items_leaf() - duplicate 'ret', used only for immediate checks

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 21:37:36 +01:00
David Sterba
9594783e4b btrfs: print-tree: use string format for key names
There's a warning when -Wformat=2 is used:

fs/btrfs/print-tree.c: In function ‘key_type_string’:
fs/btrfs/print-tree.c:424:17: warning: format not a string literal and no format arguments [-Wformat-nonliteral]
  424 |                 scnprintf(buf, buf_size, key_to_str[key->type]);

We're printing fixed strings from a table so there's no problem but
let's fix the warning so we could enable the warning in fs/btrfs/.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 21:37:36 +01:00
Qu Wenruo
4e700ac62a btrfs: remove unnecessary NULL fs_info check from find_lock_delalloc_range()
[STATIC CHECK REPORT]
Smatch is reporting that find_lock_delalloc_range() used to do a null
pointer check before accessing fs_info, but now we're accessing it for
sectorsize unconditionally.

[FALSE ALERT]
This is a false alert, the existing null pointer check is introduced in
commit f7b12a62f0 ("btrfs: replace BTRFS_MAX_EXTENT_SIZE with
fs_info->max_extent_size"), but way before that, commit 7c0260ee09
("btrfs: tests, require fs_info for root") is already forcing every
btrfs_root to have a correct fs_info pointer.

So there is no way that btrfs_root::fs_info is NULL.

[FIX]
Just remove the unnecessary NULL pointer checker.

Reported-by: kernel test robot <lkp@intel.com>
Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Fixes: f7b12a62f0 ("btrfs: replace BTRFS_MAX_EXTENT_SIZE with fs_info->max_extent_size")
Closes: https://lore.kernel.org/r/202509250925.4L4JQTtn-lkp@intel.com/
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 21:37:36 +01:00
Filipe Manana
69e293d28a btrfs: use single return value variable in btrfs_relocate_block_group()
We are using 'ret' and 'err' variables to track return values and errors,
which is pattern that is error prone and we had quite some bugs due to
this pattern in the past.

Simplify this and use a single variable, named 'ret', to track errors and
the return value.

Also rename the variable 'rw' to 'bg_is_ro' which is more meaningful name,
and change its type from int to bool.

Reviewed-by: Boris Burkov <boris@bur.io>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 21:37:30 +01:00
Alexei Starovoitov
4617b3069a Merge branch 'ease-bpf-signing-build-requirements'
Alan Maguire says:

====================
Ease BPF signing build requirements

This series makes it easier to build bpftool and selftests with
signing support, removing reliance on >= openssl v3 (supporting
openssl v1) to build bpftool and not requiring latest xxd to
build verification cert header in selftests.

Changes since v1 [1]:

- Updated patch 2 to add symlink test_progs_verification_cert to .gitignore,
  EXTRA_CLEANFILES (AI review bot)
- Added acks to patch 1 (Song, Quentin)

[1] https://lore.kernel.org/bpf/20251114222249.30122-1-alan.maguire@oracle.com/
====================

Link: https://patch.msgid.link/20251120084754.640405-1-alan.maguire@oracle.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-24 10:00:16 -08:00
Alan Maguire
ad93ba0267 selftests/bpf: Allow selftests to build with older xxd
Currently selftests require xxd with the "-n <name>" option
which allows the user to specify a name not derived from
the input object path.  Instead of relying on this newer
feature, older xxd can be used if we link our desired name
("test_progs_verification_cert") to the input object.

Many distros ship xxd in vim-common package and do not have
the latest xxd with -n support.

Fixes: b720903e2b ("selftests/bpf: Enable signature verification for some lskel tests")
Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Link: https://lore.kernel.org/r/20251120084754.640405-3-alan.maguire@oracle.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-24 10:00:16 -08:00
Alan Maguire
90ae54b4c7 bpftool: Allow bpftool to build with openssl < 3
ERR_get_error_all()[1] is a openssl v3 API, so to make code
compatible with openssl v1 utilize ERR_get_err_line_data
instead.  Since openssl is already a build requirement for
the kernel (minimum requirement openssl 1.0.0), this will
allow bpftool to compile where opensslv3 is not available.
Signing-related BPF selftests pass with openssl v1.

[1] https://docs.openssl.org/3.4/man3/ERR_get_error/

Fixes: 40863f4d6e ("bpftool: Add support for signing BPF programs")
Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Acked-by: Song Liu <song@kernel.org>
Acked-by: Quentin Monnet <qmo@kernel.org>
Link: https://lore.kernel.org/r/20251120084754.640405-2-alan.maguire@oracle.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-24 10:00:16 -08:00
Christoph Hellwig
dcfa98bb5f xfs: move some code out of xfs_iget_recycle
Having a function drop locks, reacquire them and release them again
seems to confuse the clang lock analysis even more than it confuses
humans.  Keep the humans and machines sanity by moving a chunk of
code into the caller to simplify the lock tracking.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
2025-11-24 18:53:10 +01:00
Alexei Starovoitov
acf8726466 Merge branch 'bpf-trampoline-support-jmp-mode'
Menglong Dong says:

====================
bpf trampoline support "jmp" mode

For now, the bpf trampoline is called by the "call" instruction. However,
it break the RSB and introduce extra overhead in x86_64 arch.

For example, we hook the function "foo" with fexit, the call and return
logic will be like this:
  call foo -> call trampoline -> call foo-body ->
  return foo-body -> return foo

As we can see above, there are 3 call, but 2 return, which break the RSB
balance. We can pseudo a "return" here, but it's not the best choice,
as it will still cause once RSB miss:
  call foo -> call trampoline -> call foo-body ->
  return foo-body -> return dummy -> return foo

The "return dummy" doesn't pair the "call trampoline", which can also
cause the RSB miss.

Therefore, we introduce the "jmp" mode for bpf trampoline, as advised by
Alexei in [1]. And the logic will become this:
  call foo -> jmp trampoline -> call foo-body ->
  return foo-body -> return foo

As we can see above, the RSB is totally balanced after this series.

In this series, we introduce the FTRACE_OPS_FL_JMP for ftrace to make it
use the "jmp" instruction instead of "call".

And we also do some adjustment to bpf_arch_text_poke() to allow us specify
the old and new poke_type.

For the BPF_TRAMP_F_SHARE_IPMODIFY case, we will fallback to the "call"
mode, as it need to get the function address from the stack, which is not
supported in "jmp" mode.

Before this series, we have the following performance with the bpf
benchmark:

  $ cd tools/testing/selftests/bpf
  $ ./benchs/run_bench_trigger.sh
  usermode-count :  890.171 ± 1.522M/s
  kernel-count   :  409.184 ± 0.330M/s
  syscall-count  :   26.792 ± 0.010M/s
  fentry         :  171.242 ± 0.322M/s
  fexit          :   80.544 ± 0.045M/s
  fmodret        :   78.301 ± 0.065M/s
  rawtp          :  192.906 ± 0.900M/s
  tp             :   81.883 ± 0.209M/s
  kprobe         :   52.029 ± 0.113M/s
  kprobe-multi   :   62.237 ± 0.060M/s
  kprobe-multi-all:    4.761 ± 0.014M/s
  kretprobe      :   23.779 ± 0.046M/s
  kretprobe-multi:   29.134 ± 0.012M/s
  kretprobe-multi-all:    3.822 ± 0.003M/

And after this series, we have the following performance:

  usermode-count :  890.443 ± 0.307M/s
  kernel-count   :  416.139 ± 0.055M/s
  syscall-count  :   31.037 ± 0.813M/s
  fentry         :  169.549 ± 0.519M/s
  fexit          :  136.540 ± 0.518M/s
  fmodret        :  159.248 ± 0.188M/s
  rawtp          :  194.475 ± 0.144M/s
  tp             :   84.505 ± 0.041M/s
  kprobe         :   59.951 ± 0.071M/s
  kprobe-multi   :   63.153 ± 0.177M/s
  kprobe-multi-all:    4.699 ± 0.012M/s
  kretprobe      :   23.740 ± 0.015M/s
  kretprobe-multi:   29.301 ± 0.022M/s
  kretprobe-multi-all:    3.869 ± 0.005M/s

As we can see above, the performance of fexit increase from 80.544M/s to
136.540M/s, and the "fmodret" increase from 78.301M/s to 159.248M/s.

Link: https://lore.kernel.org/bpf/20251117034906.32036-1-dongml2@chinatelecom.cn/
Changes since v2:
* reject if the addr is already "jmp" in register_ftrace_direct() and
  __modify_ftrace_direct() in the 1st patch.
* fix compile error in powerpc in the 5th patch.
* changes in the 6th patch:
  - fix the compile error by wrapping the write to tr->fops->flags with
    CONFIG_DYNAMIC_FTRACE_WITH_JMP
  - reset BPF_TRAMP_F_SKIP_FRAME when the second try of modify_fentry in
    bpf_trampoline_update()

Link: https://lore.kernel.org/bpf/20251114092450.172024-1-dongml2@chinatelecom.cn/
Changes since v1:
* change the bool parameter that we add to save_args() to "u32 flags"
* rename bpf_trampoline_need_jmp() to bpf_trampoline_use_jmp()
* add new function parameter to bpf_arch_text_poke instead of introduce
  bpf_arch_text_poke_type()
* rename bpf_text_poke to bpf_trampoline_update_fentry
* remove the BPF_TRAMP_F_JMPED and check the current mode with the origin
  flags instead.

Link: https://lore.kernel.org/bpf/CAADnVQLX54sVi1oaHrkSiLqjJaJdm3TQjoVrgU-LZimK6iDcSA@mail.gmail.com/[1]
====================

Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Link: https://patch.msgid.link/20251118123639.688444-1-dongml2@chinatelecom.cn
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-24 09:47:11 -08:00
Menglong Dong
402e44b31e bpf: implement "jmp" mode for trampoline
Implement the "jmp" mode for the bpf trampoline. For the ftrace_managed
case, we need only to set the FTRACE_OPS_FL_JMP on the tr->fops if "jmp"
is needed.

For the bpf poke case, we will check the origin poke type with the
"origin_flags", and current poke type with "tr->flags". The function
bpf_trampoline_update_fentry() is introduced to do the job.

The "jmp" mode will only be enabled with CONFIG_DYNAMIC_FTRACE_WITH_JMP
enabled and BPF_TRAMP_F_SHARE_IPMODIFY is not set. With
BPF_TRAMP_F_SHARE_IPMODIFY, we need to get the origin call ip from the
stack, so we can't use the "jmp" mode.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Link: https://lore.kernel.org/r/20251118123639.688444-7-dongml2@chinatelecom.cn
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-24 09:47:04 -08:00
Menglong Dong
ae4a3160d1 bpf: specify the old and new poke_type for bpf_arch_text_poke
In the origin logic, the bpf_arch_text_poke() assume that the old and new
instructions have the same opcode. However, they can have different opcode
if we want to replace a "call" insn with a "jmp" insn.

Therefore, add the new function parameter "old_t" along with the "new_t",
which are used to indicate the old and new poke type. Meanwhile, adjust
the implement of bpf_arch_text_poke() for all the archs.

"BPF_MOD_NOP" is added to make the code more readable. In
bpf_arch_text_poke(), we still check if the new and old address is NULL to
determine if nop insn should be used, which I think is more safe.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
Link: https://lore.kernel.org/r/20251118123639.688444-6-dongml2@chinatelecom.cn
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-24 09:47:03 -08:00
Menglong Dong
373f2f44c3 bpf,x86: adjust the "jmp" mode for bpf trampoline
In the origin call case, if BPF_TRAMP_F_SKIP_FRAME is not set, it means
that the trampoline is not called, but "jmp".

Introduce the function bpf_trampoline_use_jmp() to check if the trampoline
is in "jmp" mode.

Do some adjustment on the "jmp" mode for the x86_64. The main adjustment
that we make is for the stack parameter passing case, as the stack
alignment logic changes in the "jmp" mode without the "rip". What's more,
the location of the parameters on the stack also changes.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
Link: https://lore.kernel.org/r/20251118123639.688444-5-dongml2@chinatelecom.cn
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-24 09:47:03 -08:00
Menglong Dong
47c9214dcb bpf: fix the usage of BPF_TRAMP_F_SKIP_FRAME
Some places calculate the origin_call by checking if
BPF_TRAMP_F_SKIP_FRAME is set. However, it should use
BPF_TRAMP_F_ORIG_STACK for this propose. Just fix them.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/r/20251118123639.688444-4-dongml2@chinatelecom.cn
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-24 09:47:03 -08:00
Menglong Dong
0c3772a8db x86/ftrace: Implement DYNAMIC_FTRACE_WITH_JMP
Implement the DYNAMIC_FTRACE_WITH_JMP for x86_64. In ftrace_call_replace,
we will use JMP32_INSN_OPCODE instead of CALL_INSN_OPCODE if the address
should use "jmp".

Meanwhile, adjust the direct call in the ftrace_regs_caller. The RSB is
balanced in the "jmp" mode. Take the function "foo" for example:

 original_caller:
 call foo -> foo:
         call fentry -> fentry:
                 [do ftrace callbacks ]
                 move tramp_addr to stack
                 RET -> tramp_addr
                         tramp_addr:
                         [..]
                         call foo_body -> foo_body:
                                 [..]
                                 RET -> back to tramp_addr
                         [..]
                         RET -> back to original_caller

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Link: https://lore.kernel.org/r/20251118123639.688444-3-dongml2@chinatelecom.cn
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-24 09:46:37 -08:00
Menglong Dong
25e4e3565d ftrace: Introduce FTRACE_OPS_FL_JMP
For now, the "nop" will be replaced with a "call" instruction when a
function is hooked by the ftrace. However, sometimes the "call" can break
the RSB and introduce extra overhead. Therefore, introduce the flag
FTRACE_OPS_FL_JMP, which indicate that the ftrace_ops should be called
with a "jmp" instead of "call". For now, it is only used by the direct
call case.

When a direct ftrace_ops is marked with FTRACE_OPS_FL_JMP, the last bit of
the ops->direct_call will be set to 1. Therefore, we can tell if we should
use "jmp" for the callback in ftrace_call_replace().

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Link: https://lore.kernel.org/r/20251118123639.688444-2-dongml2@chinatelecom.cn
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-24 09:46:24 -08:00
Anton Protopopov
fad804002e bpf: cleanup aux->used_maps after jit
In commit b4ce5923e7 ("bpf, x86: add new map type: instructions array")
env->used_map was copied to func[i]->aux->used_maps before jitting.
Clear these fields out after jitting such that pointer to freed memory
(env->used_maps is freed later) are not kept in a live data structure.

The reason why the copies were initially added is explained in
https://lore.kernel.org/bpf/20251105090410.1250500-1-a.s.protopopov@gmail.com

Suggested-by: Alexei Starovoitov <ast@kernel.org>
Fixes: b4ce5923e7 ("bpf, x86: add new map type: instructions array")
Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Link: https://lore.kernel.org/r/20251124151515.2543403-1-a.s.protopopov@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-24 09:39:55 -08:00
Boris Burkov
828ec765f7 btrfs: ignore ENOMEM from alloc_bitmap()
btrfs_convert_free_space_to_bitmaps() and
btrfs_convert_free_space_to_extents() both allocate a bitmap struct
with:

        bitmap_size = free_space_bitmap_size(fs_info, block_group->length);
        bitmap = alloc_bitmap(bitmap_size);
        if (!bitmap) {
                ret = -ENOMEM;
                btrfs_abort_transaction(trans);
                return ret;
        }

This conversion is done based on a heuristic and the check triggers each
time we call update_free_space_extent_count() on a block group (each
time we add/remove an extent or modify a bitmap). Furthermore, nothing
relies on maintaining some invariant of bitmap density, it's just an
optimization for space usage. Therefore, it is safe to simply ignore
any memory allocation errors that occur, rather than aborting the
transaction and leaving the fs read only.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Boris Burkov <boris@bur.io>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-24 18:29:28 +01:00
Zilin Guan
53d1548612 mt76: mt7615: Fix memory leak in mt7615_mcu_wtbl_sta_add()
In mt7615_mcu_wtbl_sta_add(), an skb sskb is allocated. If the
subsequent call to mt76_connac_mcu_alloc_wtbl_req() fails, the function
returns an error without freeing sskb, leading to a memory leak.

Fix this by calling dev_kfree_skb() on sskb in the error handling path
to ensure it is properly released.

Fixes: 99c457d902 ("mt76: mt7615: move mt7615_mcu_set_bmc to mt7615_mcu_ops")
Signed-off-by: Zilin Guan <zilin@seu.edu.cn>
Acked-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://patch.msgid.link/20251113062415.103611-1-zilin@seu.edu.cn
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 15:01:54 +01:00
Quan Zhou
066f417be5 wifi: mt76: mt792x: fix wifi init fail by setting MCU_RUNNING after CLC load
Set the MT76_STATE_MCU_RUNNING bit only after mt7921_load_clc()
has successfully completed. Previously, the MCU_RUNNING state
was set before loading CLC, which could cause conflict between
chip mcu_init retry and mac_reset flow, result in chip init fail
and chip abnormal status. By moving the state set after CLC load,
firmware initialization becomes robust and resolves init fail issue.

Signed-off-by: Quan Zhou <quan.zhou@mediatek.com>
Reviewed-by: druth@chromium.org
Link: https://patch.msgid.link/19ec8e4465142e774f17801025accd0ae2214092.1763465933.git.quan.zhou@mediatek.com
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 15:01:30 +01:00
Mario Limonciello (AMD)
f804a5895e wifi: mt76: Strip whitespace from build ddate
On systems I have with mt7925 cards I've been noticing a blank line in my
kernel logs.  IE:
```
[   17.294105] mt7925e 0000:c3:00.0: HW/SW Version: 0x8a108a10, Build Time: 20250721232852a

[   17.314233] r8169 0000:c4:00.0 enp196s0f0: Link is Down
```

This is because the build_date from the header has a newline character
as does the dev_info() print.  As the firmware isn't guaranteed to always
have a newline but the print is, copy the firmware build date to a
temporary variable and strip any whitespace from it before showing it in
the logs.

Signed-off-by: Mario Limonciello (AMD) <superm1@kernel.org>
Link: https://patch.msgid.link/20251120155829.3494747-1-superm1@kernel.org
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:59:13 +01:00
Lorenzo Bianconi
7545551631 wifi: mt76: mt7996: Add missing locking in mt7996_mac_sta_rc_work()
Grab the mt76 mutex running mt7996_mac_sta_rc_work() since it is
required by mt7996_mcu_add_rate_ctrl routine.

Fixes: 28d519d0d4 ("wifi: mt76: Move RCU section in mt7996_mcu_add_rate_ctrl_fixed()")
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://patch.msgid.link/20251118-mt7996-rc-work-missing-mtx-v1-1-0739c493a6cb@kernel.org
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:59:13 +01:00
Lorenzo Bianconi
2a432a6d00 wifi: mt76: mt7996: skip ieee80211_iter_keys() on scanning link remove
mt7996_vif_link_remove routine is executed by mt76_scan_complete()
without holding the wiphy mutex triggering the following lockdep warning.

 WARNING: CPU: 0 PID: 72 at net/mac80211/key.c:1029 ieee80211_iter_keys+0xe4/0x1a0 [mac80211]
 CPU: 0 UID: 0 PID: 72 Comm: kworker/u32:2 Tainted: G S                  6.18.0-rc5+ #27 PREEMPT(full)
 Tainted: [S]=CPU_OUT_OF_SPEC
 Hardware name: Default string Default string/SKYBAY, BIOS 5.12 02/15/2023
 Workqueue: phy3 mt76_scan_work [mt76]
 RIP: 0010:ieee80211_iter_keys+0xe4/0x1a0 [mac80211]
 Code: 4c 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 48 8b 47 48 be ff ff ff ff 48 8d 78 68 e8 b4 eb 1e e1 85 c0 0f 85 49
ff ff ff 4c 8b ab 90 1a 00 00 48 8d 83 90
 RSP: 0018:ffffc900002f7cb0 EFLAGS: 00010246
 RAX: 0000000000000000 RBX: ffff888127e00ee0 RCX: 0000000000000000
 RDX: 0000000000000000 RSI: ffff888127e00788 RDI: ffff88811132b5c8
 RBP: ffffffffa0ddf400 R08: 0000000000000001 R09: 000000009dcc1dac
 R10: 0000000000000001 R11: ffff88811132b5a0 R12: ffffc900002f7d00
 R13: ffff8882581e6a80 R14: ffff888127e0afc8 R15: ffff888158832038
 FS:  0000000000000000(0000) GS:ffff8884da486000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 0000000030a0fd90 CR3: 0000000002c52004 CR4: 00000000003706f0
 Call Trace:
  <TASK>
  ? lock_acquire+0xc2/0x2c0
  mt7996_vif_link_remove+0x64/0x2b0 [mt7996e]
  mt76_put_vif_phy_link+0x41/0x50 [mt76]
  mt76_scan_complete+0x77/0x100 [mt76]
  mt76_scan_work+0x2eb/0x3f0 [mt76]
  ? process_one_work+0x1e5/0x6d0
  process_one_work+0x221/0x6d0
  worker_thread+0x19a/0x340
  ? rescuer_thread+0x450/0x450
  kthread+0x108/0x220
  ? kthreads_online_cpu+0x110/0x110
  ret_from_fork+0x1c6/0x220
  ? kthreads_online_cpu+0x110/0x110
  ret_from_fork_asm+0x11/0x20
  </TASK>
 irq event stamp: 45471
 hardirqs last  enabled at (45477): [<ffffffff813d446e>] __up_console_sem+0x5e/0x70
 hardirqs last disabled at (45482): [<ffffffff813d4453>] __up_console_sem+0x43/0x70
 softirqs last  enabled at (44500): [<ffffffff81f2ae0c>] napi_pp_put_page+0xac/0xd0
 softirqs last disabled at (44498): [<ffffffff81fa32a0>] page_pool_put_unrefed_netmem+0x290/0x3d0
 ---[ end trace 0000000000000000 ]---

Fix the issue skipping ieee80211_iter_keys() for scanning links in
mt7996_vif_link_remove routine since we have not uploaded any hw keys
for these links.

Fixes: 04414d7bba ("wifi: mt76: mt7996: delete vif keys when requested")
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Tested-by: Ben Greear <greearb@candelatech.com>
Link: https://patch.msgid.link/20251115-mt7996-key-iter-link-remove-fix-v1-1-4f3f4e1eaa78@kernel.org
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:59:13 +01:00
Lorenzo Bianconi
4fe823b9ee wifi: mt76: mt7996: skip deflink accounting for offchannel links
Do not take into account offchannel links for deflink accounting.

Fixes: a3316d2fc6 ("wifi: mt76: mt7996: set vif default link_id adding/removing vif links")
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Tested-by: Ben Greear <greearb@candelatech.com>
Link: https://patch.msgid.link/20251114-mt76-fix-missing-mtx-v1-4-259ebf11f654@kernel.org
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:59:13 +01:00
Lorenzo Bianconi
6aaaaeacf1 wifi: mt76: Move mt76_abort_scan out of mt76_reset_device()
Move mt76_abort_scan routine out of mt76_reset_device() in order to
avoid a possible deadlock since mt76_reset_device routine is running
with mt76 mutex help and mt76_abort_scan_complete() can grab mt76 mutex
in some cases.

Fixes: b36d556102 ("wifi: mt76: abort scan/roc on hw restart")
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Tested-by: Ben Greear <greearb@candelatech.com>
Link: https://patch.msgid.link/20251114-mt76-fix-missing-mtx-v1-3-259ebf11f654@kernel.org
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:59:13 +01:00
Lorenzo Bianconi
a84b172cca wifi: mt76: mt7996: move mt7996_update_beacons under mt76 mutex
Move mt7996_update_beacons routine inside mt76 mutex critical section
in mt7996_mac_reset_work() in order to run mt7996_vif_conf_link() in
mt7996_mcu_add_beacon routine.

Fixes: f30906c55a ("wifi: mt76: mt7996: disable beacons when going offchannel")
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Tested-by: Ben Greear <greearb@candelatech.com>
Link: https://patch.msgid.link/20251114-mt76-fix-missing-mtx-v1-2-259ebf11f654@kernel.org
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:59:12 +01:00
Lorenzo Bianconi
5a4bcba26e wifi: mt76: mt7996: grab mt76 mutex in mt7996_mac_sta_event()
Grab mt76 mutex in mt7996_mac_sta_event routine in order to rely on
mt76_dereference() utility macro.

Fixes: ecd72f9695 ("wifi: mt76: mt7996: Support MLO in mt7996_mac_sta_event()")
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Tested-by: Ben Greear <greearb@candelatech.com>
Link: https://patch.msgid.link/20251114-mt76-fix-missing-mtx-v1-1-259ebf11f654@kernel.org
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:59:12 +01:00
Michael Lo
2ccbea08dd wifi: mt76: mt7925: ensure the 6GHz A-MPDU density cap from the hardware.
Set the 6GHz HE A-MPDU density from the hardware capability instead of a
hardcoded value, ensuring accurate capability reporting.

Signed-off-by: Michael Lo <michael.lo@mediatek.com>
Signed-off-by: Ming Yen Hsieh <mingyen.hsieh@mediatek.com>
Link: https://patch.msgid.link/20251106092151.1061648-1-mingyen.hsieh@mediatek.com
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:59:12 +01:00
Shayne Chen
a4031fec9d wifi: mt76: mt7996: fix EMI rings for RRO
The RRO EMI rings only need to be allocated when WED is not active.
This patch fixes command timeout issue for the setting of WED off and
RRO on.

Fixes: 3a29164425 ("wifi: mt76: mt7996: Add SW path for HW-RRO v3.1")
Co-developed-by: Rex Lu <rex.lu@mediatek.com>
Signed-off-by: Rex Lu <rex.lu@mediatek.com>
Signed-off-by: Shayne Chen <shayne.chen@mediatek.com>
Acked-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://patch.msgid.link/20251106064203.1000505-12-shayne.chen@mediatek.com
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:59:12 +01:00
Shayne Chen
f1e9f369ae wifi: mt76: mt7996: fix using wrong phy to start in mt7996_mac_restart()
Pass the correct mt7996_phy to mt7996_run().

Fixes: 0a5df0ec47 ("wifi: mt76: mt7996: remove redundant per-phy mac80211 calls during restart")
Signed-off-by: Shayne Chen <shayne.chen@mediatek.com>
Acked-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://patch.msgid.link/20251106064203.1000505-11-shayne.chen@mediatek.com
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:59:12 +01:00
Shayne Chen
e11be918d9 wifi: mt76: mt7996: fix MLO set key and group key issues
This patch fixes the following key issues:
- Pass correct link BSS to mt7996_mcu_add_key(), and use HW beacon
  protection mode for mt7990 chipset
- Do not do group key deletion for GTK and IGTK due to FW design, the
  delete key command will delete all group keys of a link BSS
- For deleting BIGTK, FW adds a new flow, but the "sec->add" field
  should be filled with "SET_KEY". Note that if BIGTK is not deleted, it
  will cause beacon decryption issue when switching from an AP interface
  to a station interface

Fixes: 0c45d52276 ("wifi: mt76: mt7996: fix setting beacon protection keys")
Co-developed-by: Allen Ye <allen.ye@mediatek.com>
Signed-off-by: Allen Ye <allen.ye@mediatek.com>
Co-developed-by: Peter Chiu <chui-hao.chiu@mediatek.com>
Signed-off-by: Peter Chiu <chui-hao.chiu@mediatek.com>
Signed-off-by: Shayne Chen <shayne.chen@mediatek.com>
Link: https://patch.msgid.link/20251106064203.1000505-10-shayne.chen@mediatek.com
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:59:12 +01:00
Shayne Chen
4fb3b4e7d1 wifi: mt76: mt7996: fix MLD group index assignment
Fix extender mode and MBSS issues caused by incorrect assignment of the
MLD group and remap indices.

Fixes: ed01c310ec ("wifi: mt76: mt7996: Fix mt7996_mcu_bss_mld_tlv routine")
Signed-off-by: Shayne Chen <shayne.chen@mediatek.com>
Acked-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://patch.msgid.link/20251106064203.1000505-9-shayne.chen@mediatek.com
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:59:12 +01:00
Shayne Chen
85cd5534a3 wifi: mt76: mt7996: use correct link_id when filling TXD and TXP
Obtain the correct link ID and, if needed, switch to the corresponding
wcid before populating the TX descriptor and TX payload.

Rules for link id:
- For QoS data of MLD peers (excluding EAPOL), select the primary or
  secondary wcid based on whether the TID is odd or even to meet FW/HW
  requirements
- For other packets, use IEEE80211_TX_CTRL_MLO_LINK if specified
  (such as multicast and broadcast packets)

Signed-off-by: Shayne Chen <shayne.chen@mediatek.com>
Acked-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://patch.msgid.link/20251106064203.1000505-8-shayne.chen@mediatek.com
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:59:12 +01:00
Shayne Chen
7eaea3a8ba wifi: mt76: mt7996: set link_valid field when initializing wcid
This ensures the upper layer uses the correct link ID during packet
processing.

Fixes: dd82a9e02c ("wifi: mt76: mt7996: Rely on mt7996_sta_link in sta_add/sta_remove callbacks")
Signed-off-by: Shayne Chen <shayne.chen@mediatek.com>
Acked-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://patch.msgid.link/20251106064203.1000505-7-shayne.chen@mediatek.com
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:59:12 +01:00
Shayne Chen
e077071e7a wifi: mt76: mt7996: fix teardown command for an MLD peer
For an MLD peer, we only need to call the teardown command when removing
the last link, and there's no need to call mt7996_mcu_add_sta() for the
earlier links.

Fixes: c1d6dd5d03 ("wifi: mt76: mt7996: Add mt7996_mcu_teardown_mld_sta rouine")
Signed-off-by: Shayne Chen <shayne.chen@mediatek.com>
Acked-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://patch.msgid.link/20251106064203.1000505-6-shayne.chen@mediatek.com
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:59:12 +01:00
Shayne Chen
bb705a6067 wifi: mt76: mt7996: fix several fields in mt7996_mcu_bss_basic_tlv()
Fix several fields in mt7996_mcu_bss_basic_tlv() that were not obtained
from the correct link. Without this patch, the MLD station interface
does not function properly.

Fixes: 34a41bfbcb ("wifi: mt76: mt7996: prepare mt7996_mcu_add_dev/bss_info for MLO support")
Signed-off-by: Shayne Chen <shayne.chen@mediatek.com>
Acked-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://patch.msgid.link/20251106064203.1000505-5-shayne.chen@mediatek.com
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:59:12 +01:00
Shayne Chen
feb06d4556 wifi: mt76: mt7996: support fixed rate for link station
Introduce mt7996_link_sta_add_debugfs() to extend fixed rate support for
MLO link station.

Co-developed-by: Howard Hsu <howard-yh.hsu@mediatek.com>
Signed-off-by: Howard Hsu <howard-yh.hsu@mediatek.com>
Signed-off-by: Shayne Chen <shayne.chen@mediatek.com>
Link: https://patch.msgid.link/20251106064203.1000505-4-shayne.chen@mediatek.com
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:59:11 +01:00
Howard Hsu
5d86765828 wifi: mt76: mt7996: fix implicit beamforming support for mt7992
Fix the ibf_timeout field for mt7996, mt7992 and mt7990 chipsets. For
the mt7992, this value shall be set as 0xff, while the others shall be
set as 0x18.

Fixes: ad4c9a8a98 ("wifi: mt76: mt7996: add implicit beamforming support for mt7992")
Signed-off-by: Howard Hsu <howard-yh.hsu@mediatek.com>
Signed-off-by: Shayne Chen <shayne.chen@mediatek.com>
Link: https://patch.msgid.link/20251106064203.1000505-3-shayne.chen@mediatek.com
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:59:11 +01:00
Peter Chiu
dab5b20254 wifi: mt76: mt7996: no need to wait ACK event for SDO command
For the SDO unified command, driver does not need to wait for ACK event so
do not mark MCU_CMD_ACK in command header.

Signed-off-by: Peter Chiu <chui-hao.chiu@mediatek.com>
Signed-off-by: Shayne Chen <shayne.chen@mediatek.com>
Link: https://patch.msgid.link/20251106064203.1000505-2-shayne.chen@mediatek.com
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:59:11 +01:00
StanleyYP Wang
361b59b6be wifi: mt76: mt7996: fix max nss value when getting rx chainmask
Since wiphy->available_antennas_tx now accumulates the chainmask of all
the radios of a wiphy, use phy->orig_antenna_mask to get the original
max nss for comparison.

Fixes: 69d54ce749 ("wifi: mt76: mt7996: switch to single multi-radio wiphy")
Signed-off-by: StanleyYP Wang <StanleyYP.Wang@mediatek.com>
Signed-off-by: Shayne Chen <shayne.chen@mediatek.com>
Link: https://patch.msgid.link/20251106064203.1000505-1-shayne.chen@mediatek.com
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:59:11 +01:00
Jack Kao
2a035ae206 wifi: mt76: mt7925: cqm rssi low/high event notify
The implementation amounts to setting the driver flag
IEEE80211_VIF_SUPPORTS_CQM_RSSI, and then providing
mechanisms for continuously updating enough information
to be able to provide notifications to userspace when
RSSI drops below a certain threshold

Signed-off-by: Jack Kao <jack.kao@mediatek.com>
Signed-off-by: Ming Yen Hsieh <mingyen.hsieh@mediatek.com>
Link: https://patch.msgid.link/20251001012506.2168037-1-mingyen.hsieh@mediatek.com
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:59:11 +01:00
Marco Crivellari
ee518f914c wifi: mt76: replace use of system_wq with system_percpu_wq
Currently if a user enqueue a work item using schedule_delayed_work() the
used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use
WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to
schedule_work() that is using system_wq and queue_work(), that makes use
again of WORK_CPU_UNBOUND.

This lack of consistentcy cannot be addressed without refactoring the API.
For more details see the Link tag below.

This continues the effort to refactor workqueue APIs, which began with
the introduction of new workqueues and a new alloc_workqueue flag in:

commit 128ea9f6cc ("workqueue: Add system_percpu_wq and system_dfl_wq")
commit 930c2ea566 ("workqueue: Add new WQ_PERCPU flag")

Switch to using system_percpu_wq because system_wq is going away as part of
a workqueue restructuring.

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Marco Crivellari <marco.crivellari@suse.com>
Link: https://patch.msgid.link/20250922093243.140946-2-marco.crivellari@suse.com
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:58:52 +01:00
Fedor Pchelkin
5ef6de7bad wifi: mt76: adjust BSS conf pointer handling
Passing a BSS conf pointer to mt76_connac2_mac_tx_rate_val() currently has
two inconsistencies.

When called from mt76_connac2_mac_write_txwi(), BSS conf is got by
dereferencing a vif pointer.  A NULL vif isn't accounted for though the
function itself supposes it to be NULL and tries to handle this case in
previous checks.  This looks like a cosmetic change since the drivers
calling the function (namely, mt7915 and mt7921) set WANT_MONITOR_VIF flag
so judging by info->control.vif initialization inside ieee80211_tx_*()
routines it can't actually come as NULL here.

The same holds for the BSS conf pointer handling inside
mt76_connac2_mac_tx_rate_val().  It is dereferenced before being checked
for NULL.  The function supposes to handle the case so reorder the check
and dereference of the pointer.  Again, this looks like a syntax issue
only.

Found by Linux Verification Center (linuxtesting.org) with SVACE static
analysis tool.

Co-developed-by: Matvey Kovalev <matvey.kovalev@ispras.ru>
Signed-off-by: Matvey Kovalev <matvey.kovalev@ispras.ru>
Signed-off-by: Fedor Pchelkin <pchelkin@ispras.ru>
Link: https://patch.msgid.link/20251027111843.38975-2-pchelkin@ispras.ru
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:37:55 +01:00
Fedor Pchelkin
cdb2941a51 Revert "wifi: mt76: mt792x: improve monitor interface handling"
This reverts commit 55e95ce469.

mt792x drivers don't seem to support multi-radio devices yet.  At least
they don't mess with `struct wiphy_radio` at the moment.

Packet capturing on monitor interface doesn't work after the blamed patch:

  tcpdump -i wls6mon -n -vvv

Revert the NO_VIRTUAL_MONITOR feature for now to resolve the issue.

Found by Linux Verification Center (linuxtesting.org).

Fixes: 55e95ce469 ("wifi: mt76: mt792x: improve monitor interface handling")
Signed-off-by: Fedor Pchelkin <pchelkin@ispras.ru>
Link: https://patch.msgid.link/20251027111843.38975-1-pchelkin@ispras.ru
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:37:55 +01:00
Caleb James DeLisle
2df00805f7 wifi: mt76: mmio_*_copy fix byte order and alignment
Update functions which copy to and from MMIO to load bytes as Little
Endian, and also support unaligned buffers.

PCI devices almost universally use Little Endian ordering for MMIO
registers, mt76 is no exception. PCI hardware that is designed to work
with Big Endian CPUs often (but not always) "helps" by transparently
byte-swapping MMIO reads and writes on the wire. If this is enabled
then it cannot be turned off for a single write. On hardware which does
not support this, writel() does the swap in software. When we are
transferring arbitrary bytes to MMIO space, we need them to arrive in
the same order they were in memory, so when the hardware swaps them
this is a problem. Rather than care about how our PCI host controller
works, we instead load bytes as Little Endian - so on a Big Endian
machine this will reverse them, then we use writel() which will put
them back in the right order again. This way we do not make it our
business whether the swapping is done in software or hardware.

Furthermore, inspection of the code shows that these functions are
often called with stack-allocated u8 arrays which have no alignment
guarantees so we now use (get|put)_unaligned_le32().

Fixes this issue:
mt76x2e 0000:02:00.0: ROM patch build: 20141115060606a
mt76x2e 0000:02:00.0: Firmware Version: 0.0.00
mt76x2e 0000:02:00.0: Build: 1
mt76x2e 0000:02:00.0: Build Time: 201607111443____
mt76x2e 0000:02:00.0: Firmware failed to start
mt76x2e 0000:02:00.0: probe with driver mt76x2e failed with error -145

Tested on:
SmartFiber XP8421-B (Big Endian MIPS 34Kc)
  - MT7612 -> 5g / ap / psk2
  - MT7603 -> 2g / sta / psk2
  - MT7603 -> 2g / ap / psk2
TpLink Archer v1200v-v2 (Big Endian MIPS 34Kc)
  - MT7613 -> 5g / ap / psk2
  - MT7603 -> 2g / sta / psk2

Signed-off-by: Caleb James DeLisle <cjd@cjdns.fr>
Link: https://patch.msgid.link/20251029184143.3991388-1-cjd@cjdns.fr
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:37:55 +01:00
Ming Yen Hsieh
992c304112 wifi: mt76: mt7925: disable auto regd changes after user set
Add regd_user flag to block automatic regulatory domain updates
if set by user.

Signed-off-by: Ming Yen Hsieh <mingyen.hsieh@mediatek.com>
Link: https://patch.msgid.link/20251031090352.1400079-7-mingyen.hsieh@mediatek.com
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:37:55 +01:00
Ming Yen Hsieh
3bc62aa448 wifi: mt76: mt7925: add auto regdomain switch support
Implement 802.11d-based automatic regulatory domain switching to
dynamically determine the regulatory domain at runtime.

Signed-off-by: Ming Yen Hsieh <mingyen.hsieh@mediatek.com>
Link: https://patch.msgid.link/20251031090352.1400079-6-mingyen.hsieh@mediatek.com
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:37:55 +01:00
Ming Yen Hsieh
6338709a4f wifi: mt76: mt7925: improve EHT capability control in regulatory flow
Move EHT flag handling into mt7925_regd_channel_update() to ensure
correct channel capability reporting.

Signed-off-by: Ming Yen Hsieh <mingyen.hsieh@mediatek.com>
Link: https://patch.msgid.link/20251031090352.1400079-5-mingyen.hsieh@mediatek.com
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:37:55 +01:00
Ming Yen Hsieh
3305100859 wifi: mt76: mt7925: refactor regulatory notifier flow
Rename mt7925_regd_update() to mt7925_mcu_regd_update() to centralize
regd updates with error handling.

Signed-off-by: Ming Yen Hsieh <mingyen.hsieh@mediatek.com>
Link: https://patch.msgid.link/20251031090352.1400079-4-mingyen.hsieh@mediatek.com
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:37:55 +01:00
Ming Yen Hsieh
e323b84127 wifi: mt76: mt7925: refactor CLC support check flow
Move the disable_clc module parameter to regd.c and introduce
mt7925_regd_clc_supported() to centralize CLC support checks.

Signed-off-by: Ming Yen Hsieh <mingyen.hsieh@mediatek.com>
Link: https://patch.msgid.link/20251031090352.1400079-3-mingyen.hsieh@mediatek.com
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:37:55 +01:00
Ming Yen Hsieh
87c3941270 wifi: mt76: mt7925: refactor regulatory domain handling to regd.[ch]
Move regd logic to regd.c and regd.h files

Signed-off-by: Ming Yen Hsieh <mingyen.hsieh@mediatek.com>
Link: https://patch.msgid.link/20251031090352.1400079-2-mingyen.hsieh@mediatek.com
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:37:55 +01:00
Rob Herring (Arm)
c1d8beea63 wifi: mt76: Use of_reserved_mem_region_to_resource() for "memory-region"
Use the newly added of_reserved_mem_region_to_resource() function to
handle "memory-region" properties.

Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Link: https://patch.msgid.link/20251031175113.1453100-1-robh@kernel.org
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:37:55 +01:00
Alok Tiwari
9ba77f1a63 wifi: mt76: mt7996: fix typos in comments
Fix two minor comment typos in the mt7996 driver:
- "Tx/Tx" -> "Rx/Tx"
- "tnterrupt" -> "interrupt"

Signed-off-by: Alok Tiwari <alok.a.tiwari@oracle.com>
Link: https://patch.msgid.link/20251021064812.1778297-1-alok.a.tiwari@oracle.com
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:37:54 +01:00
Lorenzo Bianconi
377aa17d2a wifi: mt76: mt7996: Add NPU offload support to MT7996 driver
Introduce Airoha NPU support to MT7996 driver. NPU is used to enable
traffic forward offloading between the MT76 NIC and the Airoha ethernet one
available on the Airoha EN7581 SoC using Netfilter Flowtable APIs.

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://patch.msgid.link/20251017-mt76-npu-devel-v2-5-ddaa90901723@kernel.org
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:37:54 +01:00
Lorenzo Bianconi
7fb554b1b6 wifi: mt76: Introduce the NPU generic layer
Add the NPU generic layer in mt76 module. NPU will be used to enable
traffic forward offloading between the MT76 NIC and the Airoha ethernet one
available on the Airoha EN7581 SoC using Netfilter Flowtable APIs.

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://patch.msgid.link/20251017-mt76-npu-devel-v2-4-ddaa90901723@kernel.org
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:37:54 +01:00
Lorenzo Bianconi
f7632a7fdd wifi: mt76: Add the capability to set TX token start ID
This is a preliminary patch to enable traffic forward offloading via the
Airoha NPU module available on the Airoha EN7581 SoC.

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://patch.msgid.link/20251017-mt76-npu-devel-v2-3-ddaa90901723@kernel.org
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:37:54 +01:00
Lorenzo Bianconi
a7fb9aac3e wifi: mt76: Add mt76_dev pointer in mt76_queue struct.
This is a preliminary patch to enable traffic forward offloading via the
Airoha NPU module available on the Airoha EN7581 SoC.

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://patch.msgid.link/20251017-mt76-npu-devel-v2-2-ddaa90901723@kernel.org
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:37:54 +01:00
Lorenzo Bianconi
e627439aec wifi: mt76: Move Q_READ/Q_WRITE definitions in dma.h
This is a preliminary patch to enable traffic forward offloading between
the MT76 NIC and the Airoha ethernet one via the Airoha NPU module
available on the Airoha EN7581 SoC.

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://patch.msgid.link/20251017-mt76-npu-devel-v2-1-ddaa90901723@kernel.org
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:37:54 +01:00
Lorenzo Bianconi
2157e49892 wifi: mt76: mt7996: Remove useless check in mt7996_msdu_page_get_from_cache()
Get rid of useless null-pointer check in mt7996_msdu_page_get_from_cache
since we have already verfied the list is not empty.

Fixes: b1e58e137b ("wifi: mt76: mt7996: Introduce RRO MSDU callbacks")
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/r/202510100155.MS0IXhzm-lkp@intel.com/
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://patch.msgid.link/20251014-mt7996_msdu_page_get_from_cache-remove-null-ptr-check-v1-1-fbeb7881e192@kernel.org
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:37:54 +01:00
Felix Fietkau
a96fed2825 wifi: mt76: relicense to BSD-3-Clause-Clear
MediaTek has asked to switch from the ISC license to BSD-3-Clause-Clear,
in order to improve clarity and the legal integrity of the code.

The BSD-3-Clause license includes the "no endorsement" clause, which is
important for protecting the reputation of the original authors and
contributors by preventing unauthorized use of their names for endorsement
purposes.

This clause is absent in the BSD-2-Clause license, which is more permissive
but lacks this specific protection.

This change also cleans up the license of some Kconfig/Makefile files,
which were accidentally marked as GPL.

The GPL 2.0 remains in use on mt76x0, as well as two source files in mt7615
for which the license situation still needs to be clarified.

Link: https://patch.msgid.link/20251008104250.46292-2-nbd@nbd.name
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:37:54 +01:00
Felix Fietkau
909675fd43 wifi: mt76: fix license/copyright of util.h
The extra copyright line for Ivo van Doorn and GPL license was only there
because of code that I had already removed before the initial upstream
submission of mt76.
Remove it and make this header file use ISC license like the rest of the
source files.

Link: https://patch.msgid.link/20251008104250.46292-1-nbd@nbd.name
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:37:54 +01:00
Peter Chiu
a9730354ca wifi: mt76: use GFP_DMA32 for page_pool buffer allocation
Set GFP_DMA32 flag for page_pool buffers allocation since the hw relies
on 32-bit DMA addresses for WED offloading.

Tested-by: Daniel Pawlik <pawlik.dan@gmail.com>
Tested-by: Matteo Croce <teknoraver@meta.com>
Signed-off-by: Peter Chiu <chui-hao.chiu@mediatek.com>
Co-developed-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://patch.msgid.link/20251008-wed-fixes-v1-2-8f7678583385@kernel.org
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:37:53 +01:00
Lorenzo Bianconi
385aab8fcc wifi: mt76: wed: use proper wed reference in mt76 wed driver callabacks
MT7996 driver can use both wed and wed_hif2 devices to offload traffic
from/to the wireless NIC. In the current codebase we assume to always
use the primary wed device in wed callbacks resulting in the following
crash if the hw runs wed_hif2 (e.g. 6GHz link).

[  297.455876] Unable to handle kernel read from unreadable memory at virtual address 000000000000080a
[  297.464928] Mem abort info:
[  297.467722]   ESR = 0x0000000096000005
[  297.471461]   EC = 0x25: DABT (current EL), IL = 32 bits
[  297.476766]   SET = 0, FnV = 0
[  297.479809]   EA = 0, S1PTW = 0
[  297.482940]   FSC = 0x05: level 1 translation fault
[  297.487809] Data abort info:
[  297.490679]   ISV = 0, ISS = 0x00000005, ISS2 = 0x00000000
[  297.496156]   CM = 0, WnR = 0, TnD = 0, TagAccess = 0
[  297.501196]   GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
[  297.506500] user pgtable: 4k pages, 39-bit VAs, pgdp=0000000107480000
[  297.512927] [000000000000080a] pgd=08000001097fb003, p4d=08000001097fb003, pud=08000001097fb003, pmd=0000000000000000
[  297.523532] Internal error: Oops: 0000000096000005 [#1] SMP
[  297.715393] CPU: 2 UID: 0 PID: 45 Comm: kworker/u16:2 Tainted: G           O       6.12.50 #0
[  297.723908] Tainted: [O]=OOT_MODULE
[  297.727384] Hardware name: Banana Pi BPI-R4 (2x SFP+) (DT)
[  297.732857] Workqueue: nf_ft_offload_del nf_flow_rule_route_ipv6 [nf_flow_table]
[  297.740254] pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[  297.747205] pc : mt76_wed_offload_disable+0x64/0xa0 [mt76]
[  297.752688] lr : mtk_wed_flow_remove+0x58/0x80
[  297.757126] sp : ffffffc080fe3ae0
[  297.760430] x29: ffffffc080fe3ae0 x28: ffffffc080fe3be0 x27: 00000000deadbef7
[  297.767557] x26: ffffff80c5ebca00 x25: 0000000000000001 x24: ffffff80c85f4c00
[  297.774683] x23: ffffff80c1875b78 x22: ffffffc080d42cd0 x21: ffffffc080660018
[  297.781809] x20: ffffff80c6a076d0 x19: ffffff80c6a043c8 x18: 0000000000000000
[  297.788935] x17: 0000000000000000 x16: 0000000000000001 x15: 0000000000000000
[  297.796060] x14: 0000000000000019 x13: ffffff80c0ad8ec0 x12: 00000000fa83b2da
[  297.803185] x11: ffffff80c02700c0 x10: ffffff80c0ad8ec0 x9 : ffffff81fef96200
[  297.810311] x8 : ffffff80c02700c0 x7 : ffffff80c02700d0 x6 : 0000000000000002
[  297.817435] x5 : 0000000000000400 x4 : 0000000000000000 x3 : 0000000000000000
[  297.824561] x2 : 0000000000000001 x1 : 0000000000000800 x0 : ffffff80c6a063c8
[  297.831686] Call trace:
[  297.834123]  mt76_wed_offload_disable+0x64/0xa0 [mt76]
[  297.839254]  mtk_wed_flow_remove+0x58/0x80
[  297.843342]  mtk_flow_offload_cmd+0x434/0x574
[  297.847689]  mtk_wed_setup_tc_block_cb+0x30/0x40
[  297.852295]  nf_flow_offload_ipv6_hook+0x7f4/0x964 [nf_flow_table]
[  297.858466]  nf_flow_rule_route_ipv6+0x438/0x4a4 [nf_flow_table]
[  297.864463]  process_one_work+0x174/0x300
[  297.868465]  worker_thread+0x278/0x430
[  297.872204]  kthread+0xd8/0xdc
[  297.875251]  ret_from_fork+0x10/0x20
[  297.878820] Code: 928b5ae0 8b000273 91400a60 f943fa61 (79401421)
[  297.884901] ---[ end trace 0000000000000000 ]---

Fix the issue detecting the proper wed reference to use running wed
callabacks.

Fixes: 83eafc9251 ("wifi: mt76: mt7996: add wed tx support")
Tested-by: Daniel Pawlik <pawlik.dan@gmail.com>
Tested-by: Matteo Croce <teknoraver@meta.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://patch.msgid.link/20251008-wed-fixes-v1-1-8f7678583385@kernel.org
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:37:53 +01:00
Shayne Chen
b05ab4be9f wifi: mt76: mt7915: add bf backoff limit table support
The commit 22b980badc ("mt76: add functions for parsing rate power limits
from DT") introduced generic support for rates limits in the devicetree.
But the mt7915 supports beamforming and has another table for configuring
the backoff limits. These can be configured in the DT with the paths-*
properties. The path-*-bf are the ones relevant for beamforming and the
ones without -bf suffix for "traditional" path backoff.

Signed-off-by: Shayne Chen <shayne.chen@mediatek.com>
Signed-off-by: Sven Eckelmann (Plasma Cloud) <se@simonwunderlich.de>
Link: https://patch.msgid.link/20251007-backoff-table-support-v3-3-fd6e2684988f@simonwunderlich.de
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:37:53 +01:00
Sven Eckelmann (Plasma Cloud)
6b9833c611 dt-bindings: net: wireless: mt76: introduce backoff limit properties
Introduce path backoff limit properties in mt76 binding in order to specify
beamforming and non-beamforming backoff limits for 802.11n/ac/ax.

Reviewed-by: Rob Herring (Arm) <robh@kernel.org>
Signed-off-by: Sven Eckelmann (Plasma Cloud) <se@simonwunderlich.de>
Link: https://patch.msgid.link/20251007-backoff-table-support-v3-2-fd6e2684988f@simonwunderlich.de
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:37:53 +01:00
Sven Eckelmann (Plasma Cloud)
9a04a69b68 dt-bindings: net: wireless: mt76: Document power-limits country property
The commit 22b980badc ("mt76: add functions for parsing rate power limits
from DT") added filtering of the power limits based on two properties:

* regdomain
* country

If either the country or the regdomain matches, the power limits are
applied and the search is aborted. If none of the two is defined for the
power limit, it is a global (or "fallback") power limit. The last
"fallback" power limit in the list will be returned when not matching
regdomain or country was found.

The idea is here to allow to specify "overwriting" country limits in front
of the list - just in case a regdomain is shared but a country has
additional limitations.

But this property was forgotten to be defined in commit 2de6ccebe0
("dt-bindings:net:wireless:mediatek,mt76: introduce power-limits node").

Signed-off-by: Sven Eckelmann (Plasma Cloud) <se@simonwunderlich.de>
Reviewed-by: Rob Herring (Arm) <robh@kernel.org>
Link: https://patch.msgid.link/20251007-backoff-table-support-v3-1-fd6e2684988f@simonwunderlich.de
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:37:53 +01:00
Sven Eckelmann (Plasma Cloud)
38b845e1f9 wifi: mt76: Fix DTS power-limits on little endian systems
The power-limits for ru and mcs and stored in the devicetree as bytewise
array (often with sizes which are not a multiple of 4). These arrays have a
prefix which defines for how many modes a line is applied. This prefix is
also only a byte - but the code still tried to fix the endianness of this
byte with a be32 operation. As result, loading was mostly failing or was
sending completely unexpected values to the firmware.

Since the other rates are also stored in the devicetree as bytewise arrays,
just drop the u32 access + be32_to_cpu conversion and directly access them
as bytes arrays.

Cc: stable@vger.kernel.org
Fixes: 22b980badc ("mt76: add functions for parsing rate power limits from DT")
Fixes: a9627d992b ("mt76: extend DT rate power limits to support 11ax devices")
Signed-off-by: Sven Eckelmann (Plasma Cloud) <se@simonwunderlich.de>
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:37:53 +01:00
Thorsten Blum
8c5b063061 wifi: mt76: connac: Replace memcpy + hard-coded size with strscpy
Replace memcpy() and the hard-coded string length with strscpy() to
safely copy the string and improve mt76_connac_mcu_chip_config().

No functional changes.

Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
Link: https://patch.msgid.link/20250923213831.1896823-2-thorsten.blum@linux.dev
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:37:53 +01:00
Lorenzo Bianconi
084922069c wifi: mt76: mt7996: Remove unnecessary link_id checks in mt7996_tx
Remove unnecessary link_id checks in mt7996_tx routine since if the link
identifier provided by mac80211 is unspecified the value will be
overwritten at the beginning on the function.

Fixes: f940c9b7ae ("wifi: mt76: mt7996: Set proper link destination address in mt7996_tx()")
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://patch.msgid.link/20250924-mt76_tx_unnecessary-check-v1-1-e595930a5662@kernel.org
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:37:53 +01:00
Felix Fietkau
79277f8ad1 wifi: mt76: mt7996: fix null pointer deref in mt7996_conf_tx()
If a link does not have an assigned channel yet, mt7996_vif_link returns
NULL. We still need to store the updated queue settings in that case, and
apply them later.
Move the location of the queue params to within struct mt7996_vif_link.

Fixes: c0df2f0caa ("wifi: mt76: mt7996: prepare mt7996_mcu_set_tx for MLO support")
Acked-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://patch.msgid.link/20250929111723.52486-1-nbd@nbd.name
Signed-off-by: Felix Fietkau <nbd@nbd.name>
2025-11-24 14:37:53 +01:00
Hans de Goede
a8e5a110c0 wifi: brcmfmac: Add DMI nvram filename quirk for Acer A1 840 tablet
The Acer A1 840 tablet contains quite generic names in the sys_vendor and
product_name DMI strings, without this patch brcmfmac will try to load:
brcmfmac43340-sdio.Insyde-BayTrail.txt as nvram file which is a bit
too generic.

Add a DMI quirk so that a unique and clearly identifiable nvram file name
is used on the Acer A1 840 tablet.

Acked-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Hans de Goede <hansg@kernel.org>
Link: https://patch.msgid.link/20251103100314.353826-1-hansg@kernel.org
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-24 13:07:11 +01:00
Johannes Berg
9f33477b9a wifi: cfg80211: stop radar detection in cfg80211_leave()
If an interface is set down or, per the previous patch, changes
type, radar detection for it should be cancelled. This is done
for AP mode in mac80211 (somewhat needlessly, since cfg80211 can
do it, but didn't until now), but wasn't handled for mesh, so if
radar detection was started and then the interface set down or
its type switched (the latter sometimes happning in the hwsim
test 'mesh_peer_connected_dfs'), radar detection would be around
with the interface unknown to the driver, later leading to some
warnings around chanctx usage.

Link: https://patch.msgid.link/20251121174021.290120e419e3.I2a5650c9062e29c988992dd8ce0d8eb570d23267@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-24 13:05:23 +01:00
Johannes Berg
7a27b73943 wifi: cfg80211: use cfg80211_leave() in iftype change
When changing the interface type, all activity on the interface has
to be stopped first. This was done independent of existing code in
cfg80211_leave(), so didn't handle e.g. background radar detection.
Use cfg80211_leave() to handle it the same way.

Note that cfg80211_leave() behaves slightly differently for IBSS in
wireless extensions, it won't send an event in that case. We could
handle that, but since nl80211 was used to change the type, IBSS is
rare, and wext is already a corner case, it doesn't seem worth it.

Link: https://patch.msgid.link/20251121174021.922ef48ce007.I970c8514252ef8a864a7fbdab9591b71031dee03@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-24 13:05:23 +01:00
Johannes Berg
de3c514234 wifi: mac80211: fix channel switching code
My prior commit here introduced a bug due to copy/paste,
it was iterating the links assigned to 'ctx->replace_ctx'
and I replaced it by iterating links assigned to 'ctx' by
accident, then modified it for the iteration later.

Fix it to iterate the users of the correct chanctx, i.e.
'ctx->replace_ctx'.

Ultimately, this issue led to a crash in a hwsim test
(multi_ap_wps_shared_apdev_csa) because it would actually
do the switch (rather than refuse here) and then later
have a double-free of the original chanctx, because it
was still in use by another interface yet freed as part
of the switching.

Fixes: a1dc648aa7 ("wifi: mac80211: remove chanctx to link back-references")
Link: https://patch.msgid.link/20251121113733.7710a58d45eb.Ie9ec010b52b1baed93dbe44f968c2119b6b5d98d@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-24 13:05:11 +01:00
Johannes Berg
49a9feaf30 Merge tag 'rtw-next-2025-11-21-v2' of https://github.com/pkshih/rtw
Ping-Ke Shih says:
==================
rtw-next patches for v6.19

Main changes are about rtw89 USB support, which two USB devices are added
with proper TX status, and other notable items are listed below.

rtl8xxxu:

 - fix 40MHz bandwidth connection

rtw89:

 - support USB devices RTL8852AU and RTL8852CU

 - report TX status from air for USB devices

 - resolve racing between processes of TX and TX report

 - resolve racing of skb queue of C2H events

 - support injected packets with bandwidth and data rate

 - more materials for coming RTL8922DE
==================

Link: https://patch.msgid.link/45eed1763a354460acba15a8e69f9e3e@realtek.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-24 13:02:21 +01:00
Gao Xiang
d53cd891f0 erofs: limit the level of fs stacking for file-backed mounts
Otherwise, it could cause potential kernel stack overflow (e.g., EROFS
mounting itself).

Reviewed-by: Sheng Yong <shengyong1@xiaomi.com>
Fixes: fb17675026 ("erofs: add file-backed mount support")
Reviewed-by: Chao Yu <chao@kernel.org>
Reviewed-by: Hongbo Li <lihongbo22@huawei.com>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
2025-11-24 14:17:29 +08:00
Gao Xiang
ebe4f3f6eb erofs: correct FSDAX detection
The detection of the primary device is skipped incorrectly
if the multiple or flattened feature is enabled.

It also fixes the FSDAX misdetection for non-block extra blobs.

Fixes: c6993c4cb9 ("erofs: Fallback to normal access if DAX is not supported on extra device")
Reported-and-tested-by: syzbot+31b8fb02cb8a25bd5e78@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/r/691af9f6.a70a0220.3124cb.0097.GAE@google.com
Cc: Yuezhang Mo <Yuezhang.Mo@sony.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
2025-11-24 14:16:30 +08:00
Alexei Starovoitov
c427320873 Merge branch 'bpf-nested-rcu-critical-sections'
Puranjay Mohan says:

====================
bpf: Nested rcu critical sections

v1: https://lore.kernel.org/bpf/20250916113622.19540-1-puranjay@kernel.org/
Changes in v1->v2:
- Move the addition of new tests to a separate patch (Alexei)
- Avoid incrementing active_rcu_locks at two places (Eduard)

Support nested rcu critical sections by making the boolean flag
active_rcu_lock a counter and use it to manage rcu critical section
state. bpf_rcu_read_lock() increments this counter and
bpf_rcu_read_unlock() decrements it, MEM_RCU -> PTR_UNTRUSTED transition
happens when active_rcu_locks drops to 0.
====================

Link: https://patch.msgid.link/20251117200411.25563-1-puranjay@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-21 18:35:00 -08:00
Puranjay Mohan
cf49ec5705 selftests: bpf: Add tests for unbalanced rcu_read_lock
As verifier now supports nested rcu critical sections, add new test
cases to make sure unbalanced usage of rcu_read_lock()/unlock() is
rejected.

Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251117200411.25563-3-puranjay@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-21 18:34:59 -08:00
Puranjay Mohan
4167096cb9 bpf: support nested rcu critical sections
Currently, nested rcu critical sections are rejected by the verifier and
rcu_lock state is managed by a boolean variable. Add support for nested
rcu critical sections by make active_rcu_locks a counter similar to
active_preempt_locks. bpf_rcu_read_lock() increments this counter and
bpf_rcu_read_unlock() decrements it, MEM_RCU -> PTR_UNTRUSTED transition
happens when active_rcu_locks drops to 0.

Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251117200411.25563-2-puranjay@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-21 18:34:59 -08:00
Jakub Kicinski
e05021a829 Merge branch 'net-stmmac-qcon-ethqos-rgmii-accessor-cleanups'
Russell King says:

====================
net: stmmac: qcon-ethqos: "rgmii" accessor cleanups

This series cleans up the "rgmii" accessors in qcom-ethqos.

readl() and writel() return and take a u32 for the value. Rather than
implicitly casting this to an int, keep it as a u32.

Add set/clear functions to reduce the code and make it easier to read.

Finally, convert the open-coded poll loops to use the iopoll helpers.

Note that patch 1 has a checkpatch warning concerning "volatile" -
I'm changing the type here, and the "volatile" is removed in patch 3.
I do not feel it is appropriate to remove it in patch 1.
====================

Link: https://patch.msgid.link/aR76i0HjXitfl7xk@shell.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-21 18:13:46 -08:00
Russell King (Oracle)
9b60ba512c net: stmmac: qcom-ethqos: use read_poll_timeout_atomic()
Use read_poll_timeout_atomic() to poll the rgmii registers rather than
open-coding the polling.

Reviewed-by: Konrad Dybcio <konrad.dybcio@oss.qualcomm.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vM2n1-0000000FRTu-0js9@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-21 18:13:44 -08:00
Russell King (Oracle)
819212185a net: stmmac: qcom-ethqos: add rgmii set/clear functions
The driver has a lot of bit manipulation of the RGMII registers. Add
a pair of helpers to set bits and clear bits, converting the various
calls to rgmii_updatel() as appropriate.

Most of the change was done via this sed script:

/rgmii_updatel/ {
	N
	/,$/N
	/mask, / ! {
		s|rgmii_updatel\(([^,]*,\s+([^,]*),\s+)\2,\s+|rgmii_setmask(\1|
		s|rgmii_updatel\(([^,]*,\s+([^,]*),\s+)0,\s+|rgmii_clrmask(\1|
		s|^\s+$||
	}
}

and then formatting tweaked where necessary.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Konrad Dybcio <konrad.dybcio@oss.qualcomm.com>
Link: https://patch.msgid.link/E1vM2mw-0000000FRTo-0End@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-21 18:13:44 -08:00
Russell King (Oracle)
f54bbd390f net: stmmac: qcom-ethqos: use u32 for rgmii read/write/update
readl() returns a u32, and writel() takes a "u32" for the value. These
are used in rgmii_readl()() and rgmii_writel(), but the value and
return are "int". As these are 32-bit register values which are not
signed, use "u32".

These changes do not cause generated code changes.

Update rgmii_updatel() to use u32 for mask and val. Changing "mask"
to "u32" also does not cause generated code changes. However, changing
"val" causes the generated assembly to be re-ordered for aarch64.

Update the temporary variables used with the rgmii functions to use
u32.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Konrad Dybcio <konrad.dybcio@oss.qualcomm.com>
Link: https://patch.msgid.link/E1vM2mq-0000000FRTi-3y5F@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-21 18:13:44 -08:00
Slark Xiao
501253b61d net: wwan: t7xx: Make local function static
This function was used in t7xx_hif_cldma.c only. Make it static
as it should be.

Signed-off-by: Slark Xiao <slark_xiao@163.com>
Reviewed-by: Loic Poulain <loic.poulain@qualcomm.com>
Link: https://patch.msgid.link/20251120115208.345578-1-slark_xiao@163.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-21 18:09:43 -08:00
Eduard Zingerman
8f7cf305a1 bpf: test the correct stack liveness of tail calls
A new test is added: caller_stack_write_tail_call tests that the live
stack is correctly tracked for a tail call.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Martin Teichmann <martin.teichmann@xfel.eu>
Link: https://lore.kernel.org/r/20251119160355.1160932-5-martin.teichmann@xfel.eu
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-21 17:45:30 -08:00
Eduard Zingerman
e40f5a6bf8 bpf: correct stack liveness for tail calls
This updates bpf_insn_successors() reflecting that control flow might
jump over the instructions between tail call and function exit, verifier
might assume that some writes to parent stack always happen, which is
not the case.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Martin Teichmann <martin.teichmann@xfel.eu>
Link: https://lore.kernel.org/r/20251119160355.1160932-4-martin.teichmann@xfel.eu
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-21 17:45:30 -08:00
Martin Teichmann
978da762ea bpf: test the proper verification of tail calls
Three tests are added:

- invalidate_pkt_pointers_by_tail_call checks that one can use the
  packet pointer after a tail call. This was originally possible
  and also poses not problems, but was made impossible by 1a4607ffba.

- invalidate_pkt_pointers_by_static_tail_call tests a corner case
  found by Eduard Zingerman during the discussion of the original fix,
  which was broken in that fix.

- subprog_result_tail_call tests that precision propagation works
  correctly across tail calls. This did not work before.

Signed-off-by: Martin Teichmann <martin.teichmann@xfel.eu>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251119160355.1160932-3-martin.teichmann@xfel.eu
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-21 17:45:30 -08:00
Martin Teichmann
e3245f8990 bpf: properly verify tail call behavior
A successful ebpf tail call does not return to the caller, but to the
caller-of-the-caller, often just finishing the ebpf program altogether.

Any restrictions that the verifier needs to take into account - notably
the fact that the tail call might have modified packet pointers - are to
be checked on the caller-of-the-caller. Checking it on the caller made
the verifier refuse perfectly fine programs that would use the packet
pointers after a tail call, which is no problem as this code is only
executed if the tail call was unsuccessful, i.e. nothing happened.

This patch simulates the behavior of a tail call in the verifier. A
conditional jump to the code after the tail call is added for the case
of an unsucessful tail call, and a return to the caller is simulated for
a successful tail call.

For the successful case we assume that the tail call returns an int,
as tail calls are currently only allowed in functions that return and
int. We always assume that the tail call modified the packet pointers,
as we do not know what the tail call did.

For the unsuccessful case we know nothing happened, so we do not need to
add new constraints.

This approach also allows to check other problems that may occur with
tail calls, namely we are now able to check that precision is properly
propagated into subprograms using tail calls, as well as checking the
live slots in such a subprogram.

Fixes: 1a4607ffba ("bpf: consider that tail calls invalidate packet pointers")
Link: https://lore.kernel.org/bpf/20251029105828.1488347-1-martin.teichmann@xfel.eu/
Signed-off-by: Martin Teichmann <martin.teichmann@xfel.eu>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251119160355.1160932-2-martin.teichmann@xfel.eu
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-21 17:45:30 -08:00
Anton Protopopov
4dd3a48d13 bpf: Add a check to make static analysers happy
In [1] Dan Carpenter reported that the following code makes the
Smatch static analyser unhappy:

        17904       value = map->ops->map_lookup_elem(map, &i);
        17905       if (!value)
        17906               return -EINVAL;
    --> 17907       items[i - start] = value->xlated_off;

The analyser assumes that the `value` variable may contain an error
and thus it should be properly checked before the dereference.
On practice this will never happen as array maps do not return
error values in map_lookup_elem, but to make the Smatch and other
possible analysers happy this patch adds a formal check.

Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Closes: https://lore.kernel.org/bpf/aR2BN1Ix--8tmVrN@stanley.mountain/ [1]
Fixes: 493d9e0d60 ("bpf, x86: add support for indirect jumps")
Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Link: https://lore.kernel.org/r/20251119112517.1091793-1-a.s.protopopov@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-21 17:01:14 -08:00
Xing Guo
b7f7d76d6e selftests/bpf: Update test_tag to use sha256
commit 603b441623 ("bpf: Update the bpf_prog_calc_tag to use SHA256")
changed digest of prog_tag to SHA256 but forgot to update tests
correspondingly. Fix it.

Fixes: 603b441623 ("bpf: Update the bpf_prog_calc_tag to use SHA256")
Signed-off-by: Xing Guo <higuoxing@gmail.com>
Link: https://lore.kernel.org/r/20251121061458.3145167-1-higuoxing@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-21 16:56:08 -08:00
Matt Bobrowski
ae24fc8a16 selftests/bpf: Improve reliability of test_perf_branches_no_hw()
Currently, test_perf_branches_no_hw() relies on the busy loop within
test_perf_branches_common() being slow enough to allow at least one
perf event sample tick to occur before starting to tear down the
backing perf event BPF program. With a relatively small fixed
iteration count of 1,000,000, this is not guaranteed on modern fast
CPUs, resulting in the test run to subsequently fail with the
following:

bpf_testmod.ko is already unloaded.
Loading bpf_testmod.ko...
Successfully loaded bpf_testmod.ko.
test_perf_branches_common:PASS:test_perf_branches_load 0 nsec
test_perf_branches_common:PASS:attach_perf_event 0 nsec
test_perf_branches_common:PASS:set_affinity 0 nsec
check_good_sample:PASS:output not valid 0 nsec
check_good_sample:PASS:read_branches_size 0 nsec
check_good_sample:PASS:read_branches_stack 0 nsec
check_good_sample:PASS:read_branches_stack 0 nsec
check_good_sample:PASS:read_branches_global 0 nsec
check_good_sample:PASS:read_branches_global 0 nsec
check_good_sample:PASS:read_branches_size 0 nsec
test_perf_branches_no_hw:PASS:perf_event_open 0 nsec
test_perf_branches_common:PASS:test_perf_branches_load 0 nsec
test_perf_branches_common:PASS:attach_perf_event 0 nsec
test_perf_branches_common:PASS:set_affinity 0 nsec
check_bad_sample:FAIL:output not valid no valid sample from prog
Summary: 0/1 PASSED, 0 SKIPPED, 1 FAILED
Successfully unloaded bpf_testmod.ko.

On a modern CPU (i.e. one with a 3.5 GHz clock rate), executing 1
million increments of a volatile integer can take significantly less
than 1 millisecond. If the spin loop and detachment of the perf event
BPF program elapses before the first 1 ms sampling interval elapses,
the perf event will never end up firing. Fix this by bumping the loop
iteration counter a little within test_perf_branches_common(), along
with ensuring adding another loop termination condition which is
directly influenced by the backing perf event BPF program
executing. Notably, a concious decision was made to not adjust the
sample_freq value as that is just not a reliable way to go about
fixing the problem. It effectively still leaves the race window open.

Fixes: 67306f84ca ("selftests/bpf: Add bpf_read_branch_records() selftest")
Signed-off-by: Matt Bobrowski <mattbobrowski@google.com>
Reviewed-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20251119143540.2911424-1-mattbobrowski@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-21 16:49:16 -08:00
Matt Bobrowski
27746aaf1b selftests/bpf: skip test_perf_branches_hw() on unsupported platforms
Gracefully skip the test_perf_branches_hw subtest on platforms that
do not support LBR or require specialized perf event attributes
to enable branch sampling.

For example, AMD's Milan (Zen 3) supports BRS rather than traditional
LBR. This requires specific configurations (attr.type = PERF_TYPE_RAW,
attr.config = RETIRED_TAKEN_BRANCH_INSTRUCTIONS) that differ from the
generic setup used within this test. Notably, it also probably doesn't
hold much value to special case perf event configurations for selected
micro architectures.

Fixes: 67306f84ca ("selftests/bpf: Add bpf_read_branch_records() selftest")
Signed-off-by: Matt Bobrowski <mattbobrowski@google.com>
Acked-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20251120142059.2836181-1-mattbobrowski@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-21 16:47:58 -08:00
Alexei Starovoitov
878ee3c325 Merge branch 'bpf-arm64-indirect-jumps'
Puranjay Mohan says:

====================
bpf: arm64: Indirect jumps

Changes in v1->v2:
v1: https://lore.kernel.org/all/20251117004656.33292-1-puranjay@kernel.org/
- Dropped patch 3 that was ignoring relocations for .jumptables. LLVM
  has been fixed to not emit relocations for .jumptables, so this patch
  is not needed.
- Added Reviewed-by: Anton Protopopov <a.s.protopopov@gmail.com>

This set adds the support of indirect jumps to the arm64 JIT. It
involves calling bpf_prog_update_insn_ptrs() to support instructions
array map. The second piece is supporting BPF_JMP|BPF_X|BPF_JA, SRC=0,
DST=Rx, off=0, imm=0 instruction that is trivial to implement on arm64.

The final patch enables selftests on arm64:

 [root@localhost bpf]# ./test_progs-cpuv4 -a "*gotox*"
 #20/1    bpf_gotox/one-switch:OK
 #20/2    bpf_gotox/one-switch-non-zero-sec-offset:OK
 #20/3    bpf_gotox/two-switches:OK
 #20/4    bpf_gotox/big-jump-table:OK
 #20/5    bpf_gotox/static-global:OK
 #20/6    bpf_gotox/nonstatic-global:OK
 #20/7    bpf_gotox/other-sec:OK
 #20/8    bpf_gotox/static-global-other-sec:OK
 #20/9    bpf_gotox/nonstatic-global-other-sec:OK
 #20/10   bpf_gotox/one-jump-two-maps:OK
 #20/11   bpf_gotox/one-map-two-jumps:OK
 #20      bpf_gotox:OK
 #537/1   verifier_gotox/jump_table_ok:OK
 #537/2   verifier_gotox/jump_table_reserved_field_src_reg:OK
 #537/3   verifier_gotox/jump_table_reserved_field_non_zero_off:OK
 #537/4   verifier_gotox/jump_table_reserved_field_non_zero_imm:OK
 #537/5   verifier_gotox/jump_table_no_jump_table:OK
 #537/6   verifier_gotox/jump_table_incorrect_dst_reg_type:OK
 #537/7   verifier_gotox/jump_table_invalid_read_size_u32:OK
 #537/8   verifier_gotox/jump_table_invalid_read_size_u16:OK
 #537/9   verifier_gotox/jump_table_invalid_read_size_u8:OK
 #537/10  verifier_gotox/jump_table_misaligned_access:OK
 #537/11  verifier_gotox/jump_table_invalid_mem_acceess_pos:OK
 #537/12  verifier_gotox/jump_table_invalid_mem_acceess_neg:OK
 #537/13  verifier_gotox/jump_table_add_sub_ok:OK
 #537/14  verifier_gotox/jump_table_no_writes:OK
 #537/15  verifier_gotox/jump_table_use_reg_r0:OK
 #537/16  verifier_gotox/jump_table_use_reg_r1:OK
 #537/17  verifier_gotox/jump_table_use_reg_r2:OK
 #537/18  verifier_gotox/jump_table_use_reg_r3:OK
 #537/19  verifier_gotox/jump_table_use_reg_r4:OK
 #537/20  verifier_gotox/jump_table_use_reg_r5:OK
 #537/21  verifier_gotox/jump_table_use_reg_r6:OK
 #537/22  verifier_gotox/jump_table_use_reg_r7:OK
 #537/23  verifier_gotox/jump_table_use_reg_r8:OK
 #537/24  verifier_gotox/jump_table_use_reg_r9:OK
 #537/25  verifier_gotox/jump_table_outside_subprog:OK
 #537/26  verifier_gotox/jump_table_contains_non_unique_values:OK
 #537     verifier_gotox:OK
 Summary: 2/37 PASSED, 0 SKIPPED, 0 FAILED
====================

Link: https://patch.msgid.link/20251117130732.11107-1-puranjay@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-21 16:40:22 -08:00
Puranjay Mohan
d8774a3623 selftests: bpf: Enable gotox tests from arm64
arm64 JIT now supports gotox instruction and jumptables, so run tests in
verifier_gotox.c for arm64.

Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
Reviewed-by: Anton Protopopov <a.s.protopopov@gmail.com>
Link: https://lore.kernel.org/r/20251117130732.11107-4-puranjay@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-21 16:40:21 -08:00
Puranjay Mohan
f4a66cf1cb bpf: arm64: Add support for indirect jumps
Add support for a new instruction

	BPF_JMP|BPF_X|BPF_JA, SRC=0, DST=Rx, off=0, imm=0

which does an indirect jump to a location stored in Rx.  The register
Rx should have type PTR_TO_INSN. This new type assures that the Rx
register contains a value (or a range of values) loaded from a
correct jump table – map of type instruction array.

ARM64 JIT supports indirect jumps to all registers through the A64_BR()
macro, use it to implement this new instruction.

Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
Reviewed-by: Anton Protopopov <a.s.protopopov@gmail.com>
Acked-by: Xu Kuohai <xukuohai@huawei.com>
Link: https://lore.kernel.org/r/20251117130732.11107-3-puranjay@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-21 16:40:21 -08:00
Puranjay Mohan
84b1c40d5b bpf: arm64: Add support for instructions array
Add support for the instructions array map type in the arm64 JIT by
calling bpf_prog_update_insn_ptrs() with the offsets that map
xlated_offset to the jited_offset in the final image. arm64 JIT already
has this offset array which was being used for
bpf_prog_fill_jited_linfo() and can be used directly for
bpf_prog_update_insn_ptrs.

Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
Reviewed-by: Anton Protopopov <a.s.protopopov@gmail.com>
Acked-by: Xu Kuohai <xukuohai@huawei.com>
Link: https://lore.kernel.org/r/20251117130732.11107-2-puranjay@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-21 16:40:21 -08:00
Martin KaFai Lau
792f258803 Merge branch 'selftests-bpf-networking-test-cleanups'
Hoyeon Lee says:

====================
selftests/bpf: networking test cleanups

This series finishes the sockaddr_storage migration in the networking
selftests by removing the remaining open-coded IPv4/IPv6 wrappers
(addr_port/tuple in cls_redirect, sa46 in select_reuseport). The tests
now use sockaddr_storage directly. No other custom socket-address
wrappers remain after this series, so the churn stops here and behavior
is unchanged.
====================

Link: https://patch.msgid.link/20251121081332.2309838-1-hoyeon.lee@suse.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
2025-11-21 10:46:35 -08:00
Hoyeon Lee
db354a1577 selftests/bpf: Use sockaddr_storage instead of sa46 in select_reuseport test
The select_reuseport selftest uses a custom sa46 union to represent
IPv4 and IPv6 addresses. This custom wrapper requires extra manual
handling for address family and field extraction.

Replace sa46 with sockaddr_storage and update the helper functions to
operate on native socket structures. This simplifies the code and
removes unnecessary custom address-handling logic. No functional
changes intended.

Reviewed-by: Amery Hung <ameryhung@gmail.com>
Signed-off-by: Hoyeon Lee <hoyeon.lee@suse.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20251121081332.2309838-3-hoyeon.lee@suse.com
2025-11-21 10:46:31 -08:00
Hoyeon Lee
fd6ed07a05 selftests/bpf: Use sockaddr_storage directly in cls_redirect test
The cls_redirect test uses a custom addr_port/tuple wrapper to represent
IPv4/IPv6 addresses and ports. This custom wrapper requires extra
conversion logic and specific helpers such as fill_addr_port(), which
are no longer necessary when using standard socket address structures.

This commit replaces addr_port/tuple with the standard sockaddr_storage
so test handles address families and ports using native socket types.
It removes the custom helper, eliminates redundant casts, and simplifies
the setup helpers without functional changes. set_up_conn() and
build_input() now take src/dst sockaddr_storage directly.

Reviewed-by: Amery Hung <ameryhung@gmail.com>
Signed-off-by: Hoyeon Lee <hoyeon.lee@suse.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20251121081332.2309838-2-hoyeon.lee@suse.com
2025-11-21 10:46:01 -08:00
Bitterblue Smith
dbf9b7bb0e wifi: rtl8xxxu: Enable 40 MHz width by default
40 MHz support is hidden behind the ht40_2g module parameter with
this comment:

/*
 * Some APs will negotiate HT20_40 in a noisy environment leading
 * to miserable performance. Rather than defaulting to this, only
 * enable it if explicitly requested at module load time.
 */

This parameter was added in commit 26f1fad29a ("New driver:
rtl8xxxu (mac80211)"). Back then rtl8xxxu only supported RTL8723AU
and the RTL8192CU family. It's entirely possible the miserable
performance was due to mistakes in the channel switching function,
which were fixed in a previous patch.

Delete the ht40_2g module parameter. If someone still needs to
disable 40 MHz support, cfg80211 has the module parameter
cfg80211_disable_40mhz_24ghz. That works too.

Signed-off-by: Bitterblue Smith <rtl8821cerfe2@gmail.com>
Reviewed-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/4f053103-adfd-4ead-acb3-ef69127a4bab@gmail.com
2025-11-21 13:33:41 +08:00
Bitterblue Smith
41a21d0ff3 wifi: rtl8xxxu: Fix RX channel width reported by RTL8192FU
The other chips report the RX channel width in the RX descriptor,
but this one doesn't.

Get the RX channel width from the PHY status.

Signed-off-by: Bitterblue Smith <rtl8821cerfe2@gmail.com>
Reviewed-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/1c6c1fd4-92f6-4327-a24e-f0747ab21819@gmail.com
2025-11-21 13:33:29 +08:00
Bitterblue Smith
fc44314a37 wifi: rtl8xxxu: Fix the 40 MHz subchannel for RTL8192EU, RTL8723BU
rtl8xxxu_gen2_config_channel() was missing the subchannel setting.
This function is used by RTL8192EU and RTL8723BU.

This change seems to make no difference in my testing on channel 13
with either chip.

Signed-off-by: Bitterblue Smith <rtl8821cerfe2@gmail.com>
Reviewed-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/a5de8d39-45c1-4667-ab4c-7109de6eb13d@gmail.com
2025-11-21 13:33:18 +08:00
Bitterblue Smith
bdb4c850c3 wifi: rtl8xxxu: Make RTL8192CU, RTL8723AU TX with 40 MHz width
Set the required fields in the TX descriptor to allow these chips to
transmit with 40 MHz channel width when the access point supports it.

Tested only with RTL8192CU, but these settings are identical for
RTL8723AU.

Signed-off-by: Bitterblue Smith <rtl8821cerfe2@gmail.com>
Reviewed-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/30d95228-69b2-48f9-8854-c98d2408c4d3@gmail.com
2025-11-21 13:33:07 +08:00
Bitterblue Smith
5511ba3de4 wifi: rtl8xxxu: Fix HT40 channel config for RTL8192CU, RTL8723AU
Flip the response rate subchannel. It was backwards, causing low
speeds when using 40 MHz channel width. "iw dev ... station dump"
showed a low RX rate, 11M or less.

Also fix the channel width field of RF6052_REG_MODE_AG.

Tested only with RTL8192CU, but these settings are identical for
RTL8723AU.

Signed-off-by: Bitterblue Smith <rtl8821cerfe2@gmail.com>
Reviewed-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/1f46571d-855b-43e1-8bfc-abacceb96043@gmail.com
2025-11-21 13:32:55 +08:00
Ping-Ke Shih
2a2aae3655 wifi: rtw89: 8852a: correct field mask of reset DAC/ADC FIFO
The field mask should be bits 16-31, but suddenly use wrong bits 24-31,
rarely causing a little performance degraded if DAC/DAC FIFO stays on
an unexpected state.

Found this by Geert who works on bit field functions.

Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Closes: https://lore.kernel.org/linux-wireless/CAMuHMdVt+5yOA6tuasX4KQgZud5wtRwu0A15UkEfQJbcd_xvVw@mail.gmail.com/
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251120031044.12493-2-pkshih@realtek.com
2025-11-21 11:53:51 +08:00
Marco Crivellari
9c194fe462 wifi: rtw88: add WQ_UNBOUND to alloc_workqueue users
Currently if a user enqueues a work item using schedule_delayed_work() the
used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use
WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to
schedule_work() that is using system_wq and queue_work(), that makes use
again of WORK_CPU_UNBOUND.

This lack of consistency cannot be addressed without refactoring the API.
For more details see the Link tag below.

alloc_workqueue() treats all queues as per-CPU by default, while unbound
workqueues must opt-in via WQ_UNBOUND.

This default is suboptimal: most workloads benefit from unbound queues,
allowing the scheduler to place worker threads where they’re needed and
reducing noise when CPUs are isolated.

This continues the effort to refactor workqueue APIs, which began with
the introduction of new workqueues and a new alloc_workqueue flag in:

commit 128ea9f6cc ("workqueue: Add system_percpu_wq and system_dfl_wq")
commit 930c2ea566 ("workqueue: Add new WQ_PERCPU flag")

This change adds the WQ_UNBOUND flag to explicitly request
alloc_workqueue() to be unbound, because this specific workload has no
benefit being per-cpu.

With the introduction of the WQ_PERCPU flag (equivalent to !WQ_UNBOUND),
any alloc_workqueue() caller that doesn’t explicitly specify WQ_UNBOUND
must now use WQ_PERCPU.

Once migration is complete, WQ_UNBOUND can be removed and unbound will
become the implicit default.

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Marco Crivellari <marco.crivellari@suse.com>
Link: https://lore.kernel.org/all/20250221112003.1dSuoGyc@linutronix.de/
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251118102032.54375-3-marco.crivellari@suse.com
2025-11-21 11:46:58 +08:00
Marco Crivellari
7cbec00dc7 wifi: rtlwifi: add WQ_UNBOUND to alloc_workqueue users
Currently if a user enqueues a work item using schedule_delayed_work() the
used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use
WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to
schedule_work() that is using system_wq and queue_work(), that makes use
again of WORK_CPU_UNBOUND.

This lack of consistency cannot be addressed without refactoring the API.
For more details see the Link tag below.

alloc_workqueue() treats all queues as per-CPU by default, while unbound
workqueues must opt-in via WQ_UNBOUND.

This default is suboptimal: most workloads benefit from unbound queues,
allowing the scheduler to place worker threads where they’re needed and
reducing noise when CPUs are isolated.

This continues the effort to refactor workqueue APIs, which began with
the introduction of new workqueues and a new alloc_workqueue flag in:

commit 128ea9f6cc ("workqueue: Add system_percpu_wq and system_dfl_wq")
commit 930c2ea566 ("workqueue: Add new WQ_PERCPU flag")

This change adds the WQ_UNBOUND flag to explicitly request
alloc_workqueue() to be unbound, because this specific workload has no
benefit being per-cpu.

With the introduction of the WQ_PERCPU flag (equivalent to !WQ_UNBOUND),
any alloc_workqueue() caller that doesn’t explicitly specify WQ_UNBOUND
must now use WQ_PERCPU.

Once migration is complete, WQ_UNBOUND can be removed and unbound will
become the implicit default.

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Marco Crivellari <marco.crivellari@suse.com>
Link: https://lore.kernel.org/all/20250221112003.1dSuoGyc@linutronix.de/
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251118102032.54375-2-marco.crivellari@suse.com
2025-11-21 11:46:49 +08:00
Seungjin Bae
b647d2574e wifi: rtl818x: rtl8187: Fix potential buffer underflow in rtl8187_rx_cb()
The rtl8187_rx_cb() calculates the rx descriptor header address
by subtracting its size from the skb tail pointer.
However, it does not validate if the received packet
(skb->len from urb->actual_length) is large enough to contain this
header.

If a truncated packet is received, this will lead to a buffer
underflow, reading memory before the start of the skb data area,
and causing a kernel panic.

Add length checks for both rtl8187 and rtl8187b descriptor headers
before attempting to access them, dropping the packet cleanly if the
check fails.

Fixes: 6f7853f3cb ("rtl8187: change rtl8187_dev.c to support RTL8187B (part 2)")
Signed-off-by: Seungjin Bae <eeodqql09@gmail.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251118013258.1789949-2-eeodqql09@gmail.com
2025-11-21 11:38:20 +08:00
Jakub Kicinski
e2c20036a8 Merge branch 'devlink-net-mlx5-implement-swp_l4_csum_mode-via-devlink-params'
Daniel Zahka says:

====================
devlink: net/mlx5: implement swp_l4_csum_mode via devlink params

This series introduces a new devlink feature for querying param
default values, and resetting params to their default values. This
feature is then used to implement a new mlx5 driver param.

The series starts with two pure refactor patches: one that passes
through the extack to devlink_param::get() implementations. And a
second small refactor that prepares the netlink tlv handling code in
the devlink_param::get() path to better handle default parameter
values.

The third patch introduces the uapi and driver api for default
parameter values. The driver api is opt-in, and both the uapi and
driver api preserve existing behavior when not used by drivers or
userspace.

The fourth patch introduces a new mlx5 driver param, swp_l4_csum_mode,
for controlling tx csum behavior. The "l4_only" value of this param is
a dependency for PSP initialization on CX7 NICs.

Lastly, the series introduces a new driver param with cmode runtime to
netdevsim, and then uses this param in a new testcase for netdevsim
devlink params.

Here are some examples of using the default param uapi with the devlink
cli. Note the devlink cli binary I am using has changes which I am
posting in accompanying series targeting iproute2-next:

  # netdevsim
./devlink dev param show netdevsim/netdevsim0
netdevsim/netdevsim0:
  name max_macs type generic
    values:
      cmode driverinit value 32 default 32
  name test1 type driver-specific
    values:
      cmode driverinit value true default true

  # set to false
./devlink dev param set netdevsim/netdevsim0 name test1 value false cmode driverinit
./devlink dev param show netdevsim/netdevsim0
netdevsim/netdevsim0:
  name max_macs type generic
    values:
      cmode driverinit value 32 default 32
  name test1 type driver-specific
    values:
      cmode driverinit value false default true

  # set back to default
./devlink dev param set netdevsim/netdevsim0 name test1 default cmode driverinit
./devlink dev param show netdevsim/netdevsim0
netdevsim/netdevsim0:
  name max_macs type generic
    values:
      cmode driverinit value 32 default 32
  name test1 type driver-specific
    values:
      cmode driverinit value true default true

 # mlx5 params on cx7
./devlink dev param show pci/0000:01:00.0
pci/0000:01:00.0:
  name max_macs type generic
    values:
      cmode driverinit value 128 default 128
...
  name swp_l4_csum_mode type driver-specific
    values:
      cmode permanent value default default default

  # set to l4_only
./devlink dev param set pci/0000:01:00.0 name swp_l4_csum_mode value l4_only cmode permanent
./devlink dev param show pci/0000:01:00.0 name swp_l4_csum_mode
pci/0000:01:00.0:
  name swp_l4_csum_mode type driver-specific
    values:
      cmode permanent value l4_only default default

  # reset to default
./devlink dev param set pci/0000:01:00.0 name swp_l4_csum_mode default cmode permanent
./devlink dev param show pci/0000:01:00.0 name swp_l4_csum_mode
pci/0000:01:00.0:
  name swp_l4_csum_mode type driver-specific
    values:
      cmode permanent value default default default
====================

Link: https://patch.msgid.link/20251119025038.651131-1-daniel.zahka@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 19:01:25 -08:00
Daniel Zahka
8be656cfb9 selftest: netdevsim: test devlink default params
Test querying default values and resetting to default values for
netdevsim devlink params.

This should cover the basic paths of interest: driverinit and
non-driverinit cmodes, as well as bool and non-bool value
type. Default param values of type bool are encoded with u8 netlink
type as opposed to flag type, so that userspace can distinguish
"not-present" from false.

Signed-off-by: Daniel Zahka <daniel.zahka@gmail.com>
Link: https://patch.msgid.link/20251119025038.651131-7-daniel.zahka@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 19:01:23 -08:00
Daniel Zahka
72924056eb netdevsim: register a new devlink param with default value interface
Create a new devlink param, test2, that supports default param actions
via the devlink_param::get_default() and
devlink_param::reset_default() functions.

Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Signed-off-by: Daniel Zahka <daniel.zahka@gmail.com>
Link: https://patch.msgid.link/20251119025038.651131-6-daniel.zahka@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 19:01:23 -08:00
Daniel Zahka
b11d358bf8 net/mlx5: implement swp_l4_csum_mode via devlink params
swp_l4_csum_mode controls how L4 transmit checksums are computed when
using Software Parser (SWP) hints for header locations.

Supported values:
  1. default: device will choose between full_csum or l4_only. Driver
     will discover the device's choice during initialization.
  2. full_csum: calculate L4 checksum with the pseudo-header.
  3. l4_only: calculate L4 checksum without the pseudo-header. Only
     available when swp_l4_csum_mode_l4_only is set in
     mlx5_ifc_nv_sw_offload_cap_bits.

Note that 'default' might be returned from the device and passed to
userspace, and it might also be set during a
devlink_param::reset_default() call, but attempts to set a value of
default directly with param-set will be rejected.

The l4_only setting is a dependency for PSP initialization in
mlx5e_psp_init().

Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Signed-off-by: Daniel Zahka <daniel.zahka@gmail.com>
Link: https://patch.msgid.link/20251119025038.651131-5-daniel.zahka@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 19:01:22 -08:00
Daniel Zahka
2a367002ed devlink: support default values for param-get and param-set
Support querying and resetting to default param values.

Introduce two new devlink netlink attrs:
DEVLINK_ATTR_PARAM_VALUE_DEFAULT and
DEVLINK_ATTR_PARAM_RESET_DEFAULT. The former is used to contain an
optional parameter value inside of the param_value nested
attribute. The latter is used in param-set requests from userspace to
indicate that the driver should reset the param to its default value.

To implement this, two new functions are added to the devlink driver
api: devlink_param::get_default() and
devlink_param::reset_default(). These callbacks allow drivers to
implement default param actions for runtime and permanent cmodes. For
driverinit params, the core latches the last value set by a driver via
devl_param_driverinit_value_set(), and uses that as the default value
for a param.

Because default parameter values are optional, it would be impossible
to discern whether or not a param of type bool has default value of
false or not provided if the default value is encoded using a netlink
flag type. For this reason, when a DEVLINK_PARAM_TYPE_BOOL has an
associated default value, the default value is encoded using a u8
type.

Signed-off-by: Daniel Zahka <daniel.zahka@gmail.com>
Link: https://patch.msgid.link/20251119025038.651131-4-daniel.zahka@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 19:01:22 -08:00
Daniel Zahka
17a42aa465 devlink: refactor devlink_nl_param_value_fill_one()
Lift the param type demux and value attr placement into a separate
function. This new function, devlink_nl_param_put(), can be used to
place additional types values in the value array, e.g., default,
current, next values. This commit has no functional change.

Signed-off-by: Daniel Zahka <daniel.zahka@gmail.com>
Link: https://patch.msgid.link/20251119025038.651131-3-daniel.zahka@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 19:01:22 -08:00
Daniel Zahka
011d133bb9 devlink: pass extack through to devlink_param::get()
Allow devlink_param::get() handlers to report error messages via
extack. This function is called in a few different contexts, but not
all of them will have an valid extack to use.

When devlink_param::get() is called from param_get_doit or
param_get_dumpit contexts, pass the extack through so that drivers can
report errors when retrieving param values. devlink_param::get() is
called from the context of devlink_param_notify(), pass NULL in for
the extack.

Reviewed-by: Saeed Mahameed <saeedm@nvidia.com>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Signed-off-by: Daniel Zahka <daniel.zahka@gmail.com>
Link: https://patch.msgid.link/20251119025038.651131-2-daniel.zahka@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 19:01:22 -08:00
Jakub Kicinski
b8f2b678fc Merge branch 'netconsole-allow-userdata-buffer-to-grow-dynamically'
Gustavo Luiz Duarte says:

====================
netconsole: Allow userdata buffer to grow dynamically

The current netconsole implementation allocates a static buffer for
extradata (userdata + sysdata) with a fixed size of
MAX_EXTRADATA_ENTRY_LEN * MAX_EXTRADATA_ITEMS bytes for every target,
regardless of whether userspace actually uses this feature. This forces
us to keep MAX_EXTRADATA_ITEMS small (16), which is restrictive for
users who need to attach more metadata to their log messages.

This patch series enables dynamic allocation of the userdata buffer,
allowing it to grow on-demand based on actual usage. The series:

1. Refactors send_fragmented_body() to simplify handling of separated
   userdata and sysdata (patch 1/4)
2. Splits userdata and sysdata into separate buffers (patch 2/4)
3. Implements dynamic allocation for the userdata buffer (patch 3/4)
4. Increases MAX_USERDATA_ITEMS from 16 to 256 now that we can do so
   without memory waste (patch 4/4)

Benefits:
- No memory waste when userdata is not used
- Targets that use userdata only consume what they need
- Users can attach significantly more metadata without impacting systems
  that don't use this feature
====================

Link: https://patch.msgid.link/20251119-netconsole_dynamic_extradata-v3-0-497ac3191707@meta.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 18:47:21 -08:00
Gustavo Luiz Duarte
5ad9945341 netconsole: Increase MAX_USERDATA_ITEMS
Increase MAX_USERDATA_ITEMS from 16 to 256 entries now that the userdata
buffer is allocated dynamically.

The previous limit of 16 was necessary because the buffer was statically
allocated for all targets. With dynamic allocation, we can support more
entries without wasting memory on targets that don't use userdata.

This allows users to attach more metadata to their netconsole messages,
which is useful for complex debugging and logging scenarios.

Also update the testcase accordingly.

Signed-off-by: Gustavo Luiz Duarte <gustavold@gmail.com>
Reviewed-by: Breno Leitao <leitao@debian.org>
Link: https://patch.msgid.link/20251119-netconsole_dynamic_extradata-v3-4-497ac3191707@meta.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 18:47:18 -08:00
Gustavo Luiz Duarte
eb83801af2 netconsole: Dynamic allocation of userdata buffer
The userdata buffer in struct netconsole_target is currently statically
allocated with a size of MAX_USERDATA_ITEMS * MAX_EXTRADATA_ENTRY_LEN
(16 * 256 = 4096 bytes). This wastes memory when userdata entries are
not used or when only a few entries are configured, which is common in
typical usage scenarios. It also forces us to keep MAX_USERDATA_ITEMS
small to limit the memory wasted.

Change the userdata buffer from a static array to a dynamically
allocated pointer. The buffer is now allocated on-demand in
update_userdata() whenever userdata entries are added, modified, or
removed via configfs. The implementation calculates the exact size
needed for all current userdata entries, allocates a new buffer of that
size, formats the entries into it, and atomically swaps it with the old
buffer.

This approach provides several benefits:
- Memory efficiency: Targets with no userdata use zero bytes instead of
  4KB, and targets with userdata only allocate what they need;
- Scalability: Makes it practical to increase MAX_USERDATA_ITEMS to a
  much larger value without imposing a fixed memory cost on every
  target;
- No hot-path overhead: Allocation occurs during configuration (write to
  configfs), not during message transmission

If memory allocation fails during userdata update, -ENOMEM is returned
to userspace through the configfs attribute write operation.

The sysdata buffer remains statically allocated since it has a smaller
fixed size (MAX_SYSDATA_ITEMS * MAX_EXTRADATA_ENTRY_LEN = 4 * 256 = 1024
bytes) and its content length is less predictable.

Signed-off-by: Gustavo Luiz Duarte <gustavold@gmail.com>
Reviewed-by: Breno Leitao <leitao@debian.org>
Link: https://patch.msgid.link/20251119-netconsole_dynamic_extradata-v3-3-497ac3191707@meta.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 18:47:18 -08:00
Gustavo Luiz Duarte
9dc10f50c4 netconsole: Split userdata and sysdata
Separate userdata and sysdata into distinct buffers to enable independent
management. Previously, both were stored in a single extradata_complete
buffer with a fixed size that accommodated both types of data.

This separation allows:
- userdata to grow dynamically (in subsequent patch)
- sysdata to remain in a small static buffer
- removal of complex entry counting logic that tracked both types together

The split also simplifies the code by eliminating the need to check total
entry count across both userdata and sysdata when enabling features,
which allows to drop holding su_mutex on sysdata_*_enabled_store().

No functional change in this patch, just structural preparation for
dynamic userdata allocation.

Signed-off-by: Gustavo Luiz Duarte <gustavold@gmail.com>
Reviewed-by: Breno Leitao <leitao@debian.org>
Link: https://patch.msgid.link/20251119-netconsole_dynamic_extradata-v3-2-497ac3191707@meta.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 18:47:18 -08:00
Gustavo Luiz Duarte
7279b718b4 netconsole: Simplify send_fragmented_body()
Refactor send_fragmented_body() to use separate offset tracking for
msgbody, and extradata instead of complex conditional logic.
The previous implementation used boolean flags and calculated offsets
which made the code harder to follow.

The new implementation maintains independent offset counters
(msgbody_offset, extradata_offset) and processes each section
sequentially, making the data flow more straightforward and the code
easier to maintain.

This is a preparatory refactoring with no functional changes, which will
allow easily splitting extradata_complete into separate userdata and
sysdata buffers in the next patch.

Signed-off-by: Gustavo Luiz Duarte <gustavold@gmail.com>
Reviewed-by: Breno Leitao <leitao@debian.org>
Link: https://patch.msgid.link/20251119-netconsole_dynamic_extradata-v3-1-497ac3191707@meta.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 18:47:18 -08:00
Byungchul Park
920fa394dc eth: fbnic: access @pp through netmem_desc instead of page
To eliminate the use of struct page in page pool, the page pool users
should use netmem descriptor and APIs instead.

Make fbnic access @pp through netmem_desc instead of page.

Signed-off-by: Byungchul Park <byungchul@sk.com>
Link: https://patch.msgid.link/20251120011118.73253-1-byungchul@sk.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 18:45:27 -08:00
Jakub Kicinski
7043aa16f3 Merge branch 'net-fec-do-some-cleanup-for-the-driver'
Wei Fang says:

====================
net: fec: do some cleanup for the driver

This patch set removes some unnecessary or invalid code from the FEC
driver. See each patch for details.
====================

Link: https://patch.msgid.link/20251119025148.2817602-1-wei.fang@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 18:40:15 -08:00
Wei Fang
bd31490718 net: fec: remove duplicate macros of the BD status
There are two sets of macros used to define the status bits of TX and RX
BDs, one is the BD_SC_xx macros, the other one is the BD_ENET_xx macros.
For the BD_SC_xx macros, only BD_SC_WRAP is used in the driver. But the
BD_ENET_xx macros are more widely used in the driver, and they define
more bits of the BD status. Therefore, remove the BD_SC_xx macros from
now on.

Signed-off-by: Wei Fang <wei.fang@nxp.com>
Reviewed-by: Frank Li <Frank.Li@nxp.com>
Link: https://patch.msgid.link/20251119025148.2817602-6-wei.fang@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 18:40:07 -08:00
Wei Fang
3bb06c8a46 net: fec: remove rx_align from fec_enet_private
The rx_align was introduced by the commit 41ef84ce4c ("net: fec: change
FEC alignment according to i.mx6 sx requirement"). Because the i.MX6 SX
requires RX buffer must be 64 bytes alignment.

Since the commit 95698ff617 ("net: fec: using page pool to manage RX
buffers"), the address of the RX buffer is always the page address plus
FEC_ENET_XDP_HEADROOM which is 256 bytes, so the RX buffer is always
64-byte aligned. Therefore, rx_align has no effect since that commit,
and we can safely remove it.

In addition, to prevent future modifications to FEC_ENET_XDP_HEADROOM,
a BUILD_BUG_ON() test has been added to the driver, which ensures that
FEC_ENET_XDP_HEADROOM provides the required alignment.

Signed-off-by: Wei Fang <wei.fang@nxp.com>
Link: https://patch.msgid.link/20251119025148.2817602-5-wei.fang@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 18:40:07 -08:00
Wei Fang
63083d597a net: fec: remove struct fec_enet_priv_txrx_info
The struct fec_enet_priv_txrx_info has three members: offset, page and
skb. The offset is only initialized in the driver and is not used, the
skb is never initialized and used in the driver. The both will not be
used in the future. Therefore, replace struct fec_enet_priv_txrx_info
directly with struct page.

Signed-off-by: Wei Fang <wei.fang@nxp.com>
Reviewed-by: Frank Li <Frank.Li@nxp.com>
Link: https://patch.msgid.link/20251119025148.2817602-4-wei.fang@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 18:40:07 -08:00
Wei Fang
eef7b786bd net: fec: simplify the conditional preprocessor directives
From the Kconfig file, we can see CONFIG_FEC depends on the following
platform-related options.

ColdFire: M523x, M527x, M5272, M528x, M520x and M532x
S32: ARCH_S32 (ARM64)
i.MX: SOC_IMX28 and ARCH_MXC (ARM and ARM64)

Based on the code of fec driver, only some macro definitions on the
M5272 platform are different from those on other platforms. Therefore,
we can simplify the following complex preprocessor directives to
"if !defined(CONFIG_M5272)".

"#if defined(CONFIG_M523x) || defined(CONFIG_M527x) || \
     defined(CONFIG_M528x) || defined(CONFIG_M520x) || \
     defined(CONFIG_M532x) || defined(CONFIG_ARM) || \
     defined(CONFIG_ARM64)"

Signed-off-by: Wei Fang <wei.fang@nxp.com>
Link: https://patch.msgid.link/20251119025148.2817602-3-wei.fang@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 18:40:07 -08:00
Wei Fang
3eea593b55 net: fec: remove useless conditional preprocessor directives
The conditional preprocessor directive was added to fix build errors on
the MCF5272 platform, see commit d13919301d ("net: fec: Fix build for
MCF5272"). The compilation errors were originally caused by some register
macros not being defined on that platform.

The driver now uses quirks to dynamically handle platform differences,
and for MCF5272, its quirks is 0, so it does not support RACC and GBIT
Ethernet. So these preprocessor directives are no longer required and
can be safely removed without causing build or functional issue.

Signed-off-by: Wei Fang <wei.fang@nxp.com>
Link: https://patch.msgid.link/20251119025148.2817602-2-wei.fang@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 18:40:06 -08:00
Jakub Kicinski
a7687b292e Merge branch 'net-add-1600gbps-1-6t-link-mode-support'
Tariq Toukan says:

====================
net: Add 1600Gbps (1.6T) link mode support

This series by Yael adds 1600Gbps (1.6T) link mode support.
See detailed description by Yael below.
====================

Link: https://patch.msgid.link/1763585297-1243980-1-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 18:21:33 -08:00
Yael Chemla
5fb9a0b89e bonding: 3ad: Add support for 1600G speed
Add support for 1600Gbps speed to allow using 3ad mode with 1600G
devices.

Signed-off-by: Yael Chemla <ychemla@nvidia.com>
Reviewed-by: Shahar Shitrit <shshitrit@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1763585297-1243980-4-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 18:21:30 -08:00
Yael Chemla
be3a435df7 net/mlx5e: Add 1600Gbps link modes
Introduce support for a 1600Gbps link mode, utilizing 8 lanes at 200Gbps
per lane.

Signed-off-by: Yael Chemla <ychemla@nvidia.com>
Reviewed-by: Shahar Shitrit <shshitrit@nvidia.com>
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1763585297-1243980-3-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 18:21:29 -08:00
Yael Chemla
491c5dc98b net: ethtool: Add support for 1600Gbps speed
Add support for 1600Gbps link modes based on 200Gbps per lane [1].
This includes the adopted IEEE 802.3dj copper and optical PMDs that use
200G/lane signaling [2].

Add the following PMD types:
- KR8 (backplane)
- CR8 (copper cable)
- DR8 (SMF 500m)
- DR8-2 (SMF 2km)

These modes are defined in the 802.3dj specifications.
References:
[1] https://www.ieee802.org/3/dj/public/23_03/opsasnick_3dj_01a_2303.pdf
[2] https://www.ieee802.org/3/dj/projdoc/objectives_P802d3dj_240314.pdf

Signed-off-by: Yael Chemla <ychemla@nvidia.com>
Reviewed-by: Shahar Shitrit <shshitrit@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Link: https://patch.msgid.link/1763585297-1243980-2-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 18:21:29 -08:00
Zahari Doychev
8b4e023d79 ynl: samples: add tc filter example
Add a sample tool demonstrating how to add, dump, and delete a
flower filter with two VLAN push actions. The example can be
invoked as:

  # samples/tc-filter-add p2

    flower pref 1 proto: 0x8100
    flower:
      vlan_id: 100
      vlan_prio: 5
      num_of_vlans: 3
    action order: 1 vlan push id 200 protocol 0x8100 priority 0
    action order: 2 vlan push id 300 protocol 0x8100 priority 0

This verifies correct handling of tc action attributes for multiple
VLAN push actions. The tc action indexed arrays start from index 1,
and the index defines the action order. This behavior differs from
the YNL specification, which expects arrays to be zero-based. To
accommodate this, the example adds a dummy action at index 0, which
is ignored by the kernel.

Signed-off-by: Zahari Doychev <zahari.doychev@linux.com>
Link: https://patch.msgid.link/20251119203618.263780-2-zahari.doychev@linux.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 18:20:40 -08:00
Jakub Kicinski
b64ea1c5f4 Merge branch 'selftests-drv-net-convert-gro-and-toeplitz-tests-to-work-for-drivers-in-nipa'
Jakub Kicinski says:

====================
selftests: drv-net: convert GRO and Toeplitz tests to work for drivers in NIPA

Main objective of this series is to convert the gro.sh and toeplitz.sh
tests to be "NIPA-compatible" - meaning make use of the Python env,
which lets us run the tests against either netdevsim or a real device.

The tests seem to have been written with a different flow in mind.
Namely they source different bash "setup" scripts depending on arguments
passed to the test. While I have nothing against the use of bash and
the overall architecture - the existing code needs quite a bit of work
(don't assume MAC/IP addresses, support remote endpoint over SSH).
If I'm the one fixing it, I'd rather convert them to our "simplistic"
Python.

This series rewrites the tests in Python while addressing their
shortcomings. The functionality of running the test over loopback
on a real device is retained but with a different method of invocation
(see the last patch).

Once again we are dealing with a script which run over a variety of
protocols (combination of [ipv4, ipv6, ipip] x [tcp, udp]). The first
4 patches add support for test variants to our scripts. We use the
term "variant" in the same sense as the C kselftest_harness.h -
variant is just a set of static input arguments.

Note that neither GRO nor the Toeplitz test fully passes for me on
any HW I have access to. But this is unrelated to the conversion.
This series is not making any real functional changes to the tests,
it is limited to improving the "test harness" scripts.
====================

Link: https://patch.msgid.link/20251120021024.2944527-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 18:19:33 -08:00
Jakub Kicinski
bd28e5bddc selftests: net: remove old setup_* scripts
gro.sh and toeplitz.sh used to source in one of two setup scripts
depending on whether the test was expected to be run against
veth or a real device. veth testing is replaced by netdevsim
and existing "remote endpoint" support in our Python tests.
Add a script which sets up loopback mode.

The usage is a little bit more complicated than running
the scripts used to be. Testing used to work like this:

  ./../gro.sh -i eth0 ...

now the "setup script" has to be run explicitly:

  NETIF=eth0 ./../ksft_setup_loopback.sh ./../gro.sh

But the functionality itself is retained.

Reviewed-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20251120021024.2944527-13-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 18:19:31 -08:00
Jakub Kicinski
358008f41d netdevsim: add loopback support
Support device loopback. Apparently this mode has been historically
supported by the toeplitz test and I don't have any HW which lets
me test the conversion..

Reviewed-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20251120021024.2944527-12-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 18:19:30 -08:00
Jakub Kicinski
9cf9aa77a1 selftests: drv-net: hw: convert the Toeplitz test to Python
Rewrite the existing toeplitz.sh test in Python. The conversion
is a lot less exact than the GRO one. We use Netlink APIs to
get the device RSS and IRQ information. We expect that the device
has neither RPS nor RFS configured, and set RPS up as part of
the test.

Reviewed-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20251120021024.2944527-11-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 18:19:30 -08:00
Jakub Kicinski
fdb0267d56 selftests: drv-net: add a Python version of the GRO test
Rewrite the existing gro.sh test in Python. The conversion
not exact, the changes are related to integrating the test
with our "remote endpoint" paradigm. The test now reads
the IP addresses from the user config. It resolves the MAC
address (including running over Layer 3 networks).

Reviewed-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20251120021024.2944527-10-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 18:19:30 -08:00
Jakub Kicinski
40dd789bc5 netdevsim: pass packets thru GRO on Rx
To replace veth in software GRO testing with netdevsim we need
GRO support in netdevsim. Luckily we already have NAPI support
so this change is trivial (compared to veth).

Reviewed-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20251120021024.2944527-9-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 18:19:30 -08:00
Jakub Kicinski
15011a57d0 selftests: net: py: read ip link info about remote dev
We're already saving the info about the local dev in env.dev
for the tests, save remote dev as well. This is more symmetric,
env generally provides the same info for local and remote end.

While at it make sure that we reliably get the detailed info
about the local dev. nsim used to read the dev info without -d.

Reviewed-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20251120021024.2944527-8-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 18:19:29 -08:00
Jakub Kicinski
e02b52ecef selftests: net: py: support ksft ready without wait
There's a common synchronization problem when a script (Python test)
uses a C program to set up some state (usually start a receiving
process for traffic). The script needs to know when the process
has fully initialized. The inverse of the problem exists for shutting
the process down - we need a reliable way to tell the process to exit.

We added helpers to do this safely in
commit 7147713799 ("selftests: drv-net: add a way to wait for a local process")
unfortunately the two operations (wait for init, and shutdown) are
controlled by a single parameter (ksft_wait). Add support for using
ksft_ready without using the second fd for exit.

This is useful for programs which wait for a specific number of packets
to rx so exit_wait is a good match, but we still need to wait for init.

Reviewed-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: breno Leitao <leitao@debian.org>
Link: https://patch.msgid.link/20251120021024.2944527-7-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 18:19:29 -08:00
Jakub Kicinski
89268f7dbc selftests: net: relocate gro and toeplitz tests to drivers/net
The GRO test can run on a real device or a veth.
The Toeplitz hash test can only run on a real device.
Move them from net/ to drivers/net/ and drivers/net/hw/ respectively.

There are two scripts which set up the environment for these tests
setup_loopback.sh and setup_veth.sh. Move those scripts to net/lib.
The paths to the setup files are a little ugly but they will be
deleted shortly.

toeplitz_client.sh is not a test in itself, but rather a helper
to send traffic, so add it to TEST_FILES rather than TEST_PROGS.

Reviewed-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20251120021024.2944527-6-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 18:19:29 -08:00
Jakub Kicinski
173227d7d6 selftests: drv-net: xdp: use variants for qstat tests
Use just-added ksft variants for XDP qstat tests.

While at it correct the number of packets, we're sending
1000 packets now.

Reviewed-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20251120021024.2944527-5-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 18:19:29 -08:00
Jakub Kicinski
6ae67f1159 selftests: net: py: add test variants
There's a lot of cases where we try to re-run the same code with
different parameters. We currently need to either use a generator
method or create a "main" case implementation which then gets called
by trivial case functions:

  def _test(x, y, z):
     ...

  def case_int():
     _test(1, 2, 3)

  def case_str():
     _test('a', 'b', 'c')

Add support for variants, similar to kselftests_harness.h and
a lot of other frameworks. Variants can be added as decorator
to test functions:

  @ksft_variants([(1, 2, 3), ('a', 'b', 'c')])
  def case(x, y, z):
     ...

ksft_run() will auto-generate case names:
  case.1_2_3
  case.a_b_c

Because the names may not always be pretty (and to avoid forcing
classes to implement case-friendly __str__()) add a wrapper class
KsftNamedVariant which lets the user specify the name for the variant.

Note that ksft_run's args are still supported. ksft_run splices args
and variant params together.

Reviewed-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Link: https://patch.msgid.link/20251120021024.2944527-4-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 18:19:29 -08:00
Jakub Kicinski
80970e0fc0 selftests: net: py: extract the case generation logic
In preparation for adding test variants move the test case
collection logic to a dedicated function. New helper returns

 (function, args, name, )

tuples. The main test loop can simply run them, not much
logic or discernment needed.

Reviewed-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20251120021024.2944527-3-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 18:19:29 -08:00
Jakub Kicinski
5cb7b71b76 selftests: net: py: coding style improvements
We're about to add more features here and finding new issues with old
ones in place is hard. Address ruff checks:
 - bare exceptions
 - f-string with no params
 - unused import

We need to use BaseException when handling defer(), as Petr points out.
This retains the old behavior of ignoring SIGTERM while running cleanups.

Reviewed-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Link: https://patch.msgid.link/20251120021024.2944527-2-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 18:19:28 -08:00
Heiner Kallweit
d99b408ed8 net: phy: fixed_phy: remove not needed initialization of phy_device members
All these members are populated by the phylib state machine once the
PHY has been started, based on the fixed autoneg results.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://patch.msgid.link/bc666a53-5469-4e9c-85a1-dd285aadfe4f@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 18:18:54 -08:00
Heiner Kallweit
bd048f8ce6 net: phy: fixed_phy: fix missing initialization of fixed phy link
Original change remove the link initialization from the passed struct
fixed_phy_status, but @status is also passed to __fixed_phy_add(),
where it is saved. Make sure that copy also has link set to 1.

Fixes: 9f07af1d27 ("net: phy: fixed_phy: initialize the link status as up")
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://patch.msgid.link/dab6c10e-725e-4648-9662-39cc821723d0@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 18:16:59 -08:00
Jakub Kicinski
58673d10d5 Merge branch 'net-phy-adin1100-fix-powerdown-mode-setting'
Alexander Dahl says:

====================
net: phy: adin1100: Fix powerdown mode setting

while building a new device around the ADIN1100 I noticed some errors in
kernel log when calling `ifdown` on the ethernet device.  Series has a
straight forward fix and an obvious follow-up code simplification.
====================

Link: https://patch.msgid.link/20251119124737.280939-1-ada@thorsis.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 18:04:01 -08:00
Alexander Dahl
5894cab4e1 net: phy: adin1100: Simplify register value passing
The additional use case for that variable is gone,
the expression is simple enough to pass it inline now.

Signed-off-by: Alexander Dahl <ada@thorsis.com>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Acked-by: Nuno Sá <nuno.sa@analog.com>
Link: https://patch.msgid.link/20251119124737.280939-3-ada@thorsis.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 18:03:59 -08:00
Alexander Dahl
bccaf1fe08 net: phy: adin1100: Fix software power-down ready condition
Value CRSM_SFT_PD written to Software Power-Down Control Register
(CRSM_SFT_PD_CNTRL) is 0x01 and therefor different to value
CRSM_SFT_PD_RDY (0x02) read from System Status Register (CRSM_STAT) for
confirmation powerdown has been reached.

The condition could have only worked when disabling powerdown
(both 0x00), but never when enabling it (0x01 != 0x02).

Result is a timeout, like so:

    $ ifdown eth0
    macb f802c000.ethernet eth0: Link is Down
    ADIN1100 f802c000.ethernet-ffffffff:01: adin_set_powerdown_mode failed: -110
    ADIN1100 f802c000.ethernet-ffffffff:01: adin_set_powerdown_mode failed: -110

Fixes: 7eaf913299 ("net: phy: adin1100: Add initial support for ADIN1100 industrial PHY")
Signed-off-by: Alexander Dahl <ada@thorsis.com>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Acked-by: Nuno Sá <nuno.sa@analog.com>
Link: https://patch.msgid.link/20251119124737.280939-2-ada@thorsis.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 18:03:59 -08:00
Jakub Kicinski
22eaa206fc Merge branch 'net-stmmac-simplify-axi_blen-handling'
Russell King says:

====================
net: stmmac: simplify axi_blen handling

stmmac's axi_blen (burst length) handling is very verbose and
unnecessary.

Firstly, the burst length register bitfield is the same across all
dwmac cores, so we can use common definitions for these bits which
platform glue can use.

We end up with platform glue:
- filling in the axi_blen[] array with the decimal burst lengths, e.g.
  dwmac-intel.c, etc
- decoding a bitmap into burst lengths for this array, e.g.
  dwmac-dwc-qos-eth.c

Other cases read the array from DT, placing it into the axi_blen
array, and converting later to the register bitfield.

This series removes all this complexity, ultimately ending up with
platform glue providing the register value containing the burst
length bitfield directly. Where necessary, platform glue calls
stmmac_axi_blen_to_mask() to convert a decimal array (e.g. from
DT) to the register value.

This also means that stmmac_axi_blen_to_mask() can issue a
diagnostic message at probe time if the burst length is incorrect.
====================

Link: https://patch.msgid.link/aR2aaDs6rqfu32B-@shell.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 17:57:42 -08:00
Russell King (Oracle)
efd3c8cc52 net: stmmac: remove axi_blen array
Remove the axi_blen array from struct stmmac_axi as we set this array,
and then immediately convert it ot the register value, never looking at
the array again. Thus, the array can be function local rather than part
of a run-time allocated long-lived struct.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vLfLg-0000000FMbD-1vmh@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 17:57:40 -08:00
Russell King (Oracle)
e676cc8561 net: stmmac: move stmmac_axi_blen_to_mask() to axi_blen init sites
Move stmmac_axi_blen_to_mask() to the axi->axi_blen array init sites
to prepare for the removal of axi_blen. For sites which initialise
axi->axi_blen with constant data, initialise axi->axi_blen_regval
using the DMA_AXI_BLENx constants.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vLfLb-0000000FMb7-1SgG@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 17:57:40 -08:00
Russell King (Oracle)
6ff3310ca2 net: stmmac: move stmmac_axi_blen_to_mask() to stmmac_main.c
Move the call to stmmac_axi_blen_to_mask() out of the individual
MAC version drivers into the main code in stmmac_init_dma_engine(),
passing the resulting value through a new member, axi_blen_regval,
in the struct stmmac_axi structure.

There is now no need for stmmac_axi_blen_to_dma_mask() to use
u32p_replace_bits(), so use FIELD_PREP() instead.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vLfLW-0000000FMb1-0zKV@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 17:57:40 -08:00
Russell King (Oracle)
2704af20c8 net: stmmac: provide common stmmac_axi_blen_to_mask()
Provide a common stmmac_axi_blen_to_mask() function to translate the
burst length array to the value for the AXI bus mode register, and use
it for dwmac, dwmac4 and dwxgmac2. Remove the now unnecessary
XGMAC_BLEN* definitions.

Note that stmmac_axi_blen_to_dma_mask() is coded to be more efficient
than the original three implementations, and verifies the contents of
the burst length array.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vLfLR-0000000FMav-0VL6@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 17:57:40 -08:00
Russell King (Oracle)
8c696659f4 net: stmmac: move common DMA AXI register bits to common.h
Move the common DMA AXI register bits to common.h so they can be shared
and we can provide a common function to convert the axi->dma_blen[]
array to the format needed for this register.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vLfLL-0000000FMap-49gf@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 17:57:40 -08:00
Russell King (Oracle)
f7ac9a0bbe net: stmmac: dwc-qos-eth: simplify switch() in dwc_eth_dwmac_config_dt()
Simplify the switch() statement in dwc_eth_dwmac_config_dt().
Although this is not speed-critical, simplifying it can make it more
readable. This also drastically improves the code emitted by the
compiler.

On aarch64, with the original code, the compiler loads registers with
every possible value, and then has a tree of test-and-branch statements
to work out which register to store. With the simplified code, the
compiler can load a register with '4' and shift it appropriately.

This shrinks the text size on aarch64 from 4289 bytes to 4153 bytes,
a reduction of 3%.

Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vLfLG-0000000FMai-3fKz@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 17:57:39 -08:00
Russell King (Oracle)
f15bcd0719 net: stmmac: rk: use phylink's interface mode for set_clk_tx_rate()
rk_set_clk_tx_rate() is passed the interface mode from phylink which
will be the same as bsp_priv->phy_iface. Use the passed-in interface
mode rather than bsp_priv->phy_iface.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vLgNA-0000000FMjN-0DSS@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 17:55:37 -08:00
Jakub Kicinski
fdc38d34b3 Merge branch 'net-stmmac-pass-struct-device-to-init-exit'
Russell King says:

====================
net: stmmac: pass struct device to init/exit

Rather than passing the platform device to the ->init() and ->exit()
methods, make these methods useful for other devices by passing the
struct device instead. Update the implementations appropriately for
this change.

Move the calls for these methods into the core driver's probe and
remove methods from the stmmac_platform layer.

Convert dwmac-rk to use ->init() and ->exit().
====================

Link: https://patch.msgid.link/aR2V0Kib7j0L4FNN@shell.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 17:54:10 -08:00
Russell King (Oracle)
1a62894e04 net: stmmac: rk: convert to init()/exit() methods
Convert rk to use the init() and exit() methods for powering up and
down the device. This allows us to use the pltfr versions of probe()
and remove().

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vLf2e-0000000FMNN-1Xnh@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 17:54:08 -08:00
Russell King (Oracle)
32da89a840 net: stmmac: move probe/remove calling of init/exit
Move the probe/remove time calling of the init()/exit() methods in
the platform data to the main driver probe/remove functions. This
allows them to be used by non-platform_device based drivers.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Link: https://patch.msgid.link/E1vLf2Z-0000000FMNH-0xPV@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 17:54:07 -08:00
Russell King (Oracle)
85081acc6b net: stmmac: pass struct device to init()/exit() methods
As struct plat_stmmacenet_data is not platform_device specific, pass
a struct device into the init() and exit() methods to allow them to
become independent of the underlying device.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Acked-by: Chen-Yu Tsai <wens@kernel.org>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Link: https://patch.msgid.link/E1vLf2U-0000000FMN2-0SLg@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 17:54:07 -08:00
Jakub Kicinski
4707191ca9 Merge branch 'tcp-tcp_rcvbuf_grow-changes'
Eric Dumazet says:

====================
tcp: tcp_rcvbuf_grow() changes

First pach is minor and moves tcp_moderate_rcvbuf in appropriate group.

Second patch is another attempt to keep small sk->sk_rcvbuf for DC
(small RT) TCP flows for optimal performance.
====================

Link: https://patch.msgid.link/20251119084813.3684576-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 17:44:27 -08:00
Eric Dumazet
ecfea98b7d tcp: add net.ipv4.tcp_rcvbuf_low_rtt
This is a follow up of commit aa251c8463 ("tcp: fix too slow
tcp_rcvbuf_grow() action") which brought again the issue that I tried
to fix in commit 65c5287892 ("tcp: fix sk_rcvbuf overshoot")

We also recently increased tcp_rmem[2] to 32 MB in commit 572be9bf9d
("tcp: increase tcp_rmem[2] to 32 MB")

Idea of this patch is to not let tcp_rcvbuf_grow() grow sk->sk_rcvbuf
too fast for small RTT flows. If sk->sk_rcvbuf is too big, this can
force NIC driver to not recycle pages from their page pool, and also
can cause cache evictions for DDIO enabled cpus/NIC, as receivers
are usually slower than senders.

Add net.ipv4.tcp_rcvbuf_low_rtt sysctl, set by default to 1000 usec (1 ms)

If RTT if smaller than the sysctl value, use the RTT/tcp_rcvbuf_low_rtt
ratio to control sk_rcvbuf inflation.

Tested:

Pair of hosts with a 200Gbit IDPF NIC. Using netperf/netserver

Client initiates 8 TCP bulk flows, asking netserver to use CPU #10 only.

super_netperf 8 -H server -T,10 -l 30

On server, use perf -e tcp:tcp_rcvbuf_grow while test is running.

Before:

sysctl -w net.ipv4.tcp_rcvbuf_low_rtt=1
perf record -a -e tcp:tcp_rcvbuf_grow sleep 30 ; perf script|tail -20|cut -c30-230
 1153.051201: tcp:tcp_rcvbuf_grow: time=398 rtt_us=382 copied=6905856 inq=180224 space=6115328 ooo=0 scaling_ratio=240 rcvbuf=27666235 rcv_ssthresh=25878235 window_clamp=25937095 rcv_wnd=25600000 famil
 1153.138752: tcp:tcp_rcvbuf_grow: time=446 rtt_us=413 copied=5529600 inq=180224 space=4505600 ooo=0 scaling_ratio=240 rcvbuf=23068672 rcv_ssthresh=21571860 window_clamp=21626880 rcv_wnd=21286912 famil
 1153.361484: tcp:tcp_rcvbuf_grow: time=415 rtt_us=380 copied=7061504 inq=204800 space=6725632 ooo=0 scaling_ratio=240 rcvbuf=27666235 rcv_ssthresh=25878235 window_clamp=25937095 rcv_wnd=25600000 famil
 1153.457642: tcp:tcp_rcvbuf_grow: time=483 rtt_us=421 copied=5885952 inq=720896 space=4407296 ooo=0 scaling_ratio=240 rcvbuf=23763511 rcv_ssthresh=22223271 window_clamp=22278291 rcv_wnd=21430272 famil
 1153.466002: tcp:tcp_rcvbuf_grow: time=308 rtt_us=281 copied=3244032 inq=180224 space=2883584 ooo=0 scaling_ratio=240 rcvbuf=44854314 rcv_ssthresh=41992059 window_clamp=42050919 rcv_wnd=41713664 famil
 1153.747792: tcp:tcp_rcvbuf_grow: time=394 rtt_us=332 copied=4460544 inq=585728 space=3063808 ooo=0 scaling_ratio=240 rcvbuf=44854314 rcv_ssthresh=41992059 window_clamp=42050919 rcv_wnd=41373696 famil
 1154.260747: tcp:tcp_rcvbuf_grow: time=652 rtt_us=226 copied=10977280 inq=737280 space=9486336 ooo=0 scaling_ratio=240 rcvbuf=31165538 rcv_ssthresh=29197743 window_clamp=29217691 rcv_wnd=28368896 fami
 1154.375019: tcp:tcp_rcvbuf_grow: time=461 rtt_us=443 copied=7573504 inq=507904 space=6856704 ooo=0 scaling_ratio=240 rcvbuf=27666235 rcv_ssthresh=25878235 window_clamp=25937095 rcv_wnd=25288704 famil
 1154.463072: tcp:tcp_rcvbuf_grow: time=494 rtt_us=408 copied=7983104 inq=200704 space=7065600 ooo=0 scaling_ratio=240 rcvbuf=27666235 rcv_ssthresh=25878235 window_clamp=25937095 rcv_wnd=25579520 famil
 1154.474658: tcp:tcp_rcvbuf_grow: time=507 rtt_us=459 copied=5586944 inq=540672 space=4718592 ooo=0 scaling_ratio=240 rcvbuf=17852266 rcv_ssthresh=16692999 window_clamp=16736499 rcv_wnd=16056320 famil
 1154.584657: tcp:tcp_rcvbuf_grow: time=494 rtt_us=427 copied=8126464 inq=204800 space=7782400 ooo=0 scaling_ratio=240 rcvbuf=27666235 rcv_ssthresh=25878235 window_clamp=25937095 rcv_wnd=25600000 famil
 1154.702117: tcp:tcp_rcvbuf_grow: time=480 rtt_us=406 copied=5734400 inq=180224 space=5349376 ooo=0 scaling_ratio=240 rcvbuf=23068672 rcv_ssthresh=21571860 window_clamp=21626880 rcv_wnd=21286912 famil
 1155.941595: tcp:tcp_rcvbuf_grow: time=717 rtt_us=670 copied=11042816 inq=3784704 space=7159808 ooo=0 scaling_ratio=240 rcvbuf=19581357 rcv_ssthresh=18333222 window_clamp=18357522 rcv_wnd=14614528 fam
 1156.384735: tcp:tcp_rcvbuf_grow: time=529 rtt_us=473 copied=9011200 inq=180224 space=7258112 ooo=0 scaling_ratio=240 rcvbuf=19581357 rcv_ssthresh=18333222 window_clamp=18357522 rcv_wnd=18018304 famil
 1157.821676: tcp:tcp_rcvbuf_grow: time=529 rtt_us=272 copied=8224768 inq=602112 space=6545408 ooo=0 scaling_ratio=240 rcvbuf=67000000 rcv_ssthresh=62793576 window_clamp=62812500 rcv_wnd=62115840 famil
 1158.906379: tcp:tcp_rcvbuf_grow: time=710 rtt_us=445 copied=11845632 inq=540672 space=10240000 ooo=0 scaling_ratio=240 rcvbuf=31165538 rcv_ssthresh=29205935 window_clamp=29217691 rcv_wnd=28536832 fam
 1164.600160: tcp:tcp_rcvbuf_grow: time=841 rtt_us=430 copied=12976128 inq=1290240 space=11304960 ooo=0 scaling_ratio=240 rcvbuf=31165538 rcv_ssthresh=29212591 window_clamp=29217691 rcv_wnd=27856896 fa
 1165.163572: tcp:tcp_rcvbuf_grow: time=845 rtt_us=800 copied=12632064 inq=540672 space=7921664 ooo=0 scaling_ratio=240 rcvbuf=27666235 rcv_ssthresh=25912795 window_clamp=25937095 rcv_wnd=25260032 fami
 1165.653464: tcp:tcp_rcvbuf_grow: time=388 rtt_us=309 copied=4493312 inq=180224 space=3874816 ooo=0 scaling_ratio=240 rcvbuf=44854314 rcv_ssthresh=41995899 window_clamp=42050919 rcv_wnd=41713664 famil
 1166.651211: tcp:tcp_rcvbuf_grow: time=556 rtt_us=553 copied=6328320 inq=540672 space=5554176 ooo=0 scaling_ratio=240 rcvbuf=23068672 rcv_ssthresh=21571860 window_clamp=21626880 rcv_wnd=20946944 famil

After:

sysctl -w net.ipv4.tcp_rcvbuf_low_rtt=1000
perf record -a -e tcp:tcp_rcvbuf_grow sleep 30 ; perf script|tail -20|cut -c30-230
 1457.053149: tcp:tcp_rcvbuf_grow: time=128 rtt_us=24 copied=1441792 inq=40960 space=1269760 ooo=0 scaling_ratio=240 rcvbuf=2960741 rcv_ssthresh=2605474 window_clamp=2775694 rcv_wnd=2568192 family=AF_I
 1458.000778: tcp:tcp_rcvbuf_grow: time=128 rtt_us=31 copied=1441792 inq=24576 space=1400832 ooo=0 scaling_ratio=240 rcvbuf=3060163 rcv_ssthresh=2810042 window_clamp=2868902 rcv_wnd=2674688 family=AF_I
 1458.088059: tcp:tcp_rcvbuf_grow: time=190 rtt_us=110 copied=3227648 inq=385024 space=2781184 ooo=0 scaling_ratio=240 rcvbuf=6728240 rcv_ssthresh=6252705 window_clamp=6307725 rcv_wnd=5799936 family=AF
 1458.148549: tcp:tcp_rcvbuf_grow: time=232 rtt_us=129 copied=3956736 inq=237568 space=2842624 ooo=0 scaling_ratio=240 rcvbuf=6731333 rcv_ssthresh=6252705 window_clamp=6310624 rcv_wnd=5918720 family=AF
 1458.466861: tcp:tcp_rcvbuf_grow: time=193 rtt_us=83 copied=2949120 inq=180224 space=2457600 ooo=0 scaling_ratio=240 rcvbuf=5751438 rcv_ssthresh=5357689 window_clamp=5391973 rcv_wnd=5054464 family=AF_
 1458.775476: tcp:tcp_rcvbuf_grow: time=257 rtt_us=127 copied=4304896 inq=352256 space=3346432 ooo=0 scaling_ratio=240 rcvbuf=8067131 rcv_ssthresh=7523275 window_clamp=7562935 rcv_wnd=7061504 family=AF
 1458.776631: tcp:tcp_rcvbuf_grow: time=200 rtt_us=96 copied=3260416 inq=143360 space=2768896 ooo=0 scaling_ratio=240 rcvbuf=6397256 rcv_ssthresh=5938567 window_clamp=5997427 rcv_wnd=5828608 family=AF_
 1459.707973: tcp:tcp_rcvbuf_grow: time=215 rtt_us=96 copied=2506752 inq=163840 space=1388544 ooo=0 scaling_ratio=240 rcvbuf=3068867 rcv_ssthresh=2768282 window_clamp=2877062 rcv_wnd=2555904 family=AF_
 1460.246494: tcp:tcp_rcvbuf_grow: time=231 rtt_us=80 copied=3756032 inq=204800 space=3117056 ooo=0 scaling_ratio=240 rcvbuf=7288091 rcv_ssthresh=6773725 window_clamp=6832585 rcv_wnd=6471680 family=AF_
 1460.714596: tcp:tcp_rcvbuf_grow: time=270 rtt_us=110 copied=4714496 inq=311296 space=3719168 ooo=0 scaling_ratio=240 rcvbuf=8957739 rcv_ssthresh=8339020 window_clamp=8397880 rcv_wnd=7933952 family=AF
 1462.029977: tcp:tcp_rcvbuf_grow: time=101 rtt_us=19 copied=1105920 inq=40960 space=1036288 ooo=0 scaling_ratio=240 rcvbuf=2338970 rcv_ssthresh=2091684 window_clamp=2192784 rcv_wnd=1986560 family=AF_I
 1462.802385: tcp:tcp_rcvbuf_grow: time=89 rtt_us=45 copied=1069056 inq=0 space=1064960 ooo=0 scaling_ratio=240 rcvbuf=2338970 rcv_ssthresh=2091684 window_clamp=2192784 rcv_wnd=2035712 family=AF_INET6
 1462.918648: tcp:tcp_rcvbuf_grow: time=105 rtt_us=33 copied=1441792 inq=180224 space=1069056 ooo=0 scaling_ratio=240 rcvbuf=2383282 rcv_ssthresh=2091684 window_clamp=2234326 rcv_wnd=1896448 family=AF_
 1463.222533: tcp:tcp_rcvbuf_grow: time=273 rtt_us=144 copied=4603904 inq=385024 space=3469312 ooo=0 scaling_ratio=240 rcvbuf=8422564 rcv_ssthresh=7891053 window_clamp=7896153 rcv_wnd=7409664 family=AF
 1466.519312: tcp:tcp_rcvbuf_grow: time=130 rtt_us=23 copied=1343488 inq=0 space=1261568 ooo=0 scaling_ratio=240 rcvbuf=2780158 rcv_ssthresh=2493778 window_clamp=2606398 rcv_wnd=2494464 family=AF_INET6
 1466.681003: tcp:tcp_rcvbuf_grow: time=128 rtt_us=21 copied=1441792 inq=12288 space=1343488 ooo=0 scaling_ratio=240 rcvbuf=2932027 rcv_ssthresh=2578555 window_clamp=2748775 rcv_wnd=2568192 family=AF_I
 1470.689959: tcp:tcp_rcvbuf_grow: time=255 rtt_us=122 copied=3932160 inq=204800 space=3551232 ooo=0 scaling_ratio=240 rcvbuf=8182038 rcv_ssthresh=7647384 window_clamp=7670660 rcv_wnd=7442432 family=AF
 1471.754154: tcp:tcp_rcvbuf_grow: time=188 rtt_us=95 copied=2138112 inq=577536 space=1429504 ooo=0 scaling_ratio=240 rcvbuf=3113650 rcv_ssthresh=2806426 window_clamp=2919046 rcv_wnd=2248704 family=AF_
 1476.813542: tcp:tcp_rcvbuf_grow: time=269 rtt_us=99 copied=3088384 inq=180224 space=2564096 ooo=0 scaling_ratio=240 rcvbuf=6219470 rcv_ssthresh=5771893 window_clamp=5830753 rcv_wnd=5509120 family=AF_
 1477.738309: tcp:tcp_rcvbuf_grow: time=166 rtt_us=54 copied=1777664 inq=180224 space=1417216 ooo=0 scaling_ratio=240 rcvbuf=3117118 rcv_ssthresh=2874958 window_clamp=2922298 rcv_wnd=2613248 family=AF_

We can see sk_rcvbuf values are much smaller, and that rtt_us (estimation of rtt
from a receiver point of view) is kept small, instead of being bloated.

No difference in throughput.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Tested-by: Paolo Abeni <pabeni@redhat.com>
Link: https://patch.msgid.link/20251119084813.3684576-3-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 17:44:23 -08:00
Eric Dumazet
6d5dea6824 tcp: tcp_moderate_rcvbuf is only used in rx path
sysctl_tcp_moderate_rcvbuf is only used from tcp_rcvbuf_grow().

Move it to netns_ipv4_read_rx group.

Remove various CACHELINE_ASSERT_GROUP_SIZE() from netns_ipv4_struct_check(),
as they have no real benefit but cause pain for all changes.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20251119084813.3684576-2-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 17:44:23 -08:00
Jakub Kicinski
738cd803b9 Merge branch 'net-mdio-improve-reset-handling-of-mdio-devices'
Buday Csaba says:

====================
net: mdio: improve reset handling of mdio devices

This patchset refactors and slightly improves the reset handling of
`mdio_device`.

The patches were split from a larger series, discussed previously in the
links below.

The difference between v2 and v3, is that the helper function declarations
have been moved to a new header file: drivers/net/phy/mdio-private.h
See links for the previous versions, and for the now separate leak fix.
====================

Link: https://patch.msgid.link/cover.1763473655.git.buday.csaba@prolan.hu
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 17:41:41 -08:00
Buday Csaba
e5a440bf02 net: mdio: improve reset handling in mdio_device.c
Change fwnode_property_read_u32() in mdio_device_register_reset()
to device_property_read_u32(), which is more appropriate here.

Make mdio_device_unregister_reset() truly reverse
mdio_device_register_reset() by setting the internal fields to
their default values.

Signed-off-by: Buday Csaba <buday.csaba@prolan.hu>
Link: https://patch.msgid.link/641df1488517ae71ba10158ec1e38424211d8651.1763473655.git.buday.csaba@prolan.hu
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 17:41:39 -08:00
Buday Csaba
acde7ad968 net: mdio: common handling of phy device reset properties
Unify the handling of the per device reset properties for
`mdio_device`.

Merge mdio_device_register_gpiod() and mdio_device_register_reset()
into mdio_device_register_reset(), that handles both
reset-controllers and reset-gpios.
Move reading of the reset firmware properties (reset-assert-us,
reset-deassert-us) from fwnode_mdio.c to mdio_device_register_reset(),
so all reset related initialization code is kept in one place.

Introduce mdio_device_unregister_reset() to release the associated
resources.

These changes make tracking the reset properties easier.
Added kernel-doc for mdio_device_register/unregister_reset().

Signed-off-by: Buday Csaba <buday.csaba@prolan.hu>
Link: https://patch.msgid.link/17c216efd7a47be17db104378b6aacfc8741d8b9.1763473655.git.buday.csaba@prolan.hu
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 17:41:39 -08:00
Buday Csaba
02aeff20e8 net: mdio: move device reset functions to mdio_device.c
The functions mdiobus_register_gpiod() and mdiobus_register_reset()
handle the mdio device reset initialization, which belong to
mdio_device.c.
Move them from mdio_bus.c to mdio_device.c, and rename them to match
the corresponding source file: mdio_device_register_gpio() and
mdio_device_register_reset().
Remove 'static' qualifiers and declare them in
drivers/net/phy/mdio-private.h (new header file).

Signed-off-by: Buday Csaba <buday.csaba@prolan.hu>
Link: https://patch.msgid.link/5f684838ee897130f21b21beb07695eea4af8988.1763473655.git.buday.csaba@prolan.hu
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 17:41:39 -08:00
Nirbhay Sharma
e0940c672a bpf: Document cfi_stubs and owner fields in struct bpf_struct_ops
Add missing kernel-doc documentation for the cfi_stubs and owner
fields in struct bpf_struct_ops to fix the following warnings:

  Warning: include/linux/bpf.h:1931 struct member 'cfi_stubs' not
  described in 'bpf_struct_ops'
  Warning: include/linux/bpf.h:1931 struct member 'owner' not
  described in 'bpf_struct_ops'

The cfi_stubs field was added in commit 2cd3e3772e ("x86/cfi,bpf:
Fix bpf_struct_ops CFI") to provide CFI stub functions for trampolines,
and the owner field is used for module reference counting.

Signed-off-by: Nirbhay Sharma <nirbhay.lkd@gmail.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20251120204620.59571-2-nirbhay.lkd@gmail.com
2025-11-20 14:04:01 -08:00
Joanne Koong
84692a1519 io_uring/kbuf: remove obsolete buf_nr_pages and update comments
The buf_nr_pages field in io_buffer_list was previously used to
determine whether the buffer list uses ring-provided buffers or classic
provided buffers. This is now determined by checking the IOBL_BUF_RING
flag.

Remove the buf_nr_pages field and update related comments.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-20 13:23:05 -07:00
Matt Bobrowski
d088da9042 selftests/bpf: Use ASSERT_STRNEQ to factor in long slab cache names
subtest_kmem_cache_iter_check_slabinfo() fundamentally compares slab
cache names parsed out from /proc/slabinfo against those stored within
struct kmem_cache_result. The current problem is that the slab cache
name within struct kmem_cache_result is stored within a bounded
fixed-length array (sized to SLAB_NAME_MAX(32)), whereas the name
parsed out from /proc/slabinfo is not. Meaning, using ASSERT_STREQ()
can certainly lead to test failures, particularly when dealing with
slab cache names that are longer than SLAB_NAME_MAX(32)
bytes. Notably, kmem_cache_create() allows callers to create slab
caches with somewhat arbitrarily sized names via its __name identifier
argument, so exceeding the SLAB_NAME_MAX(32) limit that is in place
now can certainly happen.

Make subtest_kmem_cache_iter_check_slabinfo() more reliable by only
checking up to sizeof(struct kmem_cache_result.name) - 1 using
ASSERT_STRNEQ().

Fixes: a496d0cdc8 ("selftests/bpf: Add a test for kmem_cache_iter")
Signed-off-by: Matt Bobrowski <mattbobrowski@google.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Acked-by: Song Liu <song@kernel.org>
Link: https://patch.msgid.link/20251118073734.4188710-1-mattbobrowski@google.com
2025-11-20 09:26:06 -08:00
Jakub Kicinski
9e203721ec Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
Cross-merge networking fixes after downstream PR (net-6.18-rc7).

No conflicts, adjacent changes:

tools/testing/selftests/net/af_unix/Makefile
  e1bb28bf13 ("selftest: af_unix: Add test for SO_PEEK_OFF.")
  45a1cd8346 ("selftests: af_unix: Add tests for ECONNRESET and EOF semantics")

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-20 09:13:26 -08:00
David Laight
9420e720ad block: use min() instead of min_t()
min_t(unsigned int, a, b) casts an 'unsigned long' to 'unsigned int'.
Use min(a, b) instead as it promotes any 'unsigned int' to 'unsigned long'
and so cannot discard significant bits.

In this case the 'unsigned long' value is small enough that the result
is ok.

(Similarly for max_t() and clamp_t().)

Detected by an extra check added to min_t().

Signed-off-by: David Laight <david.laight.linux@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-20 07:44:29 -07:00
Paolo Abeni
0888a0d76d Merge branch 'ynl-cli-list-attrs-argument'
Gal Pressman says:

====================
YNL CLI --list-attrs argument

While experimenting with the YNL CLI, I found the process of going back
and forth to examine the YAML spec files in order to figure out how to
use each command quite tiring.

The addition of --list-attrs helps by providing all information needed
directly in the tool. I figured others would likely find it useful as
well.

v1: https://lore.kernel.org/all/20251116192845.1693119-1-gal@nvidia.com/
====================

Link: https://patch.msgid.link/20251118143208.2380814-1-gal@nvidia.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-20 15:43:06 +01:00
Gal Pressman
6c10f1a1c0 tools: ynl: cli: Display enum values in --list-attrs output
When listing attributes with --list-attrs, display the actual enum
values for attributes that reference an enum type.

  # ./cli.py --family netdev --list-attrs dev-get
  [..]
    - xdp-features: u64 (enum: xdp-act)
      Flags: basic, redirect, ndo-xmit, xsk-zerocopy, hw-offload, rx-sg, ndo-xmit-sg
      Bitmask of enabled xdp-features.
  [..]

Reviewed-by: Nimrod Oren <noren@nvidia.com>
Signed-off-by: Gal Pressman <gal@nvidia.com>
Link: https://patch.msgid.link/20251118143208.2380814-4-gal@nvidia.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-20 15:43:04 +01:00
Gal Pressman
bc1bc1b357 tools: ynl: cli: Parse nested attributes in --list-attrs output
Enhance the --list-attrs option to recursively display nested attributes
instead of just showing "nest" as the type.
Nested attributes now show their attribute set name and expand to
display their contents.

  # ./cli.py --family ethtool --list-attrs rss-get
  [..]
  Do request attributes:
    - header: nest -> header
        - dev-index: u32
        - dev-name: string
        - flags: u32 (enum: header-flags)
        - phy-index: u32
    - context: u32
  [..]

Reviewed-by: Nimrod Oren <noren@nvidia.com>
Signed-off-by: Gal Pressman <gal@nvidia.com>
Reviewed-by: Donald Hunter <donald.hunter@gmail.com>
Link: https://patch.msgid.link/20251118143208.2380814-3-gal@nvidia.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-20 15:43:04 +01:00
Gal Pressman
2a2d5a3392 tools: ynl: cli: Add --list-attrs option to show operation attributes
Add a --list-attrs option to the YNL CLI that displays information about
netlink operations, including request and reply attributes.
This eliminates the need to manually inspect YAML spec files to
determine the JSON structure required for operations, or understand the
structure of the reply.

Example usage:
  # ./cli.py --family netdev --list-attrs dev-get
  Operation: dev-get
  Get / dump information about a netdev.

  Do request attributes:
    - ifindex: u32
      netdev ifindex

  Do reply attributes:
    - ifindex: u32
      netdev ifindex
    - xdp-features: u64 (enum: xdp-act)
      Bitmask of enabled xdp-features.
    - xdp-zc-max-segs: u32
      max fragment count supported by ZC driver
    - xdp-rx-metadata-features: u64 (enum: xdp-rx-metadata)
      Bitmask of supported XDP receive metadata features. See Documentation/networking/xdp-rx-metadata.rst for more details.
    - xsk-features: u64 (enum: xsk-flags)
      Bitmask of enabled AF_XDP features.

  Dump reply attributes:
    - ifindex: u32
      netdev ifindex
    - xdp-features: u64 (enum: xdp-act)
      Bitmask of enabled xdp-features.
    - xdp-zc-max-segs: u32
      max fragment count supported by ZC driver
    - xdp-rx-metadata-features: u64 (enum: xdp-rx-metadata)
      Bitmask of supported XDP receive metadata features. See Documentation/networking/xdp-rx-metadata.rst for more details.
    - xsk-features: u64 (enum: xsk-flags)
      Bitmask of enabled AF_XDP features.

Reviewed-by: Nimrod Oren <noren@nvidia.com>
Signed-off-by: Gal Pressman <gal@nvidia.com>
Link: https://patch.msgid.link/20251118143208.2380814-2-gal@nvidia.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-20 15:43:04 +01:00
Chaitanya Kulkarni
e8f0abdd49 zloop: clear nowait flag in workqueue context
The zloop driver advertises REQ_NOWAIT support through BLK_FEAT_NOWAIT
(enabled by default for all blk-mq devices), and honors the nowait
behavior throughout zloop_queue_rq().

However, actual I/O to the backing file is performed in a workqueue,
where blocking is allowed.

To avoid imposing unnecessary non-blocking constraints in this blocking
context, clear the REQ_NOWAIT flag before processing the request in the
workqueue context.

Signed-off-by: Chaitanya Kulkarni <ckulkarnilinux@gmail.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-20 07:42:32 -07:00
Chaitanya Kulkarni
b11e483a1c loop: clear nowait flag in workqueue context
The loop driver advertises REQ_NOWAIT support through BLK_FEAT_NOWAIT
(enabled by default for all blk-mq devices), and honors the nowait
behavior throughout loop_queue_rq().

However, actual I/O to the backing file is performed in a workqueue,
where blocking is allowed.

To avoid imposing unnecessary non-blocking constraints in this blocking
context, clear the REQ_NOWAIT flag before processing the request in the
workqueue context.

Signed-off-by: Chaitanya Kulkarni <ckulkarnilinux@gmail.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-20 07:42:32 -07:00
Paolo Abeni
7828a4d3f6 Merge branch 'add-af_xdp-zero-copy-support'
Meghana Malladi says:

====================
Add AF_XDP zero copy support

This series adds AF_XDP zero coppy support to icssg driver.

Tests were performed on AM64x-EVM with xdpsock application [1].

A clear improvement is seen Transmit (txonly) and receive (rxdrop)
for 64 byte packets. 1500 byte test seems to be limited by line
rate (1G link) so no improvement seen there in packet rate

Having some issue with l2fwd as the benchmarking numbers show 0
for 64 byte packets after forwading first batch packets and I am
currently looking into it.

AF_XDP performance using 64 byte packets in Kpps.
AF_XDP performance using 64 byte packets in Kpps.
Benchmark:	XDP-SKB		XDP-Native	XDP-Native(ZeroCopy)
rxdrop		253		473		656
txonly		350		354		855
l2fwd 		178		240		0

AF_XDP performance using 1500 byte packets in Kpps.
Benchmark:	XDP-SKB		XDP-Native	XDP-Native(ZeroCopy)
rxdrop		82		82		82
txonly		81		82		82
l2fwd 		81		82		82

[1]: https://github.com/xdp-project/bpf-examples/tree/master/AF_XDP-example
v5: https://lore.kernel.org/all/20251111101523.3160680-1-m-malladi@ti.com/
====================

Link: https://patch.msgid.link/20251118135542.380574-1-m-malladi@ti.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-20 15:24:13 +01:00
Meghana Malladi
c6a1ec1870 net: ti: icssg-prueth: Enable zero copy in XDP features
Enable the zero copy feature flag in xdp_set_features_flag()
for a given ndev to get the AF-XDP zero copy support running
for both Tx and Rx.

Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Meghana Malladi <m-malladi@ti.com>
Link: https://patch.msgid.link/20251118135542.380574-7-m-malladi@ti.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-20 15:24:11 +01:00
Meghana Malladi
7a64bb388d net: ti: icssg-prueth: Add AF_XDP zero copy for RX
Use xsk_pool inside rx_chn to check if a given Rx queue id
is registered for xsk zero copy, which gets populated during
xsk enable.

Update prueth_create_xdp_rxqs to register and support two different
memory models (xsk and page) for a given Rx queue, if registered for
zero copy.

If xsk_pool is registered, allocate buffers from UMEM and map them
to the hardware Rx descriptors. In NAPI context, run the XDP program
for each packet and process the xsk buffer according to the XDP
result codes. Also allocate new set of buffers from UMEM for the
next batch of NAPI Rx processing. Add XDK_WAKEUP_RX support to support
xsk wakeup for Rx.

Move prueth_create_page_pool to prueth_init_rx_chns to avoid freeing
and re-allocating the system memory every time there is a transition
from zero copy to copy and prevents any type of memory fragmentation
or leak.

Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Meghana Malladi <m-malladi@ti.com>
Link: https://patch.msgid.link/20251118135542.380574-6-m-malladi@ti.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-20 15:24:11 +01:00
Meghana Malladi
121133163c net: ti: icssg-prueth: Make emac_run_xdp function independent of page
emac_run_xdp function runs xdp program, at a given hook point
in the Rx path of the driver in NAPI context and returns
XDP return codes. In zero copy mode the driver receives
packets using UMEM frames instead of pages (native XDP).
Decouple the usage of page in this function.

Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Meghana Malladi <m-malladi@ti.com>
Link: https://patch.msgid.link/20251118135542.380574-5-m-malladi@ti.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-20 15:24:11 +01:00
Meghana Malladi
8756ef2eb0 net: ti: icssg-prueth: Add AF_XDP zero copy for TX
Use xsk_pool inside tx_chn to check if a given Tx queue id
is registered for xsk zero copy, which gets populated during
xsk enable

If xsk_pool is set, get frames from the pool in NAPI
context and submit them to the Tx channel. Tx completion
is also handled in the NAPI context.

Use PRUETH_SWDATA_XSK to recycle xsk buffers back to the
umem pool. Add XDP_WAKEUP_TX support to enable xsk_wakeup
for Tx.

Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Meghana Malladi <m-malladi@ti.com>
Link: https://patch.msgid.link/20251118135542.380574-4-m-malladi@ti.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-20 15:24:11 +01:00
Meghana Malladi
7dfd759791 net: ti: icssg-prueth: Add XSK pool helpers
Implement XSK NDOs (setup, wakeup) and create XSK
Rx and Tx queues. xsk_qid stores the queue id for
a given port which has been registered for zero copy
AF_XDP and used to acquire UMEM pointer if registered.

Based on the xsk_qid and the xsk_pool (umem) the driver
is either in copy or zero copy mode. In case of copy mode
the xsk_qid value will be invalid and will be set to valid
queue id when enabling zero copy. To enable zero copy, the
Rx queues are destroyed, i.e., descriptors pushed to fq
and cq are freed to remap them to xdp buffers from the umem.

Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Meghana Malladi <m-malladi@ti.com>
Link: https://patch.msgid.link/20251118135542.380574-3-m-malladi@ti.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-20 15:24:11 +01:00
Meghana Malladi
41dde7f1d0 net: ti: icssg-prueth: Add functions to create and destroy Rx/Tx queues
Each port for a given ICSSG instance has their own set of
Tx and Rx queues. Add functions to create and destroy these
queues, which will be further used while performing ndo_bpf
operations to set up XSK Tx/Rx queues for a given port.

In the destroy Rx queue sequence add teardown wait to ensure
that all the descriptors including the TDCM (teardown completion
marker) have been serviced and freed to avoid any sort of descriptor
leaks.

Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Meghana Malladi <m-malladi@ti.com>
Link: https://patch.msgid.link/20251118135542.380574-2-m-malladi@ti.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-20 15:24:11 +01:00
Christoph Hellwig
1cfe3795c1 xfs: use zi more in xfs_zone_gc_mount
Use the local variable instead of the extra pointer dereference when
starting the GC thread.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
2025-11-20 13:32:58 +01:00
Paolo Abeni
070b87f64a Merge branch 'txgbe-support-more-modules'
Jiawen Wu says:

====================
TXGBE support more modules

Support CR modules for 25G devices and QSFP modules for 40G devices. And
implement .get_module_eeprom_by_page() to get module info.

v1: https://lore.kernel.org/all/20251112055841.22984-1-jiawenwu@trustnetic.com/
====================

Link: https://patch.msgid.link/20251118080259.24676-1-jiawenwu@trustnetic.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-20 12:47:28 +01:00
Jiawen Wu
9b97b6b563 net: txgbe: support getting module EEPROM by page
Getting module EEPROM has been supported in TXGBE SP devices, since SFP
driver has already implemented it.

Now add support to read module EEPROM for AML devices. Towards this, add
a new firmware mailbox command to get the page data.

Signed-off-by: Jiawen Wu <jiawenwu@trustnetic.com>
Link: https://patch.msgid.link/20251118080259.24676-6-jiawenwu@trustnetic.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-20 12:47:26 +01:00
Jiawen Wu
c6e97daec5 net: txgbe: delay to identify modules in .ndo_open
For QSFP modules, there is a possibility that the module cannot be
identified when read I2C immediately in .ndo_open. So just set the flag
WX_FLAG_NEED_MODULE_RESET and do it in the subtask, which always wait
200 ms to identify the module. And this change has no impact on the
original adaptation.

Signed-off-by: Jiawen Wu <jiawenwu@trustnetic.com>
Link: https://patch.msgid.link/20251118080259.24676-5-jiawenwu@trustnetic.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-20 12:47:26 +01:00
Jiawen Wu
57d39faed4 net: txgbe: improve functions of AML 40G devices
Support to identify QSFP modules for AML 40G devices. The definition of
GPIO pins follows the design of the QSFP modules, and TXGBE_GPIOBIT_4 is
used for module present.

Meanwhile, implement phylink in XLGMII mode by default, and get the link
state from MAC link.

Signed-off-by: Jiawen Wu <jiawenwu@trustnetic.com>
Link: https://patch.msgid.link/20251118080259.24676-4-jiawenwu@trustnetic.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-20 12:47:26 +01:00
Jiawen Wu
dbba6b7a47 net: txgbe: rename the SFP related
QSFP supported will be introduced for AML 40G devices, the code related
to identify various modules should be renamed to more appropriate names.

And struct txgbe_hic_i2c_read used to get module information is renamed
as struct txgbe_hic_get_module_info, because another SW-FW command to
read I2C will be added later.

Signed-off-by: Jiawen Wu <jiawenwu@trustnetic.com>
Link: https://patch.msgid.link/20251118080259.24676-3-jiawenwu@trustnetic.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-20 12:47:26 +01:00
Jiawen Wu
354d128aa7 net: txgbe: support CR modules for AML devices
Support to identify 25G/10G CR modules for AML devices. Autoneg is
enbaled by default in CR mode.

Signed-off-by: Jiawen Wu <jiawenwu@trustnetic.com>
Link: https://patch.msgid.link/20251118080259.24676-2-jiawenwu@trustnetic.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-20 12:47:26 +01:00
Chien Wong
b688a9447b wifi: mac80211: refactor CMAC packet handlers
Merge CMAC-128 and CMAC-256 handlers since they are almost the same.
This removes duplication.

The comment 'MIC = AES-128-CMAC(IGTK, AAD ...' is out-dated since CMAC
is also used with BIGTK, as is the comment for CMAC-256. Simply remove
the comments.

Tested-on: mac80211_hwsim

Signed-off-by: Chien Wong <m@xv97.com>
Link: https://patch.msgid.link/20251113140511.48658-6-m@xv97.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-20 11:56:19 +01:00
Chien Wong
a22fb19244 wifi: mac80211: refactor CMAC crypt functions
ieee80211_aes_cmac() and ieee80211_aes_cmac_256() are almost the same.
Merge them. This removes duplication.

Signed-off-by: Chien Wong <m@xv97.com>
Link: https://patch.msgid.link/20251113140511.48658-5-m@xv97.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-20 11:56:19 +01:00
Chien Wong
edf62602fc wifi: mac80211: utilize the newly defined CMAC constants
Make use of the added constants to reduce duplication.

Signed-off-by: Chien Wong <m@xv97.com>
Link: https://patch.msgid.link/20251113140511.48658-4-m@xv97.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-20 11:56:19 +01:00
Chien Wong
4255545a28 wifi: mac80211: add generic MMIE struct defines
The added struct is needed when writing generic handler for both CMAC-128
and CMAC-256.

Signed-off-by: Chien Wong <m@xv97.com>
Link: https://patch.msgid.link/20251113140511.48658-3-m@xv97.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-20 11:56:19 +01:00
Chien Wong
353cda30d3 wifi: mac80211: fix CMAC functions not handling errors
The called hash functions could fail thus we should check return values.

Fixes: 26717828b7 ("mac80211: aes-cmac: switch to shash CMAC driver")
Signed-off-by: Chien Wong <m@xv97.com>
Link: https://patch.msgid.link/20251113140511.48658-2-m@xv97.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-20 11:56:18 +01:00
Marco Crivellari
799e98708f wifi: ipw2x00: replace use of system_wq with system_percpu_wq
This patch continues the effort to refactor workqueue APIs, which has begun
with the changes introducing new workqueues and a new alloc_workqueue flag:

   commit 128ea9f6cc ("workqueue: Add system_percpu_wq and system_dfl_wq")
   commit 930c2ea566 ("workqueue: Add new WQ_PERCPU flag")

The point of the refactoring is to eventually alter the default behavior of
workqueues to become unbound by default so that their workload placement is
optimized by the scheduler.

Before that to happen after a careful review and conversion of each individual
case, workqueue users must be converted to the better named new workqueues with
no intended behaviour changes:

   system_wq -> system_percpu_wq
   system_unbound_wq -> systemd_dfl_wq

This way the old obsolete workqueues (system_wq, system_unbound_wq) can be
removed in the future.

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Marco Crivellari <marco.crivellari@suse.com>
Link: https://patch.msgid.link/20251120094524.45264-1-marco.crivellari@suse.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-20 11:55:24 +01:00
Marco Crivellari
fec4d9d6ff wifi: cfg80211: replace use of system_unbound_wq with system_dfl_wq
Currently if a user enqueues a work item using schedule_delayed_work() the
used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use
WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to
schedule_work() that is using system_wq and queue_work(), that makes use
again of WORK_CPU_UNBOUND.

This lack of consistency cannot be addressed without refactoring the API.
For more details see the Link tag below.

This continues the effort to refactor workqueue APIs, which began with
the introduction of new workqueues and a new alloc_workqueue flag in:

commit 128ea9f6cc ("workqueue: Add system_percpu_wq and system_dfl_wq")
commit 930c2ea566 ("workqueue: Add new WQ_PERCPU flag")

Switch to using system_dfl_wq because system_unbound_wq is going away as part of
a workqueue restructuring.

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Marco Crivellari <marco.crivellari@suse.com>
Link: https://lore.kernel.org/all/20250221112003.1dSuoGyc@linutronix.de/
Link: https://patch.msgid.link/20251113162032.394804-4-marco.crivellari@suse.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-20 10:26:17 +01:00
Marco Crivellari
e4582bdf51 wifi: cw1200: add WQ_PERCPU to alloc_workqueue users
Currently if a user enqueues a work item using schedule_delayed_work() the
used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use
WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to
schedule_work() that is using system_wq and queue_work(), that makes use
again of WORK_CPU_UNBOUND.
This lack of consistency cannot be addressed without refactoring the API.
For more details see the Link tag below.

alloc_workqueue() treats all queues as per-CPU by default, while unbound
workqueues must opt-in via WQ_UNBOUND.

This default is suboptimal: most workloads benefit from unbound queues,
allowing the scheduler to place worker threads where they’re needed and
reducing noise when CPUs are isolated.

This continues the effort to refactor workqueue APIs, which began with
the introduction of new workqueues and a new alloc_workqueue flag in:

commit 128ea9f6cc ("workqueue: Add system_percpu_wq and system_dfl_wq")
commit 930c2ea566 ("workqueue: Add new WQ_PERCPU flag")

This change adds a new WQ_PERCPU flag to explicitly request
alloc_workqueue() to be per-cpu when WQ_UNBOUND has not been specified.

With the introduction of the WQ_PERCPU flag (equivalent to !WQ_UNBOUND),
any alloc_workqueue() caller that doesn’t explicitly specify WQ_UNBOUND
must now use WQ_PERCPU.

Once migration is complete, WQ_UNBOUND can be removed and unbound will
become the implicit default.

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Marco Crivellari <marco.crivellari@suse.com>
Link: https://lore.kernel.org/all/20250221112003.1dSuoGyc@linutronix.de/
Link: https://patch.msgid.link/20251113162032.394804-3-marco.crivellari@suse.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-20 10:26:09 +01:00
Marco Crivellari
2112519bdf wifi: wfx: add WQ_PERCPU to alloc_workqueue users
Currently if a user enqueues a work item using schedule_delayed_work() the
used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use
WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to
schedule_work() that is using system_wq and queue_work(), that makes use
again of WORK_CPU_UNBOUND.
This lack of consistency cannot be addressed without refactoring the API.
For more details see the Link tag below.

alloc_workqueue() treats all queues as per-CPU by default, while unbound
workqueues must opt-in via WQ_UNBOUND.

This default is suboptimal: most workloads benefit from unbound queues,
allowing the scheduler to place worker threads where they’re needed and
reducing noise when CPUs are isolated.

This continues the effort to refactor workqueue APIs, which began with
the introduction of new workqueues and a new alloc_workqueue flag in:

commit 128ea9f6cc ("workqueue: Add system_percpu_wq and system_dfl_wq")
commit 930c2ea566 ("workqueue: Add new WQ_PERCPU flag")

This change adds a new WQ_PERCPU flag to explicitly request
alloc_workqueue() to be per-cpu when WQ_UNBOUND has not been specified.

With the introduction of the WQ_PERCPU flag (equivalent to !WQ_UNBOUND),
any alloc_workqueue() caller that doesn’t explicitly specify WQ_UNBOUND
must now use WQ_PERCPU.

Once migration is complete, WQ_UNBOUND can be removed and unbound will
become the implicit default.

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Marco Crivellari <marco.crivellari@suse.com>
Link: https://lore.kernel.org/all/20250221112003.1dSuoGyc@linutronix.de/
Reviewed-by: Jérôme Pouiller <jerome.pouiller@silabs.com>
Link: https://patch.msgid.link/20251113160825.383883-1-marco.crivellari@suse.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-20 10:25:46 +01:00
Marco Crivellari
5d5c121c4f wifi: qtnfmac: add WQ_PERCPU to alloc_workqueue users
Currently if a user enqueues a work item using schedule_delayed_work() the
used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use
WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to
schedule_work() that is using system_wq and queue_work(), that makes use
again of WORK_CPU_UNBOUND.
This lack of consistency cannot be addressed without refactoring the API.
For more details see the Link tag below.

alloc_workqueue() treats all queues as per-CPU by default, while unbound
workqueues must opt-in via WQ_UNBOUND.

This default is suboptimal: most workloads benefit from unbound queues,
allowing the scheduler to place worker threads where they’re needed and
reducing noise when CPUs are isolated.

This continues the effort to refactor workqueue APIs, which began with
the introduction of new workqueues and a new alloc_workqueue flag in:

commit 128ea9f6cc ("workqueue: Add system_percpu_wq and system_dfl_wq")
commit 930c2ea566 ("workqueue: Add new WQ_PERCPU flag")

This change adds a new WQ_PERCPU flag to explicitly request
alloc_workqueue() to be per-cpu when WQ_UNBOUND has not been specified.

With the introduction of the WQ_PERCPU flag (equivalent to !WQ_UNBOUND),
any alloc_workqueue() caller that doesn’t explicitly specify WQ_UNBOUND
must now use WQ_PERCPU.

Once migration is complete, WQ_UNBOUND can be removed and unbound will
become the implicit default.

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Marco Crivellari <marco.crivellari@suse.com>
Link: https://lore.kernel.org/all/20250221112003.1dSuoGyc@linutronix.de/
Link: https://patch.msgid.link/20251113160035.376524-1-marco.crivellari@suse.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-20 10:25:28 +01:00
Pagadala Yesu Anjaneyulu
a77f0ad44f wifi: cfg80211: Add support for 6GHz AP role not relevant AP type
Add IEEE80211_6GHZ_CTRL_REG_AP_ROLE_NOT_RELEVANT
and map it to IEEE80211_REG_LPI_AP for safe regulatory compliance
when AP role classification is not applicable.
Use LPI as safe fallback to prevent power limit violations.

Signed-off-by: Pagadala Yesu Anjaneyulu <pagadala.yesu.anjaneyulu@intel.com>
Reviewed-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20251112110828.856283677cc7.I36138a34847c3b4e680974bf347dde844448f3bc@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-20 10:25:10 +01:00
Jakub Kicinski
d877b1013c Merge branch 'net-mlx5-move-notifiers-outside-the-devlink-lock'
Tariq Toukan says:

====================
net/mlx5: Move notifiers outside the devlink lock

This series by Cosmin moves blocking notifier registration in the mlx5
driver outside the devlink lock during probe.

This is mostly a no-op refactoring that consists of multiple pieces.
It is necessary because upcoming code will introduce a potential locking
cycle between the devlink lock and the blocking notifier head mutexes,
so these notifiers must move out of the devlink-locked critical section.
====================

Link: https://patch.msgid.link/1763325940-1231508-1-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-19 20:32:30 -08:00
Cosmin Ratiu
64ad6470c8 net/mlx5: Move SF dev table notifier registration outside the PF devlink lock
This completes the previous patches by moving notifier registration for
SF dev tables outside the devlink locked critical section in
mlx5_init_one() / mlx5_uninit_one() and into the mlx5_mdev_init() /
mlx5_mdev_uninit() functions.

This is only done for non-SFs, since SFs do not have a SF HW table
themselves.

After this patch, notifiers can grab the PF devlink lock (soon to be
necessary) without creating a locking cycle.

Signed-off-by: Cosmin Ratiu <cratiu@nvidia.com>
Reviewed-by: Carolina Jubran <cjubran@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1763325940-1231508-7-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-19 20:32:28 -08:00
Cosmin Ratiu
d4a0acbd94 net/mlx5: Move the SF table notifiers outside the devlink lock
Move the SF table notifiers registration/unregistration outside of
mlx5_init_one() / mlx5_uninit_one() and into the mlx5_mdev_init() /
mlx5_mdev_uninit() functions.

This is only done for non-SFs, since SFs do not have a SF table
themselves and thus don't need notifiers.

Signed-off-by: Cosmin Ratiu <cratiu@nvidia.com>
Reviewed-by: Carolina Jubran <cjubran@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1763325940-1231508-6-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-19 20:32:28 -08:00
Cosmin Ratiu
e63c9c5f0a net/mlx5: Move the SF HW table notifier outside the devlink lock
Move the SF HW table notifier registration/unregistration outside of
mlx5_init_one() / mlx5_uninit_one() and into the mlx5_mdev_init() /
mlx5_mdev_uninit() functions.

This is only done for non-SFs, since SFs do not have a SF HW table
themselves.

Signed-off-by: Cosmin Ratiu <cratiu@nvidia.com>
Reviewed-by: Carolina Jubran <cjubran@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1763325940-1231508-5-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-19 20:32:27 -08:00
Cosmin Ratiu
d3a356db85 net/mlx5: Move the vhca event notifier outside of the devlink lock
The vhca event notifier consists of an atomic notifier for vhca state
changes (used for SF events), multiple workqueues and a blocking
notifier chain for delivering the vhca state change events for further
processing.

This patch moves the vhca notifier head outside of mlx5_init_one() /
mlx5_uninit_one() and into the mlx5_mdev_init() / mlx5_mdev_uninit()
functions.

This allows called notifiers to grab the PF devlink lock which was
previously impossible because it would create a circular lock
dependency.

mlx5_vhca_event_stop() is now called earlier in the cleanup phase and
flushes the workqueues to ensure that after the call, there are no
pending events. This simplifies the cleanup flow for vhca event
consumers.

Signed-off-by: Cosmin Ratiu <cratiu@nvidia.com>
Reviewed-by: Carolina Jubran <cjubran@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1763325940-1231508-4-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-19 20:32:27 -08:00
Cosmin Ratiu
3fee828789 net/mlx5: Move the esw mode notifier chain outside the devlink lock
The esw mode change notifier chain is initialized/cleaned up in
mlx5_init_one() / mlx5_uninit_one() with the devlink lock held.

Move the notifier head from the eswitch struct into mlx5_priv directly,
and initialize it outside the critical section. This will allow notifier
registration to happen earlier in the init procedure in subsequent
patches.

Signed-off-by: Cosmin Ratiu <cratiu@nvidia.com>
Reviewed-by: Carolina Jubran <cjubran@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1763325940-1231508-3-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-19 20:32:27 -08:00
Cosmin Ratiu
b6b03097f9 net/mlx5: Initialize events outside devlink lock
Move event init/cleanup outside of mlx5_init_one() / mlx5_uninit_one()
and into the mlx5_mdev_init() / mlx5_mdev_uninit() functions.

By doing this, we avoid the events being reinitialized on devlink reload
and, more importantly, the events->sw_nh notifier chain becomes
available earlier in the init procedure, which will be used in
subsequent patches. This makes sense because the events struct is pure
software, independent of any HW details.

Signed-off-by: Cosmin Ratiu <cratiu@nvidia.com>
Reviewed-by: Carolina Jubran <cjubran@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1763325940-1231508-2-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-19 20:32:27 -08:00
Jakub Kicinski
beabc06ffb Merge branch 'net-adjust-conservative-values-around-napi'
Jason Xing says:

====================
net: adjust conservative values around napi

This series keeps at least 96 skbs per cpu and frees 32 skbs at one
time in conclusion. More initial discussions with Eric can be seen at
the link [1].

[1]: https://lore.kernel.org/all/CAL+tcoBEEjO=-yvE7ZJ4sB2smVBzUht1gJN85CenJhOKV====================

Link: https://patch.msgid.link/20251118070646.61344-1-kerneljasonxing@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-19 20:29:29 -08:00
Jason Xing
5d7fc63ab8 net: prefetch the next skb in napi_skb_cache_get()
After getting the current skb in napi_skb_cache_get(), the next skb in
cache is highly likely to be used soon, so prefetch would be helpful.

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jason Xing <kernelxing@tencent.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Link: https://patch.msgid.link/20251118070646.61344-5-kerneljasonxing@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-19 20:29:25 -08:00
Jason Xing
2d67b5c5c6 net: use NAPI_SKB_CACHE_FREE to keep 32 as default to do bulk free
- Replace NAPI_SKB_CACHE_HALF with NAPI_SKB_CACHE_FREE
- Only free 32 skbs in napi_skb_cache_put()

Since the first patch adjusting NAPI_SKB_CACHE_SIZE to 128, the number
of packets to be freed in the softirq was increased from 32 to 64.
Considering a subsequent net_rx_action() calling napi_poll() a few
times can easily consume the 64 available slots and we can afford
keeping a higher value of sk_buffs in per-cpu storage, decrease
NAPI_SKB_CACHE_FREE to 32 like before. So now the logic is 1) keeping
96 skbs, 2) freeing 32 skbs at one time.

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jason Xing <kernelxing@tencent.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Link: https://patch.msgid.link/20251118070646.61344-4-kerneljasonxing@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-19 20:29:24 -08:00
Jason Xing
01d7385618 net: increase default NAPI_SKB_CACHE_BULK to 32
The previous value 16 is a bit conservative, so adjust it along with
NAPI_SKB_CACHE_SIZE, which can minimize triggering memory allocation
in napi_skb_cache_get*().

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jason Xing <kernelxing@tencent.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Link: https://patch.msgid.link/20251118070646.61344-3-kerneljasonxing@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-19 20:29:24 -08:00
Jason Xing
3505730d90 net: increase default NAPI_SKB_CACHE_SIZE to 128
After commit b61785852e ("net: increase skb_defer_max default to 128")
changed the value sysctl_skb_defer_max to avoid many calls to
kick_defer_list_purge(), the same situation can be applied to
NAPI_SKB_CACHE_SIZE that was proposed in 2016. It's a trade-off between
using pre-allocated memory in skb_cache and saving more a bit heavy
function calls in the softirq context.

With this patch applied, we can have more skbs per-cpu to accelerate the
sending path that needs to acquire new skbs.

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jason Xing <kernelxing@tencent.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Link: https://patch.msgid.link/20251118070646.61344-2-kerneljasonxing@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-19 20:29:24 -08:00
Jakub Kicinski
7c9dd38602 Merge branch 'disable-clkout-on-rtl8211f-d-i-vd-cg'
Vladimir Oltean says:

====================
Disable CLKOUT on RTL8211F(D)(I)-VD-CG

The Realtek RTL8211F(D)(I)-VD-CG is similar to other RTL8211F models in
that the CLKOUT signal can be turned off - a feature requested to reduce
EMI, and implemented via "realtek,clkout-disable" as documented in
Documentation/devicetree/bindings/net/realtek,rtl82xx.yaml.

It is also dissimilar to said PHY models because it has no PHYCR2
register, and disabling CLKOUT is done through some other register.

The strategy adopted in this 6-patch series is to make the PHY driver
not think in terms of "priv->has_phycr2" and "priv->phycr2", but of more
high-level features ("priv->disable_clk_out") while maintaining behaviour.
Then, the logic is extended for the new PHY.

Very loosely based on previous work from Clark Wang, who took a
different approach, to pretend that the RTL8211FVD_CLKOUT_REG is
actually this PHY's PHYCR2.
====================

Link: https://patch.msgid.link/20251117234033.345679-1-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-19 20:24:25 -08:00
Vladimir Oltean
4465ae435d net: phy: realtek: create rtl8211f_config_phy_eee() helper
To simplify the rtl8211f_config_init() control flow and get rid of
"early" returns for PHYs where the PHYCR2 register is absent, move the
entire logic sub-block that deals with disabling PHY-mode EEE to a
separate function. There, it is much more obvious what the early
"return 0" skips, and it becomes more difficult to accidentally skip
unintended stuff.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://patch.msgid.link/20251117234033.345679-7-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-19 20:24:23 -08:00
Vladimir Oltean
bb78b71faf net: phy: realtek: eliminate priv->phycr1 variable
Previous changes have replaced the machine-level priv->phycr2 with a
high-level priv->disable_clk_out. This created a discrepancy with
priv->phycr1 which is resolved here, for uniformity.

One advantage of this new implementation is that we don't read
priv->phycr1 in rtl821x_probe() if we're never going to modify it.

We never test the positive return code from phy_modify_mmd_changed(), so
we could just as well use phy_modify_mmd().

I took the ALDPS feature description from commit d90db36a9e ("net:
phy: realtek: add dt property to enable ALDPS mode") and transformed it
into a function comment - the feature is sufficiently non-obvious to
deserve that.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20251117234033.345679-6-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-19 20:24:23 -08:00
Vladimir Oltean
e1a31c41be net: phy: realtek: allow CLKOUT to be disabled on RTL8211F(D)(I)-VD-CG
Add CLKOUT disable support for RTL8211F(D)(I)-VD-CG. Like with other PHY
variants, this feature might be requested by customers when the clock
output is not used, in order to reduce electromagnetic interference (EMI).

In the common driver, the CLKOUT configuration is done through PHYCR2.
The RTL_8211FVD_PHYID is singled out as not having that register, and
execution in rtl8211f_config_init() returns early after commit
2c67301584 ("net: phy: realtek: Avoid PHYCR2 access if PHYCR2 not
present").

But actually CLKOUT is configured through a different register for this
PHY. Instead of pretending this is PHYCR2 (which it is not), just add
some code for modifying this register inside the rtl8211f_disable_clk_out()
function, and move that outside the code portion that runs only if
PHYCR2 exists.

In practice this reorders the PHYCR2 writes to disable PHY-mode EEE and
to disable the CLKOUT for the normal RTL8211F variants, but this should
be perfectly fine.

It was not noted that RTL8211F(D)(I)-VD-CG would need a genphy_soft_reset()
call after disabling the CLKOUT. Despite that, we do it out of caution
and for symmetry with the other RTL8211F models.

Co-developed-by: Clark Wang <xiaoning.wang@nxp.com>
Signed-off-by: Clark Wang <xiaoning.wang@nxp.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20251117234033.345679-5-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-19 20:24:23 -08:00
Vladimir Oltean
910ac7bfb1 net: phy: realtek: eliminate has_phycr2 variable
This variable is assigned in rtl821x_probe() and used in
rtl8211f_config_init(), which is more complex than it needs to be.
Simply testing the same condition from rtl821x_probe() in
rtl8211f_config_init() yields the same result (the PHY driver ID is a
runtime invariant), but with one temporary variable less.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20251117234033.345679-4-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-19 20:24:23 -08:00
Vladimir Oltean
27033d0691 net: phy: realtek: eliminate priv->phycr2 variable
The RTL8211F(D)(I)-VD-CG PHY also has support for disabling the CLKOUT,
and we'd like to introduce the "realtek,clkout-disable" property for
that.

But it isn't done through the PHYCR2 register, and it becomes awkward to
have the driver pretend that it is. So just replace the machine-level
"u16 phycr2" variable with a logical "bool disable_clk_out", which
scales better to the other PHY as well.

The change is a complete functional equivalent. Before, if the device
tree property was absent, priv->phycr2 would contain the RTL8211F_CLKOUT_EN
bit as read from hardware. Now, we don't save priv->phycr2, but we just
don't call phy_modify_paged() on it. Also, we can simply call
phy_modify_paged() with the "set" argument to 0.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://patch.msgid.link/20251117234033.345679-3-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-19 20:24:23 -08:00
Vladimir Oltean
8e982441ba net: phy: realtek: create rtl8211f_config_rgmii_delay()
The control flow in rtl8211f_config_init() has some pitfalls which were
probably unintended. Specifically it has an early return:

	switch (phydev->interface) {
	...
	default: /* the rest of the modes imply leaving delay as is. */
		return 0;
	}

which exits the entire config_init() function. This means it also skips
doing things such as disabling CLKOUT or disabling PHY-mode EEE.

For the RTL8211FS, which uses PHY_INTERFACE_MODE_SGMII, this might be a
problem. However, I don't know that it is, so there is no Fixes: tag.
The issue was observed through code inspection.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20251117234033.345679-2-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-19 20:24:22 -08:00
Breno Leitao
672cb5c2a9 net: vmxnet3: convert to use .get_rx_ring_count
Convert the vmxnet3 driver to use the new .get_rx_ring_count ethtool
operation instead of implementing .get_rxnfc solely for handling
ETHTOOL_GRXRINGS command. This simplifies the code by removing the
switch statement and replacing it with a direct return of the queue
count.

The new callback provides the same functionality in a more direct way,
following the ongoing ethtool API modernization.

Signed-off-by: Breno Leitao <leitao@debian.org>
Link: https://patch.msgid.link/20251118-vmxnet3_grxrings-v1-1-ed8abddd2d52@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-19 20:14:21 -08:00
Jakub Kicinski
6152f41da6 Merge branch 'net-mana-enforce-tx-sge-limit-and-fix-error-cleanup'
Aditya Garg says:

====================
net: mana: Enforce TX SGE limit and fix error cleanup

Add pre-transmission checks to block SKBs that exceed the hardware's SGE
limit. Force software segmentation for GSO traffic and linearize non-GSO
packets as needed.

Update TX error handling to drop failed SKBs and unmap resources
immediately.
====================

Link: https://patch.msgid.link/1763464269-10431-1-git-send-email-gargaditya@linux.microsoft.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-19 20:12:02 -08:00
Aditya Garg
45120304e8 net: mana: Drop TX skb on post_work_request failure and unmap resources
Drop TX packets when posting the work request fails and ensure DMA
mappings are always cleaned up.

Signed-off-by: Aditya Garg <gargaditya@linux.microsoft.com>
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
Link: https://patch.msgid.link/1763464269-10431-3-git-send-email-gargaditya@linux.microsoft.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-19 20:11:57 -08:00
Aditya Garg
934fa943b5 net: mana: Handle SKB if TX SGEs exceed hardware limit
The MANA hardware supports a maximum of 30 scatter-gather entries (SGEs)
per TX WQE. Exceeding this limit can cause TX failures.
Add ndo_features_check() callback to validate SKB layout before
transmission. For GSO SKBs that would exceed the hardware SGE limit, clear
NETIF_F_GSO_MASK to enforce software segmentation in the stack.
Add a fallback in mana_start_xmit() to linearize non-GSO SKBs that still
exceed the SGE limit.

Also, Add ethtool counter for SKBs linearized

Co-developed-by: Dipayaan Roy <dipayanroy@linux.microsoft.com>
Signed-off-by: Dipayaan Roy <dipayanroy@linux.microsoft.com>
Signed-off-by: Aditya Garg <gargaditya@linux.microsoft.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
Link: https://patch.msgid.link/1763464269-10431-2-git-send-email-gargaditya@linux.microsoft.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-19 20:11:57 -08:00
Anshumali Gaur
929ca3bcea octeontx2-af: Skip TM tree print for disabled SQs
Currently, the TM tree is printing all SQ topology including those
which are not enabled, this results in redundant output for SQs
which are not active. This patch adds a check in print_tm_tree()
to skip printing the TM tree hierarchy if the SQ is not enabled.

Signed-off-by: Anshumali Gaur <agaur@marvell.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20251118054235.1599714-1-agaur@marvell.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-19 20:06:22 -08:00
Sjoerd Simons
bc41fbbf6f dt-bindings: net: mediatek,net: Correct bindings for MT7981
Different SoCs have different numbers of Wireless Ethernet
Dispatch (WED) units:
- MT7981: Has 1 WED unit
- MT7986: Has 2 WED units
- MT7988: Has 2 WED units

Update the binding to reflect these hardware differences. The MT7981
also uses infracfg for PHY switching, so allow that property.

Signed-off-by: Sjoerd Simons <sjoerd@collabora.com>
Acked-by: Conor Dooley <conor.dooley@microchip.com>
Link: https://patch.msgid.link/20251115-openwrt-one-network-v4-6-48cbda2969ac@collabora.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-19 18:46:47 -08:00
Viacheslav Dubeyko
150ec68fa7 hfs: introduce KUnit tests for HFS string operations
This patch implements the initial Kunit based set of
unit tests for HFS string operations. It checks
functionality of hfs_strcmp(), hfs_hash_dentry(),
and hfs_compare_dentry() methods.

./tools/testing/kunit/kunit.py run --kunitconfig ./fs/hfs/.kunitconfig

[16:04:50] Configuring KUnit Kernel ...
Regenerating .config ...
Populating config with:
$ make ARCH=um O=.kunit olddefconfig
[16:04:51] Building KUnit Kernel ...
Populating config with:
$ make ARCH=um O=.kunit olddefconfig
Building with:
$ make all compile_commands.json scripts_gdb ARCH=um O=.kunit --jobs=22
[16:04:59] Starting KUnit Kernel (1/1)...
[16:04:59] ============================================================
Running tests with:
$ .kunit/linux kunit.enable=1 mem=1G console=tty kunit_shutdown=halt
[16:04:59] ================= hfs_string (3 subtests) ==================
[16:04:59] [PASSED] hfs_strcmp_test
[16:04:59] [PASSED] hfs_hash_dentry_test
[16:04:59] [PASSED] hfs_compare_dentry_test
[16:04:59] =================== [PASSED] hfs_string ====================
[16:04:59] ============================================================
[16:04:59] Testing complete. Ran 3 tests: passed: 3
[16:04:59] Elapsed time: 9.087s total, 1.310s configuring, 7.611s building, 0.125s running

v2
Fix linker error.

v3
Chen Linxuan suggested to use EXPORT_SYMBOL_IF_KUNIT.

Signed-off-by: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
cc: Yangtao Li <frank.li@vivo.com>
cc: linux-fsdevel@vger.kernel.org
cc: Chen Linxuan <me@black-desk.cn>
Reviewed-by: Chen Linxuan <me@black-desk.cn>
Signed-off-by: Viacheslav Dubeyko <slava@dubeyko.com>
Link: https://lore.kernel.org/r/20250912225022.1083313-1-slava@dubeyko.com
Signed-off-by: Viacheslav Dubeyko <slava@dubeyko.com>
2025-11-19 14:53:59 -08:00
Jakub Kicinski
865a5d1a29 Merge branch 'net-stmmac-sanitise-stmmac_is_jumbo_frm'
Russell King says:

====================
net: stmmac: sanitise stmmac_is_jumbo_frm()

stmmac_is_jumbo_frm() takes skb->len, which is unsigned int, but the
parameter is passed as an "int" and then tested using signed
comparisons. This can cause bugs. Change the parameter to be unsigned.

Also arrange for it to return a bool.
====================

Link: https://patch.msgid.link/aRxDqJSWxOdOaRt4@shell.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-19 08:33:52 -08:00
Russell King (Oracle)
bf351bbec5 net: stmmac: stmmac_is_jumbo_frm() returns boolean
stmmac_is_jumbo_frm() returns whether the driver considers the frame
size to be a jumbo frame, and thus returns 0/1 values. This is boolean,
so convert it to return a boolean and use false/true instead. Also
convert stmmac_xmit()'s is_jumbo to be bool, which causes several
variables to be repositioned to keep it in reverse Christmas-tree
order.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Link: https://patch.msgid.link/E1vLIWW-0000000Ewkl-21Ia@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-19 08:33:49 -08:00
Russell King (Oracle)
b5adada61e net: stmmac: stmmac_is_jumbo_frm() len should be unsigned
stmmac_is_jumbo_frm() and the is_jumbo_frm() methods take skb->len
which is an unsigned int. Avoid an implicit cast to "int" via the
method parameter and then incorrectly doing signed comparisons on
this unsigned value.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Link: https://patch.msgid.link/E1vLIWR-0000000Ewkf-1Tdx@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-19 08:33:49 -08:00
Russell King (Oracle)
7ac60a14d3 net: stmmac: convert priv->sph* to boolean and rename
priv->sph* only have 'true' and 'false' used with them, yet they are an
int. Change their type to a bool, and rename to make their usage more
clear.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Link: https://patch.msgid.link/E1vLIDN-0000000Evur-2NLU@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-19 08:32:38 -08:00
Damien Le Moal
a9637ab93c zloop: fix zone append check in zloop_rw()
While commit cf28f6f923 ("zloop: fail zone append operations that are
targeting full zones") added a check in zloop_rw() that a zone append is
not issued to a full zone, commit e3a96ca904 ("zloop: simplify checks
for writes to sequential zones") inadvertently removed the check to
verify that there is enough unwritten space in a zone for an incoming
zone append opration.

Re-add this check in zloop_rw() to make sure we do not write beyond the
end of a zone. Of note is that this same check is already present in the
function zloop_set_zone_append_sector() when ordered zone append is in
use.

Reported-by: Hans Holmberg <Hans.Holmberg@wdc.com>
Fixes: e3a96ca904 ("zloop: simplify checks for writes to sequential zones")
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Hans Holmberg <hans.holmberg@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-19 07:39:42 -07:00
Damien Le Moal
ebcc028b4a MAINTAINERS: add a maintainer for zoned block device support
Add myself as the maintainer of the block layer support for the zoned
block device code and user API.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-19 07:37:48 -07:00
Damien Le Moal
00ed035094 MAINTAINERS: add missing block layer user API header files
Add the missing user API header files related to the block layer to the
list of matching file patterns for Jens's block layer entry.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-19 07:37:40 -07:00
Konstantin Komarov
1b2ae190ea fs/ntfs3: check for shutdown in fsync
Ensure fsync() returns -EIO when the ntfs3 filesystem is in forced
shutdown, instead of silently succeeding via generic_file_fsync().

Signed-off-by: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
2025-11-19 09:21:36 +01:00
Fernando Fernandez Mancera
d7dbda8789 selftests: fib_tests: add fib6 from ra to static test
The new test checks that a route that has been promoted from RA-learned
to static does not switch back when a new RA message arrives. In
addition, it checks that the route is owned by RA again when the static
address is removed.

Signed-off-by: Fernando Fernandez Mancera <fmancera@suse.de>
Link: https://patch.msgid.link/20251115095939.6967-2-fmancera@suse.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-18 19:28:08 -08:00
Fernando Fernandez Mancera
f72514b3c5 ipv6: clear RA flags when adding a static route
When an IPv6 Router Advertisement (RA) is received for a prefix, the
kernel creates the corresponding on-link route with flags RTF_ADDRCONF
and RTF_PREFIX_RT configured and RTF_EXPIRES if lifetime is set.

If later a user configures a static IPv6 address on the same prefix the
kernel clears the RTF_EXPIRES flag but it doesn't clear the RTF_ADDRCONF
and RTF_PREFIX_RT. When the next RA for that prefix is received, the
kernel sees the route as RA-learned and wrongly configures back the
lifetime. This is problematic because if the route expires, the static
address won't have the corresponding on-link route.

This fix clears the RTF_ADDRCONF and RTF_PREFIX_RT flags preventing that
the lifetime is configured when the next RA arrives. If the static
address is deleted, the route becomes RA-learned again.

Fixes: 14ef37b6d0 ("ipv6: fix route lookup in addrconf_prefix_rcv()")
Reported-by: Garri Djavadyan <g.djavadyan@gmail.com>
Closes: https://lore.kernel.org/netdev/ba807d39aca5b4dcf395cc11dca61a130a52cfd3.camel@gmail.com/
Signed-off-by: Fernando Fernandez Mancera <fmancera@suse.de>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://patch.msgid.link/20251115095939.6967-1-fmancera@suse.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-18 19:28:08 -08:00
Jakub Kicinski
cbf4d314ac Merge branch 'af_unix-gc-cleanup-and-optimisation'
Kuniyuki Iwashima says:

====================
af_unix: GC cleanup and optimisation.

Currently, AF_UNIX GC is triggered from close() and sendmsg()
based on the number of inflight AF_UNIX sockets.

This is because the old GC implementation had no idea of the
shape of the graph formed by SCM_RIGHTS references.

The new GC knows whether cyclic references (could) exist.

This series refines such conditions not to trigger GC unless
really needed.
====================

Link: https://patch.msgid.link/20251115020935.2643121-1-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-18 19:19:34 -08:00
Kuniyuki Iwashima
24fa77dad2 af_unix: Consolidate unix_schedule_gc() and wait_for_unix_gc().
unix_schedule_gc() and wait_for_unix_gc() share some code.

Let's consolidate the two.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20251115020935.2643121-8-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-18 19:19:32 -08:00
Kuniyuki Iwashima
ab8b23150a af_unix: Remove unix_tot_inflight.
unix_tot_inflight is no longer used.

Let's remove it.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20251115020935.2643121-7-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-18 19:19:32 -08:00
Kuniyuki Iwashima
e29c7a4cec af_unix: Refine wait_for_unix_gc().
unix_tot_inflight is a poor metric, only telling the number of
inflight AF_UNXI sockets, and we should use unix_graph_state instead.

Also, if the receiver is catching up with the passed fds, the
sender does not need to schedule GC.

GC only helps unreferenced cyclic SCM_RIGHTS references, and in
such a situation, the malicious sendmsg() will continue to call
wait_for_unix_gc() and hit the UNIX_INFLIGHT_SANE_USER condition.

Let's make only malicious users schedule GC and wait for it to
finish if a cyclic reference exists during the previous GC run.

Then, sane users will pay almost no cost for wait_for_unix_gc().

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20251115020935.2643121-6-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-18 19:19:31 -08:00
Kuniyuki Iwashima
384900542d af_unix: Don't call wait_for_unix_gc() on every sendmsg().
We have been calling wait_for_unix_gc() on every sendmsg() in case
there are too many inflight AF_UNIX sockets.

This is also because the old GC implementation had poor knowledge
of the inflight sockets and had to suspect every sendmsg().

This was improved by commit d9f21b3613 ("af_unix: Try to run GC
async."), but we do not even need to call wait_for_unix_gc() if the
process is not sending AF_UNIX sockets.

The wait_for_unix_gc() call only helps when a malicious process
continues to create cyclic references, and we can detect that
in a better place and slow it down.

Let's move wait_for_unix_gc() to unix_prepare_fpl() that is called
only when AF_UNIX socket fd is passed via SCM_RIGHTS.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20251115020935.2643121-5-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-18 19:19:31 -08:00
Kuniyuki Iwashima
da8fc7a39b af_unix: Don't trigger GC from close() if unnecessary.
We have been triggering GC on every close() if there is even one
inflight AF_UNIX socket.

This is because the old GC implementation had no idea of the graph
shape formed by SCM_RIGHTS references.

The new GC knows whether there could be a cyclic reference or not,
and we can do better.

Let's not trigger GC from close() if there is no cyclic reference
or GC is already in progress.

While at it, unix_gc() is renamed to unix_schedule_gc() as it does
not actually perform GC since commit 8b90a9f819 ("af_unix: Run
GC on only one CPU.").

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20251115020935.2643121-4-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-18 19:19:31 -08:00
Kuniyuki Iwashima
6b6f3c71fe af_unix: Simplify GC state.
GC manages its state by two variables, unix_graph_maybe_cyclic
and unix_graph_grouped, both of which are set to false in the
initial state.

When an AF_UNIX socket is passed to an in-flight AF_UNIX socket,
unix_update_graph() sets unix_graph_maybe_cyclic to true and
unix_graph_grouped to false, making the next GC invocation call
unix_walk_scc() to group SCCs.

Once unix_walk_scc() finishes, sockets in the same SCC are linked
via vertex->scc_entry.  Then, unix_graph_grouped is set to true
so that the following GC invocations can skip Tarjan's algorithm
and simply iterate through the list in unix_walk_scc_fast().

In addition, if we know there is at least one cyclic reference,
we set unix_graph_maybe_cyclic to true so that we do not skip GC.

So the state transitions as follows:

  (unix_graph_maybe_cyclic, unix_graph_grouped)
  =
  (false, false) -> (true, false) -> (true, true) or (false, true)
                         ^.______________/________________/

There is no transition to the initial state where both variables
are false.

If we consider the initial state as grouped, we can see that the
GC actually has a tristate.

Let's consolidate two variables into one enum.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20251115020935.2643121-3-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-18 19:19:31 -08:00
Kuniyuki Iwashima
58b47c7137 af_unix: Count cyclic SCC.
__unix_walk_scc() and unix_walk_scc_fast() call unix_scc_cyclic()
for each SCC to check if it forms a cyclic reference, so that we
can skip GC at the following invocations in case all SCCs do not
have any cycles.

If we count the number of cyclic SCCs in __unix_walk_scc(), we can
simplify unix_walk_scc_fast() because the number of cyclic SCCs
only changes when it garbage-collects a SCC.

So, let's count cyclic SCC in __unix_walk_scc() and decrement it
in unix_walk_scc_fast() when performing garbage collection.

Note that we will use this counter in a later patch to check if a
cycle existed in the previous GC run.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20251115020935.2643121-2-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-18 19:19:31 -08:00
Jakub Kicinski
f83e0e0b63 Merge branch 'net-mlx5-misc-changes-2025-11-17'
Tariq Toukan says:

====================
net/mlx5: misc changes 2025-11-17

This series contains misc enhancements to the mlx5 driver.
====================

Link: https://patch.msgid.link/1763415729-1238421-1-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-18 18:53:39 -08:00
Tariq Toukan
70ca239b61 net/mlx5: Use EOPNOTSUPP instead of ENOTSUPP
Per Documentation/dev-tools/checkpatch.rst, ENOTSUPP is not a standard
error code and should be avoided. EOPNOTSUPP should be used instead.

Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Gal Pressman <gal@nvidia.com>
Link: https://patch.msgid.link/1763415729-1238421-6-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-18 18:53:34 -08:00
Saeed Mahameed
fbb9933666 net/mlx5: Abort new commands if all command slots are stalled
In case of a FW issue, FW might be not responding to FW commands,
causing kernel lockout for a long period of time, e.g. rtnl_lock held
while ethtool is trying to collect stats waiting for FW to respond to
multiple commands, when all of them will timeout.

While there's no immediate indication of the FW lockout, we can safely
assume that something is wrong when all command slots are busy and in
a timeout state and no FW completion was received on any of them.

In such case, start immediately failing new commands.

Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1763415729-1238421-5-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-18 18:53:34 -08:00
Carolina Jubran
ea3270351c net/mlx5: Remove redundant bw_share minimal value assignment
Remove unnecessary logic that sets bw_share to minimal value, when
parent has bw_share configured but nodes don't have min_rate.

This check is redundant because the parent bandwidth acts as the upper
bound regardless, and the firmware always enforces the topmost
bandwidth constraint.

Signed-off-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1763415729-1238421-4-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-18 18:53:33 -08:00
Carolina Jubran
391dad2e68 net/mlx5e: Recover SQ on excessive PTP TX timestamp delta
Extend the TX timestamp handler to recover the SQ when the difference
between the port and CQE TX timestamps is abnormally large.

The current logic aborts timestamp delivery if the delta exceeds
1/128 seconds, which matches the maximum expected packet interval in
ptp4l. A larger delta makes the timestamps unreliable.

This change adds recovery if the delta exceeds 0.5 seconds. Such a
large gap should not occur in normal operation and indicates that
firmware is stuck or metadata tracking is out of sync, leading to stale
or mismatched timestamps. Recovering the SQ ensures forward progress
and avoids silently dropping invalid timestamps.

The timestamp handler now takes mlx5e_ptpsq directly to access both CQ
stats and the recovery state.

Signed-off-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Shahar Shitrit <shshitrit@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1763415729-1238421-3-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-18 18:53:33 -08:00
Gal Pressman
2e4c44b12f net/mlx5: Refactor EEPROM query error handling to return status separately
Matthew and Jakub reported [1] issues where inventory automation tools
are calling EEPROM query repeatedly on a port that doesn't have an SFP
connected, resulting in millions of error prints.

Move MCIA register status extraction from the query functions to the
callers, allowing use of extack reporting instead of a dmesg print when
using the netlink API.

[1] https://lore.kernel.org/netdev/20251028194011.39877-1-mattc@purestorage.com/

Cc: Matthew W Carlis <mattc@purestorage.com>
Signed-off-by: Gal Pressman <gal@nvidia.com>
Reviewed-by: Jianbo Liu <jianbol@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1763415729-1238421-2-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-18 18:53:33 -08:00
Hangbin Liu
1064d521d1 netlink: specs: support ipv4-or-v6 for dual-stack fields
Since commit 1b255e1bea ("tools: ynl: add ipv4-or-v6 display hint"), we
can display either IPv4 or IPv6 addresses for a single field based on the
address family. However, most dual-stack fields still use the ipv4 display
hint. This update changes them to use the new ipv4-or-v6 display hint and
converts IPv4-only fields to use the u32 type.

Field changes:
  - v4-or-v6
    - IFA_ADDRESS, IFA_LOCAL
    - IFLA_GRE_LOCAL, IFLA_GRE_REMOTE
    - IFLA_VTI_LOCAL, IFLA_VTI_REMOTE
    - IFLA_IPTUN_LOCAL, IFLA_IPTUN_REMOTE
    - NDA_DST
    - RTA_DST, RTA_SRC, RTA_GATEWAY, RTA_PREFSRC
    - FRA_SRC, FRA_DST
  - ipv4
    - IFA_BROADCAST
    - IFLA_GENEVE_REMOTE
    - IFLA_IPTUN_6RD_RELAY_PREFIX

Reviewed-by: Asbjørn Sloth Tønnesen <ast@fiberby.net>
Reviewed-by: Donald Hunter <donald.hunter@gmail.com>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Link: https://patch.msgid.link/20251117024457.3034-3-liuhangbin@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-18 18:42:10 -08:00
Hangbin Liu
4abe51dba6 tools: ynl: Add MAC address parsing support
Add missing support for parsing MAC addresses when display_hint is 'mac'
in the YNL library. This enables YNL CLI to accept MAC address strings
for attributes like lladdr in rt-neigh operations.

Reviewed-by: Donald Hunter <donald.hunter@gmail.com>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Link: https://patch.msgid.link/20251117024457.3034-2-liuhangbin@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-18 18:42:09 -08:00
Jens Axboe
f779ac0b87 io_uring/register: use correct location for io_rings_layout
A previous consolidated the ring size etc calculations into
io_prepare_config(), but missed updating io_register_resize_rings()
correctly to use the calculated values. As a result, it ended up using
on-stack uninitialized values, and hence either failed validating the
size correctly, or just failed resizing because the sizes were random.

This caused failures in the liburing regression tests:

[...]
Running test resize-rings.t
resize=-7
test_basic 3000 failed
Test resize-rings.t failed with ret 1
Running test resize-rings.t /dev/sda
resize=-7
test_basic 3000 failed
Test resize-rings.t failed with ret 1
Running test resize-rings.t /dev/nvme1n1
resize=-7
test_basic 3000 failed
Test resize-rings.t failed with ret 1
Running test resize-rings.t /dev/dm-0
resize=-7
test_basic 3000 failed
Test resize-rings.t failed with ret 1

because io_create_region() would return -E2BIG because of unintialized
reg->size values.

Adjust the struct io_rings_layout rl pointer to point to the correct
location, and remove the (now dead) __rl on stack struct.

Fixes: eb76ff6a68 ("io_uring: pre-calculate scq layout")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-18 19:30:45 -07:00
Jakub Kicinski
616d860439 Merge branch 'net-expand-napi_skb_cache-use'
Eric Dumazet says:

====================
net: expand napi_skb_cache use

This is a followup of commit e20dfbad8a ("net: fix napi_consume_skb()
with alien skbs").

Now the per-cpu napi_skb_cache is populated from TX completion path,
we can make use of this cache, especially for cpus not used
from a driver NAPI poll (primary user of napi_cache).

With this series, I consistently reach 130 Mpps on my UDP tx stress test
and reduce SLUB spinlock contention to smaller values.
====================

Link: https://patch.msgid.link/20251116202717.1542829-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-18 18:25:49 -08:00
Eric Dumazet
21664814b8 net: use napi_skb_cache even in process context
This is a followup of commit e20dfbad8a ("net: fix napi_consume_skb()
with alien skbs").

Now the per-cpu napi_skb_cache is populated from TX completion path,
we can make use of this cache, especially for cpus not used
from a driver NAPI poll (primary user of napi_cache).

We can use the napi_skb_cache only if current context is not from hard irq.

With this patch, I consistently reach 130 Mpps on my UDP tx stress test
and reduce SLUB spinlock contention to smaller values.

Note there is still some SLUB contention for skb->head allocations.

I had to tune /sys/kernel/slab/skbuff_small_head/cpu_partial
and /sys/kernel/slab/skbuff_small_head/min_partial depending
on the platform taxonomy.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
Tested-by: Jason Xing <kerneljasonxing@gmail.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20251116202717.1542829-4-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-18 18:25:47 -08:00
Eric Dumazet
294e638259 net: __alloc_skb() cleanup
This patch refactors __alloc_skb() to prepare the following one,
and does not change functionality.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20251116202717.1542829-3-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-18 18:25:47 -08:00
Eric Dumazet
dac0236075 net: add a new @alloc parameter to napi_skb_cache_get()
We want to be able in the series last patch to get an skb from
napi_skb_cache from process context, if there is one available.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20251116202717.1542829-2-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-18 18:25:47 -08:00
Alok Tiwari
eb74ae2f87 net: dsa: ks8995: Fix incorrect OF match table name
The driver declares an OF match table named ks8895_spi_of_match, even
though it describes compatible strings for the KS8995 and related Micrel
switches. This is a leftover typo, the correct name should match the
chip family handled by this driver ks8995, and also match the variable
used in spi_driver.of_match_table.

Signed-off-by: Alok Tiwari <alok.a.tiwari@oracle.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://patch.msgid.link/20251117095356.2099772-1-alok.a.tiwari@oracle.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-18 18:22:04 -08:00
Thorsten Blum
efb238160e kcm: Fix typo and add hyphen in Kconfig help text
s/connectons/connections/ and s/message based/message-based/

Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
Link: https://patch.msgid.link/20251116135616.106079-2-thorsten.blum@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-18 18:21:04 -08:00
Kuniyuki Iwashima
932478bf9f tcp: Don't reinitialise tw->tw_transparent in tcp_time_wait().
tw->tw_transparent is initialised twice in inet_twsk_alloc()
and tcp_time_wait().

Let's remove the latter.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
Link: https://patch.msgid.link/20251118000445.4091280-1-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-18 18:00:38 -08:00
Jakub Kicinski
6785aa9d20 Merge tag 'ipsec-next-2025-11-18' of git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec-next
Steffen Klassert says:

====================
pull request (net-next): ipsec-next 2025-11-18

1) Relax a lock contention bottleneck to improve IPsec crypto
   offload performance. From Jianbo Liu.

2) Deprecate pfkey, the interface will be removed in 2027.

3) Update xfrm documentation and move it to ipsec maintainance.
   From Bagas Sanjaya.

* tag 'ipsec-next-2025-11-18' of git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec-next:
  MAINTAINERS: Add entry for XFRM documentation
  net: Move XFRM documentation into its own subdirectory
  Documentation: xfrm_sync: Number the fifth section
  Documentation: xfrm_sysctl: Trim trailing colon in section heading
  Documentation: xfrm_sync: Trim excess section heading characters
  Documentation: xfrm_sync: Properly reindent list text
  Documentation: xfrm_device: Separate hardware offload sublists
  Documentation: xfrm_device: Use numbered list for offloading steps
  Documentation: xfrm_device: Wrap iproute2 snippets in literal code block
  pfkey: Deprecate pfkey
  xfrm: Skip redundant replay recheck for the hardware offload path
  xfrm: Refactor xfrm_input lock to reduce contention with RSS
====================

Link: https://patch.msgid.link/20251118092610.2223552-1-steffen.klassert@secunet.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-18 17:55:41 -08:00
Pierre-Henry Moussay
f4e3402f59 dt-bindings: net: cdns,macb: Add pic64gx compatibility
The pic64gx uses an identical integration of the macb IP to mpfs.

Signed-off-by: Pierre-Henry Moussay <pierre-henry.moussay@microchip.com>
Signed-off-by: Conor Dooley <conor.dooley@microchip.com>
Acked-by: Nicolas Ferre <nicolas.ferre@microchip.com>
Acked-by: Rob Herring (Arm) <robh@kernel.org>
Link: https://patch.msgid.link/20251117-easter-machine-37851f20aaf3@spud
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-18 17:37:41 -08:00
Donald Hunter
6770eaad75 tools: ynltool: ignore *.d deps files
Add *.d to gitignore for ynltool

Signed-off-by: Donald Hunter <donald.hunter@gmail.com>
Link: https://patch.msgid.link/20251117143155.44806-1-donald.hunter@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-18 17:35:52 -08:00
Alexei Starovoitov
d6ec0906d6 Merge branch 'replace-bpf-memory-allocator-with-kmalloc_nolock-in-local-storage'
Amery Hung says:

====================
Replace BPF memory allocator with kmalloc_nolock() in local storage

This patchset tries to simplify bpf_local_storage.c by adopting
kmalloc_nolock(). This removes memory preallocation and reduces the
dependency of smap in bpf_selem_free() and bpf_local_storage_free().
The later will simplify a future refactor that replaces
local_storage->lock and b->lock [1].

RFC v1 tried to switch to kmalloc_nolock() unconditionally. However,
as there is substantial performance loss in socket local storage due to
1) defer_free() in kfree_nolock() and 2) no kfree_rcu() batching,
replacing kzalloc() is postponed until necessary improvements in mm
land.

Benchmark

./bench -p 1 local-storage-create --storage-type <socket,task> \
  --batch-size <16,32,64>

The benchmark is a microbenchmark stress-testing how fast local storage
can be created. For task local storage, switching from BPF memory
allocator to kmalloc_nolock() yields a small amount of improvement. For
socket local storage, it remains roughly the same as nothing has changed.

Socket local storage
memory alloc     batch  creation speed              creation speed diff
---------------  ----   ------------------                         ----
kzalloc           16    144.149 ± 0.642k/s  3.10 kmallocs/create
(before)          32    144.379 ± 1.070k/s  3.08 kmallocs/create
                  64    144.491 ± 0.818k/s  3.13 kmallocs/create

kzalloc           16    146.180 ± 1.403k/s  3.10 kmallocs/create  +1.4%
(not changed)     32    146.245 ± 1.272k/s  3.10 kmallocs/create  +1.3%
                  64    145.012 ± 1.545k/s  3.10 kmallocs/create  +0.4%

Task local storage
memory alloc     batch  creation speed              creation speed diff
---------------  ----   ------------------                         ----
BPF memory        16     24.668 ± 0.121k/s  2.54 kmallocs/create
allocator         32     22.899 ± 0.097k/s  2.67 kmallocs/create
(before)          64     22.559 ± 0.076k/s  2.56 kmallocs/create

kmalloc_nolock    16     25.796 ± 0.059k/s  2.52 kmallocs/create  +4.6%
(after)           32     23.412 ± 0.069k/s  2.50 kmallocs/create  +2.2%
                  64     23.717 ± 0.108k/s  2.60 kmallocs/create  +5.1%

[1] https://lore.kernel.org/bpf/20251002225356.1505480-1-ameryhung@gmail.com/

v1 -> v2
  - Only replace BPF memory allocator with kmalloc_nolock()
  Link: https://lore.kernel.org/bpf/20251112175939.2365295-1-ameryhung@gmail.com/
====================

Link: https://patch.msgid.link/20251114201329.3275875-1-ameryhung@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-18 16:20:35 -08:00
Amery Hung
f484f4a3e0 bpf: Replace bpf memory allocator with kmalloc_nolock() in local storage
Replace bpf memory allocator with kmalloc_nolock() to reduce memory
wastage due to preallocation.

In bpf_selem_free(), an selem now needs to wait for a RCU grace period
before being freed when reuse_now == true. Therefore, rcu_barrier()
should be always be called in bpf_local_storage_map_free().

In bpf_local_storage_free(), since smap->storage_ma is no longer needed
to return the memory, the function is now independent from smap.

Remove the outdated comment in bpf_local_storage_alloc(). We already
free selem after an RCU grace period in bpf_local_storage_update() when
bpf_local_storage_alloc() failed the cmpxchg since commit c0d63f3091
("bpf: Add bpf_selem_free()").

Signed-off-by: Amery Hung <ameryhung@gmail.com>
Reviewed-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20251114201329.3275875-5-ameryhung@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-18 16:20:25 -08:00
Amery Hung
39a460c425 bpf: Save memory alloction info in bpf_local_storage
Save the memory allocation method used for bpf_local_storage in the
struct explicitly so that we don't need to go through the hassle to
find out the info. When a later patch replaces BPF memory allocator
with kmalloc_noloc(), bpf_local_storage_free() will no longer need
smap->storage_ma to return the memory and completely remove the
dependency on smap in bpf_local_storage_free().

Signed-off-by: Amery Hung <ameryhung@gmail.com>
Reviewed-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20251114201329.3275875-4-ameryhung@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-18 16:20:25 -08:00
Amery Hung
e76a33e1c7 bpf: Remove smap argument from bpf_selem_free()
Since selem already saves a pointer to smap, use it instead of an
additional argument in bpf_selem_free(). This requires moving the
SDATA(selem)->smap assignment from bpf_selem_link_map() to
bpf_selem_alloc() since bpf_selem_free() may be called without the
selem being linked to smap in bpf_local_storage_update().

Signed-off-by: Amery Hung <ameryhung@gmail.com>
Reviewed-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20251114201329.3275875-3-ameryhung@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-18 16:20:25 -08:00
Amery Hung
0e854e5535 bpf: Always charge/uncharge memory when allocating/unlinking storage elements
Since commit a96a44aba5 ("bpf: bpf_sk_storage: Fix invalid wait
context lockdep report"), {charge,uncharge}_mem are always true when
allocating a bpf_local_storage_elem or unlinking a bpf_local_storage_elem
from local storage, so drop these arguments. No functional change.

Signed-off-by: Amery Hung <ameryhung@gmail.com>
Reviewed-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20251114201329.3275875-2-ameryhung@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-18 16:20:25 -08:00
Viacheslav Dubeyko
24e17a29cf hfsplus: fix volume corruption issue for generic/073
The xfstests' test-case generic/073 leaves HFS+ volume
in corrupted state:

sudo ./check generic/073
FSTYP -- hfsplus
PLATFORM -- Linux/x86_64 hfsplus-testing-0001 6.17.0-rc1+ #4 SMP PREEMPT_DYNAMIC Wed Oct 1 15:02:44 PDT 2025
MKFS_OPTIONS -- /dev/loop51
MOUNT_OPTIONS -- /dev/loop51 /mnt/scratch

generic/073 _check_generic_filesystem: filesystem on /dev/loop51 is inconsistent
(see XFSTESTS-2/xfstests-dev/results//generic/073.full for details)

Ran: generic/073
Failures: generic/073
Failed 1 of 1 tests

sudo fsck.hfsplus -d /dev/loop51
** /dev/loop51
Using cacheBlockSize=32K cacheTotalBlock=1024 cacheSize=32768K.
Executing fsck_hfs (version 540.1-Linux).
** Checking non-journaled HFS Plus Volume.
The volume name is untitled
** Checking extents overflow file.
** Checking catalog file.
** Checking multi-linked files.
** Checking catalog hierarchy.
Invalid directory item count
(It should be 1 instead of 0)
** Checking extended attributes file.
** Checking volume bitmap.
** Checking volume information.
Verify Status: VIStat = 0x0000, ABTStat = 0x0000 EBTStat = 0x0000
CBTStat = 0x0000 CatStat = 0x00004000
** Repairing volume.
** Rechecking volume.
** Checking non-journaled HFS Plus Volume.
The volume name is untitled
** Checking extents overflow file.
** Checking catalog file.
** Checking multi-linked files.
** Checking catalog hierarchy.
** Checking extended attributes file.
** Checking volume bitmap.
** Checking volume information.
** The volume untitled was repaired successfully.

The test is doing these steps on final phase:

mv $SCRATCH_MNT/testdir_1/bar $SCRATCH_MNT/testdir_2/bar
$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/testdir_1
$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo

So, we move file bar from testdir_1 into testdir_2 folder. It means that HFS+
logic decrements the number of entries in testdir_1 and increments number of
entries in testdir_2. Finally, we do fsync only for testdir_1 and foo but not
for testdir_2. As a result, this is the reason why fsck.hfsplus detects the
volume corruption afterwards.

This patch fixes the issue by means of adding the
hfsplus_cat_write_inode() call for old_dir and new_dir in
hfsplus_rename() after the successful ending of
hfsplus_rename_cat(). This method makes modification of in-core
inode objects for old_dir and new_dir but it doesn't save these
modifications in Catalog File's entries. It was expected that
hfsplus_write_inode() will save these modifications afterwards.
However, because generic/073 does fsync only for testdir_1 and foo
then testdir_2 modification hasn't beed saved into Catalog File's
entry and it was flushed without this modification. And it was
detected by fsck.hfsplus. Now, hfsplus_rename() stores in Catalog
File all modified entries and correct state of Catalog File will
be flushed during hfsplus_file_fsync() call. Finally, it makes
fsck.hfsplus happy.

sudo ./check generic/073
FSTYP         -- hfsplus
PLATFORM      -- Linux/x86_64 hfsplus-testing-0001 6.18.0-rc3+ #93 SMP PREEMPT_DYNAMIC Wed Nov 12 14:37:49 PST 2025
MKFS_OPTIONS  -- /dev/loop51
MOUNT_OPTIONS -- /dev/loop51 /mnt/scratch

generic/073 32s ...  32s
Ran: generic/073
Passed all 1 tests

Signed-off-by: Viacheslav Dubeyko <slava@dubeyko.com>
cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
cc: Yangtao Li <frank.li@vivo.com>
cc: linux-fsdevel@vger.kernel.org
Link: https://lore.kernel.org/r/20251112232522.814038-1-slava@dubeyko.com
Signed-off-by: Viacheslav Dubeyko <slava@dubeyko.com>
2025-11-18 16:02:35 -08:00
Tetsuo Handa
005d4b0d33 hfsplus: Verify inode mode when loading from disk
syzbot is reporting that S_IFMT bits of inode->i_mode can become bogus when
the S_IFMT bits of the 16bits "mode" field loaded from disk are corrupted.

According to [1], the permissions field was treated as reserved in Mac OS
8 and 9. According to [2], the reserved field was explicitly initialized
with 0, and that field must remain 0 as long as reserved. Therefore, when
the "mode" field is not 0 (i.e. no longer reserved), the file must be
S_IFDIR if dir == 1, and the file must be one of S_IFREG/S_IFLNK/S_IFCHR/
S_IFBLK/S_IFIFO/S_IFSOCK if dir == 0.

Reported-by: syzbot <syzbot+895c23f6917da440ed0d@syzkaller.appspotmail.com>
Closes: https://syzkaller.appspot.com/bug?extid=895c23f6917da440ed0d
Link: https://developer.apple.com/library/archive/technotes/tn/tn1150.html#HFSPlusPermissions [1]
Link: https://developer.apple.com/library/archive/technotes/tn/tn1150.html#ReservedAndPadFields [2]
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reviewed-by: Viacheslav Dubeyko <slava@dubeyko.com>
Signed-off-by: Viacheslav Dubeyko <slava@dubeyko.com>
Link: https://lore.kernel.org/r/04ded9f9-73fb-496c-bfa5-89c4f5d1d7bb@I-love.SAKURA.ne.jp
Signed-off-by: Viacheslav Dubeyko <slava@dubeyko.com>
2025-11-18 16:01:05 -08:00
Chengkaitao
8e1d91c258 block: remove the declaration of elevator_init_mq function
In commit 1e44bedbc9 ("block: unifying elevator change"), the
elevator_init_mq function was deleted, but its declaration in elevator.h
was overlooked. This patch fixes it.

Signed-off-by: Chengkaitao <chengkaitao@kylinos.cn>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-18 16:27:19 -07:00
Hoyeon Lee
ec12ab2cda selftests/bpf: Replace TCP CC string comparisons with bpf_strncmp
The connect4_prog and bpf_iter_setsockopt tests duplicate the same
open-coded TCP congestion control string comparison logic. Since
bpf_strncmp() provides the same functionality, use it instead to
avoid repeated open-coded loops.

This change applies only to functional BPF tests and does not affect
the verifier performance benchmarks (veristat.cfg). No functional
changes intended.

Reviewed-by: Amery Hung <ameryhung@gmail.com>
Signed-off-by: Hoyeon Lee <hoyeon.lee@suse.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20251115225550.1086693-5-hoyeon.lee@suse.com
2025-11-18 14:57:53 -08:00
Hoyeon Lee
f700b37314 selftests/bpf: Move common TCP helpers into bpf_tracing_net.h
Some BPF selftests contain identical copies of the min(), max(),
before(), and after() helpers. These repeated snippets are the same
across the tests and do not need to be defined separately.

Move these helpers into bpf_tracing_net.h so they can be shared by
TCP related BPF programs. This removes repeated code and keeps the
helpers in a single place.

Reviewed-by: Amery Hung <ameryhung@gmail.com>
Signed-off-by: Hoyeon Lee <hoyeon.lee@suse.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20251115225550.1086693-4-hoyeon.lee@suse.com
2025-11-18 14:57:45 -08:00
Jens Axboe
caebce24f6 Revert "block: consider discard merge last"
This reverts commit 2516c246d0.

Suspected issues with discard merging post this patch, hence revert
it for now.

Link: https://lore.kernel.org/linux-block/26acdfdf-de13-430b-8c73-f890c7689a84@kernel.dk/
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-18 15:00:12 -07:00
Bart Van Assche
935a20d1be block: Remove queue freezing from several sysfs store callbacks
Freezing the request queue from inside sysfs store callbacks may cause a
deadlock in combination with the dm-multipath driver and the
queue_if_no_path option. Additionally, freezing the request queue slows
down system boot on systems where sysfs attributes are set synchronously.

Fix this by removing the blk_mq_freeze_queue() / blk_mq_unfreeze_queue()
calls from the store callbacks that do not strictly need these callbacks.
Add the __data_racy annotation to request_queue.rq_timeout to suppress
KCSAN data race reports about the rq_timeout reads.

This patch may cause a small delay in applying the new settings.

For all the attributes affected by this patch, I/O will complete
correctly whether the old or the new value of the attribute is used.

This patch affects the following sysfs attributes:
* io_poll_delay
* io_timeout
* nomerges
* read_ahead_kb
* rq_affinity

Here is an example of a deadlock triggered by running test srp/002
if this patch is not applied:

task:multipathd
Call Trace:
 <TASK>
 __schedule+0x8c1/0x1bf0
 schedule+0xdd/0x270
 schedule_preempt_disabled+0x1c/0x30
 __mutex_lock+0xb89/0x1650
 mutex_lock_nested+0x1f/0x30
 dm_table_set_restrictions+0x823/0xdf0
 __bind+0x166/0x590
 dm_swap_table+0x2a7/0x490
 do_resume+0x1b1/0x610
 dev_suspend+0x55/0x1a0
 ctl_ioctl+0x3a5/0x7e0
 dm_ctl_ioctl+0x12/0x20
 __x64_sys_ioctl+0x127/0x1a0
 x64_sys_call+0xe2b/0x17d0
 do_syscall_64+0x96/0x3a0
 entry_SYSCALL_64_after_hwframe+0x4b/0x53
 </TASK>
task:(udev-worker)
Call Trace:
 <TASK>
 __schedule+0x8c1/0x1bf0
 schedule+0xdd/0x270
 blk_mq_freeze_queue_wait+0xf2/0x140
 blk_mq_freeze_queue_nomemsave+0x23/0x30
 queue_ra_store+0x14e/0x290
 queue_attr_store+0x23e/0x2c0
 sysfs_kf_write+0xde/0x140
 kernfs_fop_write_iter+0x3b2/0x630
 vfs_write+0x4fd/0x1390
 ksys_write+0xfd/0x230
 __x64_sys_write+0x76/0xc0
 x64_sys_call+0x276/0x17d0
 do_syscall_64+0x96/0x3a0
 entry_SYSCALL_64_after_hwframe+0x4b/0x53
 </TASK>

Cc: Christoph Hellwig <hch@lst.de>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Nilay Shroff <nilay@linux.ibm.com>
Cc: Martin Wilck <mwilck@suse.com>
Cc: Benjamin Marzinski <bmarzins@redhat.com>
Cc: stable@vger.kernel.org
Fixes: af28141498 ("block: freeze the queue in queue_attr_store")
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Nilay Shroff <nilay@linux.ibm.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-18 15:00:11 -07:00
Bart Van Assche
42adb2d4ef fs: Add the __data_racy annotation to backing_dev_info.ra_pages
Some but not all .ra_pages changes happen while block layer I/O is paused
with blk_mq_freeze_queue(). Filesystems may read .ra_pages even while
block layer I/O is paused, e.g. from inside their .fadvise callback.
Annotating all .ra_pages reads with READ_ONCE() would be cumbersome.
Hence, add the __data_racy annotatation to the .ra_pages member
variable.

Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Nilay Shroff <nilay@linux.ibm.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Nilay Shroff <nilay@linux.ibm.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-18 15:00:11 -07:00
Xue He
152c331bcd block: plug attempts to batch allocate tags multiple times
This patch aims to enable batch allocation of sufficient tags after
batch IO submission with plug mechanism, thereby avoiding the need for
frequent individual requests when the initial allocation is
insufficient.
-----------------------------------------------------------
HW:
16 CPUs/16 poll queues
Disk: Samsung PM9A3 Gen4 3.84T

CMD:
[global]
ioengine=io_uring
group_reporting=1
time_based=1
runtime=1m
refill_buffers=1
norandommap=1
randrepeat=0
fixedbufs=1
registerfiles=1
rw=randread
iodepth=128
iodepth_batch_submit=32
iodepth_batch_complete_min=32
iodepth_batch_complete_max=128
iodepth_low=32
bs=4k
numjobs=1
direct=1
hipri=1

[job1]
filename=/dev/nvme0n1
name=batch_test
------------------------------------------------------------
Perf:
base code: __blk_mq_alloc_requests() 1.47%
patch: __blk_mq_alloc_requests() 0.75%
------------------------------------------------------------

Signed-off-by: hexue <xue01.he@samsung.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-18 14:59:41 -07:00
Paolo Abeni
a057e8e4ac Merge branch 'gve-implement-xdp-hw-rx-timestamping-support-for-dq'
Tim Hostetler says:

====================
gve: Implement XDP HW RX Timestamping support for DQ

From: Tim Hostetler <thostet@google.com>

This patch series adds support for bpf_xdp_metadata_rx_timestamp from an
XDP program loaded into the driver on its own or bound to an XSK. This
is only supported for DQ.
====================

Link: https://patch.msgid.link/20251114211146.292068-1-joshwash@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-18 15:52:45 +01:00
Tim Hostetler
1b42e07af1 gve: Add Rx HWTS metadata to AF_XDP ZC mode
By overlaying the struct gve_xdp_buff on top of the struct xdp_buff_xsk
that AF_XDP utilizes, the driver records the 32 bit timestamp via the
completion descriptor and the cached 64 bit NIC timestamp via gve_priv.

The driver's implementation of xmo_rx_timestamp extends the timestamp to
the full and up to date 64 bit timestamp and returns it to the user.

gve_rx_xsk_dqo is modified to accept a pointer to the completion
descriptor and no longer takes a buf_len explicitly as it can be pulled
out of the descriptor.

With this patch gve now supports bpf_xdp_metadata_rx_timestamp.

Signed-off-by: Tim Hostetler <thostet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Harshitha Ramamurthy <hramamurthy@google.com>
Signed-off-by: Joshua Washington <joshwash@google.com>
Link: https://patch.msgid.link/20251114211146.292068-5-joshwash@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-18 15:52:43 +01:00
Tim Hostetler
66adaf1021 gve: Prepare bpf_xdp_metadata_rx_timestamp support
Support populating XDP RX metadata with hardware RX timestamps. This
patch utilizes the same underlying logic to calculate hardware
timestamps as the regular RX path.

xdp_metadata_ops is registered with the net_device in a future patch.

gve_rx_calculate_hwtstamp was pulled out so as to not duplicate logic
between gve_xdp_rx_timestamp and gve_rx_hwtstamp.

Signed-off-by: Tim Hostetler <thostet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Harshitha Ramamurthy <hramamurthy@google.com>
Signed-off-by: Joshua Washington <joshwash@google.com>
Link: https://patch.msgid.link/20251114211146.292068-4-joshwash@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-18 15:52:43 +01:00
Tim Hostetler
f356a66b87 gve: Wrap struct xdp_buff
RX timestamping will need to keep track of extra temporary information
per-packet. In preparation for this, introduce gve_xdp_buff to wrap the
xdp_buff. This is similar in function to stmmac_xdp_buff and
ice_xdp_buff.

Signed-off-by: Tim Hostetler <thostet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Harshitha Ramamurthy <hramamurthy@google.com>
Signed-off-by: Joshua Washington <joshwash@google.com>
Link: https://patch.msgid.link/20251114211146.292068-3-joshwash@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-18 15:52:42 +01:00
Tim Hostetler
46e7860ef9 gve: Move ptp_schedule_worker to gve_init_clock
Previously, gve had been only initializing ptp aux work when
hardware timestamping was initialized through ndo_hwtsatmp_set. As this
patch series introduces XDP hardware timestamp metadata which will
require the ptp aux work, the work can't be gated on the
kernel_hwtstamp_config being set and must be initialized elsewhere.

For simplicity, ptp_schedule_worker is invoked right after the ptp_clock
is registered with the kernel (which happens during gve_probe or
following reset). The worker is scheduled in GVE_NIC_TS_SYNC_INTERVAL_MS
as the synchronous call to gve_clock_nic_ts_read makes the worker
redundant if scheduled immediately.

If gve cannot read the device clock immediately, it errors out of
gve_init_clock.

Signed-off-by: Tim Hostetler <thostet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Harshitha Ramamurthy <hramamurthy@google.com>
Signed-off-by: Joshua Washington <joshwash@google.com>
Link: https://patch.msgid.link/20251114211146.292068-2-joshwash@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-18 15:52:42 +01:00
Horatiu Vultur
19f1d6c723 net: phy: micrel: lan8814: Enable in-band auto-negotiation
The lan8814 supports two interfaces towards the host (QSGMII and QUSGMII).
Currently the lan8814 disables the auto-negotiation towards the host
side. So, extend this to allow to configure to use in-band
auto-negotiation.
I have tested this only with the QSGMII interface.

Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Link: https://patch.msgid.link/20251114084224.3268928-1-horatiu.vultur@microchip.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-18 15:17:50 +01:00
Jens Axboe
f43fdeb9a3 Merge branch 'loop-aio-nowait' into for-6.19/block
Merge async IO IOCB_NOWAIT support from Ming:

"This patchset improves loop aio perf by using IOCB_NOWAIT for avoiding
 to queue aio command to workqueue context, meantime refactor
 lo_rw_aio() a bit.

 In my test VM, loop disk perf becomes very close to perf of the backing
 block device(nvme/mq virtio-scsi).

 And Mikulas verified that this way can improve 12jobs sequential
 readwrite io by ~5X, and basically solve the reported problem together
 with loop MQ change.

 https://lore.kernel.org/linux-block/a8e5c76a-231f-07d1-a394-847de930f638@redhat.com/

 Zhaoyang Huang also mentioned it may fix their performance issue on
 Android use case.

 The loop MQ change will be posted as standalone patch, because it needs
 UAPI change."

Link: https://lore.kernel.org/linux-block/20251015110735.1361261-1-ming.lei@redhat.com/
Signed-off-by: Jens Axboe <axboe@kernel.dk>

* loop-aio-nowait:
  loop: add hint for handling aio via IOCB_NOWAIT
  loop: try to handle loop aio command via NOWAIT IO first
  loop: move command blkcg/memcg initialization into loop_queue_work
  loop: add lo_submit_rw_aio()
  loop: add helper lo_rw_aio_prep()
  loop: add helper lo_cmd_nr_bvec()
2025-11-18 06:49:59 -07:00
Ming Lei
837ed30396 loop: add hint for handling aio via IOCB_NOWAIT
Add hint for using IOCB_NOWAIT to handle loop aio command for avoiding
to cause write(especially randwrite) perf regression on sparse backed file.

Try IOCB_NOWAIT in the following situations:

- backing file is block device

OR

- READ aio command

OR

- there isn't any queued blocking async WRITEs, because NOWAIT won't cause
contention with blocking WRITE, which often implies exclusive lock

With this simple policy, perf regression of randwrite/write on sparse
backing file is fixed.

Link: https://lore.kernel.org/dm-devel/7d6ae2c9-df8e-50d0-7ad6-b787cb3cfab4@redhat.com/
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-18 06:49:52 -07:00
Ming Lei
0ba93a906d loop: try to handle loop aio command via NOWAIT IO first
Try to handle loop aio command via NOWAIT IO first, then we can avoid to
queue the aio command into workqueue. This is usually one big win in
case that FS block mapping is stable, Mikulas verified [1] that this way
improves IO perf by close to 5X in 12jobs sequential read/write test,
in which FS block mapping is just stable.

Fallback to workqueue in case of -EAGAIN. This way may bring a little
cost from the 1st retry, but when running the following write test over
loop/sparse_file, the actual effect on randwrite is obvious:

```
truncate -s 4G 1.img    #1.img is created on XFS/virtio-scsi
losetup -f 1.img --direct-io=on
fio --direct=1 --bs=4k --runtime=40 --time_based --numjobs=1 --ioengine=libaio \
	--iodepth=16 --group_reporting=1 --filename=/dev/loop0 -name=job --rw=$RW
```

- RW=randwrite: obvious IOPS drop observed
- RW=write: a little drop(%5 - 10%)

This perf drop on randwrite over sparse file will be addressed in the
following patch.

BLK_MQ_F_BLOCKING has to be set for calling into .read_iter() or .write_iter()
which might sleep even though it is NOWAIT, and the only effect is that rcu read
lock is replaced with srcu read lock.

Link: https://lore.kernel.org/linux-block/a8e5c76a-231f-07d1-a394-847de930f638@redhat.com/ [1]
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-18 06:49:52 -07:00
Ming Lei
f4788ae9d7 loop: move command blkcg/memcg initialization into loop_queue_work
Move loop command blkcg/memcg initialization into loop_queue_work,
and prepare for supporting to handle loop io command by IOCB_NOWAIT.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-18 06:49:52 -07:00
Ming Lei
c66e9708f9 loop: add lo_submit_rw_aio()
Refactor lo_rw_aio() by extracting the I/O submission logic into a new
helper function lo_submit_rw_aio(). This further improves code organization
by separating the I/O preparation, submission, and completion handling into
distinct phases.

Prepare for using NOWAIT to improve loop performance.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-18 06:49:51 -07:00
Ming Lei
fd858d1ca9 loop: add helper lo_rw_aio_prep()
Add helper lo_rw_aio_prep() to separate the preparation phase(setting up bio
vectors and initializing the iocb structure) from the actual I/O execution
in the loop block driver.

Prepare for using NOWAIT to improve loop performance.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-18 06:49:51 -07:00
Ming Lei
c3e6c11147 loop: add helper lo_cmd_nr_bvec()
Add lo_cmd_nr_bvec() and prepare for refactoring lo_rw_aio().

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-18 06:49:51 -07:00
Sukrut Heroorkar
2c6d792d4b drbd: turn bitmap I/O comments into regular block comments
W=1 build warns because the bitmap I/O comments use '/**', which
marks them as kernel-doc comments even though these functions do not
document an external API.

Convert these comments to regular block comments so kernel-doc no
longer parses them.

Signed-off-by: Sukrut Heroorkar <hsukrut3@gmail.com>
Acked-by: Christoph Böhmwalder <christoph.boehmwalder@linbit.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-18 06:48:33 -07:00
Konstantin Komarov
bcbb8d0afd fs/ntfs3: change the default mount options for "acl" and "prealloc"
Switch the "acl" and "prealloc" mount parameters to fsparam_flag_no(),
making them enabled by default and allowing users to disable them with
"noacl" and "noprealloc".

Signed-off-by: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
2025-11-18 13:56:28 +01:00
Edward Adam Davis
ccc4e86d1c fs/ntfs3: Prevent memory leaks in add sub record
If a rb node with the same ino already exists in the rb tree, the newly
alloced mft_inode in ni_add_subrecord() will not have its memory cleaned
up, which leads to the memory leak issue reported by syzbot.

The best option to avoid this issue is to put the newly alloced mft node
when a rb node with the same ino already exists in the rb tree and return
the rb node found in the rb tree to the parent layer.

syzbot reported:
BUG: memory leak
unreferenced object 0xffff888110bef280 (size 128):
  backtrace (crc 126a088f):
    ni_add_subrecord+0x31/0x180 fs/ntfs3/frecord.c:317
    ntfs_look_free_mft+0xf0/0x790 fs/ntfs3/fsntfs.c:715

BUG: memory leak
unreferenced object 0xffff888109093400 (size 1024):
  backtrace (crc 7197c55e):
    mi_init+0x2b/0x50 fs/ntfs3/record.c:105
    mi_format_new+0x40/0x220 fs/ntfs3/record.c:422

Fixes: 4342306f0f ("fs/ntfs3: Add file operations and implementation")
Reported-by: syzbot+3932ccb896e06f7414c9@syzkaller.appspotmail.com
Signed-off-by: Edward Adam Davis <eadavis@qq.com>
Signed-off-by: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
2025-11-18 13:56:27 +01:00
Edward Adam Davis
4d78d1173a fs/ntfs3: out1 also needs to put mi
After ntfs_look_free_mft() executes successfully, all subsequent code
that fails to execute must put mi.

Fixes: 4342306f0f ("fs/ntfs3: Add file operations and implementation")
Signed-off-by: Edward Adam Davis <eadavis@qq.com>
Signed-off-by: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
2025-11-18 13:56:12 +01:00
Sunday Adelodun
45a1cd8346 selftests: af_unix: Add tests for ECONNRESET and EOF semantics
Add selftests to verify and document Linux’s intended behaviour for
UNIX domain sockets (SOCK_STREAM and SOCK_DGRAM) when a peer closes.
The tests verify that:

 1. SOCK_STREAM returns EOF when the peer closes normally.
 2. SOCK_STREAM returns ECONNRESET if the peer closes with unread data.
 3. SOCK_SEQPACKET returns EOF when the peer closes normally.
 4. SOCK_SEQPACKET returns ECONNRESET if the peer closes with unread data.
 5. SOCK_DGRAM does not return ECONNRESET when the peer closes.

This follows up on review feedback suggesting a selftest to clarify
Linux’s semantics.

Suggested-by: Kuniyuki Iwashima <kuniyu@google.com>
Signed-off-by: Sunday Adelodun <adelodunolaoluwa@yahoo.com>
Link: https://patch.msgid.link/20251113112802.44657-1-adelodunolaoluwa@yahoo.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-18 11:33:17 +01:00
Paolo Abeni
4e1b0afa61 Merge branch 'net-stmmac-disable-eee-rx-clock-stop-when-vlan-is-enabled'
Ovidiu Panait says:

====================
net: stmmac: Disable EEE RX clock stop when VLAN is enabled

This series fixes a couple of VLAN issues observed on the Renesas RZ/V2H
EVK platform (stmmac + Microchip KSZ9131RNXI PHY):

- The first patch fixes a bug where VLAN ID 0 would not be properly removed
due to how vlan_del_hw_rx_fltr() matched entries in the VLAN filter table.

- The second patch addresses RX clock gating issues that occur during VLAN
creation and deletion when EEE is enabled with RX clock-stop active (the
default configuration). For example:

    # ip link add link end1 name end1.5 type vlan id 5
    15c40000.ethernet end1: Timeout accessing MAC_VLAN_Tag_Filter
    RTNETLINK answers: Device or resource busy

The stmmac hardware requires the receive clock to be running when writing
certain registers, including VLAN registers. However, by default the driver
enables Energy Efficient Ethernet (EEE) and allows the PHY to stop the
receive clock when the link is idle. As a result, the RX clock might be
stopped when attempting to access these registers, leading to timeouts.

A more comprehensive overview of receive clock related issues in the
stmmac driver can be found here:
https://lore.kernel.org/all/Z9ySeo61VYTClIJJ@shell.armlinux.org.uk/

Most of the issues were resolved by commit dd557266cf ("net: stmmac:
block PHY RXC clock-stop"), which wraps register accesses with
phylink_rx_clk_stop_block()/unblock() calls. However, VLAN add/delete
operations are invoked with bottom halves disabled, where sleeping is
not permitted, so those helpers cannot be used.

To avoid these VLAN timeouts, the second patch disables the EEE RX
clock-stop feature when VLAN support is enabled. This ensures the receive
clock remains active, allowing VLAN operations to complete successfully.
====================

Link: https://patch.msgid.link/20251113112721.70500-1-ovidiu.panait.rb@renesas.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-18 10:43:45 +01:00
Ovidiu Panait
c171e679ee net: stmmac: Disable EEE RX clock stop when VLAN is enabled
On the Renesas RZ/V2H EVK platform, where the stmmac MAC is connected to a
Microchip KSZ9131RNXI PHY, creating or deleting VLAN interfaces may fail
with timeouts:

    # ip link add link end1 name end1.5 type vlan id 5
    15c40000.ethernet end1: Timeout accessing MAC_VLAN_Tag_Filter
    RTNETLINK answers: Device or resource busy

Disabling EEE at runtime avoids the problem:

    # ethtool --set-eee end1 eee off
    # ip link add link end1 name end1.5 type vlan id 5
    # ip link del end1.5

The stmmac hardware requires the receive clock to be running when writing
certain registers, such as those used for MAC address configuration or
VLAN filtering. However, by default the driver enables Energy Efficient
Ethernet (EEE) and allows the PHY to stop the receive clock when the link
is idle. As a result, the RX clock might be stopped when attempting to
access these registers, leading to timeouts and other issues.

Commit dd557266cf ("net: stmmac: block PHY RXC clock-stop")
addressed this issue for most register accesses by wrapping them in
phylink_rx_clk_stop_block()/phylink_rx_clk_stop_unblock() calls.
However, VLAN add/delete operations may be invoked with bottom halves
disabled, where sleeping is not allowed, so using these helpers is not
possible.

Therefore, to fix this, disable the RX clock stop feature in the phylink
configuration if VLAN features are set. This ensures the RX clock remains
active and register accesses succeed during VLAN operations.

Signed-off-by: Ovidiu Panait <ovidiu.panait.rb@renesas.com>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/20251113112721.70500-3-ovidiu.panait.rb@renesas.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-18 10:43:41 +01:00
Ovidiu Panait
d9db257236 net: stmmac: Fix VLAN 0 deletion in vlan_del_hw_rx_fltr()
When the "rx-vlan-filter" feature is enabled on a network device, the 8021q
module automatically adds a VLAN 0 hardware filter when the device is
brought administratively up.

For stmmac, this causes vlan_add_hw_rx_fltr() to create a new entry for
VID 0 in the mac_device_info->vlan_filter array, in the following format:

    VLAN_TAG_DATA_ETV | VLAN_TAG_DATA_VEN | vid

Here, VLAN_TAG_DATA_VEN indicates that the hardware filter is enabled for
that VID.

However, on the delete path, vlan_del_hw_rx_fltr() searches the vlan_filter
array by VID only, without verifying whether a VLAN entry is enabled. As a
result, when the 8021q module attempts to remove VLAN 0, the function may
mistakenly match a zero-initialized slot rather than the actual VLAN 0
entry, causing incorrect deletions and leaving stale entries in the
hardware table.

Fix this by verifying that the VLAN entry's enable bit (VLAN_TAG_DATA_VEN)
is set before matching and deleting by VID. This ensures only active VLAN
entries are removed and avoids leaving stale entries in the VLAN filter
table, particularly for VLAN ID 0.

Fixes: ed64639bc1 ("net: stmmac: Add support for VLAN Rx filtering")
Signed-off-by: Ovidiu Panait <ovidiu.panait.rb@renesas.com>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Link: https://patch.msgid.link/20251113112721.70500-2-ovidiu.panait.rb@renesas.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-18 10:43:41 +01:00
Christoph Hellwig
9b0305968d xfs: remove the unused bv field in struct xfs_gc_bio
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Reviewed-by: Hans Holmberg <hans.holmberg@wdc.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
2025-11-18 09:18:59 +01:00
Pu Lehui
7dc211c115 bpf: Fix invalid prog->stats access when update_effective_progs fails
Syzkaller triggers an invalid memory access issue following fault
injection in update_effective_progs. The issue can be described as
follows:

__cgroup_bpf_detach
  update_effective_progs
    compute_effective_progs
      bpf_prog_array_alloc <-- fault inject
  purge_effective_progs
    /* change to dummy_bpf_prog */
    array->items[index] = &dummy_bpf_prog.prog

---softirq start---
__do_softirq
  ...
    __cgroup_bpf_run_filter_skb
      __bpf_prog_run_save_cb
        bpf_prog_run
          stats = this_cpu_ptr(prog->stats)
          /* invalid memory access */
          flags = u64_stats_update_begin_irqsave(&stats->syncp)
---softirq end---

  static_branch_dec(&cgroup_bpf_enabled_key[atype])

The reason is that fault injection caused update_effective_progs to fail
and then changed the original prog into dummy_bpf_prog.prog in
purge_effective_progs. Then a softirq came, and accessing the members of
dummy_bpf_prog.prog in the softirq triggers invalid mem access.

To fix it, skip updating stats when stats is NULL.

Fixes: 492ecee892 ("bpf: enable program stats")
Signed-off-by: Pu Lehui <pulehui@huawei.com>
Link: https://lore.kernel.org/r/20251115102343.2200727-1-pulehui@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-17 22:35:51 -08:00
Jakub Kicinski
7c898b71e5 Merge branch 'dpll-zl3073x-refactor-state-management'
Ivan Vecera says:

====================
dpll: zl3073x: Refactor state management

This patch set is a refactoring of the zl3073x driver to clean up
state management, improve modularity, and significantly reduce
on-demand I/O.

The driver's dpll.c implementation previously performed on-demand
register reads and writes (wrapped in mailbox operations) to get
or set properties like frequency, phase, and embedded-sync settings.
This cluttered the DPLL logic with low-level I/O, duplicated locking,
and led to inefficient bus traffic.

This series addresses this by:
1. Splitting the monolithic 'core.c' into logical units ('ref.c',
   'out.c', 'synth.c').
2. Implementing a full read/write-back cache for 'zl3073x_ref' and
   'zl3073x_out' structures.

All state is now read once during '_state_fetch()' (and status updated
periodically). DPLL get callbacks read from this cache. Set callbacks
modify a copy of the state, which is then committed via a new
'..._state_set()' function. These '_state_set' functions compare
the new state to the cached state and write *only* the modified
register values back to the hardware, all within a single mailbox
sequence.

The result is a much cleaner 'dpll.c' that is almost entirely
free of direct register I/O, and all state logic is properly
encapsulated in its respective file.

The series is broken down as follows:

* Patch 1: Changes the state structs to store raw register values
  (e.g., 'config', 'ctrl') instead of parsed booleans, centralizing
  parsing logic into the helpers.
* Patch 2: Splits the logic from 'core.c' into new 'ref.c', 'out.c'
  and 'synth.c' files, creating a 'zl3073x_dev_...' abstraction layer.
* Patch 3: Introduces the caching concept by reading and caching
  the reference monitor status periodically, removing scattered
  reads from 'dpll.c'.
* Patch 4: Expands the 'zl3073x_ref' struct to cache *all* reference
  properties and adds 'zl3073x_ref_state_set()' to write back changes.
* Patch 5: Does the same for the 'zl3073x_out' struct, caching all
  output properties and adding 'zl3073x_out_state_set()'.
* Patch 6: A final cleanup that removes the 'zl3073x_dev_...' wrapper
  functions that became redundant after the refactoring.
====================

Link: https://patch.msgid.link/20251113074105.141379-1-ivecera@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 20:23:39 -08:00
Ivan Vecera
01e0e8b6a2 dpll: zl3073x: Remove unused dev wrappers
Remove several zl3073x_dev_... inline wrapper functions from core.h
as they are no longer used by any callers.

Removed functions:
* zl3073x_dev_ref_ffo_get
* zl3073x_dev_ref_is_enabled
* zl3073x_dev_synth_dpll_get
* zl3073x_dev_synth_is_enabled
* zl3073x_dev_out_signal_format_get

This is a cleanup after recent refactoring, as the remaining callers
now fetch the state object and use the base helpers directly.

Reviewed-by: Petr Oros <poros@redhat.com>
Tested-by: Prathosh Satish <Prathosh.Satish@microchip.com>
Signed-off-by: Ivan Vecera <ivecera@redhat.com>
Link: https://patch.msgid.link/20251113074105.141379-7-ivecera@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 20:23:38 -08:00
Ivan Vecera
5fb9b0d411 dpll: zl3073x: Cache all output properties in zl3073x_out
Expand the zl3073x_out structure to cache all output-related
hardware registers, including divisors, widths, embedded-sync
parameters and phase compensation.

Modify zl3073x_out_state_fetch() to read and populate all these
new fields at once, including zero-divisor checks. Refactor all
dpll "getter" functions in dpll.c to read from this new
cached state instead of performing direct register access.

Introduce a new function, zl3073x_out_state_set(), to handle
writing changes back to the hardware. This function compares the
provided state with the current cached state and writes *only* the
modified register values via a single mailbox sequence before
updating the local cache.

Refactor all dpll "setter" functions to modify a local copy of
the output state and then call zl3073x_out_state_set() to
commit the changes.

This change centralizes all output-related register I/O into
out.c, significantly reduces bus traffic, and simplifies the logic
in dpll.c.

Reviewed-by: Petr Oros <poros@redhat.com>
Tested-by: Prathosh Satish <Prathosh.Satish@microchip.com>
Signed-off-by: Ivan Vecera <ivecera@redhat.com>
Link: https://patch.msgid.link/20251113074105.141379-6-ivecera@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 20:23:37 -08:00
Ivan Vecera
5bc02b190a dpll: zl3073x: Cache all reference properties in zl3073x_ref
Expand the zl3073x_ref structure to cache all reference-related
hardware registers, including frequency components, embedded-sync
settings  and phase compensation. Previously, these registers were
read on-demand from various functions in dpll.c leading to frequent
mailbox operations.

Modify zl3073x_ref_state_fetch() to read and populate all these new
fields at once. Refactor all "getter" functions in dpll.c to read
from this new cached state instead of performing direct register
access.

Remove the standalone zl3073x_dpll_input_ref_frequency_get() helper,
as its functionality is now replaced by zl3073x_ref_freq_get() which
operates on the cached state and add a corresponding zl3073x_dev_...
wrapper.

Introduce a new function, zl3073x_ref_state_set(), to handle
writing changes back to the hardware. This function compares the
provided state with the current cached state and writes *only* the
modified register values to the device via a single mailbox sequence
before updating the local cache.

Refactor all dpll "setter" functions to modify a local copy of the
ref state and then call zl3073x_ref_state_set() to commit the changes.

As a cleanup, update callers in dpll.c that already have
a struct zl3073x_ref * to use the direct helpers instead of the
zl3073x_dev_... wrappers.

This change centralizes all reference-related register I/O into ref.c,
significantly reduces bus traffic, and simplifies the logic in dpll.c.

Reviewed-by: Petr Oros <poros@redhat.com>
Tested-by: Prathosh Satish <Prathosh.Satish@microchip.com>
Signed-off-by: Ivan Vecera <ivecera@redhat.com>
Link: https://patch.msgid.link/20251113074105.141379-5-ivecera@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 20:23:37 -08:00
Ivan Vecera
5534a8202d dpll: zl3073x: Cache reference monitor status
Instead of reading the ZL_REG_REF_MON_STATUS register every time
the reference status is needed, cache this value in the zl3073x_ref
struct.

This is achieved by:
* Adding a mon_status field to struct zl3073x_ref
* Introducing zl3073x_dev_ref_status_update() to read the status for
  all references into this new cache field
* Calling this update function from the periodic work handler
* Adding zl3073x_ref_is_status_ok() and zl3073x_dev_ref_is_status_ok()
  helpers to check the cached value
* Refactoring all callers in dpll.c to use the new
  zl3073x_dev_ref_is_status_ok() helper, removing direct register reads

This change consolidates all status register reads into a single periodic
function and reduces I/O bus traffic in dpll callbacks.

Reviewed-by: Petr Oros <poros@redhat.com>
Tested-by: Prathosh Satish <Prathosh.Satish@microchip.com>
Signed-off-by: Ivan Vecera <ivecera@redhat.com>
Link: https://patch.msgid.link/20251113074105.141379-4-ivecera@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 20:23:37 -08:00
Ivan Vecera
607f2c00c6 dpll: zl3073x: Split ref, out, and synth logic from core
Refactor the zl3073x driver by splitting the logic for input
references, outputs and synthesizers out of the monolithic
core.[ch] files.

Move the logic for each functional block into its own dedicated files:
ref.[ch], out.[ch] and synth.[ch].

Specifically:
- Move state structures (zl3073x_ref, zl3073x_out, zl3073x_synth)
  from core.h into their respective new headers
- Move state-fetching functions (..._state_fetch) from core.c to their
  new .c files
- Move the zl3073x_ref_freq_factorize helper from core.c to ref.c
- Introduce a new helper layer to decouple the core device logic from
  the state-parsing logic:
  1. Move the original inline helpers (e.g., zl3073x_ref_is_enabled)
     to the new headers (ref.h, etc.) and make them operate on a
     const struct ... * pointer.
  2. Create new zl3073x_dev_... prefixed functions in core.h
     (e.g., zl3073x_dev_ref_is_enabled) and Implement these _dev_ functions
     to fetch state using a new ..._state_get() helper and then call
     the non-prefixed helper.
  3. Update all driver-internal callers (in dpll.c, prop.c, etc.) to use
     the new zl3073x_dev_... functions.

Reviewed-by: Petr Oros <poros@redhat.com>
Tested-by: Prathosh Satish <Prathosh.Satish@microchip.com>
Signed-off-by: Ivan Vecera <ivecera@redhat.com>
Link: https://patch.msgid.link/20251113074105.141379-3-ivecera@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 20:23:37 -08:00
Ivan Vecera
58fb88d30b dpll: zl3073x: Store raw register values instead of parsed state
The zl3073x_ref, zl3073x_out and zl3073x_synth structures
previously stored state that was parsed from register reads. This
included values like boolean 'enabled' flags, synthesizer selections,
and pre-calculated frequencies.

This commit refactors the state management to store the raw register
values directly in these structures. The various inline helper functions
are updated to parse these raw values on-demand using FIELD_GET.

Reviewed-by: Petr Oros <poros@redhat.com>
Tested-by: Prathosh Satish <Prathosh.Satish@microchip.com>
Signed-off-by: Ivan Vecera <ivecera@redhat.com>
Link: https://patch.msgid.link/20251113074105.141379-2-ivecera@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 20:23:37 -08:00
Jakub Kicinski
5b8b343c5c Merge branch 's390-qeth-improve-handling-of-osa-rcs'
Aswin Karuvally says:

====================
s390/qeth: Improve handling of OSA RCs

This two patch series aims to improve how return codes from OSA Express
are handled in the qeth driver.

OSA defines a number of return codes whose meaning is determined by the
issuing command, ie. they are ambiguous. The first patch moves
definitions of all return codes including the ambiguous ones to a single
enum block to aid readability and maintainability.

The second patch implements a mechanism to interpret return codes based
on the issuing command to ensure accurate debug messages. While at it,
remove extern keyword and fix indentation for function declarations to
be in line with Linux kernel coding style.
====================

Link: https://patch.msgid.link/20251113144209.2140061-1-aswin@linux.ibm.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 20:02:05 -08:00
Aswin Karuvally
53e58437b4 s390/qeth: Handle ambiguous OSA RCs in s390dbf
OSA Express defines a number of return codes whose meaning is determined
by the issuing command, making them ambiguous. The important ones are
reported as debug messages through the s390 debug feature.

The qeth driver currently does not take the issuing command into account
when interpreting the return code which sometimes leads to incorrect
debug messages.

Implement a mechanism to interpret and report these return codes
properly. While at it, remove extern keyword and fix indentation for
function declarations to be in line with Linux kernel coding style.

Suggested-by: Alexandra Winter <wintera@linux.ibm.com>
Reviewed-by: Alexandra Winter <wintera@linux.ibm.com>
Signed-off-by: Aswin Karuvally <aswin@linux.ibm.com>
Link: https://patch.msgid.link/20251113144209.2140061-3-aswin@linux.ibm.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 20:02:02 -08:00
Aswin Karuvally
eef1f5ae73 s390/qeth: Move all OSA RCs to single enum
OSA Express defines a number of return codes whose meaning is
determined by the issuing command, making them ambiguous. Move
definitions of all return codes including the ambiguous ones to a single
enum block to aid readability and maintainability.

Reviewed-by: Alexandra Winter <wintera@linux.ibm.com>
Signed-off-by: Aswin Karuvally <aswin@linux.ibm.com>
Link: https://patch.msgid.link/20251113144209.2140061-2-aswin@linux.ibm.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 20:02:01 -08:00
Heiner Kallweit
28c0074fd4 r8169: bail out from probe if fiber mode is detected on RTL8127AF
It was reported that on a card with RTL8127AF (SFP + DAC) link-up isn't
detected. Realtek hides the SFP behind the internal PHY, which isn't
behaving fully compliance with clause 22 any longer in fiber mode.
Due to not having access to chip documentation there isn't much I can
do for now. Instead of silently failing to detect link-up in fiber mode,
inform the user that fiber mode isn't support and bail out.

The logic to detect fiber mode is borrowed from Realtek's r8127 driver.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://patch.msgid.link/fab6605a-54e2-4f54-b194-11c2b9caaaa9@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 19:59:00 -08:00
Jakub Kicinski
e8c84b989f Merge branch 'net-stmmac-dwmac-sophgo-add-phy-interface-filter'
Inochi Amaoto says:

====================
net: stmmac: dwmac-sophgo: Add phy interface filter

As the SG2042 has an internal rx delay, the delay should be remove
when init the mac, otherwise the phy will be misconfigurated.

Since this delay fix is common for other MACs, add a common helper
for it. And use it to fix SG2042.
====================

Link: https://patch.msgid.link/20251114003805.494387-1-inochiama@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 19:57:41 -08:00
Inochi Amaoto
db37c6e510 net: stmmac: dwmac-sophgo: Add phy interface filter
As the SG2042 has an internal rx delay, the delay should be removed
when initializing the mac, otherwise the phy will be misconfigurated.

Fixes: 543009e2d4 ("net: stmmac: dwmac-sophgo: Add support for Sophgo SG2042 SoC")
Signed-off-by: Inochi Amaoto <inochiama@gmail.com>
Tested-by: Han Gao <rabenda.cn@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20251114003805.494387-4-inochiama@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 19:57:38 -08:00
Inochi Amaoto
24afd7827e net: phy: Add helper for fixing RGMII PHY mode based on internal mac delay
The "phy-mode" property of devicetree indicates whether the PCB has
delay now, which means the mac needs to modify the PHY mode based
on whether there is an internal delay in the mac.

This modification is similar for many ethernet drivers. To simplify
code, define the helper phy_fix_phy_mode_for_mac_delays(speed, mac_txid,
mac_rxid) to fix PHY mode based on whether mac adds internal delay.

Suggested-by: Russell King (Oracle) <linux@armlinux.org.uk>
Signed-off-by: Inochi Amaoto <inochiama@gmail.com>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20251114003805.494387-3-inochiama@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 19:57:38 -08:00
Inochi Amaoto
6b1aa3c87f dt-bindings: net: sophgo,sg2044-dwmac: add phy mode restriction
As the ethernet controller of SG2044 and SG2042 only supports
RGMII phy. Add phy-mode property to restrict the value.

Also, since SG2042 has internal rx delay in its mac, make
only "rgmii-txid" and "rgmii-id" valid for phy-mode.

Fixes: e281c48a73 ("dt-bindings: net: sophgo,sg2044-dwmac: Add support for Sophgo SG2042 dwmac")
Signed-off-by: Inochi Amaoto <inochiama@gmail.com>
Acked-by: Conor Dooley <conor.dooley@microchip.com>
Link: https://patch.msgid.link/20251114003805.494387-2-inochiama@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 19:57:38 -08:00
Jakub Kicinski
6e90c767d5 Merge branch 'net-mana-refactor-gf-stats-handling-and-add-rx_missed_errors-counter'
Erni Sri Satya Vennela says:

====================
net: mana: Refactor GF stats handling and add rx_missed_errors counter

Restructure mana_query_gf_stats() to operate on the per-VF mana_context,
instead of per-port statistics. Introduce mana_ethtool_hc_stats to
isolate hardware counter statistics and update the
"ethtool -S <interface>" output to expose all relevant counters while
preserving backward compatibility.

Add support for the standard rx_missed_errors counter by mapping it
to the hardware's hc_rx_discards_no_wqe metric. Refresh statistics
every 2 seconds.
====================

Link: https://patch.msgid.link/1763120599-6331-1-git-send-email-ernis@linux.microsoft.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 19:55:37 -08:00
Erni Sri Satya Vennela
be4f1d67ec net: mana: Add standard counter rx_missed_errors
Report standard counter stats->rx_missed_errors
using hc_rx_discards_no_wqe from the hardware.

Add a global workqueue to periodically run
mana_query_gf_stats every 2 seconds to get the latest
info in eth_stats and define a driver capability flag
to notify hardware of the periodic queries.

To avoid repeated failures and log flooding, the workqueue
is not rescheduled if mana_query_gf_stats fails on HWC timeout
error and the stats are reset to 0. Other errors are transient
which will not need a VF reset for recovery.

Signed-off-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
Link: https://patch.msgid.link/1763120599-6331-3-git-send-email-ernis@linux.microsoft.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 19:52:30 -08:00
Erni Sri Satya Vennela
e275d9091c net: mana: Move hardware counter stats from per-port to per-VF context
Move hardware counter (HC) statistics from mana_port_context to
mana_context to enable sharing stats across multiple network ports
on the same MANA VF. Previously, each network port queried
hardware counters independently using MANA_QUERY_GF_STAT command
(GF = Generic Function stats from GDMA hardware), resulting in
redundant queries when multiple ports existed on the same device.

Isolate hardware counter stats by introducing mana_ethtool_hc_stats
in mana_context and update the code to ensure all stats are properly
reported via ethtool -S <interface>, maintaining consistency with
previous behavior.

Signed-off-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
Link: https://patch.msgid.link/1763120599-6331-2-git-send-email-ernis@linux.microsoft.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 19:52:30 -08:00
Jakub Kicinski
2f2dee1696 Merge branch 'net-stmmac-clean-up-plat_dat-allocation-initialisation'
Russell King says:

====================
net: stmmac: clean up plat_dat allocation/initialisation

This series cleans up the plat_dat allocation and initialisation,
moving common themes into the allocator.

This results in a nice saving:
  7 files changed, 53 insertions(+), 148 deletions(-)
====================

Link: https://patch.msgid.link/aRdKVMPHXlIn457m@shell.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 19:48:01 -08:00
Russell King (Oracle)
6409249ccc net: stmmac: remove unnecessary .pkt_route queue initialisation
PCI drivers explicitly set .pkt_route to zero. However, as the struct
is allocated using devm_kzalloc(), all members default to zero unless
explicitly initialised. Thus, explicitly setting these to zero is
unnecessary. Remove these. This leaves only stmmac_platform.c where
this is explicitly initialised depending on DT properties.

$ grep '\.pkt_route =' *.c
dwmac-intel.c: plat->rx_queues_cfg[0].pkt_route = 0x0;
dwmac-intel.c: plat->rx_queues_cfg[i].pkt_route = 0x0;
dwmac-loongson.c: plat->rx_queues_cfg[0].pkt_route = 0x0;
stmmac_main.c: if (priv->plat->rx_queues_cfg[queue].pkt_route == 0x0)
stmmac_pci.c: plat->rx_queues_cfg[0].pkt_route = 0x0;
stmmac_pci.c: plat->rx_queues_cfg[i].pkt_route = 0x0;
stmmac_platform.c: plat->rx_queues_cfg[queue].pkt_route = PACKET_AVCPQ;
stmmac_platform.c: plat->rx_queues_cfg[queue].pkt_route = PACKET_PTPQ;
stmmac_platform.c: plat->rx_queues_cfg[queue].pkt_route = PACKET_DCBCPQ;
stmmac_platform.c: plat->rx_queues_cfg[queue].pkt_route = PACKET_UPQ;
stmmac_platform.c: plat->rx_queues_cfg[queue].pkt_route = PACKET_MCBCQ;
stmmac_platform.c: plat->rx_queues_cfg[queue].pkt_route = 0x0;

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vJvjf-0000000EVkO-1ZaO@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 19:47:58 -08:00
Russell King (Oracle)
0a20999ed4 net: stmmac: remove unnecessary .prio queue initialisation
stmmac_platform.c explicitly sets .prio to zero if the snps,priority
property is not present in DT for the queue. However, as the struct
is allocated using devm_kzalloc(), all members default to zero unless
explicitly initialised, and of_property_read_u32() will not write to
its argument if the property is not found. Thus, explicitly setting
these to zero is unnecessary. Remove these.

$ grep '\.prio =' *.c
stmmac_platform.c: plat->rx_queues_cfg[queue].prio = 0;
stmmac_platform.c: plat->tx_queues_cfg[queue].prio = 0;

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vJvja-0000000EVkI-0zUH@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 19:47:58 -08:00
Russell King (Oracle)
c03101cb1b net: stmmac: remove unnecessary .use_prio queue initialisation
Several drivers (see below) explicitly set the queue .use_prio
configuration to false. However, as this structure is allocated using
devm_kzalloc(), all members default to zero unless otherwise explicitly
initialised. .use_prio isn't, so defaults to false. Remove these
unnecessary initialisations, leaving stmmac_platform.c as the only
file that .use_prio is set true.

$ grep 'use_prio =' *.c
dwmac-intel.c: plat->tx_queues_cfg[0].use_prio = false;
dwmac-intel.c: plat->rx_queues_cfg[0].use_prio = false;
dwmac-intel.c: plat->rx_queues_cfg[i].use_prio = false;
dwmac-intel.c: plat->tx_queues_cfg[i].use_prio = false;
dwmac-loongson.c: plat->tx_queues_cfg[0].use_prio = false;
dwmac-loongson.c: plat->rx_queues_cfg[0].use_prio = false;
stmmac_pci.c: plat->tx_queues_cfg[0].use_prio = false;
stmmac_pci.c: plat->rx_queues_cfg[0].use_prio = false;
stmmac_pci.c: plat->tx_queues_cfg[i].use_prio = false;
stmmac_pci.c: plat->rx_queues_cfg[i].use_prio = false;
stmmac_platform.c: plat->rx_queues_cfg[queue].use_prio = false;
stmmac_platform.c: plat->rx_queues_cfg[queue].use_prio = true;
stmmac_platform.c: plat->tx_queues_cfg[queue].use_prio = false;
stmmac_platform.c: plat->tx_queues_cfg[queue].use_prio = true;

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vJvjV-0000000EVkC-0WAV@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 19:47:58 -08:00
Russell King (Oracle)
b6d013b326 net: stmmac: setup default RX channel map in stmmac_plat_dat_alloc()
Setup the default 1:1 RX channel map in stmmac_plat_dat_alloc() and
remove 1:1 initialisations from platform glue drivers.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vJvjQ-0000000EVk6-05z7@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 19:47:58 -08:00
Russell King (Oracle)
d5e788e86f net: stmmac: move initialisation of queues_to_use to stmmac_plat_dat_alloc()
Move the default initialisation of plat_dat->tx_queues_to_use and
plat_dat->rx_queues_to_use to 1 to stmmac_plat_dat_alloc(). This means
platform glue only needs to override this if different.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vJvjK-0000000EVk0-3qb2@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 19:47:58 -08:00
Russell King (Oracle)
bcb145c696 net: stmmac: move initialisation of unicast_filter_entries to stmmac_plat_dat_alloc()
Move the default initialisation of plat_dat->unicast_filter_entries to
1 to stmmac_plat_dat_alloc(). This means platform glue only needs to
override this if different.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vJvjF-0000000EVju-3LfS@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 19:47:58 -08:00
Russell King (Oracle)
07cedb9eed net: stmmac: move initialisation of multicast_filter_bins to stmmac_plat_dat_alloc()
Move the default initialisation of plat_dat->multicast_filter_bins to
HASH_TABLE_SIZE to stmmac_plat_dat_alloc(). This means platform glue
only needs to override this if different.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vJvjA-0000000EVjo-2qVn@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 19:47:57 -08:00
Russell King (Oracle)
528478a746 net: stmmac: move initialisation of maxmtu to stmmac_plat_dat_alloc()
Move the default initialisation of plat_dat->maxmtu to JUMBO_LEN to
stmmac_plat_dat_alloc().

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vJvj5-0000000EVji-2EYA@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 19:47:57 -08:00
Russell King (Oracle)
ae4f29712b net: stmmac: move initialisation of clk_csr to stmmac_plat_dat_alloc()
Move the default initialisation of plat_dat->clk_csr to
stmmac_plat_dat_alloc().

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vJvj0-0000000EVjb-1jDh@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 19:47:57 -08:00
Russell King (Oracle)
99e6ddaabd net: stmmac: move initialisation of phy_addr to stmmac_plat_dat_alloc()
Move the default initialisation of plat_dat->phy_addr to
stmmac_plat_dat_alloc().

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vJviv-0000000EVjV-1CLF@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 19:47:57 -08:00
Russell King (Oracle)
511171e47f net: stmmac: add stmmac_plat_dat_alloc()
Add a function to allocate and initialise the plat_stmmacenet_data
structure with default values.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vJviq-0000000EVjP-0c0l@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 19:47:57 -08:00
Randy Dunlap
7cf3ac8a9c NFC: mei_phy: fix kernel-doc warnings
Fix kernel-doc warnings in mei_phy.h to avoid build warnings and to
improve and documentation:

mei_phy.h:15: warning: missing initial short description on line:
 * struct nfc_mei_phy
mei_phy.h:19: warning: bad line:

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reviewed-by: Krzysztof Kozlowski <krzk@kernel.org>
Link: https://patch.msgid.link/20251116070959.85055-1-rdunlap@infradead.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 19:32:32 -08:00
Jakub Kicinski
95236dfce3 Merge branch 'convert-drivers-to-use-ndo_hwtstamp-callbacks-part-4'
Vadim Fedorenko says:

====================
convert drivers to use ndo_hwtstamp callbacks part 4

This patchset is a subset of part 3 patchset to convert bnx2x and qede
drviers to use ndo callbacks instead ioctl to configure and report time
stamping. These drivers implemented only SIOCSHWTSTAMP command, but
converted to also provide configuration back to users. Some logic is
changed to avoid reporting configuration which is not in sync with the
HW in case of error happened.
====================

Link: https://patch.msgid.link/20251116094610.3932005-1-vadim.fedorenko@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 19:30:17 -08:00
Vadim Fedorenko
89ae72f21b qede: convert to use ndo_hwtstamp callbacks
The driver implemented SIOCSHWTSTAMP ioctl cmd only, but it stores
configuration in private structure, so it can be reported back to users.
Implement both ndo_hwtstamp_set and ndo_hwtstamp_set callbacks.
ndo_hwtstamp_set implements a check of unsupported 1-step timestamping
and qede_ptp_cfg_filters() becomes void as it cannot fail anymore.

Reviewed-by: Kory Maincent <kory.maincent@bootlin.com>
Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Link: https://patch.msgid.link/20251116094610.3932005-3-vadim.fedorenko@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 19:30:12 -08:00
Vadim Fedorenko
889e6af877 bnx2x: convert to use ndo_hwtstamp callbacks
The driver implemented SIOCSHWTSTAMP ioctl command only, but at the same
time it has configuration stored in a private structure. Implement both
ndo_hwtstamp_set and ndo_hwtstamp_get callback using stored info.
ndo_hwtstamp_set callback implements a check for unsupported 1-step
timestamping. The same check is removed from bnx2x_configure_ptp_filters
function as it's not needed anymore. Another call site of
bnx2x_configure_ptp_filters has hwtstamp_ioctl_called guard.

Reviewed-by: Kory Maincent <kory.maincent@bootlin.com>
Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Link: https://patch.msgid.link/20251116094610.3932005-2-vadim.fedorenko@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 19:30:12 -08:00
Jakub Kicinski
ef3b682aca Merge branch 'selftests-mptcp-counter-cache-stats-before-timeout'
Matthieu Baerts says:

====================
selftests: mptcp: counter cache & stats before timeout

Here are a bunch of small improvements to the MPTCP selftests:

- Patch 1: move code to mptcp_lib.sh to prepare the new features.

- Patch 2: simplify mptcp_lib_pr_err_stats helper use.

- Patch 3: remove unused last column from nstat output.

- Patch 4: improve stats dump in mptcp_join.sh.

- Patch 5: get counters from nstat history and simplify mptcp_connect.sh.

- Patch 6: avoid taking the same packet trace twice.

- Patch 7: wait for an event instead of a fix time.

- Patch 8: instead of using 'timeout' and print the stats after, another
  internal timeout is used: if it fires, it will print stats, then stop
  everything. This avoids confusions around stats in case of timeout.
====================

Link: https://patch.msgid.link/20251114-net-next-mptcp-sft-count-cache-stats-timeout-v1-0-863cb04e1b7b@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 19:28:04 -08:00
Matthieu Baerts (NGI0)
eea2f44870 selftests: mptcp: get stats just before timing out
Recently, some debugging happened around a test that was timing out. The
stats were showing connections being closed which was confusing because
the closing state was caused by the timeout stopping the transfer.

To avoid such confusion, the timeout is no longer done per mptcp_connect
process, but separately. In case of timeout, the stats are now printed,
then the apps are killed.

The stats will still be printed after the kill, but that's fine, and
this might even be useful, just in case. Timeout should be exceptional.

Reviewed-by: Geliang Tang <geliang@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20251114-net-next-mptcp-sft-count-cache-stats-timeout-v1-8-863cb04e1b7b@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 19:27:47 -08:00
Matthieu Baerts (NGI0)
39348f5f2f selftests: mptcp: wait for port instead of sleep
After having started mptcp_connect in listening mode,
'mptcp_lib_wait_local_port_listen' can be used to wait for the listening
socket to be ready.

This is better than using the 'sleep' command, not to pause for a fixed
amount of time, but waiting for an event. This helper is used in all
other MPTCP selftests, but not in these two.

Reviewed-by: Geliang Tang <geliang@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20251114-net-next-mptcp-sft-count-cache-stats-timeout-v1-7-863cb04e1b7b@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 19:27:47 -08:00
Matthieu Baerts (NGI0)
8c1fe0a500 selftests: mptcp: connect: avoid double packet traces
When the same netns is used for the listener and the connector, no need
to take exactly the same packet trace twice, one is enough.

This avoids confusions when the traces are the same, and wasting
resources which might not help reproducing an issue.

While at it, avoid long lines and double spaces now that these lines are
no longer aligned.

Reviewed-by: Geliang Tang <geliang@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20251114-net-next-mptcp-sft-count-cache-stats-timeout-v1-6-863cb04e1b7b@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 19:27:47 -08:00
Matthieu Baerts (NGI0)
71388a9f33 selftests: mptcp: lib: get counters from nstat history
Before, 'nstat' was used to retrieve each individual counter: this means
querying 4 different sources from /proc/net and iterating over 100+
counters each time. Instead, the stats could be retrieved once, and the
output file could be parsed for each counter. Even better, such file is
already present: the nstat history file.

To be able to get this working, the nstat history file also needs to
contains zero counters too, so it is still possible to know if a counter
is missing or set to 0.

This also simplifies mptcp_connect.sh: instead of checking multiple
counters before and after a test to compute the difference, the stats
history files can be reset before each test, and nstat can display only
the difference.

mptcp_lib_get_counter() continues to work when no history file is
available: by fetching nstat directly, like before. This is the case in
diag.sh and userspace_pm.sh where there is no need to save the history
file. This is also the case in mptcp_join.sh, when 'run_tests' is
executed in the background: easier to continue fetching counters than
updating the history each time it is needed.

Note: 'nstat' is called with '-s' in mptcp_lib_nstat_get(), so this
helper can be called multiple times during the test if needed.

Acked-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20251114-net-next-mptcp-sft-count-cache-stats-timeout-v1-5-863cb04e1b7b@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 19:27:47 -08:00
Matthieu Baerts (NGI0)
658e531417 selftests: mptcp: join: dump stats from history
In case of errors, dump the stats from history instead of using nstat.

There are multiple advantages to that:

- The same filters from pr_err_stats are used, e.g. the unused 'rate'
  column is not displayed.

- The counters are closer to the ones from when the test stopped.

- While at it, the errors can be better presented: error colours, a
  small indentation to distinguish the different parts, extra new lines.

Even if it should only happen in rare cases -- internal errors, or netns
issues -- if no history is available, 'nstat' is used like before, just
in case.

Acked-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20251114-net-next-mptcp-sft-count-cache-stats-timeout-v1-4-863cb04e1b7b@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 19:27:46 -08:00
Matthieu Baerts (NGI0)
2e6daf6b9b selftests: mptcp: lib: stats: remove nstat rate columns
With the MPTCP selftests, the nstat daemon is not used. It means that
the last column (the rate) is always 0.0, and that's not something
interesting to display.

Then, this last column can be filtered out.

Acked-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20251114-net-next-mptcp-sft-count-cache-stats-timeout-v1-3-863cb04e1b7b@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 19:27:46 -08:00
Matthieu Baerts (NGI0)
a89fc262b6 selftests: mptcp: lib: remove stats files args
Now that these files are written from MPTCP lib helpers, the stats file
paths are uniformed. Then, no need to specify them from the each
selftest.

No behavioural changes intended.

Acked-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20251114-net-next-mptcp-sft-count-cache-stats-timeout-v1-2-863cb04e1b7b@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 19:27:46 -08:00
Matthieu Baerts (NGI0)
d3305c016a selftests: mptcp: lib: introduce 'nstat_{init,get}'
These new helpers are easier to read than the long and multi lines
commands. Plus it will ease the addition of new features related to that
in the next commits.

No behavioural changes intended.

Acked-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20251114-net-next-mptcp-sft-count-cache-stats-timeout-v1-1-863cb04e1b7b@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 19:27:46 -08:00
Zenm Chen
c453d9c5bb wifi: rtw89: Add default ID 0bda:b831 for RTL8831BU
Add 0bda:b831 for RTL8831BU-based adapters that use this default ID.

Tested with TOTOLINK X900USM in station mode very briefly.

Signed-off-by: Zenm Chen <zenmchen@gmail.com>
Acked-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251116234403.8803-1-zenmchen@gmail.com
2025-11-18 10:33:11 +08:00
Abdun Nihaal
9b5b9c042b wifi: rtl818x: Fix potential memory leaks in rtl8180_init_rx_ring()
In rtl8180_init_rx_ring(), memory is allocated for skb packets and DMA
allocations in a loop. When an allocation fails, the previously
successful allocations are not freed on exit.

Fix that by jumping to err_free_rings label on error, which calls
rtl8180_free_rx_ring() to free the allocations. Remove the free of
rx_ring in rtl8180_init_rx_ring() error path, and set the freed
priv->rx_buf entry to null, to avoid double free.

Fixes: f653211197 ("Add rtl8180 wireless driver")
Signed-off-by: Abdun Nihaal <nihaal@cse.iitm.ac.in>
Reviewed-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251114094527.79842-1-nihaal@cse.iitm.ac.in
2025-11-18 10:29:49 +08:00
Ping-Ke Shih
c8d212fa81 wifi: rtw89: use separated function to set RX filter
Normally use rtw89_write32_mask() with a consecutive bit mask, but mask of
RX filter B_AX_RX_FLTR_CFG_MASK is bits 31-22 and 15-0, which excludes bits
of B_AX_RX_MPDU_MAX_LEN_MASK (bits 21-16).

Though the original logic is well to set RX filter, change it to a separate
function to avoid the tricky design.

Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://patch.msgid.link/20251113053459.34995-1-pkshih@realtek.com
2025-11-18 09:57:40 +08:00
Chih-Kang Chang
09afd209a8 wifi: rtw89: correct user macid mask of RX info for RTL8922D
The user MAC ID mask of RX info in MAC PPDU for RTL8922A and RTL8922D
is different, correct it accordingly.

Signed-off-by: Chih-Kang Chang <gary.chang@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251114060128.35363-15-pkshih@realtek.com
2025-11-18 09:30:20 +08:00
Chih-Kang Chang
9dab26b9fa wifi: rtw89: update format of addr cam H2C command
The addr cam H2C command is to tell firmware the addr related info.
For RTL8922D and RTL8922A after firmware version 0.35.84.0, the addr cam
must be updated with update mode to avoid clearing previously set
fields. Update it accordingly.

Signed-off-by: Chih-Kang Chang <gary.chang@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251114060128.35363-14-pkshih@realtek.com
2025-11-18 09:28:42 +08:00
Ping-Ke Shih
fc2e8c873f wifi: rtw89: add addr cam H2C command v1
The coming RTL8922D uses different format of address CAM command, so add
the new format accordingly.

Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251114060128.35363-13-pkshih@realtek.com
2025-11-18 09:27:03 +08:00
Ping-Ke Shih
239dd70d77 wifi: rtw89: fill addr cam H2C command by struct
The addr cam is used to tell firmware the MAC address and BSSID associated
to connected stations. Use struct instead of macros with pointer arithmetic
to fill the data.

Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251114060128.35363-12-pkshih@realtek.com
2025-11-18 09:26:40 +08:00
Ping-Ke Shih
aa8d1a8805 wifi: rtw89: align RA H2C format v1 for RTL8922A
For RTL8922A, the bits [16:31] of word 3 of v1 format isn't as WiFi 6
chips. Instead, it only needs to fill additional band type and partial
bandwidth ER.

Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251114060128.35363-11-pkshih@realtek.com
2025-11-18 09:25:00 +08:00
Ping-Ke Shih
1c2ada0962 wifi: rtw89: fw: print band and port where beacon update on
The C2H event of BCN_UPD_DONE is to notify driver that firmware changes
beacon content on certain band/port asked by driver. Print the
notification for debug purpose for now.

Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251114060128.35363-10-pkshih@realtek.com
2025-11-18 09:23:49 +08:00
Ping-Ke Shih
5607a141bf wifi: rtw89: phy: ignore DCFO if not defined in chip_info
For WiFi 7 chips, DCFO (digital carrier frequency offset) feature isn't
supported, so the corresponding registers aren't defined in chip_info.
Check and ignore this feature accordingly.

Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251114060128.35363-9-pkshih@realtek.com
2025-11-18 09:23:36 +08:00
Ping-Ke Shih
9df55e8d41 wifi: rtw89: phy: consider type 15 in BB gain table
BB gain table is a table to configure gain for certain channels. Newly
added type 15 is considered to write registers accordingly.

Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251114060128.35363-8-pkshih@realtek.com
2025-11-18 09:23:26 +08:00
Ping-Ke Shih
7fefea1100 wifi: rtw89: mac: update wcpu_on to download firmware for RTL8922D
The RTL8922D does more settings before downloading firmware, so add them
accordingly. Also, update the missed settings for RTL8922A.

Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251114060128.35363-7-pkshih@realtek.com
2025-11-18 09:21:53 +08:00
Ping-Ke Shih
d375434cbc wifi: rtw89: mac: remove undefined bit B_BE_PPDU_MAC_INFO
The bit is defined during test chip development, but formal chips don't
have this. Remove it.

Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251114060128.35363-6-pkshih@realtek.com
2025-11-18 09:20:25 +08:00
Ping-Ke Shih
12e84effcb wifi: rtw89: phy: calling BB pre-init by chips with/without BB MCU
The existing flow is doing BB pre-init before downloading BB MCU firmware,
because existing chip RTL8922A has BB MCU. However, the coming chips don't
have this, and BB pre-init configuring registers can affect downloading
WiFi-CPU firmware. Therefore, calling BB pre-init afterward for new coming
chips without BB MCU.

For existing WiFi 6 chips, no BB pre-init. For RTL8922A, don't change
the logic.

Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251114060128.35363-5-pkshih@realtek.com
2025-11-18 09:18:51 +08:00
Ping-Ke Shih
a2a64fe234 wifi: rtw89: mac: separate pre-init code before downloading firmware
Driver needs to initialize registers before downloading firmware, so
move pre-init part (power on) from original rtw89_mac_init(). The
consequence patches will add more pre-init codes before downloading
firmware.

Since rtw89_phy_init_bb_afe() is used by coming RTL8922D, don't change
logic at all for existing chips.

Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251114060128.35363-4-pkshih@realtek.com
2025-11-18 09:17:39 +08:00
Ping-Ke Shih
92db331de6 wifi: rtw89: fw: part size to download firmware by header info
The part size is the unit to download firmware piece by piece. Old chips
use 2020 bytes as a piece, but in new chips the part size is defined in
firmware header. Change to use the value dynamically.

Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251114060128.35363-3-pkshih@realtek.com
2025-11-18 09:16:25 +08:00
Chih-Kang Chang
3a12581fc2 wifi: rtw89: flush TX queue before deleting key
In the wpa_supplicant rekey flow, it sends an EAPOL packet 4/4 through
nl80211_tx_control_port() and triggers wake_tx_queue() in the driver.
Then, it sends nl80211_new_key() to configure a new key in mac80211.
However, in wake_tx_queue(), a workqueue is used to process the TX packet,
which might cause the driver to process the EAPOL packet later than
nl80211_new_key(). This results in the EAPOL packet 4/4 being transmitted
with the new key and IV, causing it to be dropped by the AP. Therefore,
needs to flush TX queue before deleting the old key to ensure that the
EAPOL 4/4 packet is transmitted using the old key.

Signed-off-by: Chih-Kang Chang <gary.chang@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251114060128.35363-2-pkshih@realtek.com
2025-11-18 09:14:41 +08:00
Eric Dumazet
ca412f25d6 tcp: reduce tcp_comp_sack_slack_ns default value to 10 usec
net.ipv4.tcp_comp_sack_slack_ns current default value is too high.

When a flow has many drops (1 % or more), and small RTT, adding 100 usec
before sending SACK stalls the sender relying on getting SACK
fast enough to keep the pipe busy.

Decrease the default to 10 usec.

This is orthogonal to Congestion Control heuristics to determine
if drops are caused by congestion or not.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Link: https://patch.msgid.link/20251114135141.3810964-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 17:02:43 -08:00
Heiner Kallweit
5860bb1ce0 net: phy: fixed_phy: remove setting supported/advertised modes from fixed_phy_register
This code was added with 34b31da486 ("phy: fixed_phy: Set supported
speed in phydev") 10 yrs ago. The commit message of this change
mentions a use case involving callback adjust_link of struct
dsa_switch_driver. This struct doesn't exist any longer, and in general
usage of the legacy fixed PHY has been removed from DSA with the switch
to phylink.

Note: Supported and advertised modes are now set by phy_probe() when
the fixed PHY is attached to the netdev and bound to the genphy driver.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://patch.msgid.link/3abaa3c5-fbb9-4052-9346-6cb096a25878@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 16:59:09 -08:00
Jakub Kicinski
40ea40853d tools: ynltool: remove -lmnl from link flags
The libmnl dependency has been removed from libynl back in
commit 73395b4381 ("tools: ynl: remove the libmnl dependency")
Remove it from the ynltool Makefile.

Reviewed-by: Donald Hunter <donald.hunter@gmail.com>
Link: https://patch.msgid.link/20251115225508.1000072-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 16:57:33 -08:00
Mohsin Bashir
0135333914 eth: fbnic: Configure RDE settings for pause frame
fbnic supports pause frames. When pause frames are enabled presumably
user expects lossless operation from the NIC. Make sure we configure
RDE (Rx DMA Engine) to DROP_NEVER mode to avoid discards due to delays
in fetching Rx descriptors from the host.

While at it enable DROP_NEVER when NIC only has a single queue
configured. In this case the NIC acts as a FIFO so there's no risk
of head-of-line blocking other queues by making RDE wait. If pause
is disabled this just moves the packet loss from the DMA engine to
the Rx buffer.

Remove redundant call to fbnic_config_drop_mode_rcq(), introduced by
commit 0cb4c0a137 ("eth: fbnic: Implement Rx queue
alloc/start/stop/free"). This call does not add value as
fbnic_enable_rcq(), which is called immediately afterward, already
handles this.

Although we do not support autoneg at this time, preserve tx_pause in
.mac_link_up instead of fbnic_phylink_get_pauseparam()

Signed-off-by: Mohsin Bashir <mohsin.bashr@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20251113232610.1151712-1-mohsin.bashr@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 16:57:32 -08:00
Jakub Kicinski
122ac16b8c Merge branch 'net-mlx-migrate-to-new-get_rx_ring_count-ethtool-api'
Breno Leitao says:

====================
net: mlx: migrate to new get_rx_ring_count ethtool API

This series migrates the mlx4 and mlx5 drivers to use the new
.get_rx_ring_count() callback introduced in commit 84eaf4359c ("net:
ethtool: add get_rx_ring_count callback to optimize RX ring queries").

Previously, these drivers handled ETHTOOL_GRXRINGS within the
.get_rxnfc() callback. With the dedicated .get_rx_ring_count() API, this
handling can be extracted and simplified.

For mlx5, this affects both the ethernet and IPoIB drivers. The
ETHTOOL_GRXRINGS handling was previously kept in .get_rxnfc() to support
"ethtool -x" when CONFIG_MLX5_EN_RXNFC=n, but this is no longer
necessary with the new dedicated callback.

Note: The mlx4 changes are compile-tested only, while mlx5 changes were
properly tested.
====================

Link: https://patch.msgid.link/20251113-mlx_grxrings-v1-0-0017f2af7dd0@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 16:57:32 -08:00
Breno Leitao
945499665f mlx5: extract GRXRINGS from .get_rxnfc
Commit 84eaf4359c ("net: ethtool: add get_rx_ring_count callback to
optimize RX ring queries") added specific support for GRXRINGS callback,
simplifying .get_rxnfc.

Remove the handling of GRXRINGS in .get_rxnfc() by moving it to the new
.get_rx_ring_count() for both the mlx5 ethernet and IPoIB drivers.

The ETHTOOL_GRXRINGS handling was previously kept in .get_rxnfc() to
support "ethtool -x" when CONFIG_MLX5_EN_RXNFC=n. With the new
dedicated .get_rx_ring_count() callback, this is no longer necessary.

This simplifies the RX ring count retrieval and aligns mlx5 with the new
ethtool API for querying RX ring parameters.

Signed-off-by: Breno Leitao <leitao@debian.org>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/20251113-mlx_grxrings-v1-2-0017f2af7dd0@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 16:57:32 -08:00
Breno Leitao
467c3f008d mlx4: extract GRXRINGS from .get_rxnfc
Commit 84eaf4359c ("net: ethtool: add get_rx_ring_count callback to
optimize RX ring queries") added specific support for GRXRINGS callback,
simplifying .get_rxnfc.

Remove the handling of GRXRINGS in .get_rxnfc() by moving it to the new
.get_rx_ring_count().

This simplifies the RX ring count retrieval and aligns mlx4 with the new
ethtool API for querying RX ring parameters. This is compiled tested
only.

Signed-off-by: Breno Leitao <leitao@debian.org>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/20251113-mlx_grxrings-v1-1-0017f2af7dd0@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-17 16:57:32 -08:00
Li Chen
3179a5f7f8 block: rate-limit capacity change info log
loop devices under heavy stress-ng loop streessor can trigger many
capacity change events in a short time. Each event prints an info
message from set_capacity_and_notify(), flooding the console and
contributing to soft lockups on slow consoles.

Switch the printk in set_capacity_and_notify() to
pr_info_ratelimited() so frequent capacity changes do not spam
the log while still reporting occasional changes.

Cc: stable@vger.kernel.org
Signed-off-by: Li Chen <chenl311@chinatelecom.cn>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-17 11:30:24 -07:00
Damien Le Moal
ade260ca85 Documentation: admin-guide: blockdev: update zloop parameters
In Documentation/admin-guide/blockdev/zoned_loop.rst, add the
description of the zone_append and ordered_zone_append configuration
arguments of zloop "add" command (device creation).

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-17 09:40:09 -07:00
Damien Le Moal
fcc6eaa3a0 zloop: introduce the ordered_zone_append configuration parameter
The zone append operation processing for zloop devices is similar to any
other command, that is, the operation is processed as a command work
item, without any special serialization between the work items (beside
the zone mutex for mutually exclusive code sections).

This processing is fine and gives excellent performance. However, it has
a side effect: zone append operation are very often reordered and
processed in a sequence that is very different from their issuing order
by the user. This effect is very visible using an XFS file system on top
of a zloop device. A simple file write leads to many file extents as the
data writes using zone append are reordered and so result in the
physical order being different than the file logical order.
E.g. executing:

$ dd if=/dev/zero of=/mnt/test bs=1M count=10 && sync
$ xfs_bmap /mnt/test
/mnt/test:
	0: [0..4095]: 2162688..2166783
	1: [4096..6143]: 2168832..2170879
	2: [6144..8191]: 2166784..2168831
	3: [8192..10239]: 2170880..2172927
	4: [10240..12287]: 2174976..2177023
	5: [12288..14335]: 2172928..2174975
	6: [14336..20479]: 2177024..2183167

For 10 IOs, 6 extents are created.

This is fine and actually allows to exercise XFS zone garbage collection
very well. However, this also makes debugging/working on XFS data
placement harder as the underlying device will most of the time reorder
IOs, resulting in many file extents.

Allow a user to mitigate this with the new ordered_zone_append
configuration parameter. For a zloop device created with this parameter
specified, the sector of a zone append command is set early, when the
command is submitted by the block layer with the zloop_queue_rq()
function, instead of in the zloop_rw() function which is exectued later
in the command work item context. This change ensures that more often
than not, zone append operations data end up being written in the same
order as the command submission by the user.

In the case of XFS, this leads to far less file data extents. E.g., for
the previous example, we get a single file data extent for the written
file.

$ dd if=/dev/zero of=/mnt/test bs=1M count=10 && sync
$ xfs_bmap /mnt/test
/mnt/test:
	0: [0..20479]: 2162688..2183167

Since we cannot use a mutex in the context of the zloop_queue_rq()
function to atomically set a zone append operation sector to the target
zone write pointer location and increment that the write pointer, a new
per-zone spinlock is introduced to protect a zone write pointer access
and modifications. To check a zone write pointer location and set a zone
append operation target sector to that value, the function
zloop_set_zone_append_sector() is introduced and called from
zloop_queue_rq().

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-17 09:40:09 -07:00
Damien Le Moal
9236c5fdd5 zloop: introduce the zone_append configuration parameter
A zloop zoned block device declares to the block layer that it supports
zone append operations. That is, a zloop device ressembles an NVMe ZNS
devices supporting zone append.

This native support is fine but it does not allow exercising the block
layer zone write plugging emulation of zone append, as is done with SCSI
or ATA SMR HDDs.

Introduce the zone_append configuration parameter to allow creating a
zloop device without native support for zone append, thus relying on the
block layer zone append emulation. If not specified, zone append support
is enabled by default. Otherwise, a value of 0 disables native zone
append and a value of 1 enables it.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-17 09:40:09 -07:00
Damien Le Moal
e3a96ca904 zloop: simplify checks for writes to sequential zones
The function zloop_rw() already checks early that a request is fully
contained within the target zone. So this check does not need to be done
again for regular writes to sequential zones. Furthermore, since zone
append operations are always directed to the zone write pointer
location, we do not need to check for their alignment to that value
after setting it. So turn the "if" checking the write pointer alignment
into an "else if".

While at it, improve the comment describing the write pointer
modification and how this value is corrected in case of error.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-17 09:40:09 -07:00
Damien Le Moal
cf28f6f923 zloop: fail zone append operations that are targeting full zones
zloop_rw() will fail any regular write operation that targets a full
sequential zone. The check for this is indirect and achieved by checking
the write pointer alignment of the write operation. But this check is
ineffective for zone append operations since these are alwasy
automatically directed at a zone write pointer.

Prevent zone append operations from being executed in a full zone with
an explicit check of the zone condition.

Fixes: eb0570c7df ("block: new zoned loop block device driver")
Cc: stable@vger.kernel.org
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-17 09:40:09 -07:00
Damien Le Moal
866d65745b zloop: make the write pointer of full zones invalid
The write pointer of zones that are in the full condition is always
invalid. Reflect that fact by setting the write pointer of full zones
to ULLONG_MAX.

Fixes: eb0570c7df ("block: new zoned loop block device driver")
Cc: stable@vger.kernel.org
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-17 09:40:09 -07:00
Guenter Roeck
6483faa393 block/blk-throttle: Remove throtl_slice from struct throtl_data
throtl_slice is now a constant. Remove the variable and use the constant
directly where needed.

Cc: Yu Kuai <yukuai@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Khazhismel Kumykov <khazhy@google.com>
Reviewed-by: Yu Kuai <yukuai@fnnas.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-17 09:39:48 -07:00
Guenter Roeck
20d0b359c7 block/blk-throttle: drop unneeded blk_stat_enable_accounting
After the removal of CONFIG_BLK_DEV_THROTTLING_LOW, it is no longer
necessary to enable block accounting, so remove the call to
blk_stat_enable_accounting(). With that, the track_bio_latency variable
is no longer used and can be deleted from struct throtl_data. Also,
including blk-stat.h is no longer necessary.

Fixes: bf20ab538c ("blk-throttle: remove CONFIG_BLK_DEV_THROTTLING_LOW")
Cc: Yu Kuai <yukuai@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Khazhismel Kumykov <khazhy@google.com>
Reviewed-by: Yu Kuai <yukuai@fnnas.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-17 09:39:48 -07:00
Guenter Roeck
f76581f9f1 block/blk-throttle: Fix throttle slice time for SSDs
Commit d61fcfa4bb ("blk-throttle: choose a small throtl_slice for SSD")
introduced device type specific throttle slices if BLK_DEV_THROTTLING_LOW
was enabled. Commit bf20ab538c ("blk-throttle: remove
CONFIG_BLK_DEV_THROTTLING_LOW") removed support for BLK_DEV_THROTTLING_LOW,
but left the device type specific throttle slices in place. This
effectively changed throttling behavior on systems with SSD which now use
a different and non-configurable slice time compared to non-SSD devices.
Practical impact is that throughput tests with low configured throttle
values (65536 bps) experience less than expected throughput on SSDs,
presumably due to rounding errors associated with the small throttle slice
time used for those devices. The same tests pass when setting the throttle
values to 65536 * 4 = 262144 bps.

The original code sets the throttle slice time to DFL_THROTL_SLICE_HD if
CONFIG_BLK_DEV_THROTTLING_LOW is disabled. Restore that code to fix the
problem. With that, DFL_THROTL_SLICE_SSD is no longer necessary. Revert to
the original code and re-introduce DFL_THROTL_SLICE to replace both
DFL_THROTL_SLICE_HD and DFL_THROTL_SLICE_SSD. This effectively reverts
commit d61fcfa4bb ("blk-throttle: choose a small throtl_slice for SSD").

While at it, also remove MAX_THROTL_SLICE since it is not used anymore.

Fixes: bf20ab538c ("blk-throttle: remove CONFIG_BLK_DEV_THROTTLING_LOW")
Cc: Yu Kuai <yukuai@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Khazhismel Kumykov <khazhy@google.com>
Reviewed-by: Yu Kuai <yukuai@fnnas.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-17 09:39:48 -07:00
Keith Busch
2516c246d0 block: consider discard merge last
If the next discard range is contiguous with the current range being
considered, it's cheaper to expand the current range than to append an
additional bio.

Signed-off-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-17 09:39:36 -07:00
Rene Rebe
82d2048102 floppy: fix for PAGE_SIZE != 4KB
For years I wondered why the floppy driver does not just work on
sparc64, e.g:

root@SUNW_375_0066:# disktype /dev/fd0
disktype: Can't open /dev/fd0: No such device or address

[  525.341906] disktype: attempt to access beyond end of device
fd0: rw=0, sector=0, nr_sectors = 16 limit=8
[  525.341991] floppy: error 10 while reading block 0

Turns out floppy.c __floppy_read_block_0 tries to read one page for
the first test read to determine the disk size and thus fails if that
is greater than 4k. Adjust minimum MAX_DISK_SIZE to PAGE_SIZE to fix
floppy on sparc64 and likely all other PAGE_SIZE != 4KB configs.

Cc: stable@vger.kernel.org
Signed-off-by: René Rebe <rene@exactco.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-17 08:22:00 -07:00
Colin Ian King
2469f2e78d fs/ntfs3: Fix spelling mistake "recommened" -> "recommended"
There is a spelling mistake in a ntfs_info message. Fix it.

Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Signed-off-by: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
2025-11-17 09:09:15 +01:00
Konstantin Komarov
266ab6d02a fs/ntfs3: update mode in xattr when ACL can be reduced to mode
If a file's ACL can be reduced to standard mode bits, update mode
accordingly, persist the change, and update the cached ACL. This keeps
mode and ACL consistent and avoids redundant xattrs.

Signed-off-by: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
2025-11-17 09:09:14 +01:00
Konstantin Komarov
d8e1e0d33d fs/ntfs3: check minimum alignment for direct I/O
Add a check for minimum alignment when performing direct I/O reads. If the
file offset or user buffer is not aligned to the device's logical block
size, fall back to buffered I/O instead of continuing with unaligned direct I/O.

Signed-off-by: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
2025-11-17 09:09:13 +01:00
Konstantin Komarov
ae91dfe389 fs/ntfs3: implement NTFS3_IOC_SHUTDOWN ioctl
Add support for the NTFS3_IOC_SHUTDOWN ioctl, allowing userspace to
request a filesystem shutdown. The ioctl number is shared with other
filesystems such as ext4, exfat, and f2fs.

Signed-off-by: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
2025-11-17 09:09:13 +01:00
Konstantin Komarov
2109b08024 fs/ntfs3: correct attr_collapse_range when file is too fragmented
Fix incorrect VCN adjustments in attr_collapse_range() that caused
filesystem errors or corruption on very fragmented NTFS files when
performing collapse-range operations.

Signed-off-by: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
2025-11-17 09:08:49 +01:00
Jakub Kicinski
c9dfb92de0 Merge branch 'mlx5-next' of git://git.kernel.org/pub/scm/linux/kernel/git/mellanox/linux
Tariq Toukan says:

====================
mlx5-next updates 2025-11-13

The following pull-request contains common mlx5 updates

* 'mlx5-next' of git://git.kernel.org/pub/scm/linux/kernel/git/mellanox/linux:
  net/mlx5: Expose definition for 1600Gbps link mode
  net/mlx5: fs, set non default device per namespace
  net/mlx5: fs, Add other_eswitch support for steering tables
  net/mlx5: Add OTHER_ESWITCH HW capabilities
  net/mlx5: Add direct ST mode support for RDMA
  PCI/TPH: Expose pcie_tph_get_st_table_loc()
  {rdma,net}/mlx5: Query vports mac address from device
====================

Link: https://patch.msgid.link/1763027252-1168760-1-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-14 18:55:38 -08:00
Altgelt, Max (Nextron)
4722981cca bpf: don't skip other information if xlated_prog_insns is skipped
If xlated_prog_insns should not be exposed, other information
(such as func_info) still can and should be filled in.
Therefore, instead of directly terminating in this case,
continue with the normal flow.

Signed-off-by: Max Altgelt <max.altgelt@nextron-systems.com>
Link: https://lore.kernel.org/r/efd00fcec5e3e247af551632726e2a90c105fbd8.camel@nextron-systems.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-14 18:55:06 -08:00
Martin KaFai Lau
6cc73f3540 selftests/bpf: Test bpf_skb_check_mtu(BPF_MTU_CHK_SEGS) when transport_header is not set
Add a test to check that bpf_skb_check_mtu(BPF_MTU_CHK_SEGS) is
rejected (-EINVAL) if skb->transport_header is not set. The test
needs to lower the MTU of the loopback device. Thus, take this
opportunity to run the test in a netns by adding "ns_" to the test
name. The "serial_" prefix can then be removed.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20251112232331.1566074-2-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-14 18:49:18 -08:00
Martin KaFai Lau
d946f3c983 bpf: Check skb->transport_header is set in bpf_skb_check_mtu
The bpf_skb_check_mtu helper needs to use skb->transport_header when
the BPF_MTU_CHK_SEGS flag is used:

	bpf_skb_check_mtu(skb, ifindex, &mtu_len, 0, BPF_MTU_CHK_SEGS)

The transport_header is not always set. There is a WARN_ON_ONCE
report when CONFIG_DEBUG_NET is enabled + skb->gso_size is set +
bpf_prog_test_run is used:

WARNING: CPU: 1 PID: 2216 at ./include/linux/skbuff.h:3071
 skb_gso_validate_network_len
 bpf_skb_check_mtu
 bpf_prog_3920e25740a41171_tc_chk_segs_flag # A test in the next patch
 bpf_test_run
 bpf_prog_test_run_skb

For a normal ingress skb (not test_run), skb_reset_transport_header
is performed but there is plan to avoid setting it as described in
commit 2170a1f091 ("net: no longer reset transport_header in __netif_receive_skb_core()").

This patch fixes the bpf helper by checking
skb_transport_header_was_set(). The check is done just before
skb->transport_header is used, to avoid breaking the existing bpf prog.
The WARN_ON_ONCE is limited to bpf_prog_test_run, so targeting bpf-next.

Fixes: 34b2021cc6 ("bpf: Add BPF-helper for MTU checking")
Cc: Jesper Dangaard Brouer <hawk@kernel.org>
Reported-by: Kaiyan Mei <M202472210@hust.edu.cn>
Reported-by: Yinhao Hu <dddddd@hust.edu.cn>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20251112232331.1566074-1-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-14 18:49:18 -08:00
Jakub Kicinski
c7b6dd2a8a Merge branch 'net-stmmac-rk-use-phy_intf_sel_x'
Russell King says:

====================
net: stmmac: rk: use PHY_INTF_SEL_x

This series is a minimal conversion of the dwmac-rk huge driver to use
PHY_INTF_SEL_x constants.

Patch 2 appears to reorder the output functions making diffing the
generated code impossible.
====================

Link: https://patch.msgid.link/aRYZaKTIvfYoV3wE@shell.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-14 18:33:59 -08:00
Russell King (Oracle)
1188741cb5 net: stmmac: rk: use PHY_INTF_SEL_x in functions
Rather than defining one xxx_GMAC_PHY_INTF_SEL_xxx() for each mode,
define xxx_GMAC_PHY_INTF_SEL() which takes the phy_intf_sel value.
Pass the appropriate value into these new macros in the set_to_xxx()
methods.

No change to produced code on aarch64.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Link: https://patch.msgid.link/E1vJbPG-0000000EBqb-2cF2@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-14 18:33:57 -08:00
Russell King (Oracle)
5e37047f74 net: stmmac: rk: use PHY_INTF_SEL_x constants
The values used in the xxx_GMAC_PHY_INTF_SEL_xxx() macros are the
phy_intf_sel values used for the dwmac core. Use these to define these
constants.

No change to produced code on aarch64.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Link: https://patch.msgid.link/E1vJbPB-0000000EBqV-27GS@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-14 18:33:57 -08:00
Russell King (Oracle)
764ebe423e net: stmmac: rk: convert all bitfields to GRF_FIELD*()
Convert all bitfields to GRF_FIELD() or GRF_FIELD_CONST(), which makes
the bitfield values more readable, and also allows the aarch64 compiler
to produce better code.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vJbP6-0000000EBqP-1cmm@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-14 18:33:57 -08:00
Russell King (Oracle)
ebb07edf97 net: stmmac: rk: replace HIWORD_UPDATE() with GRF_FIELD()
Provide GRF_FIELD() which takes the high/low bit numbers of the field
and field value, generates the mask and passes it to FIELD_PREP_WM16.
Replace all HIWORD_UPDATE() instances with this.

No change to produced code on aarch64.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Link: https://patch.msgid.link/E1vJbP1-0000000EBqJ-1AjR@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-14 18:33:57 -08:00
Breno Leitao
04ca7a69a3 net: bnx2x: convert to use get_rx_ring_count
Convert the bnx2x driver to use the new .get_rx_ring_count ethtool
operation instead of implementing .get_rxnfc solely for handling
ETHTOOL_GRXRINGS command. This simplifies the code by replacing the
switch statement with a direct return of the queue count.

The new callback provides the same functionality in a more direct way,
following the ongoing ethtool API modernization.

Signed-off-by: Breno Leitao <leitao@debian.org>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20251112-bnx_grxrings-v1-1-1c2cb73979e2@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-14 18:04:21 -08:00
Breno Leitao
f455d3f02d net: ixgbe: convert to use .get_rx_ring_count
Convert the ixgbe driver to use the new .get_rx_ring_count ethtool
operation for handling ETHTOOL_GRXRINGS command. This simplifies the
code by extracting the ring count logic into a dedicated callback.

The new callback provides the same functionality in a more direct way,
following the ongoing ethtool API modernization.

This was compile-tested only.

Reviewed-by: Paul Menzel <pmenzel@molgen.mpg.de>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Signed-off-by: Breno Leitao <leitao@debian.org>
Link: https://patch.msgid.link/20251113-ixgbe_gxrings-v2-1-0ecf57808a78@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-14 18:02:22 -08:00
Yue Haibing
06ac470658 sctp: Remove unused declaration sctp_auth_init_hmacs()
Commit bf40785fa4 ("sctp: Use HMAC-SHA1 and HMAC-SHA256 library for chunk
authentication") removed the implementation but leave declaration.

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Eric Biggers <ebiggers@kernel.org>
Link: https://patch.msgid.link/20251113114501.32905-1-yuehaibing@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-14 18:00:34 -08:00
Eric Dumazet
6d650ae928 tcp: gro: inline tcp_gro_pull_header()
tcp_gro_pull_header() is used in GRO fast path, inline it.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251113140358.58242-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-14 18:00:08 -08:00
Puranjay Mohan
4f7bc83b98 bpf: verifier: Move desc->imm setup to sort_kfunc_descs_by_imm_off()
Metadata about a kfunc call is added to the kfunc_tab in
add_kfunc_call() but the call instruction itself could get removed by
opt_remove_dead_code() later if it is not reachable.

If the call instruction is removed, specialize_kfunc() is never called
for it and the desc->imm in the kfunc_tab is never initialized for this
kfunc call. In this case, sort_kfunc_descs_by_imm_off(env->prog); in
do_misc_fixups() doesn't sort the table correctly.
This is a problem for s390 as its JIT uses this table to find the
addresses for kfuncs, and if this table is not sorted properly, JIT may
fail to find addresses for valid kfunc calls.

This was exposed by:

commit d869d56ca8 ("bpf: verifier: refactor kfunc specialization")

as before this commit, desc->imm was initialised in add_kfunc_call()
which happens before dead code elimination.

Move desc->imm setup down to sort_kfunc_descs_by_imm_off(), this fixes
the problem and also saves us from having the same logic in
add_kfunc_call() and specialize_kfunc().

Suggested-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251114154023.12801-1-puranjay@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-14 17:55:18 -08:00
Mykyta Yatsenko
a4d31f451d selftests/bpf: Align kfuncs renamed in bpf tree
bpf_task_work_schedule_resume() and bpf_task_work_schedule_signal() have
been renamed in bpf tree to bpf_task_work_schedule_resume_impl() and
bpf_task_work_schedule_signal_impl() accordingly.
There are few uses of these kfuncs in selftests that are not in bpf
tree, so that when we port [1] into bpf-next, those BPF programs will
not compile.
This patch aligns those remaining callsites with the kfunc renaming.
It should go on top of [1] when applying on bpf-next.

1: https://lore.kernel.org/all/20251104-implv2-v3-0-4772b9ae0e06@meta.com/

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Link: https://lore.kernel.org/r/20251105132105.597344-1-mykyta.yatsenko5@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-14 17:45:43 -08:00
Jakub Kicinski
eca8b8fc74 selftests: drv-net: xdp: make the XDP qstats tests less flaky
The XDP qstats tests send 2k packets over a single socket.
Looks like when netdev CI is busy running those tests in QEMU
occasionally flakes. The target doesn't get to run at all
before all 2000 packets are sent.

Lower the number of packets to 1000 and reopen the socket
every 50 packets, to give RSS a chance to spread the packets
to multiple queues.

For the netdev CI testing either lowering the count or using
multiple sockets is enough, but let's do both for extra resiliency.

Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20251113152703.3819756-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-14 17:45:38 -08:00
Dimitri Daskalakis
e1215d1d38 selftests: drv-net: xdp: Fix register spill error with clang 20
On clang 20.1.8 the XDP program fails to load with a register spill error.
Since hdr_len is a __u32, the compiler decided it only needed the lower
32-bits of ctx->data, which later triggers the register spill verifier
error.

Suggested-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Dimitri Daskalakis <dimitri.daskalakis1@gmail.com>
Link: https://patch.msgid.link/20251113043102.4062150-1-dimitri.daskalakis1@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-14 17:45:07 -08:00
Jakub Kicinski
c7dc5b5228 ipv6: clean up routes when manually removing address with a lifetime
When an IPv6 address with a finite lifetime (configured with valid_lft
and preferred_lft) is manually deleted, the kernel does not clean up the
associated prefix route. This results in orphaned routes (marked "proto
kernel") remaining in the routing table even after their corresponding
address has been deleted.

This is particularly problematic on networks using combination of SLAAC
and bridges.

1. Machine comes up and performs RA on eth0.
2. User creates a bridge
   - does an ip -6 addr flush dev eth0;
   - adds the eth0 under the bridge.
3. SLAAC happens on br0.

Even tho the address has "moved" to br0 there will still be a route
pointing to eth0, but eth0 is not usable for IP any more.

Reviewed-by: David Ahern <dsahern@kernel.org>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Link: https://patch.msgid.link/20251113031700.3736285-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-14 17:44:47 -08:00
Alexei Starovoitov
e47b68bda4 Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf after 6.18-rc5+
Cross-merge BPF and other fixes after downstream PR.

Minor conflict in kernel/bpf/helpers.c

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-14 17:43:41 -08:00
Andrii Nakryiko
ef2c0b2e56 Merge branch 'libbpf-fix-btf-dedup-to-support-recursive-typedef'
Paul Houssel says:

====================
libbpf: fix BTF dedup to support recursive typedef

Pahole fails to encode BTF for some Go projects (e.g. Kubernetes and
Podman) due to recursive type definitions that create reference loops
not representable in C. These recursive typedefs trigger a failure in
the BTF deduplication algorithm.

This patch extends btf_dedup_struct_types() to properly handle potential
recursion for BTF_KIND_TYPEDEF, similar to how recursion is already
handled for BTF_KIND_STRUCT. This allows pahole to successfully
generate BTF for Go binaries using recursive types without impacting
existing C-based workflows.

Changes in v4: fix typo found by Claude-based CI

Changes in v3:
  1. Patch 1: Adjusted the comment of btf_dedup_ref_type() to refer to
  typedef as well.
  2. Patch 2: Update of the "dedup: recursive typedef" test to include a
  duplicated version of the types to make sure deduplication still happens
  in this case.

Changes in v2:
  1. Patch 1: Refactored code to prevent copying existing logic. Instead of
  adding a new function we modify the existing btf_dedup_struct_type()
  function to handle the BTF_KIND_TYPEDEF case. Calls to btf_hash_struct()
  and btf_shallow_equal_struct() are replaced with calls to functions that
  select btf_hash_struct() / btf_hash_typedef() based on the type.
  2. Patch 2: Added tests

v3: https://lore.kernel.org/lkml/cover.1763024337.git.paul.houssel@orange.com/

v2: https://lore.kernel.org/lkml/cover.1762956564.git.paul.houssel@orange.com/

v1: https://lore.kernel.org/lkml/20251107153408.159342-1-paulhoussel2@gmail.com/
====================

Link: https://patch.msgid.link/cover.1763037045.git.paul.houssel@orange.com
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-11-14 17:07:21 -08:00
Paul Houssel
a69e09823e selftests/bpf: Add BTF dedup tests for recursive typedef definitions
Add several ./test_progs tests:
    1.  btf/dedup:recursive typedef ensures that deduplication no
	longer fails on recursive typedefs.
    2.  btf/dedup:typedef ensures that typedefs are deduplicated correctly
	just as they were before this patch.

Signed-off-by: Paul Houssel <paul.houssel@orange.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/9fac2f744089f6090257d4c881914b79f6cd6c6a.1763037045.git.paul.houssel@orange.com
2025-11-14 17:07:20 -08:00
Paul Houssel
3781413465 libbpf: Fix BTF dedup to support recursive typedef definitions
Handle recursive typedefs in BTF deduplication

Pahole fails to encode BTF for some Go projects (e.g. Kubernetes and
Podman) due to recursive type definitions that create reference loops
not representable in C. These recursive typedefs trigger a failure in
the BTF deduplication algorithm.

This patch extends btf_dedup_ref_type() to properly handle potential
recursion for BTF_KIND_TYPEDEF, similar to how recursion is already
handled for BTF_KIND_STRUCT. This allows pahole to successfully
generate BTF for Go binaries using recursive types without impacting
existing C-based workflows.

Suggested-by: Tristan d'Audibert <tristan.daudibert@gmail.com>
Co-developed-by: Martin Horth <martin.horth@telecom-sudparis.eu>
Co-developed-by: Ouail Derghal <ouail.derghal@imt-atlantique.fr>
Co-developed-by: Guilhem Jazeron <guilhem.jazeron@inria.fr>
Co-developed-by: Ludovic Paillat <ludovic.paillat@inria.fr>
Co-developed-by: Robin Theveniaut <robin.theveniaut@irit.fr>
Signed-off-by: Martin Horth <martin.horth@telecom-sudparis.eu>
Signed-off-by: Ouail Derghal <ouail.derghal@imt-atlantique.fr>
Signed-off-by: Guilhem Jazeron <guilhem.jazeron@inria.fr>
Signed-off-by: Ludovic Paillat <ludovic.paillat@inria.fr>
Signed-off-by: Robin Theveniaut <robin.theveniaut@irit.fr>
Signed-off-by: Paul Houssel <paul.houssel@orange.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/bf00857b1e06f282aac12f6834de7396a7547ba6.1763037045.git.paul.houssel@orange.com
2025-11-14 17:07:20 -08:00
Alexei Starovoitov
c133390398 selftests/bpf: Fix failure paths in send_signal test
When test_send_signal_kern__open_and_load() fails parent closes the
pipe which cases ASSERT_EQ(read(pipe_p2c...)) to fail, but child
continues and enters infinite loop, while parent is stuck in wait(NULL).
Other error paths have similar issue, so kill the child before waiting on it.

The bug was discovered while compiling all of selftests with -O1 instead of -O2
which caused progs/test_send_signal_kern.c to fail to load.

Fixes: ab8b7f0cb3 ("tools/bpf: Add self tests for bpf_send_signal_thread()")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20251113171153.2583-1-alexei.starovoitov@gmail.com
2025-11-14 17:02:25 -08:00
Alexei Starovoitov
63066b7a8e selftests/bpf: Convert glob_match() to bpf arena
Increase arena test coverage.
Convert glob_match() to bpf arena in two steps:
1.
Copy paste lib/glob.c into bpf_arena_strsearch.h
Copy paste lib/globtests.c into progs/arena_strsearch.c

2.
Add __arena to pointers
Add __arg_arena to global functions that accept arena pointers
Add cond_break to loops

The test also serves as a good example of what's possible
with bpf arena and how existing algorithms can be converted.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20251111032931.21430-1-alexei.starovoitov@gmail.com
2025-11-14 13:57:28 -08:00
Menglong Dong
fea3f5e83c bpf: Handle return value of ftrace_set_filter_ip in register_fentry
The error that returned by ftrace_set_filter_ip() in register_fentry() is
not handled properly. Just fix it.

Fixes: 00963a2e75 ("bpf: Support bpf_trampoline on functions with IPMODIFY (e.g. livepatch)")
Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
Acked-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20251110120705.1553694-1-dongml2@chinatelecom.cn
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-14 13:31:30 -08:00
Eduard Zingerman
e5d2e34e72 bpf: Add missing checks to avoid verbose verifier log
There are a few places where log level is not checked before calling
"verbose()". This forces programs working only at
BPF_LOG_LEVEL_STATS (e.g. veristat) to allocate unnecessarily large
log buffers. Add missing checks.

Reported-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251114200542.912386-1-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-14 13:29:12 -08:00
Sahil Chandna
c1da3df719 bpf: Prevent nesting overflow in bpf_try_get_buffers
bpf_try_get_buffers() returns one of multiple per-CPU buffers based on a
per-CPU nesting counter. This mechanism expects that buffers are not
endlessly acquired before being returned. migrate_disable() ensures that a
task remains on the same CPU, but it does not prevent the task from being
preempted by another task on that CPU.

Without disabled preemption, a task may be preempted while holding a
buffer, allowing another task to run on same CPU and acquire an
additional buffer. Several such preemptions can cause the per-CPU
nest counter to exceed MAX_BPRINTF_NEST_LEVEL and trigger the warning in
bpf_try_get_buffers(). Adding preempt_disable()/preempt_enable() around
buffer acquisition and release prevents this task preemption and
preserves the intended bounded nesting behavior.

Reported-by: syzbot+b0cff308140f79a9c4cb@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/all/68f6a4c8.050a0220.1be48.0011.GAE@google.com/
Fixes: 4223bf833c ("bpf: Remove preempt_disable in bpf_try_get_buffers")
Suggested-by: Yonghong Song <yonghong.song@linux.dev>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Sahil Chandna <chandna.sahil@gmail.com>
Link: https://lore.kernel.org/r/20251114064922.11650-1-chandna.sahil@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-14 13:06:47 -08:00
Rene Rebe
79bd8c9814 ps3disk: use memcpy_{from,to}_bvec index
With 6e0a48552b (ps3disk: use memcpy_{from,to}_bvec) converting
ps3disk to new bvec helpers, incrementing the offset was accidently
lost, corrupting consecutive buffers. Restore index for non-corrupted
data transfers.

Fixes: 6e0a48552b (ps3disk: use memcpy_{from,to}_bvec)
Signed-off-by: René Rebe <rene@exactco.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-14 09:10:16 -07:00
Jens Axboe
fa0d2dc69e Merge branch 'p2pdma-mmio-6.19.v5' into for-6.19/block
Merge MMIO P2P DMA series from Leon:

"This patch series improves block layer and NVMe driver support for MMIO
 memory regions, particularly for peer-to-peer (P2P) DMA transfers that
 go through the host bridge.

 The series addresses a critical gap where P2P transfers through the
 host bridge (PCI_P2PDMA_MAP_THRU_HOST_BRIDGE) were not properly marked
 as MMIO memory, leading to potential issues with:

 - Inappropriate CPU cache synchronization operations on MMIO regions
 - Incorrect DMA mapping/unmapping that doesn't respect MMIO semantics
 - Missing IOMMU configuration for MMIO memory handling

 This work is extracted from the larger DMA physical API improvement
 series [1] and focuses specifically on block layer and NVMe
 requirements for MMIO memory support.

 [1] https://lore.kernel.org/all/cover.1757423202.git.leonro@nvidia.com/"

Link: https://lore.kernel.org/linux-block/20251114-block-with-mmio-v5-0-69d00f73d766@nvidia.com/
Signed-off-by: Jens Axboe <axboe@kernel.dk>

* p2pdma-mmio-6.19.v5:
  block-dma: properly take MMIO path
  nvme-pci: migrate to dma_map_phys instead of map_page
2025-11-14 05:10:09 -07:00
Leon Romanovsky
37f0c7a8df block-dma: properly take MMIO path
In commit eadaa8b255 ("dma-mapping: introduce new DMA attribute to
indicate MMIO memory"), DMA_ATTR_MMIO attribute was added to describe
MMIO addresses, which require to avoid any memory cache flushing, as
an outcome of the discussion pointed in Link tag below.

In case of PCI_P2PDMA_MAP_THRU_HOST_BRIDGE transfer, blk-mq-dm logic
treated this as regular page and relied on "struct page" DMA flow.
That flow performs CPU cache flushing, which shouldn't be done here,
and doesn't set IOMMU_MMIO flag in DMA-IOMMU case.

As a solution, let's encode peer-to-peer transaction type in NVMe IOD
flags variable and provide it to blk-mq-dma API.

Link: https://lore.kernel.org/all/f912c446-1ae9-4390-9c11-00dce7bf0fd3@arm.com/
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-14 05:09:56 -07:00
Leon Romanovsky
61d43b1731 nvme-pci: migrate to dma_map_phys instead of map_page
After introduction of dma_map_phys(), there is no need to convert
from physical address to struct page in order to map page. So let's
use it directly.

Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-14 05:09:56 -07:00
Jakub Kicinski
df58ee7d8f Merge branch 'net-phy-mscc-add-support-for-phy-led-control'
Lad Prabhakar says:

====================
net: phy: mscc: Add support for PHY LED control

This patch series adds support for controlling the PHY LEDs on the
VSC85xx family of PHYs from Microsemi (now part of Renesas).
The first two patches simplify and consolidate existing probe code
the third patch introduces the LED control functionality.
The LED control feature allows users to configure the LED behavior
based on link activity, speed, and other criteria.
====================

Link: https://patch.msgid.link/20251112135715.1017117-1-prabhakar.mahadev-lad.rj@bp.renesas.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-13 17:48:46 -08:00
Lad Prabhakar
df39794319 net: phy: mscc: Handle devm_phy_package_join() failure in vsc85xx_probe_common()
devm_phy_package_join() may fail and return a negative error code.
Update vsc85xx_probe_common() to properly handle this failure by
checking the return value and propagating the error to the caller.

Signed-off-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20251112135715.1017117-5-prabhakar.mahadev-lad.rj@bp.renesas.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-13 17:48:43 -08:00
Lad Prabhakar
eb47c5c488 net: phy: mscc: Add support for PHY LED control
Add support for the PHY LED controller in the MSCC VSC85xx driver. The
implementation provides LED brightness and hardware control through the
LED subsystem and integrates with the standard 'netdev' trigger.

Introduce new register definitions for the LED behavior register
(MSCC_PHY_LED_BEHAVIOR = 30) and the LED combine disable bits, which
control whether LEDs indicate link-only or combined link and activity
status. Implement a helper, vsc8541_led_combine_disable_set(), to update
these bits safely using phy_modify().

Add support for LED brightness control and hardware mode configuration.
The new callbacks implement the standard LED class operations, allowing
user control through sysfs. The brightness control maps to PHY LED force
on/off modes. The hardware control get and set functions translate
between the PHY-specific LED mode encodings and the LED subsystem
TRIGGER_NETDEV_* rules.

The combine feature is managed automatically based on the selected
rules. When both RX and TX activity are disabled, the combine feature is
turned off, causing LEDs to indicate link-only status. When either RX or
TX activity is enabled, the combine feature remains active and LEDs
indicate combined link and activity.

Register the LED callbacks for all VSC85xx PHY variants so that the LED
subsystem can manage their indicators consistently. Existing device tree
LED configuration and default behavior are preserved.

Signed-off-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20251112135715.1017117-4-prabhakar.mahadev-lad.rj@bp.renesas.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-13 17:48:43 -08:00
Lad Prabhakar
217eb2d60f net: phy: mscc: Consolidate probe functions into a common helper
Unify the probe implementations of the VSC85xx PHY family into a single
vsc85xx_probe_common() helper. The existing probe functions for the
vsc85xx, vsc8514, vsc8574, and vsc8584 variants contained almost
identical initialization logic, differing only in configuration
parameters such as the number of LEDs, supported LED modes, hardware
statistics, and PTP support.

Introduce a vsc85xx_probe_config structure to describe the per-variant
parameters, and move all common setup code into the shared helper. Each
variant's probe function now defines a constant configuration instance
and calls vsc85xx_probe_common().

Also mark the default LED mode array parameter as const to match its
usage.

Signed-off-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20251112135715.1017117-3-prabhakar.mahadev-lad.rj@bp.renesas.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-13 17:48:43 -08:00
Lad Prabhakar
c31783c2b5 net: phy: mscc: Simplify LED mode update using phy_modify()
The vsc85xx_led_cntl_set() function currently performs a manual
read-modify-write sequence protected by the PHY lock to update the
LED mode register (MSCC_PHY_LED_MODE_SEL).

Replace this sequence with a call to phy_modify(), which already
handles read-modify-write operations with proper locking inside
the PHY core.

Signed-off-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/20251112135715.1017117-2-prabhakar.mahadev-lad.rj@bp.renesas.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-13 17:48:43 -08:00
Oliver Neukum
de9c41624c net: usb: usbnet: adhere to style
This satisfies the coding style.

Signed-off-by: Oliver Neukum <oneukum@suse.com>
Link: https://patch.msgid.link/20251112102610.281565-1-oneukum@suse.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-13 17:47:38 -08:00
Felix Maurer
c294432be1 netlink: specs: rt-link: Add attributes for hsr
YNL wasn't able to decode the linkinfo from hsr interfaces. Add the
linkinfo attribute definitions for hsr interfaces. Example output now
looks like this:

$ ynl --spec Documentation/netlink/specs/rt-link.yaml --do getlink \
    --json '{"ifname": "hsr0"}' --output-json | jq .linkinfo
{
  "kind": "hsr",
  "data": {
    "slave1": 15,
    "slave2": 13,
    "supervision-addr": "01:15:4e:00:01:00",
    "seq-nr": 64511,
    "version": 1,
    "protocol": 0
  }
}

Signed-off-by: Felix Maurer <fmaurer@redhat.com>
Link: https://patch.msgid.link/926077a70de614f1539c905d06515e258905255e.1762968225.git.fmaurer@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-13 17:46:31 -08:00
Vladimir Oltean
55f943c6af net: pcs: xpcs-plat: fix MODULE_AUTHOR
This field needs to hold just Serge's name.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://patch.msgid.link/20251112211118.700875-1-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-13 17:41:06 -08:00
Heiner Kallweit
4aa73c6051 net: dsa: remove definition of struct dsa_switch_driver
Since 93e86b3bc8 ("net: dsa: Remove legacy probing support")
this struct has no user any longer.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
Link: https://patch.msgid.link/4053a98f-052f-4dc1-a3d4-ed9b3d3cc7cb@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-13 17:40:22 -08:00
Kriish Sharma
992b7d5fd8 dpll: zl3073x: fix kernel-doc name and missing parameter in fw.c
Documentation build reported:

  Warning: drivers/dpll/zl3073x/fw.c:365 function parameter 'comp' not described in 'zl3073x_fw_component_flash'
  Warning: drivers/dpll/zl3073x/fw.c:365 expecting prototype for zl3073x_flash_bundle_flash(). Prototype was for zl3073x_fw_component_flash() instead
  Warning: drivers/dpll/zl3073x/fw.c:365 No description found for return value of 'zl3073x_fw_component_flash'

The kernel-doc comment above `zl3073x_fw_component_flash()` used
the wrong function name (`zl3073x_flash_bundle_flash`) and omitted
the `@comp` parameter. Update the comment to correctly document
the `zl3073x_fw_component_flash()` function and its arguments.

Signed-off-by: Kriish Sharma <kriish.sharma2006@gmail.com>
Reviewed-by: Ivan Vecera <ivecera@redhat.com>
Link: https://patch.msgid.link/20251112055642.2597450-1-kriish.sharma2006@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-13 17:26:23 -08:00
Chen Ni
205305c028 net/sched: act_ife: convert comma to semicolon
Replace comma between expressions with semicolons.

Using a ',' in place of a ';' can have unintended side effects.
Although that is not the case here, it is seems best to use ';'
unless ',' is intended.

Found by inspection.
No functional change intended.
Compile tested only.

Signed-off-by: Chen Ni <nichen@iscas.ac.cn>
Reviewed-by: Jamal Hadi Salim <jhs@mojatatu.com>
Link: https://patch.msgid.link/20251112072709.73755-1-nichen@iscas.ac.cn
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-13 17:22:00 -08:00
Yue Haibing
c3716126cf vxlan: Remove unused declarations eth_vni_hash() and fdb_head_index()
Commit 1f763fa808 ("vxlan: Convert FDB table to rhashtable") removed the
implementations but leave declarations.

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://patch.msgid.link/20251112092055.3546703-1-yuehaibing@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-13 17:20:54 -08:00
Viacheslav Dubeyko
ed490f36f4 hfsplus: fix volume corruption issue for generic/070
The xfstests' test-case generic/070 leaves HFS+ volume
in corrupted state:

sudo ./check generic/070
FSTYP -- hfsplus
PLATFORM -- Linux/x86_64 hfsplus-testing-0001 6.17.0-rc1+ #4 SMP PREEMPT_DYNAMIC Wed Oct 1 15:02:44 PDT 2025
MKFS_OPTIONS -- /dev/loop51
MOUNT_OPTIONS -- /dev/loop51 /mnt/scratch

generic/070 _check_generic_filesystem: filesystem on /dev/loop50 is inconsistent
(see xfstests-dev/results//generic/070.full for details)

Ran: generic/070
Failures: generic/070
Failed 1 of 1 tests

sudo fsck.hfsplus -d /dev/loop50
** /dev/loop50
Using cacheBlockSize=32K cacheTotalBlock=1024 cacheSize=32768K.
Executing fsck_hfs (version 540.1-Linux).
** Checking non-journaled HFS Plus Volume.
The volume name is test
** Checking extents overflow file.
Unused node is not erased (node = 1)
** Checking catalog file.
** Checking multi-linked files.
** Checking catalog hierarchy.
** Checking extended attributes file.
** Checking volume bitmap.
** Checking volume information.
Verify Status: VIStat = 0x0000, ABTStat = 0x0000 EBTStat = 0x0004
CBTStat = 0x0000 CatStat = 0x00000000
** Repairing volume.
** Rechecking volume.
** Checking non-journaled HFS Plus Volume.
The volume name is test
** Checking extents overflow file.
** Checking catalog file.
** Checking multi-linked files.
** Checking catalog hierarchy.
** Checking extended attributes file.
** Checking volume bitmap.
** Checking volume information.
** The volume test was repaired successfully.

It is possible to see that fsck.hfsplus detected not
erased and unused node for the case of extents overflow file.
The HFS+ logic has special method that defines if the node
should be erased:

bool hfs_bnode_need_zeroout(struct hfs_btree *tree)
{
	struct super_block *sb = tree->inode->i_sb;
	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
	const u32 volume_attr = be32_to_cpu(sbi->s_vhdr->attributes);

	return tree->cnid == HFSPLUS_CAT_CNID &&
		volume_attr & HFSPLUS_VOL_UNUSED_NODE_FIX;
}

However, it is possible to see that this method works
only for the case of catalog file. But debugging of the issue
has shown that HFSPLUS_VOL_UNUSED_NODE_FIX attribute has been
requested for the extents overflow file too:

catalog file
kernel: hfsplus: node 4, num_recs 0, flags 0x10
kernel: hfsplus: tree->cnid 4, volume_attr 0x80000800

extents overflow file
kernel: hfsplus: node 1, num_recs 0, flags 0x10
kernel: hfsplus: tree->cnid 3, volume_attr 0x80000800

This patch modifies the hfs_bnode_need_zeroout() by checking
only volume_attr but not the b-tree ID because node zeroing
can be requested for all HFS+ b-tree types.

sudo ./check generic/070
FSTYP         -- hfsplus
PLATFORM      -- Linux/x86_64 hfsplus-testing-0001 6.18.0-rc3+ #79 SMP PREEMPT_DYNAMIC Fri Oct 31 16:07:42 PDT 2025
MKFS_OPTIONS  -- /dev/loop51
MOUNT_OPTIONS -- /dev/loop51 /mnt/scratch

generic/070 33s ...  34s
Ran: generic/070
Passed all 1 tests

Signed-off-by: Viacheslav Dubeyko <slava@dubeyko.com>
cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
cc: Yangtao Li <frank.li@vivo.com>
cc: linux-fsdevel@vger.kernel.org
Link: https://lore.kernel.org/r/20251101001229.247432-1-slava@dubeyko.com
Signed-off-by: Viacheslav Dubeyko <slava@dubeyko.com>
2025-11-13 15:04:52 -08:00
Viacheslav Dubeyko
00c14a09a7 hfs/hfsplus: prevent getting negative values of offset/length
The syzbot reported KASAN out-of-bounds issue in
hfs_bnode_move():

[   45.588165][ T9821] hfs: dst 14, src 65536, len -65536
[   45.588895][ T9821] ==================================================================
[   45.590114][ T9821] BUG: KASAN: out-of-bounds in hfs_bnode_move+0xfd/0x140
[   45.591127][ T9821] Read of size 18446744073709486080 at addr ffff888035935400 by task repro/9821
[   45.592207][ T9821]
[   45.592420][ T9821] CPU: 0 UID: 0 PID: 9821 Comm: repro Not tainted 6.16.0-rc7-dirty #42 PREEMPT(full)
[   45.592428][ T9821] Hardware name: QEMU Ubuntu 24.04 PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
[   45.592431][ T9821] Call Trace:
[   45.592434][ T9821]  <TASK>
[   45.592437][ T9821]  dump_stack_lvl+0x1c1/0x2a0
[   45.592446][ T9821]  ? __virt_addr_valid+0x1c8/0x5c0
[   45.592454][ T9821]  ? __pfx_dump_stack_lvl+0x10/0x10
[   45.592461][ T9821]  ? rcu_is_watching+0x15/0xb0
[   45.592469][ T9821]  ? lock_release+0x4b/0x3e0
[   45.592476][ T9821]  ? __virt_addr_valid+0x1c8/0x5c0
[   45.592483][ T9821]  ? __virt_addr_valid+0x4a5/0x5c0
[   45.592491][ T9821]  print_report+0x17e/0x7c0
[   45.592497][ T9821]  ? __virt_addr_valid+0x1c8/0x5c0
[   45.592504][ T9821]  ? __virt_addr_valid+0x4a5/0x5c0
[   45.592511][ T9821]  ? __phys_addr+0xd3/0x180
[   45.592519][ T9821]  ? hfs_bnode_move+0xfd/0x140
[   45.592526][ T9821]  kasan_report+0x147/0x180
[   45.592531][ T9821]  ? _printk+0xcf/0x120
[   45.592537][ T9821]  ? hfs_bnode_move+0xfd/0x140
[   45.592544][ T9821]  ? hfs_bnode_move+0xfd/0x140
[   45.592552][ T9821]  kasan_check_range+0x2b0/0x2c0
[   45.592557][ T9821]  ? hfs_bnode_move+0xfd/0x140
[   45.592565][ T9821]  __asan_memmove+0x29/0x70
[   45.592572][ T9821]  hfs_bnode_move+0xfd/0x140
[   45.592580][ T9821]  hfs_brec_remove+0x473/0x560
[   45.592589][ T9821]  hfs_cat_move+0x6fb/0x960
[   45.592598][ T9821]  ? __pfx_hfs_cat_move+0x10/0x10
[   45.592607][ T9821]  ? seqcount_lockdep_reader_access+0x122/0x1c0
[   45.592614][ T9821]  ? lockdep_hardirqs_on+0x9c/0x150
[   45.592631][ T9821]  ? __lock_acquire+0xaec/0xd80
[   45.592641][ T9821]  hfs_rename+0x1dc/0x2d0
[   45.592649][ T9821]  ? __pfx_hfs_rename+0x10/0x10
[   45.592657][ T9821]  vfs_rename+0xac6/0xed0
[   45.592664][ T9821]  ? __pfx_vfs_rename+0x10/0x10
[   45.592670][ T9821]  ? d_alloc+0x144/0x190
[   45.592677][ T9821]  ? bpf_lsm_path_rename+0x9/0x20
[   45.592683][ T9821]  ? security_path_rename+0x17d/0x490
[   45.592691][ T9821]  do_renameat2+0x890/0xc50
[   45.592699][ T9821]  ? __pfx_do_renameat2+0x10/0x10
[   45.592707][ T9821]  ? getname_flags+0x1e5/0x540
[   45.592714][ T9821]  __x64_sys_rename+0x82/0x90
[   45.592720][ T9821]  ? entry_SYSCALL_64_after_hwframe+0x77/0x7f
[   45.592725][ T9821]  do_syscall_64+0xf3/0x3a0
[   45.592741][ T9821]  ? exc_page_fault+0x9f/0xf0
[   45.592748][ T9821]  entry_SYSCALL_64_after_hwframe+0x77/0x7f
[   45.592754][ T9821] RIP: 0033:0x7f7f73fe3fc9
[   45.592760][ T9821] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 48
[   45.592765][ T9821] RSP: 002b:00007ffc7e116cf8 EFLAGS: 00000283 ORIG_RAX: 0000000000000052
[   45.592772][ T9821] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f7f73fe3fc9
[   45.592776][ T9821] RDX: 0000200000000871 RSI: 0000200000000780 RDI: 00002000000003c0
[   45.592781][ T9821] RBP: 00007ffc7e116d00 R08: 0000000000000000 R09: 00007ffc7e116d30
[   45.592784][ T9821] R10: fffffffffffffff0 R11: 0000000000000283 R12: 00005557e81f8250
[   45.592788][ T9821] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
[   45.592795][ T9821]  </TASK>
[   45.592797][ T9821]
[   45.619721][ T9821] The buggy address belongs to the physical page:
[   45.620300][ T9821] page: refcount:1 mapcount:1 mapping:0000000000000000 index:0x559a88174 pfn:0x35935
[   45.621150][ T9821] memcg:ffff88810a1d5b00
[   45.621531][ T9821] anon flags: 0xfff60000020838(uptodate|dirty|lru|owner_2|swapbacked|node=0|zone=1|lastcpupid=0x7ff)
[   45.622496][ T9821] raw: 00fff60000020838 ffffea0000d64d88 ffff888021753e10 ffff888029da0771
[   45.623260][ T9821] raw: 0000000559a88174 0000000000000000 0000000100000000 ffff88810a1d5b00
[   45.624030][ T9821] page dumped because: kasan: bad access detected
[   45.624602][ T9821] page_owner tracks the page as allocated
[   45.625115][ T9821] page last allocated via order 0, migratetype Movable, gfp_mask 0x140dca(GFP_HIGHUSER_MOVABLE|__GFP_ZERO0
[   45.626685][ T9821]  post_alloc_hook+0x240/0x2a0
[   45.627127][ T9821]  get_page_from_freelist+0x2101/0x21e0
[   45.627628][ T9821]  __alloc_frozen_pages_noprof+0x274/0x380
[   45.628154][ T9821]  alloc_pages_mpol+0x241/0x4b0
[   45.628593][ T9821]  vma_alloc_folio_noprof+0xe4/0x210
[   45.629066][ T9821]  folio_prealloc+0x30/0x180
[   45.629487][ T9821]  __handle_mm_fault+0x34bd/0x5640
[   45.629957][ T9821]  handle_mm_fault+0x40e/0x8e0
[   45.630392][ T9821]  do_user_addr_fault+0xa81/0x1390
[   45.630862][ T9821]  exc_page_fault+0x76/0xf0
[   45.631273][ T9821]  asm_exc_page_fault+0x26/0x30
[   45.631712][ T9821] page last free pid 5269 tgid 5269 stack trace:
[   45.632281][ T9821]  free_unref_folios+0xc73/0x14c0
[   45.632740][ T9821]  folios_put_refs+0x55b/0x640
[   45.633177][ T9821]  free_pages_and_swap_cache+0x26d/0x510
[   45.633685][ T9821]  tlb_flush_mmu+0x3a0/0x680
[   45.634105][ T9821]  tlb_finish_mmu+0xd4/0x200
[   45.634525][ T9821]  exit_mmap+0x44c/0xb70
[   45.634914][ T9821]  __mmput+0x118/0x420
[   45.635286][ T9821]  exit_mm+0x1da/0x2c0
[   45.635659][ T9821]  do_exit+0x652/0x2330
[   45.636039][ T9821]  do_group_exit+0x21c/0x2d0
[   45.636457][ T9821]  __x64_sys_exit_group+0x3f/0x40
[   45.636915][ T9821]  x64_sys_call+0x21ba/0x21c0
[   45.637342][ T9821]  do_syscall_64+0xf3/0x3a0
[   45.637756][ T9821]  entry_SYSCALL_64_after_hwframe+0x77/0x7f
[   45.638290][ T9821] page has been migrated, last migrate reason: numa_misplaced
[   45.638956][ T9821]
[   45.639173][ T9821] Memory state around the buggy address:
[   45.639677][ T9821]  ffff888035935300: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[   45.640397][ T9821]  ffff888035935380: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[   45.641117][ T9821] >ffff888035935400: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[   45.641837][ T9821]                    ^
[   45.642207][ T9821]  ffff888035935480: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[   45.642929][ T9821]  ffff888035935500: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[   45.643650][ T9821] ==================================================================

This commit [1] fixes the issue if an offset inside of b-tree node
or length of the request is bigger than b-tree node. However,
this fix is still not ready for negative values
of the offset or length. Moreover, negative values of
the offset or length doesn't make sense for b-tree's
operations. Because we could try to access the memory address
outside of the beginning of memory page's addresses range.
Also, using of negative values make logic very complicated,
unpredictable, and we could access the wrong item(s)
in the b-tree node.

This patch changes b-tree interface by means of converting
signed integer arguments of offset and length on u32 type.
Such conversion has goal to prevent of using negative values
unintentionally or by mistake in b-tree operations.

[1] 'commit a431930c9b ("hfs: fix slab-out-of-bounds in hfs_bnode_read()")'

Signed-off-by: Viacheslav Dubeyko <slava@dubeyko.com>
cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
cc: Yangtao Li <frank.li@vivo.com>
cc: linux-fsdevel@vger.kernel.org
Link: https://lore.kernel.org/r/20251002200020.2578311-1-slava@dubeyko.com
Signed-off-by: Viacheslav Dubeyko <slava@dubeyko.com>
2025-11-13 15:02:52 -08:00
Yang Chenzhi
152af11428 hfsplus: fix missing hfs_bnode_get() in __hfs_bnode_create
When sync() and link() are called concurrently, both threads may
enter hfs_bnode_find() without finding the node in the hash table
and proceed to create it.

Thread A:
  hfsplus_write_inode()
    -> hfsplus_write_system_inode()
      -> hfs_btree_write()
        -> hfs_bnode_find(tree, 0)
          -> __hfs_bnode_create(tree, 0)

Thread B:
  hfsplus_create_cat()
    -> hfs_brec_insert()
      -> hfs_bnode_split()
        -> hfs_bmap_alloc()
          -> hfs_bnode_find(tree, 0)
            -> __hfs_bnode_create(tree, 0)

In this case, thread A creates the bnode, sets refcnt=1, and hashes it.
Thread B also tries to create the same bnode, notices it has already
been inserted, drops its own instance, and uses the hashed one without
getting the node.

```

	node2 = hfs_bnode_findhash(tree, cnid);
	if (!node2) {                                 <- Thread A
		hash = hfs_bnode_hash(cnid);
		node->next_hash = tree->node_hash[hash];
		tree->node_hash[hash] = node;
		tree->node_hash_cnt++;
	} else {                                      <- Thread B
		spin_unlock(&tree->hash_lock);
		kfree(node);
		wait_event(node2->lock_wq,
			!test_bit(HFS_BNODE_NEW, &node2->flags));
		return node2;
	}
```

However, hfs_bnode_find() requires each call to take a reference.
Here both threads end up setting refcnt=1. When they later put the node,
this triggers:

BUG_ON(!atomic_read(&node->refcnt))

In this scenario, Thread B in fact finds the node in the hash table
rather than creating a new one, and thus must take a reference.

Fix this by calling hfs_bnode_get() when reusing a bnode newly created by
another thread to ensure the refcount is updated correctly.

A similar bug was fixed in HFS long ago in commit
a9dc087fd3 ("fix missing hfs_bnode_get() in __hfs_bnode_create")
but the same issue remained in HFS+ until now.

Reported-by: syzbot+005d2a9ecd9fbf525f6a@syzkaller.appspotmail.com
Signed-off-by: Yang Chenzhi <yang.chenzhi@vivo.com>
Signed-off-by: Viacheslav Dubeyko <slava@dubeyko.com>
Link: https://lore.kernel.org/r/20250829093912.611853-1-yang.chenzhi@vivo.com
Signed-off-by: Viacheslav Dubeyko <slava@dubeyko.com>
2025-11-13 14:59:46 -08:00
Matt Bobrowski
93ce3bee31 selftests/bpf: retry bpf_map_update_elem() when E2BIG is returned
Executing the test_maps binary on platforms with extremely high core
counts may cause intermittent assertion failures in
test_update_delete() (called via test_map_parallel()). This can occur
because bpf_map_update_elem() under some circumstances (specifically
in this case while performing bpf_map_update_elem() with BPF_NOEXIST
on a BPF_MAP_TYPE_HASH with its map_flags set to BPF_F_NO_PREALLOC)
can return an E2BIG error code i.e.

error -7 7 tools/testing/selftests/bpf/test_maps.c:#: void
test_update_delete(unsigned int, void *): Assertion `err == 0' failed.
tools/testing/selftests/bpf/test_maps.c:#: void
__run_parallel(unsigned int, void (*)(unsigned int, void *), void *):
Assertion `status == 0' failed.

As it turns out, is_map_full() which is called from alloc_htab_elem()
can take on a conservative approach when htab->use_percpu_counter is
true (which is the case here because the percpu_counter is used when a
BPF_MAP_TYPE_HASH is created with its map_flags set to
BPF_F_NO_PREALLOC). This conservative approach prioritizes preventing
over-allocation and potential issues that could arise from possibly
exceeding htab->map.max_entries in highly concurrent environments,
even if it means slightly under-utilizing the htab map's capacity.

Given that bpf_map_update_elem() from test_update_delete() can return
E2BIG, update can_retry() such that it also accounts for the E2BIG
error code (specifically only when running with map_flags being set to
BPF_F_NO_PREALLOC). The retry loop will allow the global count
belonging to the percpu_counter to become synchronized and better
reflect the current htab map's capacity.

Signed-off-by: Matt Bobrowski <mattbobrowski@google.com>
Acked-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20251113092519.2632079-1-mattbobrowski@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-13 14:36:28 -08:00
Jakub Kicinski
c99ebb6132 Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
Cross-merge networking fixes after downstream PR (net-6.18-rc6).

No conflicts, adjacent changes in:

drivers/net/phy/micrel.c
  96a9178a29 ("net: phy: micrel: lan8814 fix reset of the QSGMII interface")
  61b7ade9ba ("net: phy: micrel: Add support for non PTP SKUs for lan8814")

and a trivial one in tools/testing/selftests/drivers/net/Makefile.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-13 12:35:38 -08:00
Jens Axboe
f8f115baae Merge branch 'zcrx-updates-6.19' into for-6.19/io_uring
Merge zcrx updates from Pavel:

"Zcrx updates for 6.19. It includes a bunch of small patches,
 IORING_REGISTER_ZCRX_CTRL and RQ flushing (Patches 4-5) and David's
 work on sharing zcrx b/w multiple io_uring instances."

Link: https://lore.kernel.org/io-uring/cover.1763029704.git.asml.silence@gmail.com/
Signed-off-by: Jens Axboe <axboe@kernel.dk>

* zcrx-updates-6.19:
  io_uring/zcrx: share an ifq between rings
  io_uring/zcrx: add io_fill_zcrx_offsets()
  io_uring/zcrx: export zcrx via a file
  io_uring/zcrx: move io_zcrx_scrub() and dependencies up
  io_uring/zcrx: count zcrx users
  io_uring/zcrx: add sync refill queue flushing
  io_uring/zcrx: introduce IORING_REGISTER_ZCRX_CTRL
  io_uring/zcrx: elide passing msg flags
  io_uring/zcrx: use folio_nr_pages() instead of shift operation
  io_uring/zcrx: convert to use netmem_desc
2025-11-13 11:20:19 -07:00
David Wei
00d9148127 io_uring/zcrx: share an ifq between rings
Add a way to share an ifq from a src ring that is real (i.e. bound to a
HW RX queue) with other rings. This is done by passing a new flag
IORING_ZCRX_IFQ_REG_IMPORT in the registration struct
io_uring_zcrx_ifq_reg, alongside the fd of an exported zcrx ifq.

Signed-off-by: David Wei <dw@davidwei.uk>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-13 11:19:37 -07:00
David Wei
0926f94ab3 io_uring/zcrx: add io_fill_zcrx_offsets()
Add a helper io_fill_zcrx_offsets() that sets the constant offsets in
struct io_uring_zcrx_offsets returned to userspace.

Signed-off-by: David Wei <dw@davidwei.uk>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-13 11:19:37 -07:00
Pavel Begunkov
d7af80b213 io_uring/zcrx: export zcrx via a file
Add an option to wrap a zcrx instance into a file and expose it to the
user space. Currently, users can't do anything meaningful with the file,
but it'll be used in a next patch to import it into another io_uring
instance. It's implemented as a new op called ZCRX_CTRL_EXPORT for the
IORING_REGISTER_ZCRX_CTRL registration opcode.

Signed-off-by: David Wei <dw@davidwei.uk>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-13 11:19:37 -07:00
David Wei
742cb2e14e io_uring/zcrx: move io_zcrx_scrub() and dependencies up
In preparation for adding zcrx ifq exporting and importing, move
io_zcrx_scrub() and its dependencies up the file to be closer to
io_close_queue().

Signed-off-by: David Wei <dw@davidwei.uk>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-13 11:19:37 -07:00
Pavel Begunkov
39c9676f78 io_uring/zcrx: count zcrx users
zcrx tries to detach ifq / terminate page pools when the io_uring ctx
owning it is being destroyed. There will be multiple io_uring instances
attached to it in the future, so add a separate counter to track the
users. Note, refs can't be reused for this purpose as it only used to
prevent zcrx and rings destruction, and also used by page pools to keep
it alive.

Signed-off-by: David Wei <dw@davidwei.uk>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-13 11:19:37 -07:00
Pavel Begunkov
475eb39b00 io_uring/zcrx: add sync refill queue flushing
Add an zcrx interface via IORING_REGISTER_ZCRX_CTRL that forces the
kernel to flush / consume entries from the refill queue. Just as with
the IORING_REGISTER_ZCRX_REFILL attempt, the motivation is to address
cases where the refill queue becomes full, and the user can't return
buffers and needs to stash them. It's still a slow path, and the user
should size refill queue appropriately, but it should be helpful for
handling temporary traffic spikes and other unpredictable conditions.

The interface is simpler comparing to ZCRX_REFILL as it doesn't need
temporary refill entry arrays and gives natural batching, whereas
ZCRX_REFILL requires even more user logic to be somewhat efficient.

Also, add a structure for the operation. It's not currently used but
can serve for future improvements like limiting the number of buffers to
process, etc.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-13 11:19:37 -07:00
Pavel Begunkov
d663976dad io_uring/zcrx: introduce IORING_REGISTER_ZCRX_CTRL
It'll be annoying and take enough of boilerplate code to implement
new zcrx features as separate io_uring register opcode. Introduce
IORING_REGISTER_ZCRX_CTRL that will multiplex such calls to zcrx.
Note, there are no real users of the opcode in this patch.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-13 11:19:37 -07:00
Pavel Begunkov
1b8b5d0316 io_uring/zcrx: elide passing msg flags
zcrx sqe->msg_flags has never been defined and checked to be zero. It
doesn't need to be a MSG_* bitmask. Keep them undefined, don't mix
with MSG_DONTWAIT, and don't pass into io_zcrx_recv() as it's ignored
anyway.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-13 11:19:37 -07:00
Pedro Demarchi Gomes
a0169c3a62 io_uring/zcrx: use folio_nr_pages() instead of shift operation
folio_nr_pages() is a faster helper function to get the number of pages when
NR_PAGES_IN_LARGE_FOLIO is enabled.

Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-13 11:19:37 -07:00
Pavel Begunkov
f0243d2b86 io_uring/zcrx: convert to use netmem_desc
Convert zcrx to struct netmem_desc, and use struct net_iov::desc to
access its fields instead of struct net_iov inner union alises.
zcrx only directly reads niov->pp, so with this patch it doesn't depend
on the union anymore.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Reviewed-by: Byungchul Park <byungchul@sk.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-13 11:19:37 -07:00
Jens Axboe
5bd38e18d5 Merge branch 'zcrx-query-6.19' into for-6.19/io_uring
Merge zcrx SQ/CQ query changes from Pavel:

"Introduce zcrx and SQ/CQ layout queries. The former returns what zcrx
 features are available. And both return the ring size information to
 help with allocation size calculation for user provided rings like
 IORING_SETUP_NO_MMAP and. IORING_MEM_REGION_TYPE_USER"

Link: https://lore.kernel.org/io-uring/cover.1763030298.git.asml.silence@gmail.com/
Signed-off-by: Jens Axboe <axboe@kernel.dk>

* zcrx-query-6.19:
  io_uring/query: introduce rings info query
  io_uring/query: introduce zcrx query
2025-11-13 11:18:19 -07:00
Pavel Begunkov
4aaa9bc4d5 io_uring/query: introduce rings info query
Same problem as with zcrx in the previous patch, the user needs to know
SQ/CQ header sizes to allocated memory before setup to use it for user
provided rings, i.e. IORING_SETUP_NO_MMAP, however that information is
only returned after registration, hence the user is guessing kernel
implementation details.

Return the header size and alignment, which is split with the same
motivation, to allow the user to know the real structure size without
alignment in case there will be more flexible placement schemes in the
future.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-13 11:17:36 -07:00
Pavel Begunkov
2647e2ecc0 io_uring/query: introduce zcrx query
Add a new query type IO_URING_QUERY_ZCRX returning the user some basic
information about the interface, which includes allowed flags for areas
and registration and supported IORING_REGISTER_ZCRX_CTRL subcodes.

There is also a chicken-egg problem with user provided refill queue
memory, where offsets and size information is returned after
registration, but to properly allocate memory you need to know it
beforehand, which is why the userspace currently has to guess the RQ
headers size and severely overestimates it. Return the size information.
It's split into "size" and "alignment" fields because for default
placement modes the user is interested in the aligned size, however if
it gets support for more flexible placement, it'll need to only know the
actual header size.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-13 11:17:36 -07:00
Alexei Starovoitov
f1d8c6580b Merge branch 'percpu_hash-maps'
Leon Hwang says:

====================
In the discussion thread
"[PATCH bpf-next v9 0/7] bpf: Introduce BPF_F_CPU and BPF_F_ALL_CPUS flags for percpu maps"[1],
it was pointed out that missing calls to bpf_obj_free_fields() could
lead to memory leaks.

A selftest was added to confirm that this is indeed a real issue - the
refcount of BPF_KPTR_REF field is not decremented when
bpf_obj_free_fields() is missing after copy_map_value[,_long]().

Further inspection of copy_map_value[,_long]() call sites revealed two
locations affected by this issue:

1. pcpu_copy_value()
2. htab_map_update_elem() when used with BPF_F_LOCK

Similar case happens when update local storage maps with BPF_F_LOCK.

This series fixes the cases where BPF_F_LOCK is not involved by
properly calling bpf_obj_free_fields() after copy_map_value[,_long](),
and adds a selftest to verify the fix.

The remaining cases involving BPF_F_LOCK will be addressed in a
separate patch set after the series
"bpf: Introduce BPF_F_CPU and BPF_F_ALL_CPUS flags for percpu maps"
is applied.

Changes:
v5 -> v6:
* Update the test name to include "refcounted_kptr".
* Update some local variables' name in the test (per Alexei).
* v5: https://lore.kernel.org/bpf/20251104142714.99878-1-leon.hwang@linux.dev/

v4 -> v5:
* Use a local variable to store the this_cpu_ptr()/per_cpu_ptr() result,
  and reuse it between copy_map_value[,_long]() and
  bpf_obj_free_fields() in patch #1 (per Andrii).
* Drop patch #2 and #3, because the combination of BPF_F_LOCK with other
  special fields (except for BPF_SPIN_LOCK) will be disallowed on the
  UAPI side in the future (per Alexei).
* v4: https://lore.kernel.org/bpf/20251030152451.62778-1-leon.hwang@linux.dev/

v3 -> v4:
* Target bpf-next tree.
* Address comments from Amery:
  * Drop 'bpf_obj_free_fields()' in the path of updating local storage
    maps without BPF_F_LOCK.
  * Drop the corresponding self test.
  * Respin the other test of local storage maps using syscall BPF
    programs.
* v3: https://lore.kernel.org/bpf/20251026154000.34151-1-leon.hwang@linux.dev/

v2 -> v3:
* Free special fields when update local storage maps without BPF_F_LOCK.
* Add test to verify decrementing refcount when update cgroup local
  storage maps without BPF_F_LOCK.
* Address review from AI bot:
  * Slow path with BPF_F_LOCK (around line 642-646) in
    'bpf_local_storage.c'.
* v2: https://lore.kernel.org/bpf/20251020164608.20536-1-leon.hwang@linux.dev/

v1 -> v2:
* Add test to verify decrementing refcount when update cgroup local
  storage maps with BPF_F_LOCK.
* Address review from AI bot:
  * Fast path without bucket lock (around line 610) in
    'bpf_local_storage.c'.
* v1: https://lore.kernel.org/bpf/20251016145801.47552-1-leon.hwang@linux.dev/
====================

Link: https://patch.msgid.link/20251105151407.12723-1-leon.hwang@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-13 09:16:20 -08:00
Leon Hwang
c1cbf0d21c selftests/bpf: Add test to verify freeing the special fields in pcpu maps
Add test to verify that updating [lru_,]percpu_hash maps decrements
refcount when BPF_KPTR_REF objects are involved.

The tests perform the following steps:
. Call update_elem() to insert an initial value.
. Use bpf_refcount_acquire() to increment the refcount.
. Store the node pointer in the map value.
. Add the node to a linked list.
. Probe-read the refcount and verify it is *2*.
. Call update_elem() again to trigger refcount decrement.
. Probe-read the refcount and verify it is *1*.

Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20251105151407.12723-3-leon.hwang@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-13 09:15:33 -08:00
Leon Hwang
6af6e49a76 bpf: Free special fields when update [lru_,]percpu_hash maps
As [lru_,]percpu_hash maps support BPF_KPTR_{REF,PERCPU}, missing
calls to 'bpf_obj_free_fields()' in 'pcpu_copy_value()' could cause the
memory referenced by BPF_KPTR_{REF,PERCPU} fields to be held until the
map gets freed.

Fix this by calling 'bpf_obj_free_fields()' after
'copy_map_value[,_long]()' in 'pcpu_copy_value()'.

Fixes: 65334e64a4 ("bpf: Support kptrs in percpu hashmap and percpu LRU hashmap")
Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20251105151407.12723-2-leon.hwang@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-13 09:14:15 -08:00
Jens Axboe
8e1bf774ab Merge branch 'elevator-switch-6.19' into for-6.19/block
Merge elevator switching improvements from Nilay:

"This patchset reorganizes the elevator switch path used during both
 nr_hw_queues update and elv_iosched_store() operations to address a
 recently reported lockdep splat [1].

 The warning highlights a locking dependency between ->freeze_lock and
 ->elevator_lock on pcpu_alloc_mutex, triggered when the Kyber scheduler
 dynamically allocates its private scheduling data. The fix is to ensure
 that such allocations occur outside the locked sections, thus
 eliminating the dependency chain.

 While working on this, it also became evident that the nr_hw_queue
 update code maintains two disjoint xarrays—one for elevator tags and
 another for elevator type—both serving the same purpose. Unifying these
 into a single elv_change_ctx structure improves clarity and
 maintainability.

 This series therefore implements five patches:
 The first perparatory patch unifies elevator tags and type xarrays. It
 combines both xarrays into a single struct elv_change_ctx, simplifying
 per-queue elevator state management.

 The second patch is aimed to group together all elevator-related
 resources that share the same lifetime and as a first step we move the
 elevator tags pointer from struct elv_change_ctx into the newly
 introduced struct elevator_resources. The subsequent patch extends the
 struct elevator_resources to include other elevator-related data.

 The third patch introduce ->alloc_sched_data and ->free_sched_data
 elevator ops which could be then used to safely allocate and free
 scheduler data.

 The fourth patch now builds upon the previous patch and starts using
 the newly introduced alloc/free sched data methods in the earlier patch
 during elevator switch and nr_hw_queue update. And while doing so, it's
 ensured that sched data allocation and free happens before we acquire
 ->freeze_lock and ->elevator_lock thus preventing its dependency on
 pcpu_alloc_mutex.

 The last patch of this series converts Kyber scheduler to use the new
 methods inroduced in the previous patch. It hooks Kyber’s scheduler
 data allocation and teardown logic from ->init_sched and ->exit_sched
 into the new methods, ensuring memory operations are performed outside
 locked sections.

 Together, these changes simplify the elevator switch logic and prevent
 the reported lockdep splat."

Link: https://lore.kernel.org/linux-block/20251113090619.2030737-1-nilay@linux.ibm.com/
Signed-off-by: Jens Axboe <axboe@kernel.dk>

* elevator-switch-6.19:
  block: define alloc_sched_data and free_sched_data methods for kyber
  block: use {alloc|free}_sched data methods
  block: introduce alloc_sched_data and free_sched_data elevator methods
  block: move elevator tags into struct elevator_resources
  block: unify elevator tags and type xarrays into struct elv_change_ctx
2025-11-13 09:28:00 -07:00
Nilay Shroff
d4c3ef56a1 block: define alloc_sched_data and free_sched_data methods for kyber
Currently, the Kyber elevator allocates its private data dynamically in
->init_sched and frees it in ->exit_sched. However, since ->init_sched
is invoked during elevator switch after acquiring both ->freeze_lock and
->elevator_lock, it may trigger the lockdep splat [1] due to dependency
on pcpu_alloc_mutex.

To resolve this, move the elevator data allocation and deallocation
logic from ->init_sched and ->exit_sched into the newly introduced
->alloc_sched_data and ->free_sched_data methods. These callbacks are
invoked before acquiring ->freeze_lock and ->elevator_lock, ensuring
that memory allocation happens safely without introducing additional
locking dependencies.

This change breaks the dependency chain involving pcpu_alloc_mutex and
prevents the reported lockdep warning.

[1] https://lore.kernel.org/all/CAGVVp+VNW4M-5DZMNoADp6o2VKFhi7KxWpTDkcnVyjO0=-D5+A@mail.gmail.com/

Reported-by: Changhui Zhong <czhong@redhat.com>
Reported-by: Yi Zhang <yi.zhang@redhat.com>
Closes: https://lore.kernel.org/all/CAGVVp+VNW4M-5DZMNoADp6o2VKFhi7KxWpTDkcnVyjO0=-D5+A@mail.gmail.com/
Tested-by: Yi Zhang <yi.zhang@redhat.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Yu Kuai <yukuai@fnnas.com>
Signed-off-by: Nilay Shroff <nilay@linux.ibm.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-13 09:27:49 -07:00
Nilay Shroff
0315476e78 block: use {alloc|free}_sched data methods
The previous patch introduced ->alloc_sched_data and
->free_sched_data methods. This patch builds upon that
by now using these methods during elevator switch and
nr_hw_queue update.

It's also ensured that scheduler-specific data is
allocated and freed through the new callbacks outside
of the ->freeze_lock and ->elevator_lock locking contexts,
thereby preventing any dependency on pcpu_alloc_mutex.

Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Yu Kuai <yukuai@fnnas.com>
Signed-off-by: Nilay Shroff <nilay@linux.ibm.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-13 09:27:49 -07:00
Nilay Shroff
61019afdf6 block: introduce alloc_sched_data and free_sched_data elevator methods
The recent lockdep splat [1] highlights a potential deadlock risk
involving ->elevator_lock and ->freeze_lock dependencies on -pcpu_alloc_
mutex. The trace shows that the issue occurs when the Kyber scheduler
allocates dynamic memory for its elevator data during initialization.

To address this, introduce two new elevator operation callbacks:
->alloc_sched_data and ->free_sched_data. The subsequent patch would
build upon these newly introduced methods to suppress lockdep splat[1].

[1] https://lore.kernel.org/all/CAGVVp+VNW4M-5DZMNoADp6o2VKFhi7KxWpTDkcnVyjO0=-D5+A@mail.gmail.com/

Signed-off-by: Nilay Shroff <nilay@linux.ibm.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-13 09:27:49 -07:00
Nilay Shroff
04728ce909 block: move elevator tags into struct elevator_resources
This patch introduces a new structure, struct elevator_resources, to
group together all elevator-related resources that share the same
lifetime. As a first step, this change moves the elevator tag pointer
from struct elv_change_ctx into the new struct elevator_resources.

Additionally, rename blk_mq_alloc_sched_tags_batch() and
blk_mq_free_sched_tags_batch() to blk_mq_alloc_sched_res_batch() and
blk_mq_free_sched_res_batch(), respectively. Introduce two new wrapper
helpers, blk_mq_alloc_sched_res() and blk_mq_free_sched_res(), around
blk_mq_alloc_sched_tags() and blk_mq_free_sched_tags().

These changes pave the way for consolidating the allocation and freeing
of elevator-specific resources into common helper functions. This
refactoring improves encapsulation and prepares the code for future
extensions, allowing additional elevator-specific data to be added to
struct elevator_resources without cluttering struct elv_change_ctx.

Subsequent patches will extend struct elevator_resources to include
other elevator-related data.

Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Yu Kuai <yukuai@fnnas.com>
Signed-off-by: Nilay Shroff <nilay@linux.ibm.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-13 09:27:49 -07:00
Nilay Shroff
232143b605 block: unify elevator tags and type xarrays into struct elv_change_ctx
Currently, the nr_hw_queues update path manages two disjoint xarrays —
one for elevator tags and another for elevator type — both used during
elevator switching. Maintaining these two parallel structures for the
same purpose adds unnecessary complexity and potential for mismatched
state.

This patch unifies both xarrays into a single structure, struct
elv_change_ctx, which holds all per-queue elevator change context. A
single xarray, named elv_tbl, now maps each queue (q->id) in a tagset
to its corresponding elv_change_ctx entry, encapsulating the elevator
tags, type and name references.

This unification simplifies the code, improves maintainability, and
clarifies ownership of per-queue elevator state.

Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Yu Kuai <yukuai@fnnas.com>
Signed-off-by: Nilay Shroff <nilay@linux.ibm.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-13 09:27:49 -07:00
Jens Axboe
c3f42a6de7 Merge branch 'bcache-updates-6.19' into for-6.19/block
Merge bcache updates from Coly for 6.19:

"The major change is from me, which is to remove useless discard
 interface and code for cache device (not the backing device). And the
 last patch about gc latency is a cooperative result from Robert Pang
 (Google), Mingzhe Zou (Easystack) and me, by inspired from their
 previous works, I compose the final version and Robert prvides positive
 benchmark result.

 Marco contributes 2 patches to improve the usage of  per-cpu system
 work queue. Gustavo contributes a patch to fix the not-at-end
 flexible-array member warning by gcc14. And Qianfeng contributes a code
 cleanup patch to remove redundant __GFP_NOWARN."

Link: https://lore.kernel.org/linux-block/20251113053630.54218-1-colyli@fnnas.com/
Signed-off-by: Jens Axboe <axboe@kernel.dk>

* bcache-updates-6.19:
  bcache: Avoid -Wflex-array-member-not-at-end warning
  bcache: WQ_PERCPU added to alloc_workqueue users
  bcache: replace use of system_wq with system_percpu_wq
  bcache: remove redundant __GFP_NOWARN
  bcache: reduce gc latency by processing less nodes and sleep less time
  bcache: remove discard sysfs interface document
  bcache: drop discard sysfs interface
  bcache: remove discard code from alloc.c
  bcache: get rid of discard code from journal
2025-11-13 09:18:19 -07:00
Gustavo A. R. Silva
699122b590 bcache: Avoid -Wflex-array-member-not-at-end warning
-Wflex-array-member-not-at-end was introduced in GCC-14, and we are
getting ready to enable it, globally.

Use the new TRAILING_OVERLAP() helper to fix the following warning:

drivers/md/bcache/bset.h:330:27: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end]

This helper creates a union between a flexible-array member (FAM) and a
set of MEMBERS that would otherwise follow it.

This overlays the trailing MEMBER struct btree_iter_set stack_data[MAX_BSETS];
onto the FAM struct btree_iter::data[], while keeping the FAM and the start
of MEMBER aligned.

The static_assert() ensures this alignment remains, and it's
intentionally placed immediately after the corresponding structures --no
blank line in between.

Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Signed-off-by: Coly Li <colyli@fnnas.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-13 09:18:06 -07:00
Marco Crivellari
c0c8082142 bcache: WQ_PERCPU added to alloc_workqueue users
Currently if a user enqueue a work item using schedule_delayed_work() the
used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use
WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to
schedule_work() that is using system_wq and queue_work(), that makes use
again of WORK_CPU_UNBOUND.
This lack of consistentcy cannot be addressed without refactoring the API.

alloc_workqueue() treats all queues as per-CPU by default, while unbound
workqueues must opt-in via WQ_UNBOUND.

This default is suboptimal: most workloads benefit from unbound queues,
allowing the scheduler to place worker threads where they’re needed and
reducing noise when CPUs are isolated.

This patch continues the effort to refactor worqueue APIs, which has begun
with the change introducing new workqueues and a new alloc_workqueue flag:

commit 128ea9f6cc ("workqueue: Add system_percpu_wq and system_dfl_wq")
commit 930c2ea566 ("workqueue: Add new WQ_PERCPU flag")

This change adds a new WQ_PERCPU flag to explicitly request
alloc_workqueue() to be per-cpu when WQ_UNBOUND has not been specified.

With the introduction of the WQ_PERCPU flag (equivalent to !WQ_UNBOUND),
any alloc_workqueue() caller that doesn’t explicitly specify WQ_UNBOUND
must now use WQ_PERCPU.

Once migration is complete, WQ_UNBOUND can be removed and unbound will
become the implicit default.

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Marco Crivellari <marco.crivellari@suse.com>
Signed-off-by: Coly Li <colyli@fnnas.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-13 09:18:06 -07:00
Marco Crivellari
fd82071814 bcache: replace use of system_wq with system_percpu_wq
Currently if a user enqueues a work item using schedule_delayed_work() the
used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use
WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to
schedule_work() that is using system_wq and queue_work(), that makes use
again of WORK_CPU_UNBOUND.

This lack of consistency cannot be addressed without refactoring the API.

This patch continues the effort to refactor worqueue APIs, which has begun
with the change introducing new workqueues and a new alloc_workqueue flag:

commit 128ea9f6cc ("workqueue: Add system_percpu_wq and system_dfl_wq")
commit 930c2ea566 ("workqueue: Add new WQ_PERCPU flag")

system_wq should be the per-cpu workqueue, yet in this name nothing makes
that clear, so replace system_wq with system_percpu_wq.

The old wq (system_wq) will be kept for a few release cycles.

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Marco Crivellari <marco.crivellari@suse.com>
Signed-off-by: Coly Li <colyli@fnnas.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-13 09:18:06 -07:00
Qianfeng Rong
21194c44b6 bcache: remove redundant __GFP_NOWARN
GFP_NOWAIT already includes __GFP_NOWARN, so let's remove the redundant
__GFP_NOWARN.

Signed-off-by: Qianfeng Rong <rongqianfeng@vivo.com>
Acked-by: Coly Li <colyli@fnnas.com>
Acked-by: Coly Li <colyli@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-13 09:18:06 -07:00
Coly Li
70bc173ce0 bcache: reduce gc latency by processing less nodes and sleep less time
When bcache device is busy for high I/O loads, there are two methods to
reduce the garbage collection latency,
- Process less nodes in eac loop of incremental garbage collection in
  btree_gc_recurse().
- Sleep less time between two full garbage collection in
  bch_btree_gc().

This patch introduces to hleper routines to provide different garbage
collection nodes number and sleep intervel time.
- btree_gc_min_nodes()
  If there is no front end I/O, return 128 nodes to process in each
  incremental loop, otherwise only 10 nodes are returned. Then front I/O
  is able to access the btree earlier.
- btree_gc_sleep_ms()
  If there is no synchronized wait for bucket allocation, sleep 100 ms
  between two incremental GC loop. Othersize only sleep 10 ms before
  incremental GC loop. Then a faster GC may provide available buckets
  earlier, to avoid most of bcache working threads from being starved by
  buckets allocation.

The idea is inspired by works from Mingzhe Zou and Robert Pang, but much
simpler and the expected behavior is more predictable.

Signed-off-by: Coly Li <colyli@fnnas.com>
Signed-off-by: Robert Pang <robertpang@google.com>
Signed-off-by: Mingzhe Zou <mingzhe.zou@easystack.cn>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-13 09:18:06 -07:00
Coly Li
7bf90cd740 bcache: remove discard sysfs interface document
This patch removes documents of bcache discard sysfs interface, it
drops discard related sections from,
- Documentation/ABI/testing/sysfs-block-bcache
- Documentation/admin-guide/bcache.rst

Signed-off-by: Coly Li <colyli@fnnas.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-13 09:18:06 -07:00
Coly Li
73a004f83c bcache: drop discard sysfs interface
Since discard code is removed, now the sysfs interface to enable discard
is useless. This patch removes the corresponding sysfs entry, and remove
bool variable 'discard' from struct cache as well.

Signed-off-by: Coly Li <colyli@fnnas.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-13 09:18:06 -07:00
Coly Li
b4056afbd4 bcache: remove discard code from alloc.c
Bcache allocator initially has no free space to allocate. Firstly it
does a garbage collection which is triggered by a cache device write
and fills free space into ca->free[] lists. The discard happens after
the free bucket is handled by garbage collection added into one of the
ca->free[] lists. But normally this bucket will be allocated out very
soon to requester and filled data onto it. The discard hint on this
bucket LBA range doesn't help SSD control to improve internal erasure
performance, and waste extra CPU cycles to issue discard bios.

This patch removes the almost-useless discard code from alloc.c.

Signed-off-by: Coly Li <colyli@fnnas.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-13 09:18:06 -07:00
Coly Li
0c72e9fcc1 bcache: get rid of discard code from journal
In bcache journal there is discard functionality but almost useless in
reality. Because discard happens after a journal bucket is reclaimed,
and the reclaimed bucket is allocated for new journaling immediately.
There is no time for underlying SSD to use the discard hint for internal
data management.

The discard code in bcache journal doesn't bring any performance
optimization and wastes CPU cycles for issuing discard bios. Therefore
this patch gits rid of it from journal.c and journal.h.

Signed-off-by: Coly Li <colyli@fnnas.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-13 09:18:06 -07:00
Damien Le Moal
7b2038b1b1 dm: fix zone reset all operation processing
dm_zone_get_reset_bitmap() is used to generate a bitmap of the zones of
a zoned device target when a REQ_OP_ZONE_RESET_ALL request is being
processed. This bitmap is built by executing a zone report with a report
callback set to the function dm_zone_need_reset_cb() in struct
dm_report_zones_args. However, the cb callback pointer is not anymore
the same as the callback specified by callers of the
blkdev_report_zones() function. Rather, this is a DM internal callback
and report zones callback functions from blkdev_report_zones() are
passed using struct blk_report_zones_args, introduced with commit
db9aed869f34 ("block: introduce disk_report_zone()").

This commit changed the DM main report zones callback handler function
dm_report_zones_cb() to call the new disk_report_zone() so that callback
functions from blkdev_report_zones() are executed, and this change
resulted in the DM internal dm_zone_need_reset_cb() callback function to
not be executed anymore, turning any REQ_OP_ZONE_RESET_ALL request into
a no-op.

Fix this by calling in dm_report_zones_cb() the DM internal cb function
specified in struct dm_report_zones_args.

Fixes: db9aed869f34 ("block: introduce disk_report_zone()").
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-13 09:10:04 -07:00
Damien Le Moal
881880b6f3 block: fix NULL pointer dereference in disk_report_zones()
Commit 2284eec5053d ("block: introduce blkdev_get_zone_info()")
introduced the report_active field in struct blk_report_zones_args so
that open and closed zones can be reported with the condition
BLK_ZONE_COND_ACTIVE in the case of a cached report zone.
However, the args pointer to a struct blk_report_zones_args that is
passed to disk_report_zones() can be NULL, e.g. in the case of internal
report zones operations for device mapper zoned targets.

Fix disk_report_zones() to make sure to check that the args is not null
before updating a zone condition for cached zone reports.

Fixes: 2284eec5053d ("block: introduce blkdev_get_zone_info()")
Reported-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-13 09:10:04 -07:00
Damien Le Moal
c2b8d20628 block: fix NULL pointer dereference in blk_zone_reset_all_bio_endio()
For zoned block devices that do not need zone write plugs (e.g. most
device mapper devices that support zones), the disk hash table of zone
write plugs is NULL. For such devices, blk_zone_reset_all_bio_endio()
should not attempt to scan this has table as that causes a NULL pointer
dereference.

Fix this by checking that the disk does have zone write plugs using the
atomic counter. This is equivalent to checking for a non-NULL hash table
but has the advantage to also speed up the execution of
blk_zone_reset_all_bio_endio() for devices that do use zone write plugs
but do not have any plug in the hash table (e.g. a disk with only full
zones).

Fixes: efae226c2e ("block: handle zone management operations completions")
Reported-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-13 09:10:04 -07:00
Russell King (Oracle)
f694d215d3 net: stmmac: always allocate mac_device_info
The ->setup() method implemented by dwmac-loongson and dwmac-sun8i
allocate the mac_device_info structure, as does stmmac_hwif_init().
This makes no sense.

Have stmmac_hwif_init() always allocate this structure, and pass it to
the ->setup() method to initialise when it is provided. Rename this
method to "mac_setup" to more accurately describe what it is doing.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vImWK-0000000DrIx-28vO@rmk-PC.armlinux.org.uk
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-13 17:03:19 +01:00
Russell King (Oracle)
d0af55df5a net: stmmac: clean up stmmac_reset()
stmmac_reset() takes the stmmac_priv and an ioaddr. It has one call
site, which passes the priv pointer, and dereferences priv for the
ioaddr.

stmmac_reset() then checks whether priv is NULL. If it was, the caller
would have oopsed. Remove the checks for NULL, and move the dereference
for ioaddr into stmmac_reset().

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vImWF-0000000DrIr-1fmn@rmk-PC.armlinux.org.uk
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-13 17:03:15 +01:00
Pavel Begunkov
d741c62555 io_uring: move cq/sq user offset init around
Move user SQ/CQ offset initialisation at the end of io_prepare_config()
where it already calculated all information to set it properly.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-13 07:27:35 -07:00
Pavel Begunkov
eb76ff6a68 io_uring: pre-calculate scq layout
Move ring layouts calculations into io_prepare_config(), so that more
misconfiguration checking can be done earlier before creating a ctx.
It also deduplicates some code with ring resizing. And as a bonus, now
it initialises params->sq_off.array, which is closer to all other user
offset init, and also applies it to ring resizing, which was previously
missing it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-13 07:27:34 -07:00
Pavel Begunkov
001b76b7e7 io_uring: keep ring laoyut in a structure
Add a structure keeping SQ/CQ sizes and offsets. For now it only records
data previously returned from rings_size and the SQ size.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-13 07:27:34 -07:00
Pavel Begunkov
0f4b537363 io_uring: introduce struct io_ctx_config
There will be more information needed during ctx setup, and instead of
passing a handful of pointers around, wrap them all into a new
structure. Add a helper for encapsulating all configuration checks and
preparation, that's also reused for ring resizing.

Note, it indirectly adds a io_uring_sanitise_params() check to ring
resizing, which is a good thing.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-13 07:27:34 -07:00
Pavel Begunkov
929dbbb699 io_uring: convert params to pointer in ring reisze
The parameters in io_register_resize_rings() will be moved into another
structure in a later patch. In preparation to that, convert the params
variable it to a pointer, but still store the data on stack.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-13 07:27:34 -07:00
Pavel Begunkov
94cd832916 io_uring: use size_add helpers for ring offsets
Use size_add / size_mul set of functions for rings_size() calculations.
It's more consistent with struct_size(), and errors are preserved across
a series of calculations, so intermediate result checks can be omitted.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-13 07:27:34 -07:00
Pavel Begunkov
e279bb4b4c io_uring: refactor rings_size nosqarray handling
A preparation patch inversing the IORING_SETUP_NO_SQARRAY check, this
way there is only one successful return path from the function, which
will be helpful later.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-13 07:27:34 -07:00
Jens Axboe
ecb8490b2f Merge branch 'io_uring-6.18' into for-6.19/io_uring
Merge 6.18-rc io_uring fixes, as certain coming changes depend on some
of these.

* io_uring-6.18:
  io_uring/rsrc: don't use blk_rq_nr_phys_segments() as number of bvecs
  io_uring/query: return number of available queries
  io_uring/rw: ensure allocated iovec gets cleared for early failure
  io_uring: fix regbuf vector size truncation
  io_uring: fix types for region size calulation
  io_uring/zcrx: remove sync refill uapi
  io_uring: fix buffer auto-commit for multishot uring_cmd
  io_uring: correct __must_hold annotation in io_install_fixed_file
  io_uring zcrx: add MAINTAINERS entry
  io_uring: Fix code indentation error
  io_uring/sqpoll: be smarter on when to update the stime usage
  io_uring/sqpoll: switch away from getrusage() for CPU accounting
  io_uring: fix incorrect unlikely() usage in io_waitid_prep()

Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-13 07:26:37 -07:00
Heiner Kallweit
9f07af1d27 net: phy: fixed_phy: initialize the link status as up
All callers initialize the link status as up. This change is in line
with how of_phy_register_fixed_link() behaves.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://patch.msgid.link/45f644e8-2292-4787-a27a-f69084c93218@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-13 15:20:30 +01:00
Byungchul Park
40a71b53d5 jbd2: use a weaker annotation in journal handling
jbd2 journal handling code doesn't want jbd2_might_wait_for_commit()
to be placed between start_this_handle() and stop_this_handle().  So it
marks the region with rwsem_acquire_read() and rwsem_release().

However, the annotation is too strong for that purpose.  We don't have
to use more than try lock annotation for that.

rwsem_acquire_read() implies:

   1. might be a waiter on contention of the lock.
   2. enter to the critical section of the lock.

All we need in here is to act 2, not 1.  So trylock version of
annotation is sufficient for that purpose.  Now that dept partially
relies on lockdep annotaions, dept interpets rwsem_acquire_read() as a
potential wait and might report a deadlock by the wait.

Replace it with trylock version of annotation.

Signed-off-by: Byungchul Park <byungchul@sk.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: stable@kernel.org
Message-ID: <20251024073940.1063-1-byungchul@sk.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-13 08:43:44 -05:00
Tetsuo Handa
524c385383 jbd2: use a per-journal lock_class_key for jbd2_trans_commit_key
syzbot is reporting possibility of deadlock due to sharing lock_class_key
for jbd2_handle across ext4 and ocfs2. But this is a false positive, for
one disk partition can't have two filesystems at the same time.

Reported-by: syzbot+6e493c165d26d6fcbf72@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=6e493c165d26d6fcbf72
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Tested-by: syzbot+6e493c165d26d6fcbf72@syzkaller.appspotmail.com
Reviewed-by: Jan Kara <jack@suse.cz>
Message-ID: <987110fc-5470-457a-a218-d286a09dd82f@I-love.SAKURA.ne.jp>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
2025-11-13 08:34:39 -05:00
Karina Yankevich
b97cb7d6a0 ext4: xattr: fix null pointer deref in ext4_raw_inode()
If ext4_get_inode_loc() fails (e.g. if it returns -EFSCORRUPTED),
iloc.bh will remain set to NULL. Since ext4_xattr_inode_dec_ref_all()
lacks error checking, this will lead to a null pointer dereference
in ext4_raw_inode(), called right after ext4_get_inode_loc().

Found by Linux Verification Center (linuxtesting.org) with SVACE.

Fixes: c8e008b604 ("ext4: ignore xattrs past end")
Cc: stable@kernel.org
Signed-off-by: Karina Yankevich <k.yankevich@omp.ru>
Reviewed-by: Sergey Shtylyov <s.shtylyov@omp.ru>
Reviewed-by: Baokun Li <libaokun1@huawei.com>
Message-ID: <20251022093253.3546296-1-k.yankevich@omp.ru>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-13 08:33:22 -05:00
Deepanshu Kartikey
892e1cf175 ext4: refresh inline data size before write operations
The cached ei->i_inline_size can become stale between the initial size
check and when ext4_update_inline_data()/ext4_create_inline_data() use
it. Although ext4_get_max_inline_size() reads the correct value at the
time of the check, concurrent xattr operations can modify i_inline_size
before ext4_write_lock_xattr() is acquired.

This causes ext4_update_inline_data() and ext4_create_inline_data() to
work with stale capacity values, leading to a BUG_ON() crash in
ext4_write_inline_data():

  kernel BUG at fs/ext4/inline.c:1331!
  BUG_ON(pos + len > EXT4_I(inode)->i_inline_size);

The race window:
1. ext4_get_max_inline_size() reads i_inline_size = 60 (correct)
2. Size check passes for 50-byte write
3. [Another thread adds xattr, i_inline_size changes to 40]
4. ext4_write_lock_xattr() acquires lock
5. ext4_update_inline_data() uses stale i_inline_size = 60
6. Attempts to write 50 bytes but only 40 bytes actually available
7. BUG_ON() triggers

Fix this by recalculating i_inline_size via ext4_find_inline_data_nolock()
immediately after acquiring xattr_sem. This ensures ext4_update_inline_data()
and ext4_create_inline_data() work with current values that are protected
from concurrent modifications.

This is similar to commit a54c4613da ("ext4: fix race writing to an
inline_data file while its xattrs are changing") which fixed i_inline_off
staleness. This patch addresses the related i_inline_size staleness issue.

Reported-by: syzbot+f3185be57d7e8dda32b8@syzkaller.appspotmail.com
Link: https://syzkaller.appspot.com/bug?extid=f3185be57d7e8dda32b8
Cc: stable@kernel.org
Signed-off-by: Deepanshu Kartikey <kartikey406@gmail.com>
Message-ID: <20251020060936.474314-1-kartikey406@gmail.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-13 08:25:04 -05:00
Paolo Abeni
b63945b0c5 Merge tag 'linux-can-next-for-6.19-20251112-2' of git://git.kernel.org/pub/scm/linux/kernel/git/mkl/linux-can-next
Marc Kleine-Budde says:

====================
pull-request: can-next 2025-11-12

this is a pull request of 11 patches for net-next/main.

The first 3 patches are by Vadim Fedorenko and convert the CAN drivers
to use the ndo_hwtstamp callbacks.

Maud Spierings contributes a patch for the mcp251x driver that
converts it to use dev_err_probe().

The next 6 patches target the mcp251xfd driver and are by Gregor
Herburger and me. They add GPIO controller functionality to the
driver.

The final patch is by Chu Guangqing and fixes a typo in the bxcan
driver.

linux-can-next-for-6.19-20251112-2

* tag 'linux-can-next-for-6.19-20251112-2' of git://git.kernel.org/pub/scm/linux/kernel/git/mkl/linux-can-next:
  can: bxcan: Fix a typo error for assign
  dt-bindings: can: mcp251xfd: add gpio-controller property
  can: mcp251xfd: add gpio functionality
  can: mcp251xfd: only configure PIN1 when rx_int is set
  can: mcp251xfd: add workaround for errata 5
  can: mcp251xfd: utilize gather_write function for all non-CRC writes
  can: mcp251xfd: move chip sleep mode into runtime pm
  can: mcp251x: mcp251x_can_probe(): use dev_err_probe()
  can: peak_usb: convert to use ndo_hwtstamp callbacks
  can: peak_canfd: convert to use ndo_hwtstamp callbacks
  can: convert generic HW timestamp ioctl to ndo_hwtstamp callbacks
====================

Link: https://patch.msgid.link/20251112184344.189863-1-mkl@pengutronix.de
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-13 13:07:48 +01:00
Jakub Kicinski
68fa5b092e Merge branch 'net-stmmac-convert-glue-drivers-to-use-stmmac_get_phy_intf_sel'
Russell King says:

====================
net: stmmac: convert glue drivers to use stmmac_get_phy_intf_sel()

This series converts the remaining glue drivers that support
multi-interface to use stmmac_get_phy_intf_sel(). The reason these
drivers are not converted to the set_phy_intf_sel() method is
because it is unclear whether there are ordering dependencies that
would prevent it.

For example, reading the stm32mp2 documentation, it is required to
set the ETH1_SEL field while the dwmac core is in reset and before
clocks are enabled. This requirement can not be satsified at the
moment (but could with further changes.)
====================

Link: https://patch.msgid.link/aRLvrfx6tOa-RhrY@shell.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-12 18:13:45 -08:00
Russell King (Oracle)
ccb4ff9f24 net: stmmac: visconti: use stmmac_get_phy_intf_sel()
Use stmmac_get_phy_intf_sel() to decode the PHY interface mode to the
phy_intf_sel value, validate the result and use that to set the
control register to select the operating mode for the DWMAC core.

Note that this will allow GMII as well as MII as the phy_intf_sel
value is the same for both.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vIjUe-0000000DquB-3JDY@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-12 18:13:43 -08:00
Russell King (Oracle)
bb68e0183e net: stmmac: visconti: use PHY_INTF_SEL_x to select PHY interface
Convert dwmac-visconti to use the PHY_INTF_SEL_x definitions. The
original definitions used constant 0, BIT(0) (==1) and BIT(2) (==4)
to define these, but the values of the bits corresponds with the
PHY_INTF_SEL_x values, so it is highly likely that these are not
individual bits, but the PHY_INTF_SEL_x bitfield.

This removes this incorrect use of BIT().

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vIjUZ-0000000Dqu5-2sDI@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-12 18:13:42 -08:00
Russell King (Oracle)
45c5e24a53 net: stmmac: stm32: use stmmac_get_phy_intf_sel()
Use stmmac_get_phy_intf_sel() to decode the PHY interface mode to the
phy_intf_sel value. As both configure functions would end up with the
same code, call this from stm32mp1_set_mode(), validate the result and
pass the resulting value into the stm32 configure function. Use this
value to set the operating mode for the DWMAC core.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vIjUU-0000000Dqtz-2PwT@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-12 18:13:42 -08:00
Russell King (Oracle)
07669cf12e net: stmmac: stm32: use PHY_INTF_SEL_x directly
Rather than defining separate constants for each, use the
PHY_INTF_SEL_x definitions in the switch()es configuring the
control register, and use one FIELD_PREP() to convert phy_intf_sel
to the register value.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vIjUP-0000000Dqtt-1bYn@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-12 18:13:42 -08:00
Russell King (Oracle)
73130c298f net: stmmac: stm32: use PHY_INTF_SEL_x to select PHY interface
Convert dwmac-stm32 to use the PHY_INTF_SEL_x definitions.

For stm32mp1, the original definitions used constant 0 (GMII, 0 << 21),
BIT(21) (RGMII, 1 << 21) and BIT(23) (RMII, 4 << 21) to define these,
but from the values it can be clearly seen that these are the
PHY_INTF_SEL_x inputs to the dwmac.

For stm32mp2, the original definitions cover a bitfield 6:4 in the
SYSCFG Ethernet1 control register (according to documentation) and use
the PHY_INTF_SEL_x values.

Use the common dwmac definitions for the PHY interface selection field
by adding the bitfield mask, and using FIELD_PREP() for the bitfield
values.

This removes this incorrect use of BIT().

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vIjUK-0000000Dqtn-1AyK@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-12 18:13:42 -08:00
Russell King (Oracle)
d22045997b net: stmmac: starfive: use stmmac_get_phy_intf_sel()
Use stmmac_get_phy_intf_sel() to decode the PHY interface mode to the
phy_intf_sel value, validate the result and use that to set the
control register to select the operating mode for the DWMAC core.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Emil Renner Berthing <emil.renner.berthing@canonical.com>
Link: https://patch.msgid.link/E1vIjUF-0000000Dqth-0gwD@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-12 18:13:42 -08:00
Russell King (Oracle)
f0917b4753 net: stmmac: starfive: use PHY_INTF_SEL_x to select PHY interface
Use the common dwmac definitions for the PHY interface selection field.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Emil Renner Berthing <emil.renner.berthing@canonical.com>
Link: https://patch.msgid.link/E1vIjUA-0000000Dqtb-0AfP@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-12 18:13:42 -08:00
Russell King (Oracle)
f06620091f net: stmmac: mediatek: simplify set_interface() methods
Use the phy_intf_sel field value when deciding what other options to
apply for the configuration register.

Note that this will allow GMII as well as MII as the phy_intf_sel
value is the same for both.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Link: https://patch.msgid.link/E1vIjU4-0000000DqtV-3qsX@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-12 18:13:41 -08:00
Russell King (Oracle)
d9c7964fd9 net: stmmac: mediatek: use stmmac_get_phy_intf_sel()
Use stmmac_get_phy_intf_sel() to decode the PHY interface mode to the
phy_intf_sel value, validate the result, and pass that into the
implementation specific ->dwmac_set_phy_interface() method. Use this
to configure the PHY interface selection field.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Link: https://patch.msgid.link/E1vIjTz-0000000DqtP-3N9v@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-12 18:13:41 -08:00
Russell King (Oracle)
c3308d380e net: stmmac: mediatek: use PHY_INTF_SEL_x
Use PHY_INTF_SEL_x definitions for the fields that correspond to the
phy_intf_sel inputs to the dwmac core.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Link: https://patch.msgid.link/E1vIjTu-0000000DqtI-2sUB@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-12 18:13:41 -08:00
Russell King (Oracle)
83eb6c7e18 net: stmmac: loongson1: use stmmac_get_phy_intf_sel()
Use stmmac_get_phy_intf_sel() to decode the PHY interface mode to the
phy_intf_sel value, validate the result and use that to set the
control register to select the operating mode for the DWMAC core.

Note that this will allow GMII as well as MII as the phy_intf_sel
value is the same for both.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Link: https://patch.msgid.link/E1vIjTp-0000000DqtC-2DmI@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-12 18:13:41 -08:00
Russell King (Oracle)
031f7a05d4 net: stmmac: loongson1: use PHY_INTF_SEL_x directly
Use the PHY_INTF_SEL_xx values directly in ls1c_dwmac_syscon_init(),
converting them to the PHY_INTF_SELI bitfield when calling
regmap_update_bits().

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Link: https://patch.msgid.link/E1vIjTk-0000000Dqt6-1gN9@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-12 18:13:41 -08:00
Russell King (Oracle)
5d88b24c1d net: stmmac: loongson1: use PHY_INTF_SEL_x
Use PHY_INTF_SEL_x definitions for phy_intf_sel bitfield.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-12 18:13:41 -08:00
Jakub Kicinski
9c577f0998 tools: ynltool: correct install in Makefile
Use the variable in case user has a custom install binary.

Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20251111155214.2760711-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-12 18:07:17 -08:00
Dimitri Daskalakis
f766f8cdde selftests: drv-net: Limit the max number of queues in procfs_downup_hammer
For NICs with a large (1024+) number of queues, this test can cause
excessive memory fragmentation. This results in OOM errors, and in the
worst case driver/kernel crashes. We don't need to test with the max number
of queues, just enough to create a high likelihood of races between
reconfiguration and stats getting read.

Signed-off-by: Dimitri Daskalakis <dimitri.daskalakis1@gmail.com>
Link: https://patch.msgid.link/20251111225319.3019542-1-dimitri.daskalakis1@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-12 18:04:18 -08:00
Dan Hamik
ef42cf705c wifi: rtw89: rtw8852bu: Added dev id for ASUS AX57 NANO USB Wifi dongle
Add the USB device ID 0x0b05:0x1cb6 to the rtw8852bu driver to support the
ASUS AX57 Nano WiFi 6 USB adapter.This device uses the same Realtek
RTL8852BU chipset as other supported models.

Tested on: Linux Mint 22 with kernel 6.8.0-87-generic.
The adapter initializes successfully and connects to networks.

Signed-off-by: Dan Hamik <dan@hamik.net>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/010f019a76e1a84b-0a6f5e9f-2a43-4a9d-9c30-de4ae6363011-000000@us-east-2.amazonses.com
2025-11-13 09:49:03 +08:00
Ping-Ke Shih
7465c0a912 wifi: rtw89: configure RX antenna if chips can support
If chip->ops->cfg_txrx_path is implemented, a chip can support to configure
RX antenna, so accept setting via iw tool.

Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251111022452.28093-9-pkshih@realtek.com
2025-11-13 09:40:06 +08:00
Ping-Ke Shih
d9204e99b6 wifi: rtw89: do RF calibration once setting channel when running pure monitor mode
To be able to capture and inject packets in monitor mode, do RF calibration
once setting certain channel. Since calibration costs time, do not change
behavior of normal usage, which do calibration only when starting as AP
or going to connect AP.

Since driver declares IEEE80211_HW_WANT_MONITOR_VIF, the pure monitor vif
must be only one when adding interface with type NL80211_IFTYPE_MONITOR.
Otherwise, monitor vif must be NULL.

Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251111022452.28093-8-pkshih@realtek.com
2025-11-13 09:38:29 +08:00
Ping-Ke Shih
e96abe4fd7 wifi: rtw89: consider data rate/bandwidth/GI for injected packets
To send injected packets with configurable rate/bandwidth/GI, fill TXWD
fields according to SKB's info->control.rates[0] annotated by mac80211.

Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251111022452.28093-7-pkshih@realtek.com
2025-11-13 09:36:54 +08:00
Kuan-Chung Chen
1dd7e743dd wifi: rtw89: phy: fix out-of-bounds access in rtw89_phy_read_txpwr_limit()
Coverity reported a potential out-of-bounds access when 'bw' exceeds the
valid range for the specified band. Add a helper `rtw89_bw_is_valid()`
to check bandwidth validity for each band before accessing limit tables.

Addresses-Coverity-ID: 1598844 ("Out-of-bounds access")
Addresses-Coverity-ID: 1598896 ("Out-of-bounds access")

Signed-off-by: Kuan-Chung Chen <damon.chen@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251111022452.28093-6-pkshih@realtek.com
2025-11-13 09:36:40 +08:00
Ping-Ke Shih
b37fb77d45 wifi: rtw89: 8852c: add compensation of thermal value from efuse calibration
The 3th bit of thermal value programmed in efuse means 8 (2 ^ 3) grams.
Check the bit and signed bit, and add compensation to final thermal.

Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251111022452.28093-5-pkshih@realtek.com
2025-11-13 09:36:29 +08:00
Ping-Ke Shih
7bf433c676 wifi: rtw89: debug: add parser to diagnose along DIAG_MAC fw element
The rules to diagnose MAC have a common header, and a cmd field is used
to know the exact command and its format. The rules with the same tuple of
fields {sheet, seq} can be seen as a set of compound rules, which treat
it as positive rule if just one of the rules is positive.

Take EQUALV rules as example, if value of {addr, mask} is equal to
predefined value as field val, a rule is positive. Fields addr_name_offset
and msg_offset are offsets related to textual messages for human readable.

Format of common rule header (8 bytes)
 +-------+-----+--------+-----------+-----+---------+
 | sheet | cmd | seq[2] | io / band | len | rsvd[2] |
 +-------+-----+--------+-----------+-----+---------+

Format of rule command is EQUALV (equal value) (24 bytes):
 +------+------------------+------+-----+------------+---------+
 | addr | addr_name_offset | mask | val | msg_offset | rsvd[4] |
 +------+------------------+------+-----+------------+---------+

Format of message:
 +-----+----------+
 | len | string[] |
 +-----+----------+

An example of output:
 Plain(Ignore)/Rules/Positive: 115(4)/86/81

 Where, Plain is total rules written in firmware element.
        Ignore is the ignored rules, such as USB IO, but current is PCIE.
        Rules is number of set of compound rules.
        Positive is number of positive Rules.

Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251111022452.28093-4-pkshih@realtek.com
2025-11-13 09:36:17 +08:00
Ping-Ke Shih
de19cc7def wifi: rtw89: fw: parse firmware element of DIAG_MAC
The firmware element ID 28 is a set of rules to diagnose if MAC get
abnormal. The latter patch will use these rules via debugfs to know
the status.

The element contains rules with their textual messages shown as below:

   +------------------------------------+
   |                                    |
   |                +-----------+       |
   |                | rule_size |-------|----------+
   +----------------+-----------+-------+ --       |
   |             rule[0]                |   \      |
   |             rule[1]                |   |  <---+
   |                :                   |   /
   +------------------------------------+ --
   | msg[0]      msg[1]                 |  each msg has variable length
   |        msg[2]    msg[3] ...        |  (with address align 2)
   | ...                                |
   +------------------------------------+

Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251111022452.28093-3-pkshih@realtek.com
2025-11-13 09:34:39 +08:00
Ping-Ke Shih
dae8d7d63b wifi: rtw89: pci: add to read PCI configuration space from common code
Normally only access PCI device in pci.c. However for debug purpose,
a set of registers predefined in firmware element including PCI
configuration space should be read for diagnosis.

Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251111022452.28093-2-pkshih@realtek.com
2025-11-13 09:30:03 +08:00
Bart Van Assche
f233339188 blk-zoned: Move code from disk_zone_wplug_add_bio() into its caller
Move the following code into the only caller of disk_zone_wplug_add_bio():
 - The code for clearing the REQ_NOWAIT flag.
 - The code that sets the BLK_ZONE_WPLUG_PLUGGED flag.
 - The disk_zone_wplug_schedule_bio_work() call.

This patch moves all code that is related to REQ_NOWAIT or to bio
scheduling into a single function. Additionally, the 'schedule_bio_work'
variable is removed. No functionality has been changed.

Cc: Damien Le Moal <dlemoal@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Damien Le Moal <dlmoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-12 14:05:23 -07:00
Bart Van Assche
faa3be1a61 blk-zoned: Document disk_zone_wplug_schedule_bio_work() locking
Document that all callers hold this lock because the code in
disk_zone_wplug_schedule_bio_work() depends on this.

Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-12 14:05:23 -07:00
Bart Van Assche
fd0ae4754c blk-zoned: Fix a typo in a source code comment
Remove a superfluous parenthesis that was introduced by commit fa8555630b
("blk-zoned: Improve the queue reference count strategy documentation").

Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-12 14:05:23 -07:00
Kumar Kartikeya Dwivedi
3249e8a17e bpf: Adjust return value for queue destruction in rqspinlock
Return -ETIMEDOUT whenever non-head waiters are signalled by head, and fix
oversight in commit 7bd6e5ce5b ("rqspinlock: Disable queue destruction for
deadlocks"). We no longer signal on deadlocks.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Reviewed-by: Amery Hung <ameryhung@gmail.com>
Link: https://lore.kernel.org/r/20251111013827.1853484-1-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-12 11:17:39 -08:00
Chu Guangqing
b305fbdad4 can: bxcan: Fix a typo error for assign
Fix the spelling error of "assign".

Signed-off-by: Chu Guangqing <chuguangqing@inspur.com>
Reviewed-by: Dario Binacchi <dario.binacchi@amarulasolutions.com>
Link: https://patch.msgid.link/20251103074009.4708-1-chuguangqing@inspur.com
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-11-12 19:30:59 +01:00
Marc Kleine-Budde
2d0938702c Merge patch series "can: mcp251xfd: add gpio functionality"
Viken Dadhaniya <viken.dadhaniya@oss.qualcomm.com> says:

The mcp251xfd allows two pins to be configured as GPIOs. This series adds
support for this feature.

The GPIO functionality is controlled with the IOCON register which has an
erratum.

Patch 1 from https://lore.kernel.org/linux-can/20240429-mcp251xfd-runtime_pm-v1-3-c26a93a66544@pengutronix.de/
Patch 2 refactor of no-crc functions to prepare workaround for non-crc writes
Patch 3 is the fix/workaround for the aforementioned erratum
Patch 4 only configure pin1 for rx-int
Patch 5 adds the gpio support
Patch 6 updates dt-binding

https://lore.kernel.org/all/20240806-industrious-augmented-crane-44239a-mkl@pengutronix.de/

Link: https://patch.msgid.link/20251001091006.4003841-1-viken.dadhaniya@oss.qualcomm.com
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-11-12 19:30:55 +01:00
Gregor Herburger
6ece6b4c37 dt-bindings: can: mcp251xfd: add gpio-controller property
The mcp251xfd has two pins that can be used as gpio. Add gpio-controller
property to binding description.

Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Gregor Herburger <gregor.herburger@ew.tq-group.com>
Signed-off-by: Viken Dadhaniya <viken.dadhaniya@oss.qualcomm.com>
Reviewed-by: Manivannan Sadhasivam <mani@kernel.org>
Link: https://patch.msgid.link/20251001091006.4003841-7-viken.dadhaniya@oss.qualcomm.com
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-11-12 19:30:33 +01:00
Gregor Herburger
c6106336ec can: mcp251xfd: add gpio functionality
The mcp251xfd devices allow two pins to be configured as gpio. Add this
functionality to driver.

Acked-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Signed-off-by: Gregor Herburger <gregor.herburger@ew.tq-group.com>
Tested-by: Viken Dadhaniya <viken.dadhaniya@oss.qualcomm.com>
Signed-off-by: Viken Dadhaniya <viken.dadhaniya@oss.qualcomm.com>
Reviewed-by: Manivannan Sadhasivam <mani@kernel.org>
Link: https://patch.msgid.link/20251001091006.4003841-6-viken.dadhaniya@oss.qualcomm.com
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-11-12 19:30:33 +01:00
Gregor Herburger
d35fa005f5 can: mcp251xfd: only configure PIN1 when rx_int is set
When rx_int is used th mcp251xfd_chip_rx_int_enable and
mcp251xfd_chip_rx_int_disable function configure both PIN0 and PIN1. To
prepare the support of the GPIOS only configure PIN1 with
regmap_update_bits.

This way PIN0 can be used as GPIO while PIN1 is used as rx_int
interrupt.

Signed-off-by: Gregor Herburger <gregor.herburger@ew.tq-group.com>
Tested-by: Viken Dadhaniya <viken.dadhaniya@oss.qualcomm.com>
Signed-off-by: Viken Dadhaniya <viken.dadhaniya@oss.qualcomm.com>
Reviewed-by: Manivannan Sadhasivam <mani@kernel.org>
Link: https://patch.msgid.link/20251001091006.4003841-5-viken.dadhaniya@oss.qualcomm.com
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-11-12 19:30:32 +01:00
Gregor Herburger
c902835fc6 can: mcp251xfd: add workaround for errata 5
According to Errata DS80000789E 5 writing IOCON register using one SPI
write command clears LAT0/LAT1.

Errata Fix/Work Around suggests to write registers with single byte write
instructions. However, it seems that every write to the second byte
causes the overwrite of LAT0/LAT1.

Never write byte 2 of IOCON register to avoid clearing of LAT0/LAT1.

Signed-off-by: Gregor Herburger <gregor.herburger@ew.tq-group.com>
Tested-by: Viken Dadhaniya <viken.dadhaniya@oss.qualcomm.com>
Signed-off-by: Viken Dadhaniya <viken.dadhaniya@oss.qualcomm.com>
Reviewed-by: Manivannan Sadhasivam <mani@kernel.org>
Link: https://patch.msgid.link/20251001091006.4003841-4-viken.dadhaniya@oss.qualcomm.com
[mkl: add missing MCP251XFD_REG_IOCON_GPIO_MASK]
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-11-12 19:30:07 +01:00
Jakub Kicinski
e949824730 Merge tag 'wireless-next-2025-11-12' of https://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless-next
Johannes Berg says:

====================
More -next material, notably:
 - split ieee80211.h file, it's way too big
 - mac80211: initial chanctx work towards NAN
 - mac80211: MU-MIMO sniffer improvements
 - ath12k: statistics improvements

* tag 'wireless-next-2025-11-12' of https://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless-next: (26 commits)
  wifi: cw1200: Fix potential memory leak in cw1200_bh_rx_helper()
  wifi: mac80211: make monitor link info check more specific
  wifi: mac80211: track MU-MIMO configuration on disabled interfaces
  wifi: cfg80211/mac80211: Add fallback mechanism for INDOOR_SP connection
  wifi: cfg80211/mac80211: clean up duplicate ap_power handling
  wifi: cfg80211: use a C99 initializer in wiphy_register
  wifi: cfg80211: fix doc of struct key_params
  wifi: mac80211: remove unnecessary vlan NULL check
  wifi: mac80211: pass frame type to element parsing
  wifi: mac80211: remove "disabling VHT" message
  wifi: mac80211: add and use chanctx usage iteration
  wifi: mac80211: simplify ieee80211_recalc_chanctx_min_def() API
  wifi: mac80211: remove chanctx to link back-references
  wifi: mac80211: make link iteration safe for 'break'
  wifi: mac80211: fix EHT typo
  wifi: cfg80211: fix EHT typo
  wifi: ieee80211: split NAN definitions out
  wifi: ieee80211: split P2P definitions out
  wifi: ieee80211: split S1G definitions out
  wifi: ieee80211: split EHT definitions out
  ...
====================

Link: https://patch.msgid.link/20251112115126.16223-4-johannes@sipsolutions.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-12 09:33:24 -08:00
Russell King (Oracle)
7e975caa0f net: stmmac: improve ndev->max_mtu setup readability
Improve the readibility of the code setting ndev->max_mtu. This depends
on the hardware specific maximum defined by the MAC core, and also a
platform provided maximum.

The code was originally checking that the platform specific maximum was
between ndev->min_mtu..MAC core maximum before reducing ndev->max_mtu,
otherwise if the platform specific maximum was less than ndev->min_mtu,
issuing a warning.

Re-order the code to handle the case where the platform specific max is
below ndev->min_mtu, which then means that the subsequent test is
simply reducing ndev->max_mtu.

Update the comment, and add a few blank lines to separate the blocks of
code.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vImWA-0000000DrIl-1HZY@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-12 09:17:20 -08:00
javen
1479493c91 r8169: add support for RTL8125K
This adds support for chip RTL8125K. Its XID is 0x68a. It is basically
based on the one with XID 0x688, but with different firmware file.

Signed-off-by: javen <javen_xu@realsil.com.cn>
Reviewed-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://patch.msgid.link/20251111092851.3371-1-javen_xu@realsil.com.cn
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-12 09:14:10 -08:00
Keith Busch
3749ea4dee null_blk: fix zone read length beyond write pointer
Fix up the divisor calculating the number of zone sectors being read and
handle a read that straddles the zone write pointer. The length is
rounded up a sector boundary, so be sure to truncate any excess bytes
off to avoid copying past the data segment.

Fixes: 3451cf34f5 ("null_blk: allow byte aligned memory offsets")
Signed-off-by: Keith Busch <kbusch@kernel.org>
Tested-by: Bart van Assche <bvanassche@acm.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-12 10:02:56 -07:00
Eric Dumazet
26b8986a18 net: clear skb->sk in skb_release_head_state()
skb_release_head_state() inlines skb_orphan().

We need to clear skb->sk otherwise we can freeze TCP flows
on a mostly idle host, because skb_fclone_busy() would
return true as long as the packet is not yet processed by
skb_defer_free_flush().

Fixes: 1fcf572211 ("net: allow skb_release_head_state() to be called multiple times")
Fixes: e20dfbad8a ("net: fix napi_consume_skb() with alien skbs")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Tested-by: Aditya Garg <gargaditya@linux.microsoft.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20251111151235.1903659-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-12 06:48:33 -08:00
Jakub Kicinski
4001bda0cc Merge branch 'selftests-vsock-refactor-and-improve-vmtest-infrastructure'
Bobby Eshleman says:

====================
selftests/vsock: refactor and improve vmtest infrastructure

This patch series refactors the vsock selftest VM infrastructure to
improve test run times, improve logging, and prepare for future tests
which make heavy usage of these refactored functions and have new
requirements such as simultaneous QEMU processes.
====================

Link: https://patch.msgid.link/20251108-vsock-selftests-fixes-and-improvements-v4-0-d5e8d6c87289@meta.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-12 06:19:42 -08:00
Bobby Eshleman
99f932c905 selftests/vsock: disable shellcheck SC2317 and SC2119
Disable shellcheck rules SC2317 and SC2119. These rules are being
triggered due to false positives. For SC2317, many `return
"${KSFT_PASS}"` lines are reported as unreachable, even though they are
executed during normal runs. For SC2119, the fact that
log_guest/log_host accept either stdin or arguments triggers SC2119,
despite being valid.

Signed-off-by: Bobby Eshleman <bobbyeshleman@meta.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Link: https://patch.msgid.link/20251108-vsock-selftests-fixes-and-improvements-v4-12-d5e8d6c87289@meta.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-12 06:19:40 -08:00
Bobby Eshleman
338c5ddf4c selftests/vsock: add vsock_loopback module loading
Add vsock_loopback module loading to the loopback test so that vmtest.sh
can be used for kernels built with loopback as a module.

This is not technically a fix as kselftest expects loopback to be
built-in already (defined in selftests/vsock/config). This is useful
only for using vmtest.sh outside of kselftest.

Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Bobby Eshleman <bobbyeshleman@meta.com>
Link: https://patch.msgid.link/20251108-vsock-selftests-fixes-and-improvements-v4-11-d5e8d6c87289@meta.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-12 06:19:39 -08:00
Bobby Eshleman
67422ef38f selftests/vsock: add 1.37 to tested virtme-ng versions
Testing with 1.37 shows all tests passing but emits the warning:

warning: vng version 'virtme-ng 1.37' has not been tested and may not function properly.
	The following versions have been tested: 1.33 1.36

This patch adds 1.37 to the virtme-ng versions to get rid of the above
warning.

Reviewed-by: Simon Horman <horms@kernel.org>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Bobby Eshleman <bobbyeshleman@meta.com>
Link: https://patch.msgid.link/20251108-vsock-selftests-fixes-and-improvements-v4-10-d5e8d6c87289@meta.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-12 06:19:39 -08:00
Bobby Eshleman
592e3d14ce selftests/vsock: add BUILD=0 definition
Add the definition for BUILD and initialize it to zero. This avoids
'bash -u vmtest.sh` from throwing 'unbound variable' when BUILD is not
set to 1 and is later checked for its value.

Signed-off-by: Bobby Eshleman <bobbyeshleman@meta.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Link: https://patch.msgid.link/20251108-vsock-selftests-fixes-and-improvements-v4-9-d5e8d6c87289@meta.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-12 06:19:39 -08:00
Bobby Eshleman
d13fb04a4b selftests/vsock: identify and execute tests that can re-use VM
In preparation for future patches that introduce tests that cannot
re-use the same VM, add functions to identify those that *can* re-use a
VM.

By continuing to re-use the same VM for these tests we can save time by
avoiding the delay of booting a VM for every test.

Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Bobby Eshleman <bobbyeshleman@meta.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Link: https://patch.msgid.link/20251108-vsock-selftests-fixes-and-improvements-v4-8-d5e8d6c87289@meta.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-12 06:19:39 -08:00
Bobby Eshleman
7fea50dff9 selftests/vsock: add check_result() for pass/fail counting
Add check_result() function to reuse logic for incrementing the
pass/fail counters. This function will get used by different callers as
we add different types of tests in future patches (namely, namespace and
non-namespace tests will be called at different places, and re-use this
function).

Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Bobby Eshleman <bobbyeshleman@meta.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Link: https://patch.msgid.link/20251108-vsock-selftests-fixes-and-improvements-v4-7-d5e8d6c87289@meta.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-12 06:19:39 -08:00
Bobby Eshleman
9e2ad0bc36 selftests/vsock: speed up tests by reducing the QEMU pidfile timeout
Reduce the time waiting for the QEMU pidfile from three minutes to five
seconds. The three minute time window was chosen to make sure QEMU had
enough time to fully boot up. This, however, is an unreasonably long
delay for QEMU to write the pidfile, which happens earlier when the QEMU
process starts (not after VM boot). The three minute delay becomes
noticeably wasteful in future tests that expect QEMU to fail and wait a
full three minutes for a pidfile that will never exist.

Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Bobby Eshleman <bobbyeshleman@meta.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Link: https://patch.msgid.link/20251108-vsock-selftests-fixes-and-improvements-v4-6-d5e8d6c87289@meta.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-12 06:19:38 -08:00
Bobby Eshleman
c7df4adc06 selftests/vsock: do not unconditionally die if qemu fails
If QEMU fails to boot, then set the returncode (via timeout) instead of
unconditionally dying. This is in preparation for tests that expect QEMU
to fail to boot. In that case, we just want to know if the boot failed
or not so we can test the pass/fail criteria, and continue executing the
next test.

Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Bobby Eshleman <bobbyeshleman@meta.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Link: https://patch.msgid.link/20251108-vsock-selftests-fixes-and-improvements-v4-5-d5e8d6c87289@meta.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-12 06:19:38 -08:00
Bobby Eshleman
ac8997e943 selftests/vsock: avoid multi-VM pidfile collisions with QEMU
Change QEMU to use generated pidfile names instead of just a single
globally-defined pidfile. This allows multiple QEMU instances to
co-exist with different pidfiles. This is required for future tests that
use multiple VMs to check for CID collissions.

Additionally, this also places the burden of killing the QEMU process
and cleaning up the pidfile on the caller of vm_start(). To help with
this, a function terminate_pidfiles() is introduced that callers use to
perform the cleanup. The terminate_pidfiles() function supports multiple
pidfile removals because future patches will need to process two
pidfiles at a time.

Change QEMU_OPTS to be initialized inside the vm_start(). This allows
the generated pidfile to be passed to the string assignment, and
prepares for future vm-specific options as well (e.g., cid).

Signed-off-by: Bobby Eshleman <bobbyeshleman@meta.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Link: https://patch.msgid.link/20251108-vsock-selftests-fixes-and-improvements-v4-4-d5e8d6c87289@meta.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-12 06:19:38 -08:00
Bobby Eshleman
4f76ff14d3 selftests/vsock: reuse logic for vsock_test through wrapper functions
Add wrapper functions vm_vsock_test() and host_vsock_test() to invoke
the vsock_test binary. This encapsulates several items of repeat logic,
such as waiting for the server to reach listening state and
enabling/disabling the bash option pipefail to avoid pipe-style logging
from hiding failures.

Signed-off-by: Bobby Eshleman <bobbyeshleman@meta.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Link: https://patch.msgid.link/20251108-vsock-selftests-fixes-and-improvements-v4-3-d5e8d6c87289@meta.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-12 06:19:38 -08:00
Bobby Eshleman
2ed3ce7efb selftests/vsock: make wait_for_listener() work even if pipefail is on
Rewrite wait_for_listener()'s pattern matching to avoid tripping the
if-condition when pipefail is on.

awk doesn't gracefully handle SIGPIPE with a non-zero exit code, so grep
exiting upon finding a match causes false-positives when the pipefail
option is used (grep exits, SIGPIPE emits, and awk complains with a
non-zero exit code). Instead, move all of the pattern matching into awk
so that SIGPIPE cannot happen and the correct exit code is returned.

Signed-off-by: Bobby Eshleman <bobbyeshleman@meta.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Link: https://patch.msgid.link/20251108-vsock-selftests-fixes-and-improvements-v4-2-d5e8d6c87289@meta.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-12 06:19:38 -08:00
Bobby Eshleman
d9cac93cd1 selftests/vsock: improve logging in vmtest.sh
Improve usability of logging functions. Remove the test name prefix from
logging functions so that logging calls can be made deeper into the call
stack without passing down the test name or setting some global. Teach
log function to accept a LOG_PREFIX variable to avoid unnecessary
argument shifting.

Remove log_setup() and instead use log_host(). The host/guest prefixes
are useful to show whether a failure happened on the guest or host side,
but "setup" doesn't really give additional useful information. Since all
log_setup() calls happen on the host, lets just use log_host() instead.

Signed-off-by: Bobby Eshleman <bobbyeshleman@meta.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Link: https://patch.msgid.link/20251108-vsock-selftests-fixes-and-improvements-v4-1-d5e8d6c87289@meta.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-12 06:19:37 -08:00
Hans Holmberg
bf3b8e9152 xfs: remove xarray mark for reclaimable zones
We can easily check if there are any reclaimble zones by just looking
at the used counters in the reclaim buckets, so do that to free up the
xarray mark we currently use for this purpose.

Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
2025-11-12 11:10:50 +01:00
Christoph Hellwig
6731f85d38 xfs: remove the xlog_in_core_t typedef
Switch the few remaining users to use the underlying struct directly.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
2025-11-12 11:09:25 +01:00
Christoph Hellwig
bc2dd9f2ba xfs: remove l_iclog_heads
l_iclog_heads is only used in one place and can be trivially derived
from l_iclog_hsize by a single shift operation.  Remove it, and switch
the initialization of l_iclog_hsize to use struct_size so that it is
directly derived from the on-disk format definition.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
2025-11-12 11:09:25 +01:00
Christoph Hellwig
ef1e275638 xfs: remove the xlog_rec_header_t typedef
There are almost no users of the typedef left, kill it and switch the
remaining users to use the underlying struct.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
2025-11-12 11:09:25 +01:00
Christoph Hellwig
fe985b910e xfs: remove xlog_in_core_2_t
xlog_in_core_2_t is a really odd type, not only is it grossly
misnamed because it actually is an on-disk structure, but it also
reprents the actual on-disk structure in a rather odd way.

A v1 or small v2 log header look like:

	+-----------------------+
	|      xlog_record      |
	+-----------------------+

while larger v2 log headers look like:

	+-----------------------+
	|      xlog_record      |
	+-----------------------+
	|  xlog_rec_ext_header  |
	+-------------------+---+
	|         .....         |
	+-----------------------+
	|  xlog_rec_ext_header  |
	+-----------------------+

I.e., the ext headers are a variable sized array at the end of the
header.  So instead of declaring a union of xlog_rec_header,
xlog_rec_ext_header and padding to BBSIZE, add the proper padding to
struct struct xlog_rec_header and struct xlog_rec_ext_header, and
add a variable sized array of the latter to the former.  This also
exposes the somewhat unusual scope of the log checksums, which is
made explicitly now by adding proper padding and macro designating
the actual payload length.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
2025-11-12 11:09:25 +01:00
Christoph Hellwig
9ed9df98fc xfs: remove a very outdated comment from xlog_alloc_log
The xlog_iclog definition has been pretty standard for a while, so drop
this now rather misleading comment.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
2025-11-12 11:09:25 +01:00
Christoph Hellwig
16c18021e1 xfs: cleanup xlog_alloc_log a bit
Remove the separate head variable, move the ic_datap initialization
up a bit where the context is more obvious and remove the duplicate
memset right after a zeroing memory allocation.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
2025-11-12 11:09:25 +01:00
Christoph Hellwig
be665a4e27 xfs: don't use xlog_in_core_2_t in struct xlog_in_core
Most accessed to the on-disk log record header are for the original
xlog_rec_header.  Make that the main structure, and case for the
single remaining place using other union legs.

This prepares for removing xlog_in_core_2_t entirely.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
2025-11-12 11:09:25 +01:00
Christoph Hellwig
899b7ee44b xfs: add a on-disk log header cycle array accessor
Accessing the cycle arrays in the original log record header vs the
extended header is messy and duplicated in multiple places.

Add a xlog_cycle_data helper to abstract it out.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
2025-11-12 11:09:25 +01:00
Christoph Hellwig
74d975ed6c xfs: add a XLOG_CYCLE_DATA_SIZE constant
The XLOG_HEADER_CYCLE_SIZE / BBSIZE expression is used a lot
in the log code, give it a symbolic name.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
2025-11-12 11:09:25 +01:00
Johannes Berg
0eb272033b Merge tag 'ath-next-20251111' of git://git.kernel.org/pub/scm/linux/kernel/git/ath/ath
Jeff Johnson says:
==================
ath.git patches for v6.19 (#2)

Just one 2-patch series for this PR.

Once pulled into wireless-next, ath-next will fast-forward, and that
will provide the baseline for merging ath12k-ng into ath-next.
==================

Link: https://patch.msgid.link/15a98cae-0274-45f4-9b8e-be6fa9720884@oss.qualcomm.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-12 09:56:28 +01:00
Gregor Herburger
f5982a679a can: mcp251xfd: utilize gather_write function for all non-CRC writes
This is a preparation patch to add errata workaround for non crc writes.

Currently for non-crc writes to the chip can go through the
.gather_write, .write or the reg_update_bits callback.

To allow the addition of the errata fix at a single location use
mcp251xfd_regmap_nocrc_gather_write for all non-CRC write instructions,
similar to the crc regmap.

Signed-off-by: Gregor Herburger <gregor.herburger@ew.tq-group.com>
Tested-by: Viken Dadhaniya <viken.dadhaniya@oss.qualcomm.com>
Signed-off-by: Viken Dadhaniya <viken.dadhaniya@oss.qualcomm.com>
Reviewed-by: Manivannan Sadhasivam <mani@kernel.org>
Link: https://patch.msgid.link/20251001091006.4003841-3-viken.dadhaniya@oss.qualcomm.com
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-11-12 09:36:59 +01:00
Marc Kleine-Budde
71df9227ba can: mcp251xfd: move chip sleep mode into runtime pm
This is a preparation patch to add GPIO support.

Up to now, the Vdd regulator and the clocks have been managed by
Runtime-PM (on systems without CONFIG_PM these remain permanently
switched on).

During the mcp251xfd_open() callback the mcp251xfd is powered,
soft-reset and configured. In mcp251xfd_stop() the chip is shut down
again. To support the on-chip GPIOs, the chip must be supplied with
power while GPIOs are being requested, even if the networking
interface is down.

To support this, move the functions mcp251xfd_chip_softreset() and
mcp251xfd_chip_clock_init() from mcp251xfd_chip_start() to
mcp251xfd_runtime_resume(). Instead of setting the controller to sleep
mode in mcp251xfd_chip_stop(), bring it into configuration mode. This
way it doesn't take part in bus activity and doesn't enter sleep mode.

Signed-off-by: Gregor Herburger <gregor.herburger@ew.tq-group.com>
Tested-by: Viken Dadhaniya <viken.dadhaniya@oss.qualcomm.com>
Signed-off-by: Viken Dadhaniya <viken.dadhaniya@oss.qualcomm.com>
Reviewed-by: Manivannan Sadhasivam <mani@kernel.org>
Link: https://patch.msgid.link/20251001091006.4003841-2-viken.dadhaniya@oss.qualcomm.com
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-11-12 09:36:59 +01:00
Tariq Toukan
5422318e27 net/mlx5: Expose definition for 1600Gbps link mode
This patch exposes new link mode for 1600Gbps, utilizing 8 lanes at
200Gbps per lane.

Co-developed-by: Yael Chemla <ychemla@nvidia.com>
Reviewed-by: Shahar Shitrit <shshitrit@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1762863888-1092798-1-git-send-email-tariqt@nvidia.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
2025-11-12 03:35:14 -05:00
Bagas Sanjaya
939ba8c5b8 MAINTAINERS: Add entry for XFRM documentation
XFRM patches are supposed to be sent to maintainers under "NETWORKING
[IPSEC]" heading, but it doesn't cover XFRM docs yet. Add the entry.

Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Bagas Sanjaya <bagasdotme@gmail.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
2025-11-12 08:30:03 +01:00
Bagas Sanjaya
03e23b18c7 net: Move XFRM documentation into its own subdirectory
XFRM docs are currently reside in Documentation/networking directory,
yet these are distinctive as a group of their own. Move them into xfrm
subdirectory.

Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Bagas Sanjaya <bagasdotme@gmail.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
2025-11-12 08:30:03 +01:00
Bagas Sanjaya
7276e7ae56 Documentation: xfrm_sync: Number the fifth section
Number the fifth section ("Exception to threshold settings") to be
consistent with the rest of sections.

Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Suggested-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Bagas Sanjaya <bagasdotme@gmail.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
2025-11-12 08:30:03 +01:00
Bagas Sanjaya
c08b786b82 Documentation: xfrm_sysctl: Trim trailing colon in section heading
The sole section heading ("/proc/sys/net/core/xfrm_* Variables") has
trailing colon. Trim it.

Suggested-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Bagas Sanjaya <bagasdotme@gmail.com>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
2025-11-12 08:29:36 +01:00
Bagas Sanjaya
01ad7831fb Documentation: xfrm_sync: Trim excess section heading characters
The first section "Message Structure" has excess underline, while the
second and third one ("TLVS reflect the different parameters" and
"Default configurations for the parameters") have trailing colon. Trim
them.

Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Suggested-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Bagas Sanjaya <bagasdotme@gmail.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
2025-11-12 08:28:11 +01:00
Bagas Sanjaya
a397b259c1 Documentation: xfrm_sync: Properly reindent list text
List texts are currently aligned at the start of column, rather than
after the list marker. Reindent them.

Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Bagas Sanjaya <bagasdotme@gmail.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
2025-11-12 08:28:10 +01:00
Bagas Sanjaya
840188d276 Documentation: xfrm_device: Separate hardware offload sublists
Sublists of hardware offload type lists are rendered in combined
paragraph due to lack of separator from their parent list. Add it.

Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Bagas Sanjaya <bagasdotme@gmail.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
2025-11-12 08:28:10 +01:00
Bagas Sanjaya
340e2a7386 Documentation: xfrm_device: Use numbered list for offloading steps
Format xfrm offloading steps as numbered list.

Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Bagas Sanjaya <bagasdotme@gmail.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
2025-11-12 08:28:09 +01:00
Bagas Sanjaya
68ec5df1d8 Documentation: xfrm_device: Wrap iproute2 snippets in literal code block
iproute2 snippets (ip x) are shown in long-running definition lists
instead. Format them as literal code blocks that do the semantic job
better.

Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Bagas Sanjaya <bagasdotme@gmail.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
2025-11-12 08:28:09 +01:00
Robert Marko
fc6aa0e470 net: sparx5/lan969x: populate netdev of_node
Populate of_node for the port netdevs, to make the individual ports
of_nodes available in sysfs.

Signed-off-by: Robert Marko <robert.marko@sartura.hr>
Link: https://patch.msgid.link/20251110124342.199216-1-robert.marko@sartura.hr
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-11 18:06:07 -08:00
Jakub Kicinski
458ea87a6b Merge branch 'net-stmmac-convert-meson8b-to-use-stmmac_get_phy_intf_sel'
Russell King says:

====================
net: stmmac: convert meson8b to use stmmac_get_phy_intf_sel()

This series splits out meson8b from the previous 16 patch series
as that now has r-b tags.

This series converts meson8b to use stmmac_get_phy_intf_sel(). This
driver is not converted to the set_phy_intf_sel() method as it is
unclear whether there are ordering dependencies that would prevent
it. I would appreciate the driver author looking in to whether this
conversion is possible.
====================

Link: https://patch.msgid.link/aRH50uVDX4_9O5ZU@shell.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-11 17:53:30 -08:00
Russell King (Oracle)
da3d150123 net: stmmac: meson8b: use stmmac_get_phy_intf_sel()
Use stmmac_get_phy_intf_sel() to decode the PHY interface mode to the
phy_intf_sel value, validate the result and use that to set the
control register to select the operating mode for the DWMAC core.

Reviewed-by: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vIT6b-0000000DpPX-1LQ0@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-11 17:53:18 -08:00
Russell King (Oracle)
52d639da6f net: stmmac: meson8b: use phy_intf_sel directly
Rearrange meson_axg_set_phy_mode() to use phy_intf_sel directly,
converting it to the register field for meson8b_dwmac_mask_bits().

Reviewed-by: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vIT6W-0000000DpPR-0tby@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-11 17:53:18 -08:00
Russell King (Oracle)
12f42597ab net: stmmac: meson8b: use PHY_INTF_SEL_x
Use PHY_INTF_SEL_x definitions for phy_intf_sel bitfield.

Reviewed-by: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vIT6R-0000000DpPL-0Nli@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-11 17:53:18 -08:00
Nathan Chancellor
34bff6f03c net: netcp: ethss: Fix type of first parameter in hwtstamp stubs
When building without CONFIG_TI_CPTS, there are a series of errors from
-Wincompatible-pointer-types:

  drivers/net/ethernet/ti/netcp_ethss.c:3831:27: error: initialization of 'int (*)(void *, struct kernel_hwtstamp_config *)' from incompatible pointer type 'int (*)(struct gbe_intf *, struct kernel_hwtstamp_config *)' [-Wincompatible-pointer-types]
   3831 |         .hwtstamp_get   = gbe_hwtstamp_get,
        |                           ^~~~~~~~~~~~~~~~
  drivers/net/ethernet/ti/netcp_ethss.c:3831:27: note: (near initialization for 'gbe_module.hwtstamp_get')
  drivers/net/ethernet/ti/netcp_ethss.c:2758:19: note: 'gbe_hwtstamp_get' declared here
   2758 | static inline int gbe_hwtstamp_get(struct gbe_intf *gbe_intf,
        |                   ^~~~~~~~~~~~~~~~
  drivers/net/ethernet/ti/netcp_ethss.c:3832:27: error: initialization of 'int (*)(void *, struct kernel_hwtstamp_config *, struct netlink_ext_ack *)' from incompatible pointer type 'int (*)(struct gbe_intf *, struct kernel_hwtstamp_config *, struct netlink_ext_ack *)' [-Wincompatible-pointer-types]
   3832 |         .hwtstamp_set   = gbe_hwtstamp_set,
        |                           ^~~~~~~~~~~~~~~~
  drivers/net/ethernet/ti/netcp_ethss.c:3832:27: note: (near initialization for 'gbe_module.hwtstamp_set')
  drivers/net/ethernet/ti/netcp_ethss.c:2764:19: note: 'gbe_hwtstamp_set' declared here
   2764 | static inline int gbe_hwtstamp_set(struct gbe_intf *gbe_intf,
        |                   ^~~~~~~~~~~~~~~~

In a recent conversion to ndo_hwtstamp, the type of the first parameter
was updated for the CONFIG_TI_CPTS=y implementations of
gbe_hwtstamp_get() and gbe_hwtstamp_set() but not the CONFIG_TI_CPTS=n
ones.

Update the type of the first parameter in the CONFIG_TI_CPTS=n stubs to
resolve the errors.

Fixes: 3f02b82725 ("ti: netcp: convert to ndo_hwtstamp callbacks")
Reviewed-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Link: https://patch.msgid.link/20251110-netcp_ethss-fix-cpts-stubs-clang-wifpts-v2-1-aa6204ec1f43@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-11 17:49:02 -08:00
Kriish Sharma
6d7e3870af blk-mq-dma: fix kernel-doc function name for integrity DMA iterator
Documentation build reported:

  Warning: block/blk-mq-dma.c:373 expecting prototype for blk_rq_integrity_dma_map_iter_start(). Prototype was for blk_rq_integrity_dma_map_iter_next() instead

The kernel-doc comment above `blk_rq_integrity_dma_map_iter_next()` used
the wrong function name (`blk_rq_integrity_dma_map_iter_start`) in its
header. This patch corrects the function name in the kernel-doc block to
match the actual implementation, ensuring clean documentation builds.

Fixes: fec9b16dc5 ("blk-mq-dma: add scatter-less integrity data DMA mapping")
Signed-off-by: Kriish Sharma <kriish.sharma2006@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-11 08:37:33 -07:00
Keith Busch
fd9ecd0052 block: fix merging data-less bios
The data segment gaps the block layer tracks doesn't apply to bio's that
don't have data. Skip calculating this to fix a NULL pointer access.

Fixes: 2f6b2565d4 ("block: accumulate memory segment gaps per bio")
Reported-by: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Yu Kuai <yukuai@fnnas.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-11 08:35:59 -07:00
YangWen
aee4d5a521 ntfs3: fix double free of sbi->options->nls and clarify ownership of fc->fs_private
commit 02f312754c ("ntfs3: fix use-after-free of sbi->options in cmp_fnames") introduced a use-after-free bug
due to improper handling of sbi->options in error paths. This resulted in crashes when superblock cleanup
is performed in ntfs_put_super.

This patch ensures that the options structure and its subfields are properly freed, preventing the memory
corruption and use-after-free errors.

Fixes: 02f312754c ("ntfs3: fix use-after-free of sbi->options in cmp_fnames")
Reported-by: syzbot+cc433e4cd6d54736bf80@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=cc433e4cd6d54736bf80
Signed-off-by: YangWen <anmuxixixi@gmail.com>
[almaz.alexandrovich@paragon-software.com: added fixes and closes tags]
Signed-off-by: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
2025-11-11 16:08:18 +01:00
Bartlomiej Kubik
a8a3ca23bb fs/ntfs3: Initialize allocated memory before use
KMSAN reports: Multiple uninitialized values detected:

- KMSAN: uninit-value in ntfs_read_hdr (3)
- KMSAN: uninit-value in bcmp (3)

Memory is allocated by __getname(), which is a wrapper for
kmem_cache_alloc(). This memory is used before being properly
cleared. Change kmem_cache_alloc() to kmem_cache_zalloc() to
properly allocate and clear memory before use.

Fixes: 82cae269cf ("fs/ntfs3: Add initialization of super block")
Fixes: 78ab59fee0 ("fs/ntfs3: Rework file operations")
Tested-by: syzbot+332bd4e9d148f11a87dc@syzkaller.appspotmail.com
Reported-by: syzbot+332bd4e9d148f11a87dc@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=332bd4e9d148f11a87dc

Fixes: 82cae269cf ("fs/ntfs3: Add initialization of super block")
Fixes: 78ab59fee0 ("fs/ntfs3: Rework file operations")
Tested-by: syzbot+0399100e525dd9696764@syzkaller.appspotmail.com
Reported-by: syzbot+0399100e525dd9696764@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=0399100e525dd9696764

Reviewed-by: Khalid Aziz <khalid@kernel.org>
Signed-off-by: Bartlomiej Kubik <kubik.bartlomiej@gmail.com>
Signed-off-by: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
2025-11-11 16:08:02 +01:00
Caleb Sander Mateos
727a440278 ublk: return unsigned from ublk_{,un}map_io()
ublk_map_io() and ublk_unmap_io() never return negative values, and
their return values are stored in variables of type unsigned. Clarify
that they can't fail by making their return types unsigned.

Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-11 07:57:20 -07:00
Caleb Sander Mateos
6b0a29933f ublk: remove unnecessary checks in ublk_check_and_get_req()
ub = iocb->ki_filp->private_data cannot be NULL, as it's set in
ublk_ch_open() before it returns succesfully. req->mq_hctx cannot be
NULL as any inflight ublk request must belong to some queue. And
req->mq_hctx->driver_data cannot be NULL as it's set to the ublk_queue
pointer in ublk_init_hctx(). So drop the unnecessary checks.

Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-11 07:57:05 -07:00
Pavel Begunkov
712fbe97c3 io_uring: move flags check to io_uring_sanitise_params
io_uring_sanitise_params() sanitises most of the setup flags invariants,
move the IORING_SETUP_FLAGS check from io_uring_setup() into it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-11 07:53:33 -07:00
Pavel Begunkov
01405895c1 io_uring: use mem_is_zero to check ring params
mem_is_zero() does the job without hand rolled loops, use that to verify
reserved fields of ring params.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-11 07:53:33 -07:00
Pavel Begunkov
7bb21a52e2 io_uring: pass sq entries in the params struct
There is no need to pass the user requested number of SQ entries
separately from the main parameter structure io_uring_params. Initialise
it at the beginning and stop passing it in favour of struct
io_uring_params::sq_entries.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-11 07:53:33 -07:00
Pavel Begunkov
4aed5b4e6d io_uring: add helper calculating region byte size
There has been type related issues with region size calculation, add an
utility helper function that returns the size and handles type
conversions right.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-11 07:53:33 -07:00
Pavel Begunkov
21bd7b14a3 io_uring/query: buffer size calculations with a union
Instead of having an array of a calculated size as a buffer, put all
query uapi structures into a union and pass that around. That way
everything is well typed, and the compiler will prevent opcode handling
using a structure not accounted into the buffer size.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-11 07:53:33 -07:00
David Wei
b6c5f9454e io_uring/zcrx: call netdev_queue_get_dma_dev() under instance lock
netdev ops must be called under instance lock or rtnl_lock, but
io_register_zcrx_ifq() isn't doing this for netdev_queue_get_dma_dev().
Fix this by taking the instance lock using netdev_get_by_index_lock().

Extended the instance lock section to include attaching a memory
provider. Could not move io_zcrx_create_area() outside, since the dmabuf
codepath IORING_ZCRX_AREA_DMABUF requires ifq->dev.

Fixes: 59b8b32ac8 ("io_uring/zcrx: add support for custom DMA devices")
Signed-off-by: David Wei <dw@davidwei.uk>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-11 07:53:33 -07:00
David Wei
c07a491c1b net: export netdev_get_by_index_lock()
Need to call netdev_get_by_index_lock() from io_uring/zcrx.c, but it is
currently private to net. Export the function in linux/netdevice.h.

Signed-off-by: David Wei <dw@davidwei.uk>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-11 07:53:33 -07:00
Chaitanya Kulkarni
86afb1cdc2 block: add lockdep to queue_limits_commit_update()
queue_limits_commit_update() expects q->limits_lock to be held by
the caller (via queue_limits_start_update()).

The API pattern is:

  lim = queue_limits_start_update(q);  /* acquires lock */
              /* modify lim */
  queue_limits_commit_update(q, &lim); /* releases lock */

  OR

  queue_limits_commit_update_frozen(q, &lim);
   lim = queue_limits_start_update(q); /* acquires lock */
  queue_limits_commit_update(q, &lim); /* releases lock */

Add lockdep_assert_held() to report incorrect API usage.

Signed-off-by: Chaitanya Kulkarni <ckulkarnilinux@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-11 07:51:08 -07:00
Zheng Qixing
1649714b93 nbd: defer config unlock in nbd_genl_connect
There is one use-after-free warning when running NBD_CMD_CONNECT and
NBD_CLEAR_SOCK:

nbd_genl_connect
  nbd_alloc_and_init_config // config_refs=1
  nbd_start_device // config_refs=2
  set NBD_RT_HAS_CONFIG_REF			open nbd // config_refs=3
  recv_work done // config_refs=2
						NBD_CLEAR_SOCK // config_refs=1
						close nbd // config_refs=0
  refcount_inc -> uaf

------------[ cut here ]------------
refcount_t: addition on 0; use-after-free.
WARNING: CPU: 24 PID: 1014 at lib/refcount.c:25 refcount_warn_saturate+0x12e/0x290
 nbd_genl_connect+0x16d0/0x1ab0
 genl_family_rcv_msg_doit+0x1f3/0x310
 genl_rcv_msg+0x44a/0x790

The issue can be easily reproduced by adding a small delay before
refcount_inc(&nbd->config_refs) in nbd_genl_connect():

        mutex_unlock(&nbd->config_lock);
        if (!ret) {
                set_bit(NBD_RT_HAS_CONFIG_REF, &config->runtime_flags);
+               printk("before sleep\n");
+               mdelay(5 * 1000);
+               printk("after sleep\n");
                refcount_inc(&nbd->config_refs);
                nbd_connect_reply(info, nbd->index);
        }

Fixes: e46c7287b1 ("nbd: add a basic netlink interface")
Signed-off-by: Zheng Qixing <zhengqixing@huawei.com>
Reviewed-by: Yu Kuai <yukuai@fnnas.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-11 07:50:15 -07:00
Jens Axboe
3d076988aa Merge tag 'md-6.19-20251111' of gitolite.kernel.org:pub/scm/linux/kernel/git/mdraid/linux into for-6.19/block
Pull MD changes from Yu:

"- Change maintainer's email address (Yu Kuai)
 - Data can be lost if array is created with different lbs devices, fix
   this problem and record lbs of the array in metadata (Li Nan)
 - Fix rcu protection for md_thread (Yun Zhou)
 - Fix mddev kobject lifetime regression (Xiao Ni)
 - Enable atomic writes for md-linear (John Garry)
 - Some cleanups (Chen Ni, Huiwen He, Wu Guanghao)"

* tag 'md-6.19-20251111' of gitolite.kernel.org:pub/scm/linux/kernel/git/mdraid/linux:
  md: allow configuring logical block size
  md: add check_new_feature module parameter
  md/raid0: Move queue limit setup before r0conf initialization
  md: init bioset in mddev_init
  md: delete md_redundancy_group when array is becoming inactive
  md: prevent adding disks with larger logical_block_size to active arrays
  md/raid5: remove redundant __GFP_NOWARN
  md: avoid repeated calls to del_gendisk
  md/md-llbitmap: Remove unneeded semicolon
  md/md-linear: Enable atomic writes
  Factor out code into md_should_do_recovery()
  md: fix rcu protection in md_wakeup_thread
  md: delete mddev kobj before deleting gendisk kobj
  MAINTAINERS: Update Yu Kuai's E-mail address
2025-11-11 06:58:11 -07:00
Paolo Abeni
21f43f4a2b Merge branch 'devlink-eswitch-inactive-mode'
Saeed Mahameed says:

====================
devlink eswitch inactive mode

Before having traffic flow through an eswitch, a user may want to have the
ability to block traffic towards the FDB until FDB is fully programmed and the
user is ready to send traffic to it. For example: when two eswitches are present
for vports in a multi-PF setup, one eswitch may take over the traffic from the
other when the user chooses. Before this take over, a user may want to first
program the inactive eswitch and then once ready redirect traffic to this new
eswitch.

This series introduces a user-configurable mode for an eswitch that allows
dynamically switching between active and inactive modes. When inactive, traffic
does not flow through the eswitch. While inactive, steering pipeline
configuration can be done (e.g. adding TC rules, discovering representors,
enabling the desired SDN modes such as bridge/OVS/DPDK/etc). Once configuration
is completed, a user can set the eswitch mode to active and have traffic flow
through. This allows admins to upgrade forwarding pipeline rules with very
minimal downtime and packet drops.

A user can start the eswitch in switchdev or switchdev_inactive mode.

Active: Traffic is enabled on this eswitch FDB.
Inactive: Traffic is ignored/dropped on this eswitch FDB.

An example use case:
$ devlink dev eswitch set pci/0000:08:00.1 mode switchdev_inactive
Setup FDB pipeline and netdev representors
...
Once ready to start receiving traffic
$ devlink dev eswitch set pci/0000:08:00.1 mode switchdev

v2: https://lore.kernel.org/all/20251107000831.157375-1-saeed@kernel.org/
v1: https://lore.kernel.org/all/20251016013618.2030940-1-saeed@kernel.org/
====================

Link: https://patch.msgid.link/20251108070404.1551708-1-saeed@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-11 13:17:57 +01:00
Saeed Mahameed
9da611df15 net/mlx5: E-Switch, support eswitch inactive mode
Add support for eswitch switchdev inactive mode

Inactive mode: Drop all traffic going to FDB, Remove
mpfs l2 rules and disconnect adjacent vports.

Active mode: Traffic flows through FDB, mpfs table populated, and
adjacent vports are connected.

Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Adithya Jayachandran <ajayachandra@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Link: https://patch.msgid.link/20251108070404.1551708-4-saeed@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-11 13:17:54 +01:00
Saeed Mahameed
9902b6381d net/mlx5: MPFS, add support for dynamic enable/disable
MPFS (Multi PF Switch) is enabled by default in Multi-Host environments,
the driver keeps a list of desired unicast mac addresses of all vports
(vfs/Sfs) and applied to HW via L2_table FW command.

Add API to dynamically apply the list of MACs to HW when needed for next
patches, to utilize this new API in devlink eswitch active/in-active uAPI.

Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Adithya Jayachandran <ajayachandra@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Link: https://patch.msgid.link/20251108070404.1551708-3-saeed@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-11 13:17:54 +01:00
Saeed Mahameed
0e535824d0 devlink: Introduce switchdev_inactive eswitch mode
Adds DEVLINK_ESWITCH_MODE_SWITCHDEV_INACTIVE attribute to UAPI and
documentation.

Before having traffic flow through an eswitch, a user may want to have the
ability to block traffic towards the FDB until FDB is fully programmed and
the user is ready to send traffic to it. For example: when two eswitches
are present for vports in a multi-PF setup, one eswitch may take over the
traffic from the other when the user chooses.
Before this take over, a user may want to first program the inactive
eswitch and then once ready redirect traffic to this new eswitch.

switchdev modes transition semantics:

legacy->switchdev_inactive: Create switchdev mode normally, traffic not
  allowed to flow yet.

switchdev_inactive->switchdev: Enable traffic to flow.

switchdev->switchdev_inactive: Block traffic on the FDB, FDB and
  representros state and content is preserved.

When eswitch is configured to this mode, traffic is ignored/dropped on
this eswitch FDB, while current configuration is kept, e.g FDB rules and
netdev representros are kept available, FDB programming is allowed.

Example:
 # start inactive switchdev
devlink dev eswitch set pci/0000:08:00.1 mode switchdev_inactive
 # setup TC rules, representors etc ..
 # activate
devlink dev eswitch set pci/0000:08:00.1 mode switchdev

Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Link: https://patch.msgid.link/20251108070404.1551708-2-saeed@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-11 13:17:53 +01:00
Paolo Abeni
8180c4fa54 Merge branch 'tools-ynl-turn-the-page-pool-sample-into-a-real-tool'
Jakub Kicinski says:

====================
tools: ynl: turn the page-pool sample into a real tool

The page-pool YNL sample is quite useful. It's helps calculate
recycling rate and memory consumption. Since we still haven't
figured out a way to integrate with iproute2 (not for the lack
of thinking how to solve it) - create a ynltool command in ynl.

Add page-pool and qstats support.

Most commands can use the Python YNL CLI directly but low level
stats often need aggregation or some math on top to be useful.
Specifically in this patch set:
 - page pool stats are aggregated and recycling rate computed
 - per-queue stats are used to compute traffic balance across queues

v1: https://lore.kernel.org/20251104232348.1954349-1-kuba@kernel.org
====================

Link: https://patch.msgid.link/20251107162227.980672-1-kuba@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-11 12:46:21 +01:00
Jakub Kicinski
9eef97a9de tools: ynltool: add traffic distribution balance
The main if not only use case for per-queue stats today is checking
for traffic imbalance. Add simple traffic balance analysis to qstats.

 $ ynltool qstat balance
 eth0 rx 44 queues:
  rx-packets  : cv=6.9% ns=24.2% stddev=512006493
                min=6278921110 max=8011570575 mean=7437054644
  rx-bytes    : cv=6.9% ns=24.1% stddev=759670503060
                min=9326315769440 max=11884393670786 mean=11035439201354
  ...

  $ ynltool -j qstat balance | jq
  [
   {
    "ifname": "eth0",
    "ifindex": 2,
    "queue-type": "rx",
    "rx-packets": {
      "queue-count": 44,
      "min": 6278301665,
      "max": 8010780185,
      "mean": 7.43635E+9,
      "stddev": 5.12012E+8,
      "coefficient-of-variation": 6.88525,
      "normalized-spread": 24.249
    },
   ...

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Link: https://patch.msgid.link/20251107162227.980672-5-kuba@kernel.org
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-11 12:21:04 +01:00
Jakub Kicinski
3f0a638d45 tools: ynltool: add qstats support
$ ynltool qstat
  eth0        rx-packets:       493192163        rx-bytes:   1442544543997
              tx-packets:       745999838        tx-bytes:   4574215826482
                 tx-stop:            7033         tx-wake:            7033

  $ ynltool qstat show group-by queue
  eth0  rx-0     packets:        70196880           bytes:    178633973750
  eth0  rx-1     packets:        63623419           bytes:    197274745250
  ...
  eth0  tx-1     packets:        98645810           bytes:    631247647938
                    stop:            1048            wake:            1048
  eth0  tx-2     packets:        86775824           bytes:    563930471952
                    stop:            1126            wake:            1126
  ...

  $ ynltool -j qstat  | jq
  [
   {
    "ifname": "eth0",
    "ifindex": 2,
    "rx": {
      "packets": 493396439,
      "bytes": 1443608198921
    },
    "tx": {
      "packets": 746239978,
      "bytes": 4574333772645,
      "stop": 7072,
      "wake": 7072
    }
   }
  ]

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Link: https://patch.msgid.link/20251107162227.980672-4-kuba@kernel.org
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-11 12:21:04 +01:00
Jakub Kicinski
124dac9b42 tools: ynltool: add page-pool stats
Replace the page-pool sample with page pool support in ynltool.

 # ynltool page-pool stats
    eth0[2]	page pools: 18 (zombies: 0)
		refs: 171456 bytes: 702283776 (refs: 0 bytes: 0)
		recycling: 97.3% (alloc: 2679:6134966 recycle: 1250981:4719386)
 # ynltool -j page-pool stats | jq
 [
  {
    "ifname": "eth0",
    "ifindex": 2,
    "page_pools": 18,
    "zombies": 0,
    "live": {
      "refs": 171456,
      "bytes": 702283776
    },
    "zombie": {
      "refs": 0,
      "bytes": 0
    },
    "recycling_pct": 97.2746,
    "alloc": {
      "slow": 2679,
      "fast": 6135029
    },
    "recycle": {
      "ring": 1250997,
      "cache": 4719432
    }
  }
 ]

 # ynltool page-pool stats group-by pp
 pool id: 108  dev: eth0[2]  napi: 530
   inflight: 9472 pages 38797312 bytes
   recycling: 95.5% (alloc: 148:208379 recycle: 45386:153842)
 pool id: 107  dev: eth0[2]  napi: 529
   inflight: 9408 pages 38535168 bytes
   recycling: 94.9% (alloc: 147:180178 recycle: 42251:128808)

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Link: https://patch.msgid.link/20251107162227.980672-3-kuba@kernel.org
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-11 12:21:04 +01:00
Jakub Kicinski
b02d229013 tools: ynltool: create skeleton for the C command
Based on past discussions it seems like integration of YNL into
iproute2 is unlikely. YNL itself is not great as a C library,
since it has no backward compat (we routinely change types).

Most of the operations can be performed with the generic Python
CLI directly. There is, however, a handful of operations where
summarization of kernel output is very useful (mostly related
to stats: page-pool, qstat).

Create a command (inspired by bpftool, I think it stood the test
of time reasonably well) to be able to plug the subcommands into.

Link: https://lore.kernel.org/1754895902-8790-1-git-send-email-ernis@linux.microsoft.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Link: https://patch.msgid.link/20251107162227.980672-2-kuba@kernel.org
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-11 12:21:04 +01:00
Christoph Hellwig
6a7bb6ccd0 xfs: reduce ilock roundtrips in xfs_qm_vop_dqalloc
xfs_qm_vop_dqalloc only needs the (exclusive) ilock for attaching dquots
to the inode if not done so yet.  All the other locks don't touch the inode
and don't need the ilock - the i_rwsem / iolock protects against changes
to the IDs while we are in a method, and the ilock would not help because
dropping it for the dqget calls would be racy anyway.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
2025-11-11 11:45:58 +01:00
Christoph Hellwig
13d3c1a045 xfs: move xfs_dquot_tree calls into xfs_qm_dqget_cache_{lookup,insert}
These are the low-level functions that needs them, so localize the
(trivial) calculation of the radix tree root there.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
2025-11-11 11:45:58 +01:00
Christoph Hellwig
b6d2ab27cc xfs: move quota locking into xrep_quota_item
Drop two redundant lock roundtrips by not requiring q_lock to be held on
entry and return.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
2025-11-11 11:45:58 +01:00
Christoph Hellwig
a2ebb21f8a xfs: move quota locking into xqcheck_commit_dquot
Drop two redundant lock roundtrips by not requiring q_lock to be held on
entry and return.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
2025-11-11 11:45:58 +01:00
Christoph Hellwig
7dd30acb4b xfs: move q_qlock locking into xqcheck_compare_dquot
Instead of having both callers do it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
2025-11-11 11:45:58 +01:00
Christoph Hellwig
bfca8760f4 xfs: move q_qlock locking into xchk_quota_item
This avoids a pointless roundtrip because ilock needs to be taken first.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
2025-11-11 11:45:58 +01:00
Christoph Hellwig
a536bf9bec xfs: push q_qlock acquisition from xchk_dquot_iter to the callers.
There is no good reason to take q_qlock in xchk_dquot_iter, which just
provides a reference to the dquot.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
2025-11-11 11:45:58 +01:00
Christoph Hellwig
e85e74e4c9 xfs: remove q_qlock locking in xfs_qm_scall_setqlim
q_type can't change for an existing dquot, so there is no need for
the locking here.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
2025-11-11 11:45:58 +01:00
Christoph Hellwig
55c1bc3eb9 xfs: return the dquot unlocked from xfs_qm_dqget
There is no reason to lock the dquot in xfs_qm_dqget, which just acquires
a reference.  Move the locking to the callers, or remove it in cases where
the caller instantly unlocks the dquot.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
2025-11-11 11:45:58 +01:00
Christoph Hellwig
bf5066e169 xfs: fold xfs_qm_dqattach_one into xfs_qm_dqget_inode
xfs_qm_dqattach_one is a thin wrapper around xfs_qm_dqget_inode.  Move
the extra asserts into xfs_qm_dqget_inode, drop the unneeded q_qlock
roundtrip and merge the two functions.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
2025-11-11 11:45:58 +01:00
Christoph Hellwig
d0f93c0d7c xfs: xfs_qm_dqattach_one is never called with a non-NULL *IO_idqpp
The caller already checks that, so replace the handling of this case with
an assert that it does not happen.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
2025-11-11 11:45:58 +01:00
Christoph Hellwig
0494f04643 xfs: consolidate q_qlock locking in xfs_qm_dqget and xfs_qm_dqget_inode
Move taking q_qlock from the cache lookup / insert helpers into the
main functions and do it just before returning to the caller.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
2025-11-11 11:45:58 +01:00
Christoph Hellwig
6b6e6e7521 xfs: remove xfs_qm_dqput and optimize dropping dquot references
With the new lockref-based dquot reference counting, there is no need to
hold q_qlock for dropping the reference.  Make xfs_qm_dqrele the main
function to drop dquot references without taking q_qlock and convert all
callers of xfs_qm_dqput to unlock q_qlock and call xfs_qm_dqrele instead.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
2025-11-11 11:45:58 +01:00
Christoph Hellwig
0c5e80bd57 xfs: use a lockref for the xfs_dquot reference count
The xfs_dquot structure currently uses the anti-pattern of using the
in-object lock that protects the content to also serialize reference
count updates for the structure, leading to a cumbersome free path.
This is partially papered over by the fact that we never free the dquot
directly but always through the LRU.  Switch to use a lockref instead and
move the reference counter manipulations out of q_qlock.

To make this work, xfs_qm_flush_one and xfs_qm_flush_one are converted to
acquire a dquot reference while flushing to integrate with the lockref
"get if not dead" scheme.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
2025-11-11 11:45:57 +01:00
Christoph Hellwig
6129b088e1 xfs: remove xfs_dqunlock and friends
There's really no point in wrapping the basic mutex operations.  Remove
the wrapper to ease lock analysis annotations and make the code a litte
easier to read.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
2025-11-11 11:45:57 +01:00
Christoph Hellwig
36cebabde7 xfs: don't treat all radix_tree_insert errors as -EEXIST
Return other errors to the caller instead.  Note that there really
shouldn't be any other errors because the entry is preallocated, but
if there were, we'd better return them instead of retrying forever.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
2025-11-11 11:45:57 +01:00
Christoph Hellwig
005d5ae0c5 xfs: make qi_dquots a 64-bit value
qi_dquots counts all quotas in the file system, which can be up to
3 * UINT_MAX and overflow a 32-bit counter, but can't be negative.
Make qi_dquots a uint64_t, and saturate the value to UINT_MAX for
userspace reporting.

Fixes: 1da177e4c3 ("Linux-2.6.12-rc2")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
2025-11-11 11:45:57 +01:00
Christoph Hellwig
204c8f77e8 xfs: don't leak a locked dquot when xfs_dquot_attach_buf fails
xfs_qm_quotacheck_dqadjust acquired the dquot through xfs_qm_dqget,
which means it owns a reference and holds q_qlock.  Both need to
be dropped on an error exit.

Cc: <stable@vger.kernel.org> # v6.13
Fixes: ca378189fd ("xfs: convert quotacheck to attach dquot buffers")
Reported-by: kernel test robot <lkp@intel.com>
Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
2025-11-11 11:45:22 +01:00
Christoph Hellwig
0ec73eb3f1 xfs: add a xfs_groups_to_rfsbs helper
Plus a rtgroup wrapper and use that to avoid overflows when converting
zone/rtg counts to block counts.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
2025-11-11 11:38:05 +01:00
Abdun Nihaal
5e88e86411 wifi: cw1200: Fix potential memory leak in cw1200_bh_rx_helper()
In one of the error paths, the memory allocated for skb_rx is not freed.
Fix that by freeing it before returning.

Fixes: a910e4a94f ("cw1200: add driver for the ST-E CW1100 & CW1200 WLAN chipsets")
Signed-off-by: Abdun Nihaal <nihaal@cse.iitm.ac.in>
Link: https://patch.msgid.link/20251110175316.106591-1-nihaal@cse.iitm.ac.in
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-11 11:11:24 +01:00
Benjamin Berg
9e23063a79 wifi: mac80211: make monitor link info check more specific
Verify that only one of the permitted change flags is set when changing
the link of a monitor interface. Before the WARN_ON_ONCE would accept
anything if mu_mimo_owner was set.

Also, split out the mu_mimo_owner flag and enable it for all interface
types. The option is set during association when VHT is available and it
is not expected that any configuration of the MU groups is done without
it being set.

Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
Reviewed-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20251110141948.6696dba8678d.Icafac3be4724825dd6140e4407bae3a2adb593a5@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-11 11:05:19 +01:00
Benjamin Berg
a5aa46f1ac wifi: mac80211: track MU-MIMO configuration on disabled interfaces
For monitoring, userspace will try to configure the VIF sdata, while the
driver may see the monitor_sdata that is created when only monitor
interfaces are up. This causes the odd situation that it may not be
possible to store the MU-MIMO configuration on monitor_sdata.

Fix this by storing that information on the VIF sdata and updating the
monitor_sdata when available and the interface is up. Also, adjust the
code that adds monitor_sdata so that it will configure MU-MIMO based on
the newly added interface or one of the existing ones.

This should give a mostly consistent behaviour when configuring MU-MIMO
on sniffer interfaces. Should the user configure MU-MIMO on multiple
sniffer interfaces, then mac80211 will simply select one of the
configurations. This behaviour should be good enough and avoids breaking
user expectations in the common scenarios.

Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
Reviewed-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20251110141514.677915f8f6bb.If4e04a57052f9ca763562a67248b06fd80d0c2c1@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-11 11:05:10 +01:00
Pagadala Yesu Anjaneyulu
b54cf0f449 wifi: cfg80211/mac80211: Add fallback mechanism for INDOOR_SP connection
Implement fallback to LPI mode when SP mode is not permitted
by regulatory constraints for INDOOR_SP connections.
Limit fallback mechanism to client mode.

Signed-off-by: Pagadala Yesu Anjaneyulu <pagadala.yesu.anjaneyulu@intel.com>
Reviewed-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20251110140806.8b43201a34ae.I37fc7bb5892eb9d044d619802e8f2095fde6b296@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-11 11:05:00 +01:00
Pagadala Yesu Anjaneyulu
e18efacc9c wifi: cfg80211/mac80211: clean up duplicate ap_power handling
Move duplicated ap_power type handling code to an inline
function in cfg80211.

Signed-off-by: Pagadala Yesu Anjaneyulu <pagadala.yesu.anjaneyulu@intel.com>
Reviewed-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20251110140806.959948da1cb5.I893b5168329fb3232f249c182a35c99804112da6@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-11 11:05:00 +01:00
Emmanuel Grumbach
706edca679 wifi: cfg80211: use a C99 initializer in wiphy_register
struct regulatory request was not fully initialized. While this is not
really a big deal because nl80211_send_reg_change_event doesn't look at
the other fields, it still makes sense to zero all the other fields as
Coverity suggests.

Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Reviewed-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20251110140230.f8d4fcb1328b.I87170b1caef04356809838e684c9499f5806e624@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-11 11:04:38 +01:00
Jason Xing
8da7bea7db xsk: add indirect call for xsk_destruct_skb
Since Eric proposed an idea about adding indirect call wrappers for
UDP and managed to see a huge improvement[1], the same situation can
also be applied in xsk scenario.

This patch adds an indirect call for xsk and helps current copy mode
improve the performance by around 1% stably which was observed with
IXGBE at 10Gb/sec loaded. If the throughput grows, the positive effect
will be magnified. I applied this patch on top of batch xmit series[2],
and was able to see <5% improvement from our internal application
which is a little bit unstable though.

Use INDIRECT wrappers to keep xsk_destruct_skb static as it used to
be when the mitigation config is off.

Be aware of the freeing path that can be very hot since the frequency
can reach around 2,000,000 times per second with the xdpsock test.

[1]: https://lore.kernel.org/netdev/20251006193103.2684156-2-edumazet@google.com/
[2]: https://lore.kernel.org/all/20251021131209.41491-1-kerneljasonxing@gmail.com/

Suggested-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Jason Xing <kernelxing@tencent.com>
Reviewed-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Link: https://patch.msgid.link/20251031103328.95468-1-kerneljasonxing@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-11 10:21:08 +01:00
Li Nan
62ed1b5822 md: allow configuring logical block size
Previously, raid array used the maximum logical block size (LBS)
of all member disks. Adding a larger LBS disk at runtime could
unexpectedly increase RAID's LBS, risking corruption of existing
partitions. This can be reproduced by:

```
  # LBS of sd[de] is 512 bytes, sdf is 4096 bytes.
  mdadm -CRq /dev/md0 -l1 -n3 /dev/sd[de] missing --assume-clean

  # LBS is 512
  cat /sys/block/md0/queue/logical_block_size

  # create partition md0p1
  parted -s /dev/md0 mklabel gpt mkpart primary 1MiB 100%
  lsblk | grep md0p1

  # LBS becomes 4096 after adding sdf
  mdadm --add -q /dev/md0 /dev/sdf
  cat /sys/block/md0/queue/logical_block_size

  # partition lost
  partprobe /dev/md0
  lsblk | grep md0p1
```

Simply restricting larger-LBS disks is inflexible. In some scenarios,
only disks with 512 bytes LBS are available currently, but later, disks
with 4KB LBS may be added to the array.

Making LBS configurable is the best way to solve this scenario.
After this patch, the raid will:
  - store LBS in disk metadata
  - add a read-write sysfs 'mdX/logical_block_size'

Future mdadm should support setting LBS via metadata field during RAID
creation and the new sysfs. Though the kernel allows runtime LBS changes,
users should avoid modifying it after creating partitions or filesystems
to prevent compatibility issues.

Only 1.x metadata supports configurable LBS. 0.90 metadata inits all
fields to default values at auto-detect. Supporting 0.90 would require
more extensive changes and no such use case has been observed.

Note that many RAID paths rely on PAGE_SIZE alignment, including for
metadata I/O. A larger LBS than PAGE_SIZE will result in metadata
read/write failures. So this config should be prevented.

Link: https://lore.kernel.org/linux-raid/20251103125757.1405796-6-linan666@huaweicloud.com
Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
2025-11-11 11:20:15 +08:00
Li Nan
9c47127a80 md: add check_new_feature module parameter
Raid checks if pad3 is zero when loading superblock from disk. Arrays
created with new features may fail to assemble on old kernels as pad3
is used.

Add module parameter check_new_feature to bypass this check.

Link: https://lore.kernel.org/linux-raid/20251103125757.1405796-5-linan666@huaweicloud.com
Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
2025-11-11 11:19:54 +08:00
Li Nan
2107457e31 md/raid0: Move queue limit setup before r0conf initialization
Prepare for making logical blocksize configurable. This change has no
impact until logical block size becomes configurable.

Move raid0_set_limits() before create_strip_zones(). It is safe as fields
modified in create_strip_zones() do not involve mddev configuration, and
rdev modifications there are not used in raid0_set_limits().

'blksize' in create_strip_zones() fetches mddev's logical block size,
which is already the maximum aross all rdevs, so the later max() can be
removed.

Link: https://lore.kernel.org/linux-raid/20251103125757.1405796-4-linan666@huaweicloud.com
Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
2025-11-11 11:19:27 +08:00
Li Nan
381a3ce1c0 md: init bioset in mddev_init
IO operations may be needed before md_run(), such as updating metadata
after writing sysfs. Without bioset, this triggers a NULL pointer
dereference as below:

 BUG: kernel NULL pointer dereference, address: 0000000000000020
 Call Trace:
  md_update_sb+0x658/0xe00
  new_level_store+0xc5/0x120
  md_attr_store+0xc9/0x1e0
  sysfs_kf_write+0x6f/0xa0
  kernfs_fop_write_iter+0x141/0x2a0
  vfs_write+0x1fc/0x5a0
  ksys_write+0x79/0x180
  __x64_sys_write+0x1d/0x30
  x64_sys_call+0x2818/0x2880
  do_syscall_64+0xa9/0x580
  entry_SYSCALL_64_after_hwframe+0x4b/0x53

Reproducer
```
  mdadm -CR /dev/md0 -l1 -n2 /dev/sd[cd]
  echo inactive > /sys/block/md0/md/array_state
  echo 10 > /sys/block/md0/md/new_level
```

mddev_init() can only be called once per mddev, no need to test if bioset
has been initialized anymore.

Link: https://lore.kernel.org/linux-raid/20251103125757.1405796-3-linan666@huaweicloud.com
Fixes: d981ed8419 ("md: Add new_level sysfs interface")
Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
2025-11-11 11:19:10 +08:00
Li Nan
0ce112d917 md: delete md_redundancy_group when array is becoming inactive
'md_redundancy_group' are created in md_run() and deleted in del_gendisk(),
but these are not paired. Writing inactive/active to sysfs array_state can
trigger md_run() multiple times without del_gendisk(), leading to
duplicate creation as below:

 sysfs: cannot create duplicate filename '/devices/virtual/block/md0/md/sync_action'
 Call Trace:
  dump_stack_lvl+0x9f/0x120
  dump_stack+0x14/0x20
  sysfs_warn_dup+0x96/0xc0
  sysfs_add_file_mode_ns+0x19c/0x1b0
  internal_create_group+0x213/0x830
  sysfs_create_group+0x17/0x20
  md_run+0x856/0xe60
  ? __x64_sys_openat+0x23/0x30
  do_md_run+0x26/0x1d0
  array_state_store+0x559/0x760
  md_attr_store+0xc9/0x1e0
  sysfs_kf_write+0x6f/0xa0
  kernfs_fop_write_iter+0x141/0x2a0
  vfs_write+0x1fc/0x5a0
  ksys_write+0x79/0x180
  __x64_sys_write+0x1d/0x30
  x64_sys_call+0x2818/0x2880
  do_syscall_64+0xa9/0x580
  entry_SYSCALL_64_after_hwframe+0x4b/0x53
 md: cannot register extra attributes for md0

Creation of it depends on 'pers', its lifecycle cannot be aligned with
gendisk. So fix this issue by triggering 'md_redundancy_group' deletion
when the array is becoming inactive.

Link: https://lore.kernel.org/linux-raid/20251103125757.1405796-2-linan666@huaweicloud.com
Fixes: 790abe4d77 ("md: remove/add redundancy group only in level change")
Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
2025-11-11 11:18:51 +08:00
Li Nan
6c6b66f65e md: prevent adding disks with larger logical_block_size to active arrays
When adding a disk to a md array, avoid updating the array's
logical_block_size to match the new disk. This prevents accidental
partition table loss that renders the array unusable.

The later patch will introduce a way to configure the array's
logical_block_size.

The issue was introduced before Linux 2.6.12-rc2.

Link: https://lore.kernel.org/linux-raid/20250918115759.334067-2-linan666@huaweicloud.com/
Fixes: d2e45eace8 ("[PATCH] Fix raid "bio too big" failures")
Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
2025-11-11 11:17:33 +08:00
Heiner Kallweit
b981e100c1 net: dsa: loop: use new helper fixed_phy_register_100fd to simplify the code
Use new helper fixed_phy_register_100fd to simplify the code.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
Link: https://patch.msgid.link/922f1b45-1748-4dd2-87eb-9d018df44731@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-10 18:19:18 -08:00
Marco Crivellari
e483a615a6 isdn: kcapi: add WQ_PERCPU to alloc_workqueue users
Currently if a user enqueues a work item using schedule_delayed_work() the
used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use
WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to
schedule_work() that is using system_wq and queue_work(), that makes use
again of WORK_CPU_UNBOUND.
This lack of consistency cannot be addressed without refactoring the API.

alloc_workqueue() treats all queues as per-CPU by default, while unbound
workqueues must opt-in via WQ_UNBOUND.

This default is suboptimal: most workloads benefit from unbound queues,
allowing the scheduler to place worker threads where they’re needed and
reducing noise when CPUs are isolated.

This continues the effort to refactor workqueue APIs, which began with
the introduction of new workqueues and a new alloc_workqueue flag in:

commit 128ea9f6cc ("workqueue: Add system_percpu_wq and system_dfl_wq")
commit 930c2ea566 ("workqueue: Add new WQ_PERCPU flag")

This change adds a new WQ_PERCPU flag to explicitly request
alloc_workqueue() to be per-cpu when WQ_UNBOUND has not been specified.

With the introduction of the WQ_PERCPU flag (equivalent to !WQ_UNBOUND),
any alloc_workqueue() caller that doesn’t explicitly specify WQ_UNBOUND
must now use WQ_PERCPU.

Once migration is complete, WQ_UNBOUND can be removed and unbound will
become the implicit default.

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Marco Crivellari <marco.crivellari@suse.com>
Link: https://patch.msgid.link/20251107134452.198378-1-marco.crivellari@suse.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-10 18:14:20 -08:00
Ping-Ke Shih
12e6bea1aa wifi: rtlwifi: rtl8188ee: correct allstasleep in P2P PS H2C command
The "->allstasleep" variable is a 1 bit bitfield. It can only be 0 or 1.
This "= -1" assignement should be "= 0" as other chips and vendor driver.

Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Closes: https://lore.kernel.org/linux-wireless/5c992936-4e7b-4c0a-abfc-0ec0fb9ef9fd@suswa.mountain/T/#t
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/1762390172-21091-1-git-send-email-pkshih@realtek.com
2025-11-11 10:10:21 +08:00
Bitterblue Smith
623c177323 wifi: rtw89: Enable the new rtw89_8852au module
Tested in station mode and a little in AP mode.

Signed-off-by: Bitterblue Smith <rtl8821cerfe2@gmail.com>
Acked-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/180a5e77-9297-4ffc-80d5-191dfef47661@gmail.com
2025-11-11 10:04:13 +08:00
Bitterblue Smith
0029ccab53 wifi: rtw89: Add rtw8852au.c
This is the entry point of the new rtw89_8852au module.

Signed-off-by: Bitterblue Smith <rtl8821cerfe2@gmail.com>
Acked-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/9580e5be-2bd1-45f9-ab75-616b86e25694@gmail.com
2025-11-11 10:02:58 +08:00
Bitterblue Smith
1dfd11e700 wifi: rtw89: 8852a: Accept USB devices and load their MAC address
Make rtw8852a_read_efuse() accept USB devices and load the MAC
address from the correct offset.

Signed-off-by: Bitterblue Smith <rtl8821cerfe2@gmail.com>
Acked-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/97412e2c-ac98-403c-9056-9d9fe1ed9f28@gmail.com
2025-11-11 10:02:49 +08:00
Bitterblue Smith
c19b106609 wifi: rtw89: Add rtw8852a_hfc_param_ini_usb
"hfc" means "hci fc" which is "Host Control Interface Flow Control".
These are some parameters needed for RTL8852AU.

Signed-off-by: Bitterblue Smith <rtl8821cerfe2@gmail.com>
Acked-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/d1032888-2a53-4c52-a8b9-6e00cd6758dc@gmail.com
2025-11-11 10:02:40 +08:00
Bitterblue Smith
0eea5e0f03 wifi: rtw89: Add rtw8852a_dle_mem_usb
Add rtw8852a_dle_mem_usb and its various quotas and sizes in struct
rtw89_mac_size_set.

"dle" could be "Data Link Engine" or "Double Link Engine". These are
some parameters needed for RTL8852AU.

Signed-off-by: Bitterblue Smith <rtl8821cerfe2@gmail.com>
Acked-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/d0a09039-97a8-4501-b023-510c126d8c61@gmail.com
2025-11-11 10:01:36 +08:00
Bitterblue Smith
233542f5b4 wifi: rtw89: Use the correct power sequences for USB/SDIO
Make rtw89_mac_pwr_seq() select the right parts of the power sequences
based on the interface type.

This is only relevant for RTL8852A. The other chips don't use power
sequences.

Signed-off-by: Bitterblue Smith <rtl8821cerfe2@gmail.com>
Acked-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/dec13310-06eb-429e-acb8-4c5b62656836@gmail.com
2025-11-11 10:00:12 +08:00
Jakub Kicinski
d5a9ae217b Merge branch 'gve-improve-rx-buffer-length-management'
Ankit Garg says:

====================
gve: Improve RX buffer length management

This patch series improves the management of the RX buffer length for
the DQO queue format in the gve driver. The goal is to make RX buffer
length config more explicit, easy to change, and performant by default.

We accomplish that in four patches:

1.  Currently, the buffer length is implicitly coupled with the header
    split setting, which is an unintuitive and restrictive design. The
    first patch decouples the RX buffer length from the header split
    configuration.

2.  The second patch is a preparatory step for third. It converts the XDP
    config verification method to use extack for better error reporting.

3.  The third patch exposes the `rx_buf_len` parameter to userspace via
    ethtool, allowing user to directly view or modify the RX buffer length
    if supported by the device.

4.  The final patch improves the out-of-the-box RX single stream throughput
    by >10%  by changing the driver's default behavior to select the
    maximum supported RX buffer length advertised by the device during
    initialization.
====================

Link: https://patch.msgid.link/20251106192746.243525-1-joshwash@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-10 17:36:40 -08:00
Ankit Garg
09a81a0f4f gve: Default to max_rx_buffer_size for DQO if device supported
Change the driver's default behavior to prefer the largest available RX
buffer length supported by the device for DQO format, rather than always
using the hardcoded 2K default.

Previously, the driver would initialize with
`GVE_DEFAULT_RX_BUFFER_SIZE` (2K), even if the device advertised support
for a larger length (e.g., 4K).

Performance observations:
- With LRO disabled, we observed >10% improvement in RX single stream
throughput when MTU >=2048.
- With LRO enabled, we observed >10% improvement in RX single stream
throughput when MTU >=1460.
- No regressions were observed.

Signed-off-by: Ankit Garg <nktgrg@google.com>
Reviewed-by: Harshitha Ramamurthy <hramamurthy@google.com>
Reviewed-by: Jordan Rhee <jordanrhee@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Joshua Washington <joshwash@google.com>
Link: https://patch.msgid.link/20251106192746.243525-5-joshwash@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-10 17:36:37 -08:00
Ankit Garg
d235bb213f gve: Allow ethtool to configure rx_buf_len
Add support for getting and setting the RX buffer length via the
ethtool ring parameters (`ethtool -g`/`-G`). The driver restricts the
allowed buffer length to 2048 (SZ_2K) by default and allows 4096 (SZ_4K)
based on device options.

As XDP is only supported when the `rx_buf_len` is 2048, the driver now
enforces this in two places:
1.  In `gve_xdp_set`, rejecting XDP programs if the current buffer
    length is not 2048.
2.  In `gve_set_rx_buf_len_config`, rejecting buffer length changes if XDP
    is loaded and the new length is not 2048.

Signed-off-by: Ankit Garg <nktgrg@google.com>
Reviewed-by: Harshitha Ramamurthy <hramamurthy@google.com>
Reviewed-by: Jordan Rhee <jordanrhee@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Joshua Washington <joshwash@google.com>
Link: https://patch.msgid.link/20251106192746.243525-4-joshwash@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-10 17:36:37 -08:00
Ankit Garg
091a3b6ff2 gve: Use extack to log xdp config verification errors
Plumb extack as it allows us to send more detailed error messages back
and append 'gve' suffix to method name per convention.

NL_SET_ERR_MSG_FMT_MOD doesn't support format string longer than 80
chars so keeping netdev warning with actual queue count details.

Signed-off-by: Ankit Garg <nktgrg@google.com>
Reviewed-by: Harshitha Ramamurthy <hramamurthy@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Joshua Washington <joshwash@google.com>
Link: https://patch.msgid.link/20251106192746.243525-3-joshwash@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-10 17:36:37 -08:00
Ankit Garg
40fef85ceb gve: Decouple header split from RX buffer length
Previously, enabling header split via `gve_set_hsplit_config` also
implicitly changed the RX buffer length to 4K (if supported by the
device). This coupled two settings that should be orthogonal; this patch
removes that side effect.

After this change, `gve_set_hsplit_config` only toggles the header
split configuration. The RX buffer length is no longer affected and
must be configured independently.

Signed-off-by: Ankit Garg <nktgrg@google.com>
Reviewed-by: Harshitha Ramamurthy <hramamurthy@google.com>
Reviewed-by: Jordan Rhee <jordanrhee@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Joshua Washington <joshwash@google.com>
Link: https://patch.msgid.link/20251106192746.243525-2-joshwash@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-10 17:36:37 -08:00
Jakub Kicinski
71bc986cd1 Merge branch 'net-stmmac-ingenic-convert-to-set_phy_intf_sel'
Russell King says:

====================
net: stmmac: ingenic: convert to set_phy_intf_sel()

Convert ingenic to use the new ->set_phy_intf_sel() method that was
recently introduced in net-next.

This is the largest of the conversions, as there is scope for cleanups
along with the conversion.
====================

Link: https://patch.msgid.link/aQ2tgEu-dudzlZlg@shell.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-10 17:30:43 -08:00
Russell King (Oracle)
34bf68a691 net: stmmac: ingenic: use ->set_phy_intf_sel()
Rather than placing the phy_intf_sel() setup in the ->init() method,
move it to the new ->set_phy_intf_sel() method.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vHHqY-0000000Djrn-1D6H@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-10 17:30:40 -08:00
Russell King (Oracle)
9352f74fd1 net: stmmac: ingenic: pass ingenic_mac struct rather than plat_dat
It no longer makes sense to pass a pointer to struct
plat_stmmacenet_data when calling the set_mode() methods to only use it
to get a pointer to the ingenic_mac structure that we already had in
the caller. Simplify this by passing the struct ingenic_mac pointer.

Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vHHqT-0000000Djrh-0ka3@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-10 17:30:40 -08:00
Russell King (Oracle)
2284cca0bc net: stmmac: ingenic: simplify x2000 mac_set_mode()
As per the previous commit, we have validated that the phy_intf_sel
value is one that is permissible for this SoC, so there is no need to
handle invalid PHY interface modes. We can also apply the other
configuration based upon the phy_intf_sel value rather than the
PHY interface mode.

Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vHHqO-0000000Djrb-0DPN@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-10 17:30:40 -08:00
Russell King (Oracle)
608975d4d7 net: stmmac: ingenic: simplify mac_set_mode() methods
x1000, x1600 and x1830 only accept RMII mode. PHY_INTF_SEL_RMII is only
selected with PHY_INTERFACE_MODE_RMII, and PHY_INTF_SEL_RMII has been
validated by the SoC's .valid_phy_intf_sel bitmask. Thus, checking the
interface mode in these functions becomes unnecessary. Remove these.

jz4775 is similar, except for a greater set of PHY_INTF_SEL_x valies.
Also remove the switch statement here.

Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vHHqI-0000000DjrV-3ygL@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-10 17:30:40 -08:00
Russell King (Oracle)
35147b5c9e net: stmmac: ingenic: move "MAC PHY control register" debug
Move the printing of the MAC PHY control register interface mode
setting into ingenic_set_phy_intf_sel(), and use phy_modes() to
print the string rather than using the enum name.

Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vHHqD-0000000DjrP-3aaU@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-10 17:30:40 -08:00
Russell King (Oracle)
0e2fa91c55 net: stmmac: ingenic: use stmmac_get_phy_intf_sel()
Use stmmac_get_phy_intf_sel() to decode the PHY interface mode to the
phy_intf_sel value, validate the result against the SoC specific
supported phy_intf_sel values, and pass into the SoC specific
set_mode() methods, replacing the local phy_intf_sel variable. This
provides the value for the MACPHYC_PHY_INFT_MASK field.

Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vHHq8-0000000DjrJ-2NRK@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-10 17:30:40 -08:00
Russell King (Oracle)
14497aaa5e net: stmmac: ingenic: prep PHY_INTF_SEL_x field after switch()
Move the preparation of the PHY_INTF_SEL_x bitfield out of the switch()
statement such that it only appears once.

Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vHHq3-0000000DjrD-1u8O@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-10 17:30:39 -08:00
Russell King (Oracle)
dbf99dc7d1 net: stmmac: ingenic: use PHY_INTF_SEL_x directly
Use the PHY_INTF_SEL_x values directly in each of the mac_set_mode
methods rather than the driver private MACPHYC_PHY_INFT_x definitions.
Remove the MACPHYC_PHY_INFT_x definitions.

Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vHHpy-0000000Djr7-1R1m@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-10 17:30:39 -08:00
Russell King (Oracle)
da6e9fd104 net: stmmac: ingenic: use PHY_INTF_SEL_x to select PHY interface
Use the common dwmac definitions for the PHY interface selection field.

Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vHHpt-0000000Djr1-0wwr@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-10 17:30:39 -08:00
Russell King (Oracle)
307a575775 net: stmmac: ingenic: simplify jz4775 mac_set_mode()
All paths configure the transmit clock as an input. Move this out of
the switch() statement to simplify the code.

Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vHHpo-0000000Djqv-0RD4@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-10 17:30:39 -08:00
Russell King (Oracle)
2dd63c3645 net: stmmac: ingenic: move ingenic_mac_init()
Move ingenic_mac_init() to between variant specific set_mode()
implementations and ingenic_mac_probe(). No code changes.

Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vHHpi-0000000Djqp-4910@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-10 17:30:39 -08:00
Simon Schippers
7ff14c5204 usbnet: Add support for Byte Queue Limits (BQL)
In the current implementation, usbnet uses a fixed tx_qlen of:

USB2: 60 * 1518 bytes = 91.08 KB
USB3: 60 * 5 * 1518 bytes = 454.80 KB

Such large transmit queues can be problematic, especially for cellular
modems. For example, with a typical celluar link speed of 10 Mbit/s, a
fully occupied USB3 transmit queue results in:

454.80 KB / (10 Mbit/s / 8 bit/byte) = 363.84 ms

of additional latency.

This patch adds support for Byte Queue Limits (BQL) [1] to dynamically
manage the transmit queue size and reduce latency without sacrificing
throughput.

Testing was performed on various devices using the usbnet driver for
packet transmission:

- DELOCK 66045: USB3 to 2.5 GbE adapter (ax88179_178a)
- DELOCK 61969: USB2 to 1 GbE adapter (asix)
- Quectel RM520: 5G modem (qmi_wwan)
- USB2 Android tethering (cdc_ncm)

No performance degradation was observed for iperf3 TCP or UDP traffic,
while latency for a prioritized ping application was significantly
reduced. For example, using the USB3 to 2.5 GbE adapter, which was fully
utilized by iperf3 UDP traffic, the prioritized ping was improved from
1.6 ms to 0.6 ms. With the same setup but with a 100 Mbit/s Ethernet
connection, the prioritized ping was improved from 35 ms to 5 ms.

[1] https://lwn.net/Articles/469652/

Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251106175615.26948-1-simon.schippers@tu-dortmund.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-10 17:28:38 -08:00
Breno Leitao
23c52b58cc tg3: Fix num of RX queues being reported by ethtool
Using num_online_cpus() to report number of queues is actually not
correct, as reported by Michael[1].

netif_get_num_default_rss_queues() was used to replace num_online_cpus()
in the past, but tg3 ethtool callbacks didn't get converted. Doing it
now.

Link: https://lore.kernel.org/all/CACKFLim7ruspmqvjr6bNRq5Z_XXVk3vVaLZOons7kMCzsEG23A@mail.gmail.com/#t [1]

Signed-off-by: Breno Leitao <leitao@debian.org>
Suggested-by: Michael Chan <michael.chan@broadcom.com>
Reviewed-by: Michael Chan <michael.chan@broadcom.com>
Link: https://patch.msgid.link/20251107-tg3_counts-v1-1-337fe5c8ccb7@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-10 17:11:32 -08:00
Jakub Kicinski
f5d8ec838b Merge branch 'net-dsa-b53-add-support-for-bcm5389-97-98-and-bcm63xx-arl-formats'
Jonas Gorski says:

====================
net: dsa: b53: add support for BCM5389/97/98 and BCM63XX ARL formats

Currently b53 assumes that all switches apart from BCM5325/5365 use the
same ARL formats, but there are actually multiple formats in use.

Older switches use a format apparently introduced with BCM5387/BCM5389,
while newer chips use a format apparently introduced with BCM5395.

Note that these numbers are not linear, BCM5397/BCM5398 use the older
format.

In addition to that the switches integrated into BCM63XX SoCs use their
own format. While accessing these normal read/write ARL entries are the
same format as BCM5389 one, the search format is different.

So in order to support all these different format, split all code
accessing these entries into chip-family specific functions, and collect
them in appropriate arl ops structs to keep the code cleaner.

Sent as net-next since the ARL accesses have never worked before, and
the extensive refactoring might be too much to warrant a fix.
====================

Link: https://patch.msgid.link/20251107080749.26936-1-jonas.gorski@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-10 17:11:10 -08:00
Jonas Gorski
2b3013ac03 net: dsa: b53: add support for bcm63xx ARL entry format
The ARL registers of BCM63XX embedded switches are somewhat unique. The
normal ARL table access registers have the same format as BCM5389, but
the ARL search registers differ:

* SRCH_CTL is at the same offset of BCM5389, but 16 bits wide. It does
  not have more fields, just needs to be accessed by a 16 bit read.
* SRCH_RSLT_MACVID and SRCH_RSLT are aligned to 32 bit, and have shifted
  offsets.
* SRCH_RSLT has a different format than the normal ARL data entry
  register.
* There is only one set of ENTRY_N registers, implying a 1 bin layout.

So add appropriate ops for bcm63xx and let it use it.

Signed-off-by: Jonas Gorski <jonas.gorski@gmail.com>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Link: https://patch.msgid.link/20251107080749.26936-9-jonas.gorski@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-10 17:11:07 -08:00
Jonas Gorski
300f78e8b6 net: dsa: b53: add support for 5389/5397/5398 ARL entry format
BCM5389, BCM5397 and BCM5398 use a different ARL entry format with just
a 16 bit fwdentry register, as well as different search control and data
offsets.

So add appropriate ops for them and switch those chips to use them.

Signed-off-by: Jonas Gorski <jonas.gorski@gmail.com>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Link: https://patch.msgid.link/20251107080749.26936-8-jonas.gorski@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-10 17:11:07 -08:00
Jonas Gorski
a7e73339ad net: dsa: b53: move ARL entry functions into ops struct
Now that the differences in ARL entry formats are neatly contained into
functions per chip family, wrap them into an ops struct and add wrapper
functions to access them.

Signed-off-by: Jonas Gorski <jonas.gorski@gmail.com>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Link: https://patch.msgid.link/20251107080749.26936-7-jonas.gorski@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-10 17:11:07 -08:00
Jonas Gorski
e0c476f325 net: dsa: b53: split reading search entry into their own functions
Split reading search entries into a function for each format.

Signed-off-by: Jonas Gorski <jonas.gorski@gmail.com>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Link: https://patch.msgid.link/20251107080749.26936-6-jonas.gorski@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-10 17:11:07 -08:00
Jonas Gorski
1716be6db0 net: dsa: b53: provide accessors for accessing ARL_SRCH_CTL
In order to more easily support more formats, move accessing
ARL_SRCH_CTL into helper functions to contain the differences.

Signed-off-by: Jonas Gorski <jonas.gorski@gmail.com>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Link: https://patch.msgid.link/20251107080749.26936-5-jonas.gorski@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-10 17:11:07 -08:00
Jonas Gorski
bf6e9d2ae1 net: dsa: b53: move writing ARL entries into their own functions
Move writing ARL entries into individual functions for each format.

Signed-off-by: Jonas Gorski <jonas.gorski@gmail.com>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Link: https://patch.msgid.link/20251107080749.26936-4-jonas.gorski@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-10 17:11:06 -08:00
Jonas Gorski
4a291fe722 net: dsa: b53: move reading ARL entries into their own function
Instead of duplicating the whole code iterating over all bins for
BCM5325, factor out reading and parsing the entry into its own
functions, and name it the modern one after the first chip with that ARL
format, (BCM53)95.

Signed-off-by: Jonas Gorski <jonas.gorski@gmail.com>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Link: https://patch.msgid.link/20251107080749.26936-3-jonas.gorski@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-10 17:11:06 -08:00
Jonas Gorski
a6e4fd38bf net: dsa: b53: b53_arl_read{,25}(): use the entry for comparision
Align the b53_arl_read{,25}() functions by consistently using the
parsed arl entry instead of parsing the raw registers again.

Signed-off-by: Jonas Gorski <jonas.gorski@gmail.com>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Link: https://patch.msgid.link/20251107080749.26936-2-jonas.gorski@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-10 17:11:06 -08:00
Jakub Kicinski
7fc2bf8d30 Merge tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Martin KaFai Lau says:

====================
pull-request: bpf-next 2025-11-10

We've added 19 non-merge commits during the last 3 day(s) which contain
a total of 22 files changed, 1345 insertions(+), 197 deletions(-).

The main changes are:

1) Preserve skb metadata after a TC BPF program has changed the skb,
   from Jakub Sitnicki.
   This allows a TC program at the end of a TC filter chain to still see
   the skb metadata, even if another TC program at the front of the chain
   has changed the skb using BPF helpers.

2) Initial af_smc bpf_struct_ops support to control the smc specific
   syn/synack options, from D. Wythe.

* tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next:
  bpf/selftests: Add selftest for bpf_smc_hs_ctrl
  net/smc: bpf: Introduce generic hook for handshake flow
  bpf: Export necessary symbols for modules with struct_ops
  selftests/bpf: Cover skb metadata access after bpf_skb_change_proto
  selftests/bpf: Cover skb metadata access after change_head/tail helper
  selftests/bpf: Cover skb metadata access after bpf_skb_adjust_room
  selftests/bpf: Cover skb metadata access after vlan push/pop helper
  selftests/bpf: Expect unclone to preserve skb metadata
  selftests/bpf: Dump skb metadata on verification failure
  selftests/bpf: Verify skb metadata in BPF instead of userspace
  bpf: Make bpf_skb_change_head helper metadata-safe
  bpf: Make bpf_skb_change_proto helper metadata-safe
  bpf: Make bpf_skb_adjust_room metadata-safe
  bpf: Make bpf_skb_vlan_push helper metadata-safe
  bpf: Make bpf_skb_vlan_pop helper metadata-safe
  vlan: Make vlan_remove_tag return nothing
  bpf: Unclone skb head on bpf_dynptr_write to skb metadata
  net: Preserve metadata on pskb_expand_head
  net: Helper to move packet data and metadata after skb_push/pull
====================

Link: https://patch.msgid.link/20251110232427.3929291-1-martin.lau@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-10 16:43:51 -08:00
Niklas Söderlund
38f073a71e net: ravb: Correct bad check of timestamp control flags
When converting the Renesas network drivers to use flags from enum
hwtstamp_rx_filters to control when to timestamp packages instead of a
driver specific schema with bit-wise flags an error was made.

The bit-wise driver specific flags correct logic to set get_ts was:

  q: RAVB_BE + tstamp_rx_ctrl: 0 => 0
  q: RAVB_NC + tstamp_rx_ctrl: 0 => 0
  q: RAVB_BE + tstamp_rx_ctrl: RAVB_RXTSTAMP_TYPE_V2_L2_EVENT => 0
  q: RAVB_NC + tstamp_rx_ctrl: RAVB_RXTSTAMP_TYPE_V2_L2_EVENT => 1
  q: RAVB_BE + tstamp_rx_ctrl: RAVB_RXTSTAMP_TYPE_ALL => 1
  q: RAVB_NC + tstamp_rx_ctrl: RAVB_RXTSTAMP_TYPE_ALL => 1

The converted logic to use enum flags mapped tstamp_rx_ctrl as

  0 to HWTSTAMP_FILTER_NONE
  RAVB_RXTSTAMP_TYPE_V2_L2_EVENT to HWTSTAMP_FILTER_PTP_V2_L2_EVENT
  RAVB_RXTSTAMP_TYPE_ALL to HWTSTAMP_FILTER_ALL

But the logic was incorrectly changed to:

  q: RAVB_BE + tstamp_rx_ctrl: HWTSTAMP_FILTER_NONE => 1 (error)
  q: RAVB_NC + tstamp_rx_ctrl: HWTSTAMP_FILTER_NONE => 0
  q: RAVB_BE + tstamp_rx_ctrl: HWTSTAMP_FILTER_PTP_V2_L2_EVENT => 0
  q: RAVB_NC + tstamp_rx_ctrl: HWTSTAMP_FILTER_PTP_V2_L2_EVENT => 1
  q: RAVB_BE + tstamp_rx_ctrl: HWTSTAMP_FILTER_ALL => 1
  q: RAVB_NC + tstamp_rx_ctrl: HWTSTAMP_FILTER_ALL => 0 (error)

This change restores the converted flag check to the correct logic of
the bit-wise driver specific flags.

Reported-by: Simon Horman <horms@kernel.org>
Closes: https://lore.kernel.org/linux-renesas-soc/aQ4xSv9629XF-Bt3@horms.kernel.org/
Fixes: 16e2e6cf75 ("net: ravb: Use common defines for time stamping control")
Signed-off-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Link: https://patch.msgid.link/20251107200100.3637869-1-niklas.soderlund+renesas@ragnatech.se
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-10 16:26:31 -08:00
Zhongqiu Han
5b9192c2c0 ptp: ocp: Document sysfs output format for backward compatibility
Add a comment to ptp_ocp_tty_show() explaining that the sysfs output
intentionally does not include a trailing newline. This is required for
backward compatibility with existing userspace software that reads the
sysfs attribute and uses the value directly as a device path.

A previous attempt to add a newline to align with common kernel
conventions broke userspace applications that were opening device paths
like "/dev/ttyS4\n" instead of "/dev/ttyS4", resulting in ENOENT errors.

This comment prevents future attempts to "fix" this behavior, which would
break existing userspace applications.

Link: https://lore.kernel.org/netdev/20251030124519.1828058-1-zhongqiu.han@oss.qualcomm.com/
Link: https://lore.kernel.org/netdev/aef3b850-5f38-4c28-a018-3b0006dc2f08@linux.dev/
Suggested-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Zhongqiu Han <zhongqiu.han@oss.qualcomm.com>
Reviewed-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Link: https://patch.msgid.link/20251107074533.416048-1-zhongqiu.han@oss.qualcomm.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-10 16:23:13 -08:00
Kuniyuki Iwashima
73edb26b06 sctp: Don't inherit do_auto_asconf in sctp_clone_sock().
syzbot reported list_del(&sp->auto_asconf_list) corruption
in sctp_destroy_sock().

The repro calls setsockopt(SCTP_AUTO_ASCONF, 1) to a SCTP
listener, calls accept(), and close()s the child socket.

setsockopt(SCTP_AUTO_ASCONF, 1) sets sp->do_auto_asconf
to 1 and links sp->auto_asconf_list to a per-netns list.

Both fields are placed after sp->pd_lobby in struct sctp_sock,
and sctp_copy_descendant() did not copy the fields before the
cited commit.

Also, sctp_clone_sock() did not set them explicitly.

In addition, sctp_auto_asconf_init() is called from
sctp_sock_migrate(), but it initialises the fields only
conditionally.

The two fields relied on __GFP_ZERO added in sk_alloc(),
but sk_clone() does not use it.

Let's clear newsp->do_auto_asconf in sctp_clone_sock().

[0]:
list_del corruption. prev->next should be ffff8880799e9148, but was ffff8880799e8808. (prev=ffff88803347d9f8)
kernel BUG at lib/list_debug.c:64!
Oops: invalid opcode: 0000 [#1] SMP KASAN PTI
CPU: 0 UID: 0 PID: 6008 Comm: syz.0.17 Not tainted syzkaller #0 PREEMPT(full)
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/02/2025
RIP: 0010:__list_del_entry_valid_or_report+0x15a/0x190 lib/list_debug.c:62
Code: e8 7b 26 71 fd 43 80 3c 2c 00 74 08 4c 89 ff e8 7c ee 92 fd 49 8b 17 48 c7 c7 80 0a bf 8b 48 89 de 4c 89 f9 e8 07 c6 94 fc 90 <0f> 0b 4c 89 f7 e8 4c 26 71 fd 43 80 3c 2c 00 74 08 4c 89 ff e8 4d
RSP: 0018:ffffc90003067ad8 EFLAGS: 00010246
RAX: 000000000000006d RBX: ffff8880799e9148 RCX: b056988859ee6e00
RDX: 0000000000000000 RSI: 0000000000000202 RDI: 0000000000000000
RBP: dffffc0000000000 R08: ffffc90003067807 R09: 1ffff9200060cf00
R10: dffffc0000000000 R11: fffff5200060cf01 R12: 1ffff1100668fb3f
R13: dffffc0000000000 R14: ffff88803347d9f8 R15: ffff88803347d9f8
FS:  00005555823e5500(0000) GS:ffff88812613e000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000200000000480 CR3: 00000000741ce000 CR4: 00000000003526f0
Call Trace:
 <TASK>
 __list_del_entry_valid include/linux/list.h:132 [inline]
 __list_del_entry include/linux/list.h:223 [inline]
 list_del include/linux/list.h:237 [inline]
 sctp_destroy_sock+0xb4/0x370 net/sctp/socket.c:5163
 sk_common_release+0x75/0x310 net/core/sock.c:3961
 sctp_close+0x77e/0x900 net/sctp/socket.c:1550
 inet_release+0x144/0x190 net/ipv4/af_inet.c:437
 __sock_release net/socket.c:662 [inline]
 sock_close+0xc3/0x240 net/socket.c:1455
 __fput+0x44c/0xa70 fs/file_table.c:468
 task_work_run+0x1d4/0x260 kernel/task_work.c:227
 resume_user_mode_work include/linux/resume_user_mode.h:50 [inline]
 exit_to_user_mode_loop+0xe9/0x130 kernel/entry/common.c:43
 exit_to_user_mode_prepare include/linux/irq-entry-common.h:225 [inline]
 syscall_exit_to_user_mode_work include/linux/entry-common.h:175 [inline]
 syscall_exit_to_user_mode include/linux/entry-common.h:210 [inline]
 do_syscall_64+0x2bd/0xfa0 arch/x86/entry/syscall_64.c:100
 entry_SYSCALL_64_after_hwframe+0x77/0x7f

Fixes: 16942cf4d3 ("sctp: Use sk_clone() in sctp_accept().")
Reported-by: syzbot+ba535cb417f106327741@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/netdev/690d2185.a70a0220.22f260.000e.GAE@google.com/
Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Acked-by: Xin Long <lucien.xin@gmail.com>
Link: https://patch.msgid.link/20251106223418.1455510-1-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-10 16:22:09 -08:00
Martin KaFai Lau
67f4cfb530 Merge branch 'net-smc-introduce-smc_hs_ctrl'
D. Wythe says:

====================
net/smc: Introduce smc_hs_ctrl

This patch aims to introduce BPF injection capabilities for SMC and
includes a self-test to ensure code stability.

Since the SMC protocol isn't ideal for every situation, especially
short-lived ones, most applications can't guarantee the absence of
such scenarios. Consequently, applications may need specific strategies
to decide whether to use SMC. For example, an application might limit SMC
usage to certain IP addresses or ports.

To maintain the principle of transparent replacement, we want applications
to remain unaffected even if they need specific SMC strategies. In other
words, they should not require recompilation of their code.

Additionally, we need to ensure the scalability of strategy implementation.
While using socket options or sysctl might be straightforward, it could
complicate future expansions.

Fortunately, BPF addresses these concerns effectively. Users can write
their own strategies in eBPF to determine whether to use SMC, and they can
easily modify those strategies in the future.

This is a rework of the series from [1]. Changes since [1] are limited to
the SMC parts:

1. Rename smc_ops to smc_hs_ctrl and change interface name.
2. Squash SMC patches, removing standalone non-BPF hook capability.
3. Fix typos

[1]: https://lore.kernel.org/bpf/20250123015942.94810-1-alibuda@linux.alibaba.com/#t

v2 -> v1:
  - Removed the fixes patch, which have already been merged on current branch.
  - Fixed compilation warning of smc_call_hsbpf() when CONFIG_SMC_HS_CTRL_BPF
    is not enabled.
  - Changed the default value of CONFIG_SMC_HS_CTRL_BPF to Y.
  - Fix typo and renamed some variables

v3 -> v2:
  - Removed the libbpf patch, which have already been merged on current branch.
  - Fixed sparse warning of smc_call_hsbpf() and xchg().

v4 -> v3:
   - Rebased on latest bpf-next, updated SMC loopback config from SMC_LO to DIBS_LO
     per upstream changes.

v5 -> v4:
    - Removed the redundant sk parameter from smc_call_hsbpf
    - Reject registration when bpf_link is set, link support will be added in the
      future.
    - Updated selftests with new test heplers.
====================

Link: https://patch.msgid.link/20251107035632.115950-1-alibuda@linux.alibaba.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
2025-11-10 12:00:47 -08:00
D. Wythe
beb3c67297 bpf/selftests: Add selftest for bpf_smc_hs_ctrl
This tests introduces a tiny smc_hs_ctrl for filtering SMC connections
based on IP pairs, and also adds a realistic topology model to verify it.

Also, we can only use SMC loopback under CI test, so an additional
configuration needs to be enabled.

Follow the steps below to run this test.

make -C tools/testing/selftests/bpf
cd tools/testing/selftests/bpf
sudo ./test_progs -t smc

Results shows:
Summary: 1/1 PASSED, 0 SKIPPED, 0 FAILED

Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Tested-by: Saket Kumar Bhaskar <skb99@linux.ibm.com>
Reviewed-by: Zhu Yanjun <yanjun.zhu@linux.dev>
Link: https://patch.msgid.link/20251107035632.115950-4-alibuda@linux.alibaba.com
2025-11-10 12:00:45 -08:00
D. Wythe
15f295f556 net/smc: bpf: Introduce generic hook for handshake flow
The introduction of IPPROTO_SMC enables eBPF programs to determine
whether to use SMC based on the context of socket creation, such as
network namespaces, PID and comm name, etc.

As a subsequent enhancement, to introduce a new generic hook that
allows decisions on whether to use SMC or not at runtime, including
but not limited to local/remote IP address or ports.

User can write their own implememtion via bpf_struct_ops now to choose
whether to use SMC or not before TCP 3rd handshake to be comleted.

Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
Link: https://patch.msgid.link/20251107035632.115950-3-alibuda@linux.alibaba.com
2025-11-10 11:19:41 -08:00
D. Wythe
07c428ece3 bpf: Export necessary symbols for modules with struct_ops
Exports three necessary symbols for implementing struct_ops with
tristate subsystem.

To hold or release refcnt of struct_ops refcnt by inline funcs
bpf_try_module_get and bpf_module_put which use bpf_struct_ops_get(put)
conditionally.

And to copy obj name from one to the other with effective checks by
bpf_obj_name_cpy.

Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20251107035632.115950-2-alibuda@linux.alibaba.com
2025-11-10 11:07:34 -08:00
Martin KaFai Lau
abd0c0f6aa Merge branch 'make-tc-bpf-helpers-preserve-skb-metadata'
Jakub Sitnicki says:

====================
Make TC BPF helpers preserve skb metadata

Changes in v4:
- Fix copy-paste bug in check_metadata() test helper (AI review)
- Add "out of scope" section (at the bottom)
- Link to v3: https://lore.kernel.org/r/20251026-skb-meta-rx-path-v3-0-37cceebb95d3@cloudflare.com

Changes in v3:
- Use the already existing BPF_STREAM_STDERR const in tests (Martin)
- Unclone skb head on bpf_dynptr_write to skb metadata (patch 3) (Martin)
- Swap order of patches 1 & 2 to refer to skb_postpush_data_move() in docs
- Mention in skb_data_move() docs how to move just the metadata
- Note in pskb_expand_head() docs to move metadata after skb_push() (Jakub)
- Link to v2: https://lore.kernel.org/r/20251019-skb-meta-rx-path-v2-0-f9a58f3eb6d6@cloudflare.com

Changes in v2:
- Tweak WARN_ON_ONCE check in skb_data_move() (patch 2)
- Convert all tests to verify skb metadata in BPF (patches 9-10)
- Add test coverage for modified BPF helpers (patches 12-15)
- Link to RFCv1: https://lore.kernel.org/r/20250929-skb-meta-rx-path-v1-0-de700a7ab1cb@cloudflare.com

This patch set continues our work [1] to allow BPF programs and user-space
applications to attach multiple bytes of metadata to packets via the
XDP/skb metadata area.

The focus of this patch set it to ensure that skb metadata remains intact
when packets pass through a chain of TC BPF programs that call helpers
which operate on skb head.

Currently, several helpers that either adjust the skb->data pointer or
reallocate skb->head do not preserve metadata at its expected location,
that is immediately in front of the MAC header. These are:

- bpf_skb_adjust_room
- bpf_skb_change_head
- bpf_skb_change_proto
- bpf_skb_change_tail
- bpf_skb_vlan_pop
- bpf_skb_vlan_push

In TC BPF context, metadata must be moved whenever skb->data changes to
keep the skb->data_meta pointer valid. I don't see any way around
it. Creative ideas how to avoid that would be very welcome.

With that in mind, we can patch the helpers in at least two different ways:

1. Integrate metadata move into header move

   Replace the existing memmove, which follows skb_push/pull, with a helper
   that moves both headers and metadata in a single call. This avoids an
   extra memmove but reduces transparency.

        skb_pull(skb, len);
-       memmove(skb->data, skb->data - len, n);
+       skb_postpull_data_move(skb, len, n);
        skb->mac_header += len;

        skb_push(skb, len)
-       memmove(skb->data, skb->data + len, n);
+       skb_postpush_data_move(skb, len, n);
        skb->mac_header -= len;

2. Move metadata separately

   Add a dedicated metadata move after the header move. This is more
   explicit but costs an additional memmove.

        skb_pull(skb, len);
        memmove(skb->data, skb->data - len, n);
+       skb_metadata_postpull_move(skb, len);
        skb->mac_header += len;

        skb_push(skb, len)
+       skb_metadata_postpush_move(skb, len);
        memmove(skb->data, skb->data + len, n);
        skb->mac_header -= len;

This patch set implements option (1), expecting that "you can have just one
memmove" will be the most obvious feedback, while readability is a,
somewhat subjective, matter of taste, which I don't claim to have ;-)

The structure of the patch set is as follows:

- patches 1-4 prepare ground for safe-proofing the BPF helpers
- patches 5-9 modify the BPF helpers to preserve skb metadata
- patches 10-11 prepare ground for metadata tests with BPF helper calls
- patches 12-16 adapt and expand tests to cover the made changes

Out of scope for this series:
- safe-proofing tunnel & tagging devices - VLAN, GRE, ...
  (next in line, in development preview at [2])
- metadata access after packet foward
  (to do after Rx path - once metadata reliably reaches sk_filter)

Thanks,
-jkbs

[1] https://lore.kernel.org/all/20250814-skb-metadata-thru-dynptr-v7-0-8a39e636e0fb@cloudflare.com/
[2] https://github.com/jsitnicki/linux/commits/skb-meta/safeproof-netdevs/
====================

Link: https://patch.msgid.link/20251105-skb-meta-rx-path-v4-0-5ceb08a9b37b@cloudflare.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
2025-11-10 10:52:34 -08:00
Jakub Sitnicki
d2c5cca3fb selftests/bpf: Cover skb metadata access after bpf_skb_change_proto
Add a test to verify that skb metadata remains accessible after calling
bpf_skb_change_proto(), which modifies packet headroom to accommodate
different IP header sizes.

Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20251105-skb-meta-rx-path-v4-16-5ceb08a9b37b@cloudflare.com
2025-11-10 10:52:33 -08:00
Jakub Sitnicki
85d454afef selftests/bpf: Cover skb metadata access after change_head/tail helper
Add a test to verify that skb metadata remains accessible after calling
bpf_skb_change_head() and bpf_skb_change_tail(), which modify packet
headroom/tailroom and can trigger head reallocation.

Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20251105-skb-meta-rx-path-v4-15-5ceb08a9b37b@cloudflare.com
2025-11-10 10:52:33 -08:00
Jakub Sitnicki
29960e635b selftests/bpf: Cover skb metadata access after bpf_skb_adjust_room
Add a test to verify that skb metadata remains accessible after calling
bpf_skb_adjust_room(), which modifies the packet headroom and can trigger
head reallocation.

The helper expects an Ethernet frame carrying an IP packet so switch test
packet identification by source MAC address since we can no longer rely on
Ethernet proto being set to zero.

Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20251105-skb-meta-rx-path-v4-14-5ceb08a9b37b@cloudflare.com
2025-11-10 10:52:33 -08:00
Jakub Sitnicki
354d020c29 selftests/bpf: Cover skb metadata access after vlan push/pop helper
Add a test to verify that skb metadata remains accessible after calling
bpf_skb_vlan_push() and bpf_skb_vlan_pop(), which modify the packet
headroom.

Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20251105-skb-meta-rx-path-v4-13-5ceb08a9b37b@cloudflare.com
2025-11-10 10:52:32 -08:00
Jakub Sitnicki
1e1357fde8 selftests/bpf: Expect unclone to preserve skb metadata
Since pskb_expand_head() no longer clears metadata on unclone, update tests
for cloned packets to expect metadata to remain intact.

Also simplify the clone_dynptr_kept_on_{data,meta}_slice_write tests.
Creating an r/w dynptr slice is sufficient to trigger an unclone in the
prologue, so remove the extraneous writes to the data/meta slice.

Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20251105-skb-meta-rx-path-v4-12-5ceb08a9b37b@cloudflare.com
2025-11-10 10:52:32 -08:00
Jakub Sitnicki
9ef9ac15a5 selftests/bpf: Dump skb metadata on verification failure
Add diagnostic output when metadata verification fails to help with
troubleshooting test failures. Introduce a check_metadata() helper that
prints both expected and received metadata to the BPF program's stderr
stream on mismatch. The userspace test reads and dumps this stream on
failure.

Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20251105-skb-meta-rx-path-v4-11-5ceb08a9b37b@cloudflare.com
2025-11-10 10:52:32 -08:00
Jakub Sitnicki
967534e57c selftests/bpf: Verify skb metadata in BPF instead of userspace
Move metadata verification into the BPF TC programs. Previously,
userspace read metadata from a map and verified it once at test end.

Now TC programs compare metadata directly using __builtin_memcmp() and
set a test_pass flag. This enables verification at multiple points during
test execution rather than a single final check.

Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20251105-skb-meta-rx-path-v4-10-5ceb08a9b37b@cloudflare.com
2025-11-10 10:52:32 -08:00
Jakub Sitnicki
fb206fc312 bpf: Make bpf_skb_change_head helper metadata-safe
Although bpf_skb_change_head() doesn't move packet data after skb_push(),
skb metadata still needs to be relocated. Use the dedicated helper to
handle it.

Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20251105-skb-meta-rx-path-v4-9-5ceb08a9b37b@cloudflare.com
2025-11-10 10:52:32 -08:00
Jakub Sitnicki
8cfc172ce2 bpf: Make bpf_skb_change_proto helper metadata-safe
bpf_skb_change_proto reuses the same headroom operations as
bpf_skb_adjust_room, already updated to handle metadata safely.

The remaining step is to ensure that there is sufficient headroom to
accommodate metadata on skb_push().

Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20251105-skb-meta-rx-path-v4-8-5ceb08a9b37b@cloudflare.com
2025-11-10 10:52:32 -08:00
Jakub Sitnicki
be83105d38 bpf: Make bpf_skb_adjust_room metadata-safe
bpf_skb_adjust_room() may push or pull bytes from skb->data. In both cases,
skb metadata must be moved accordingly to stay accessible.

Replace existing memmove() calls, which only move payload, with a helper
that also handles metadata. Reserve enough space for metadata to fit after
skb_push.

Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20251105-skb-meta-rx-path-v4-7-5ceb08a9b37b@cloudflare.com
2025-11-10 10:52:32 -08:00
Jakub Sitnicki
55ffc98b44 bpf: Make bpf_skb_vlan_push helper metadata-safe
Use the metadata-aware helper to move packet bytes after skb_push(),
ensuring metadata remains valid after calling the BPF helper.

Also, take care to reserve sufficient headroom for metadata to fit.

Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20251105-skb-meta-rx-path-v4-6-5ceb08a9b37b@cloudflare.com
2025-11-10 10:52:31 -08:00
Jakub Sitnicki
efd35c2623 bpf: Make bpf_skb_vlan_pop helper metadata-safe
Use the metadata-aware helper to move packet bytes after skb_pull(),
ensuring metadata remains valid after calling the BPF helper.

Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20251105-skb-meta-rx-path-v4-5-5ceb08a9b37b@cloudflare.com
2025-11-10 10:52:31 -08:00
Jakub Sitnicki
b85be58e2f vlan: Make vlan_remove_tag return nothing
All callers ignore the return value.

Prepare to reorder memmove() after skb_pull() which is a common pattern.

Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20251105-skb-meta-rx-path-v4-4-5ceb08a9b37b@cloudflare.com
2025-11-10 10:52:31 -08:00
Jakub Sitnicki
f38499ff45 bpf: Unclone skb head on bpf_dynptr_write to skb metadata
Currently bpf_dynptr_from_skb_meta() marks the dynptr as read-only when
the skb is cloned, preventing writes to metadata.

Remove this restriction and unclone the skb head on bpf_dynptr_write() to
metadata, now that the metadata is preserved during uncloning. This makes
metadata dynptr consistent with skb dynptr, allowing writes regardless of
whether the skb is cloned.

Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20251105-skb-meta-rx-path-v4-3-5ceb08a9b37b@cloudflare.com
2025-11-10 10:52:31 -08:00
Jakub Sitnicki
290fc0be09 net: Preserve metadata on pskb_expand_head
pskb_expand_head() copies headroom, including skb metadata, into the newly
allocated head, but then clears the metadata. As a result, metadata is lost
when BPF helpers trigger an skb head reallocation.

Let the skb metadata remain in the newly created copy of head.

Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20251105-skb-meta-rx-path-v4-2-5ceb08a9b37b@cloudflare.com
2025-11-10 10:52:31 -08:00
Jakub Sitnicki
8989d328df net: Helper to move packet data and metadata after skb_push/pull
Lay groundwork for fixing BPF helpers available to TC(X) programs.

When skb_push() or skb_pull() is called in a TC(X) ingress BPF program, the
skb metadata must be kept in front of the MAC header. Otherwise, BPF
programs using the __sk_buff->data_meta pseudo-pointer lose access to it.

Introduce a helper that moves both metadata and a specified number of
packet data bytes together, suitable as a drop-in replacement for
memmove().

Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20251105-skb-meta-rx-path-v4-1-5ceb08a9b37b@cloudflare.com
2025-11-10 10:52:31 -08:00
Konstantin Komarov
f35590ee26 fs/ntfs3: remove ntfs_bio_pages and use page cache for compressed I/O
Replace the use of ntfs_bio_pages with the disk page cache for reading and
writing compressed files. This slightly improves performance when reading
compressed data and simplifies the I/O logic.

When an XPRESS or LZX compressed file is opened for writing, it is now
decompressed into a normal file before modification. A new argument (`int copy`)
is added to ni_read_frame() to handle writing of decompressed and mapped data.

Signed-off-by: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
2025-11-10 14:30:16 +01:00
Lizhi Xu
c3856bb499 ntfs3: avoid memcpy size warning
There are more entries after the structure, use unsafe_memcpy() to avoid
this warning.

syzbot reported:
memcpy: detected field-spanning write (size 3656) of single field "hdr1" at fs/ntfs3/index.c:1927 (size 16)
Call Trace:
 indx_insert_entry+0x1a0/0x460 fs/ntfs3/index.c:1996
 ni_add_name+0x4dd/0x820 fs/ntfs3/frecord.c:2995
 ni_rename+0x98/0x170 fs/ntfs3/frecord.c:3026
 ntfs_rename+0xab9/0xf00 fs/ntfs3/namei.c:332

Reported-by: syzbot+3a1878433bc1cb97b42a@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=3a1878433bc1cb97b42a
Signed-off-by: Lizhi Xu <lizhi.xu@windriver.com>
Signed-off-by: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
2025-11-10 14:30:16 +01:00
Nirbhay Sharma
5f33da04e6 fs/ntfs3: fix KMSAN uninit-value in ni_create_attr_list
The call to kmalloc() to allocate the attribute list buffer is given a
size of al_aligned(rs). This size can be larger than the data
subsequently copied into the buffer, leaving trailing bytes uninitialized.

This can trigger a KMSAN "uninit-value" warning if that memory is
later accessed.

Fix this by using kzalloc() instead, which ensures the entire
allocated buffer is zero-initialized, preventing the warning.

Reported-by: syzbot+83c9dd5c0dcf6184fdbf@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=83c9dd5c0dcf6184fdbf
Signed-off-by: Nirbhay Sharma <nirbhay.lkd@gmail.com>
Signed-off-by: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
2025-11-10 14:30:15 +01:00
Edward Adam Davis
be99c62ac7 ntfs3: init run lock for extend inode
After setting the inode mode of $Extend to a regular file, executing the
truncate system call will enter the do_truncate() routine, causing the
run_lock uninitialized error reported by syzbot.

Prior to patch 4e8011ffec, if the inode mode of $Extend was not set to
a regular file, the do_truncate() routine would not be entered.

Add the run_lock initialization when loading $Extend.

syzbot reported:
INFO: trying to register non-static key.
Call Trace:
 dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120
 assign_lock_key+0x133/0x150 kernel/locking/lockdep.c:984
 register_lock_class+0x105/0x320 kernel/locking/lockdep.c:1299
 __lock_acquire+0x99/0xd20 kernel/locking/lockdep.c:5112
 lock_acquire+0x120/0x360 kernel/locking/lockdep.c:5868
 down_write+0x96/0x1f0 kernel/locking/rwsem.c:1590
 ntfs_set_size+0x140/0x200 fs/ntfs3/inode.c:860
 ntfs_extend+0x1d9/0x970 fs/ntfs3/file.c:387
 ntfs_setattr+0x2e8/0xbe0 fs/ntfs3/file.c:808

Fixes: 4e8011ffec ("ntfs3: pretend $Extend records as regular files")
Reported-by: syzbot+bdeb22a4b9a09ab9aa45@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=bdeb22a4b9a09ab9aa45
Tested-by: syzbot+bdeb22a4b9a09ab9aa45@syzkaller.appspotmail.com
Signed-off-by: Edward Adam Davis <eadavis@qq.com>
Signed-off-by: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
2025-11-10 14:30:14 +01:00
Pedro Demarchi Gomes
d1693a7d5a ntfs: set dummy blocksize to read boot_block when mounting
When mounting, sb->s_blocksize is used to read the boot_block without
being defined or validated. Set a dummy blocksize before attempting to
read the boot_block.

The issue can be triggered with the following syz reproducer:

  mkdirat(0xffffffffffffff9c, &(0x7f0000000080)='./file1\x00', 0x0)
  r4 = openat$nullb(0xffffffffffffff9c, &(0x7f0000000040), 0x121403, 0x0)
  ioctl$FS_IOC_SETFLAGS(r4, 0x40081271, &(0x7f0000000980)=0x4000)
  mount(&(0x7f0000000140)=@nullb, &(0x7f0000000040)='./cgroup\x00',
        &(0x7f0000000000)='ntfs3\x00', 0x2208004, 0x0)
  syz_clone(0x88200200, 0x0, 0x0, 0x0, 0x0, 0x0)

Here, the ioctl sets the bdev block size to 16384. During mount,
get_tree_bdev_flags() calls sb_set_blocksize(sb, block_size(bdev)),
but since block_size(bdev) > PAGE_SIZE, sb_set_blocksize() leaves
sb->s_blocksize at zero.

Later, ntfs_init_from_boot() attempts to read the boot_block while
sb->s_blocksize is still zero, which triggers the bug.

Reported-by: syzbot+f4f84b57a01d6b8364ad@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=f4f84b57a01d6b8364ad
Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
[almaz.alexandrovich@paragon-software.com: changed comment style, added
return value handling]
Signed-off-by: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
2025-11-10 14:30:11 +01:00
Chien Wong
473235677a wifi: cfg80211: fix doc of struct key_params
The seq in struct key_params is for many ciphers, including CCMP, GCMP,
CMAC, GMAC. In addition to get_key(), it is also used when setting keys.

Signed-off-by: Chien Wong <m@xv97.com>
Link: https://patch.msgid.link/20251107142332.181308-1-m@xv97.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-10 10:39:14 +01:00
Johannes Berg
243d30fbb6 wifi: mac80211: remove unnecessary vlan NULL check
In a vlan iteration loop the vlan pointer cannot be NULL.
Remove the unnecessary check.

Link: https://patch.msgid.link/20251105161033.670b5a06296c.I24cb1a5338736ab0a8a24d6a492c259f894d09fb@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-10 10:39:08 +01:00
Johannes Berg
68eb1b791a wifi: mac80211: pass frame type to element parsing
This will be needed for UHR operation parsing, and we
already pass whether or not the frame is an action
frame, replace that by the full type. Note this fixes
a few cases where 'false' was erroneously passed (mesh
and TDLS) and removes ieee802_11_parse_elems_crc() as
it's unused.

Link: https://patch.msgid.link/20251105160810.a476d20a6e01.Ie659535f9357f2f9a3c73f8c059ccfc96bf93b54@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-10 10:39:02 +01:00
Johannes Berg
1fba157685 wifi: mac80211: remove "disabling VHT" message
This message always occurs with any AP that is simply HT/11n,
remove it.

Link: https://patch.msgid.link/20251105160538.f8b286cfba08.Ib91b5b09a3fff753341b0f7cb0df7f6913a4abe6@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-10 10:38:56 +01:00
Johannes Berg
1ce954c98b wifi: mac80211: add and use chanctx usage iteration
In preparation for NAN interfaces using multiple channel
contexts, add an iterator macro that iterates all users
of a given channel context.

The logic during reserved assign/reassign handling the
bandwidth in ieee80211_get_chanctx_max_required_bw() is
a bit more complicated and should be cleaned up, so it
isn't yet converted.

Link: https://patch.msgid.link/20251105160431.5aaccc2f127d.I2b7fd0858a263916f43abab49c6d3cc0b5aa16ec@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-10 10:38:50 +01:00
Johannes Berg
52363af3a9 wifi: mac80211: simplify ieee80211_recalc_chanctx_min_def() API
ieee80211_recalc_chanctx_min_def() is used outside the chandef
code, but then should/is always used with NULL/false for the
two last arguments. Remove them, and create another level of
indirection for use inside the file.

Link: https://patch.msgid.link/20251105160431.33408844c392.I4f52298861780c17a27cd229609e8a3e29c8d740@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-10 10:38:50 +01:00
Johannes Berg
a1dc648aa7 wifi: mac80211: remove chanctx to link back-references
Each link can currently use a chanctx and have another one
reserved, and both of these are also tracked backwards in
the assigned_links and reserved_links lists. If we consider
that there aren't *that* many links, this duplicate book-
keeping isn't necessary.
(I think it used to be necessary before the wiphy locking
changes, when chanctx_mtx existed, because we couldn't do
any interface iterations while holding only chanctx_mtx.)

Additionally, for NAN, we're going to want to track which
chanctxs are in use by the (group of) NAN interfaces. For
those, links don't really make sense as such, so chanctxs
need to be assigned to a different data structure.

Thus, as a first step, remove those back-lists of users
(right now only links) of each channel context. This is a
very basic conversion, ieee80211_vif_use_reserved_switch()
should made to iterate smarter.

Link: https://patch.msgid.link/20251105160431.dbeea1c42e76.I8d273c407274e1c05a4778aa20b56a9f326e87a7@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-10 10:38:49 +01:00
Johannes Berg
29cc798e70 wifi: mac80211: make link iteration safe for 'break'
The current link iteration macros for_each_sdata_link() and
for_each_sdata_link_rcu() are various nested for loops, but
because they iterate all sdata instances and then all links
inside, using 'break' inside the iteration doesn't actually
break out of the whole iteration.

Make it work by tracking whether or not the inner iteration
(over links) actually completed, if it broke out then given
list_for_each_entry() it still iterates all sdata instances
but won't go into them.

Link: https://patch.msgid.link/20251105160431.c21956654fc0.I8d4739af061c44c57d172f19a15303a44ad1e596@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-10 10:38:49 +01:00
Johannes Berg
1a1cad924e wifi: mac80211: fix EHT typo
This is clearly EHT, not ETH, fix the typo.

Link: https://patch.msgid.link/20251105153958.12a04517f7ec.Idcf800817fa30605b1002c3d2287cad016e7aea7@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-10 10:38:37 +01:00
Johannes Berg
30b6089aad wifi: cfg80211: fix EHT typo
This is clearly EHT, not ETH, fix the typo.

Link: https://patch.msgid.link/20251105153958.e9d4af3b768e.I5f3378326837e3f62928a2f1fd3403f29cea069b@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-10 10:38:36 +01:00
Johannes Berg
60a3734192 wifi: ieee80211: split NAN definitions out
The ieee80211.h file has gotten very long, continue splitting
it by putting NAN definitions into a separate file. Note that
NAN isn't really even IEEE 802.11 but WFA.

Link: https://patch.msgid.link/20251105153843.8da0e796dda2.I7b2ce11220b70e8794019501eabbf8afbaf431a6@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-10 10:38:15 +01:00
Johannes Berg
fcd42b909b wifi: ieee80211: split P2P definitions out
The ieee80211.h file has gotten very long, continue splitting
it by putting P2P definitions into a separate file. Note that
P2P isn't really even IEEE 802.11 but WFA.

Link: https://patch.msgid.link/20251105153843.e47b2614e9d2.Id242f61da720e365f6b5d7a4a545fbbc2f1e92b4@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-10 10:38:15 +01:00
Johannes Berg
00105d7600 wifi: ieee80211: split S1G definitions out
The ieee80211.h file has gotten very long, continue splitting
it by putting S1G definitions into a separate file.

Link: https://patch.msgid.link/20251105153843.82c0bddee6e3.Ic6646615286dad240b42e31e9d428c7e4ea40ce0@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-10 10:38:15 +01:00
Johannes Berg
86bc0c6623 wifi: ieee80211: split EHT definitions out
The ieee80211.h file has gotten very long, continue splitting
it by putting EHT definitions into a separate file.

Link: https://patch.msgid.link/20251105153843.bf77fe169140.I691267e0edd914c604a5bfd447d33be00044c9b4@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-10 10:38:14 +01:00
Johannes Berg
02a2cf3025 wifi: ieee80211: split HE definitions out
The ieee80211.h file has gotten very long, continue splitting
it by putting HE definitions into a separate file.

Link: https://patch.msgid.link/20251105153843.6998c0802104.I3dd7cfea6abbd118b999ecdedd48437d39cb0533@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-10 10:38:14 +01:00
Johannes Berg
7cb14da1d7 wifi: ieee80211: split VHT definitions out
The ieee80211.h file has gotten very long, continue splitting
it by putting VHT definitions into a separate file.

Link: https://patch.msgid.link/20251105153843.c31cb771a250.I787a13064db7d80440101de3445be17881daf1b6@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-10 10:38:14 +01:00
Johannes Berg
fdc1c141f3 wifi: ieee80211: split HT definitions out
The ieee80211.h file has gotten very long, continue splitting
it by putting HT definitions into a separate file.

Link: https://patch.msgid.link/20251105153843.7532471178d0.Id956a5433ad8658e4e5c0272dbcbb59587206142@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-10 10:38:14 +01:00
Johannes Berg
69674282fc wifi: ieee80211: split mesh definitions out
The ieee80211.h file has gotten very long, start splitting it
by putting mesh definitions into a separate file.

Link: https://patch.msgid.link/20251105153843.489713ca8b34.I3befb4bf6ace0315758a1794224ddd18c4652e32@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-10 10:38:14 +01:00
Matthew Wilcox (Oracle)
c3454ac036 gfs2: Use bio_add_folio_nofail()
As the label says, we've just allocated a new BIO so we know
we can add this folio to it.  We now have bio_add_folio_nofail()
for this purpose.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2025-11-09 20:19:34 +00:00
Patrisious Haddad
583b4fe1c1 net/mlx5: fs, set non default device per namespace
Add mlx5_fs_set_root_dev() function which swaps the root namespace
core device with another one for a given table_type.

It is intended for usage only by RDMA_TRANSPORT tables in case of LAG
configuration, to allow the creation of tables during LAG always
through the LAG master device, which is valid since during LAG the
master is allowed to manage the RDMA_TRANSPORT tables of its slaves.

In addition move the table_type enum to global include to allow its use
in a downstream patch in the RDMA driver.

Signed-off-by: Patrisious Haddad <phaddad@nvidia.com>
Signed-off-by: Edward Srouji <edwards@nvidia.com>
Link: https://patch.msgid.link/20251029-support-other-eswitch-v1-3-98bb707b5d57@nvidia.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
2025-11-09 05:16:58 -05:00
Patrisious Haddad
3b848dec7e net/mlx5: fs, Add other_eswitch support for steering tables
Add other_eswitch support which allows flow tables creation above vports
that reside on different esw managers.

The new flag MLX5_FLOW_TABLE_OTHER_ESWITCH indicates if the
esw_owner_vhca_id attribute is supported.

Note that this is only supported if the Advanced-RDMA cap-
rdma_transport_manager_other_eswitch is set.
And it is the caller responsibility to check that.

Signed-off-by: Patrisious Haddad <phaddad@nvidia.com>
Signed-off-by: Edward Srouji <edwards@nvidia.com>
Link: https://patch.msgid.link/20251029-support-other-eswitch-v1-2-98bb707b5d57@nvidia.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
2025-11-09 05:16:53 -05:00
Patrisious Haddad
6948417b3f net/mlx5: Add OTHER_ESWITCH HW capabilities
Add OTHER_ESWITCH capabilities which includes other_eswitch and
eswitch_owner_vhca_id to all steering objects.

Signed-off-by: Patrisious Haddad <phaddad@nvidia.com>
Signed-off-by: Edward Srouji <edwards@nvidia.com>
Link: https://patch.msgid.link/20251029-support-other-eswitch-v1-1-98bb707b5d57@nvidia.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
2025-11-09 05:16:47 -05:00
Yishai Hadas
2d838c11e1 net/mlx5: Add direct ST mode support for RDMA
Add support for direct ST mode where ST Table Location equals
PCI_TPH_LOC_NONE.

In that case, no steering table exists, the steering tag itself will be
used directly by the SW, FW, HW from the mkey.

This enables RDMA users to use the current exposed APIs to work in
direct mode.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Signed-off-by: Edward Srouji <edwards@nvidia.com>
Link: https://patch.msgid.link/20251027-st-direct-mode-v1-2-e0ad953866b6@nvidia.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
2025-11-09 05:13:34 -05:00
Yishai Hadas
7b8a8ec20c PCI/TPH: Expose pcie_tph_get_st_table_loc()
Expose pcie_tph_get_st_table_loc() to be used by drivers as will be done
in the next patch from the series.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Signed-off-by: Edward Srouji <edwards@nvidia.com>
Link: https://patch.msgid.link/20251027-st-direct-mode-v1-1-e0ad953866b6@nvidia.com
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
2025-11-09 05:13:02 -05:00
Caleb Sander Mateos
4cda40dce9 block: clean up indentation in blk_rq_map_iter_init()
blk_rq_map_iter_init() has one line with 7 spaces of indentation and
another that mixes 1 tab and 8 spaces. Convert both to tabs.

Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-08 06:38:18 -07:00
Zheng Qixing
9517b82d8d nbd: defer config put in recv_work
There is one uaf issue in recv_work when running NBD_CLEAR_SOCK and
NBD_CMD_RECONFIGURE:
  nbd_genl_connect     // conf_ref=2 (connect and recv_work A)
  nbd_open	       // conf_ref=3
  recv_work A done     // conf_ref=2
  NBD_CLEAR_SOCK       // conf_ref=1
  nbd_genl_reconfigure // conf_ref=2 (trigger recv_work B)
  close nbd	       // conf_ref=1
  recv_work B
    config_put         // conf_ref=0
    atomic_dec(&config->recv_threads); -> UAF

Or only running NBD_CLEAR_SOCK:
  nbd_genl_connect   // conf_ref=2
  nbd_open 	     // conf_ref=3
  NBD_CLEAR_SOCK     // conf_ref=2
  close nbd
    nbd_release
      config_put     // conf_ref=1
  recv_work
    config_put 	     // conf_ref=0
    atomic_dec(&config->recv_threads); -> UAF

Commit 87aac3a80a ("nbd: call nbd_config_put() before notifying the
waiter") moved nbd_config_put() to run before waking up the waiter in
recv_work, in order to ensure that nbd_start_device_ioctl() would not
be woken up while nbd->task_recv was still uncleared.

However, in nbd_start_device_ioctl(), after being woken up it explicitly
calls flush_workqueue() to make sure all current works are finished.
Therefore, there is no need to move the config put ahead of the wakeup.

Move nbd_config_put() to the end of recv_work, so that the reference is
held for the whole lifetime of the worker thread. This makes sure the
config cannot be freed while recv_work is still running, even if clear
+ reconfigure interleave.

In addition, we don't need to worry about recv_work dropping the last
nbd_put (which causes deadlock):

path A (netlink with NBD_CFLAG_DESTROY_ON_DISCONNECT):
  connect  // nbd_refs=1 (trigger recv_work)
  open nbd // nbd_refs=2
  NBD_CLEAR_SOCK
  close nbd
    nbd_release
      nbd_disconnect_and_put
        flush_workqueue // recv_work done
      nbd_config_put
        nbd_put // nbd_refs=1
      nbd_put // nbd_refs=0
        queue_work

path B (netlink without NBD_CFLAG_DESTROY_ON_DISCONNECT):
  connect  // nbd_refs=2 (trigger recv_work)
  open nbd // nbd_refs=3
  NBD_CLEAR_SOCK // conf_refs=2
  close nbd
    nbd_release
      nbd_config_put // conf_refs=1
      nbd_put // nbd_refs=2
  recv_work done // conf_refs=0, nbd_refs=1
  rmmod // nbd_refs=0

Reported-by: syzbot+56fbf4c7ddf65e95c7cc@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/all/6907edce.a70a0220.37351b.0014.GAE@google.com/T/
Fixes: 87aac3a80a ("nbd: make the config put is called before the notifying the waiter")
Depends-on: e2daec488c ("nbd: Fix hungtask when nbd_config_put")
Signed-off-by: Zheng Qixing <zhengqixing@huawei.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-08 06:37:54 -07:00
Huiwen He
a811db3919 md/raid5: remove redundant __GFP_NOWARN
The __GFP_NOWARN flag was included in GFP_NOWAIT since commit
16f5dfbc85 ("gfp: include __GFP_NOWARN in GFP_NOWAIT"). So
remove the redundant __GFP_NOWARN flag.

Link: https://lore.kernel.org/linux-raid/20251102152540.871568-1-hehuiwen@kylinos.cn
Signed-off-by: Huiwen He <hehuiwen@kylinos.cn>
Reviewed-by: Li Nan <linan122@huawei.com>
Reviewed-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
2025-11-08 17:49:36 +08:00
Xiao Ni
90e3bb44c0 md: avoid repeated calls to del_gendisk
There is a uaf problem which is found by case 23rdev-lifetime:

Oops: general protection fault, probably for non-canonical address 0xdead000000000122
RIP: 0010:bdi_unregister+0x4b/0x170
Call Trace:
 <TASK>
 __del_gendisk+0x356/0x3e0
 mddev_unlock+0x351/0x360
 rdev_attr_store+0x217/0x280
 kernfs_fop_write_iter+0x14a/0x210
 vfs_write+0x29e/0x550
 ksys_write+0x74/0xf0
 do_syscall_64+0xbb/0x380
 entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7ff5250a177e

The sequence is:
1. rdev remove path gets reconfig_mutex
2. rdev remove path release reconfig_mutex in mddev_unlock
3. md stop calls do_md_stop and sets MD_DELETED
4. rdev remove path calls del_gendisk because MD_DELETED is set
5. md stop path release reconfig_mutex and calls del_gendisk again

So there is a race condition we should resolve. This patch adds a
flag MD_DO_DELETE to avoid the race condition.

Link: https://lore.kernel.org/linux-raid/20251029063419.21700-1-xni@redhat.com
Fixes: 9e59d60976 ("md: call del_gendisk in control path")
Signed-off-by: Xiao Ni <xni@redhat.com>
Suggested-by: Yu Kuai <yukuai@fnnas.com>
Reviewed-by: Li Nan <linan122@huawei.com>
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
2025-11-08 17:49:22 +08:00
Chen Ni
46caa40534 md/md-llbitmap: Remove unneeded semicolon
Remove unnecessary semicolons reported by Coccinelle/coccicheck and the
semantic patch at scripts/coccinelle/misc/semicolon.cocci.

Link: https://lore.kernel.org/linux-raid/20250910091912.25624-1-nichen@iscas.ac.cn
Signed-off-by: Chen Ni <nichen@iscas.ac.cn>
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
2025-11-08 16:55:35 +08:00
John Garry
7fc8f632e6 md/md-linear: Enable atomic writes
All the infrastructure has already been plumbed to support this for
stacked devices, so just enable the request_queue limits features flag.

A note about chunk sectors for linear arrays:
While it is possible to set a chunk sectors param for building a linear
array, this is for specifying the granularity at which data sectors from
the device are used. It is not the same as a stripe size, like for RAID0.

As such, it is not appropriate to set chunk_sectors request queue limit to
the same value, as chunk_sectors request limit is a boundary for which
requests cannot straddle.

However, request_queue limit max_hw_sectors is set to chunk sectors, which
almost has the same effect as setting chunk_sectors limit.

Link: https://lore.kernel.org/linux-raid/20250903161052.3326176-1-john.g.garry@oracle.com
Signed-off-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Yu Kuai <yukuai3@fnnas.com>
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
2025-11-08 16:55:19 +08:00
Wu Guanghao
082d680faf Factor out code into md_should_do_recovery()
In md_check_recovery(), use new helper to make code cleaner.

Link: https://lore.kernel.org/linux-raid/e62894c8-d916-94bc-ef48-3c60e6e1fc5d@huawei.com
Signed-off-by: Wu Guanghao <wuguanghao3@huawei.com>
Reviewed-by: Yu Kuai <yukuai3@fnnas.com>
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
2025-11-08 16:54:54 +08:00
Yun Zhou
0dc7620554 md: fix rcu protection in md_wakeup_thread
We attempted to use RCU to protect the pointer 'thread', but directly
passed the value when calling md_wakeup_thread(). This means that the
RCU pointer has been acquired before rcu_read_lock(), which renders
rcu_read_lock() ineffective and could lead to a use-after-free.

Link: https://lore.kernel.org/linux-raid/20251015083227.1079009-1-yun.zhou@windriver.com
Fixes: 4469315439 ("md: protect md_thread with rcu")
Signed-off-by: Yun Zhou <yun.zhou@windriver.com>
Reviewed-by: Li Nan <linan122@huawei.com>
Reviewed-by: Yu Kuai <yukuai@fnnas.com>
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
2025-11-08 16:54:36 +08:00
Xiao Ni
cc394b94dc md: delete mddev kobj before deleting gendisk kobj
In sync del gendisk path, it deletes gendisk first and the directory
/sys/block/md is removed. Then it releases mddev kobj in a delayed work.
If we enable debug log in sysfs_remove_group, we can see the debug log
'sysfs group bitmap not found for kobject md'. It's the reason that the
parent kobj has been deleted, so it can't find parent directory.

In creating path, it allocs gendisk first, then adds mddev kobj. So it
should delete mddev kobj before deleting gendisk.

Before commit 9e59d60976 ("md: call del_gendisk in control path"), it
releases mddev kobj first. If the kobj hasn't been deleted, it does clean
job and deletes the kobj. Then it calls del_gendisk and releases gendisk
kobj. So it doesn't need to call kobject_del to delete mddev kobj. After
this patch, in sync del gendisk path, the sequence changes. So it needs
to call kobject_del to delete mddev kobj.

After this patch, the sequence is:
1. kobject del mddev kobj
2. del_gendisk deletes gendisk kobj
3. mddev_delayed_delete releases mddev kobj
4. md_kobj_release releases gendisk kobj

Link: https://lore.kernel.org/linux-raid/20250928012424.61370-1-xni@redhat.com
Fixes: 9e59d60976 ("md: call del_gendisk in control path")
Signed-off-by: Xiao Ni <xni@redhat.com>
Reviewed-by: Li Nan <linan122@huawei.com>
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
2025-11-08 16:53:55 +08:00
Yu Kuai
3d7b1dbaa0 MAINTAINERS: Update Yu Kuai's E-mail address
Change to my new email address on fnnas.com.

Link: https://lore.kernel.org/linux-raid/20251021122800.3158836-1-yukuai@fnnas.com
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
Reviewed-by: Paul Menzel <pmenzel@molgen.mpg.de>
2025-11-08 16:53:29 +08:00
Jakub Kicinski
a0c3aefb08 Merge branch '40GbE' of git://git.kernel.org/pub/scm/linux/kernel/git/tnguy/next-queue
Tony Nguyen says:

====================
Intel Wired LAN Driver Updates 2025-11-06 (i40, ice, iavf)

Mohammad Heib introduces a new devlink parameter, max_mac_per_vf, for
controlling the maximum number of MAC address filters allowed by a VF. This
allows administrators to control the VF behavior in a more nuanced manner.

Aleksandr and Przemek add support for Receive Side Scaling of GTP to iAVF
for VFs running on E800 series ice hardware. This improves performance and
scalability for virtualized network functions in 5G and LTE deployments.

* '40GbE' of git://git.kernel.org/pub/scm/linux/kernel/git/tnguy/next-queue:
  iavf: add RSS support for GTP protocol via ethtool
  ice: Extend PTYPE bitmap coverage for GTP encapsulated flows
  ice: improve TCAM priority handling for RSS profiles
  ice: implement GTP RSS context tracking and configuration
  ice: add virtchnl definitions and static data for GTP RSS
  ice: add flow parsing for GTP and new protocol field support
  i40e: support generic devlink param "max_mac_per_vf"
  devlink: Add new "max_mac_per_vf" generic device param
====================

Link: https://patch.msgid.link/20251106225321.1609605-1-anthony.l.nguyen@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-07 19:15:36 -08:00
Jakub Kicinski
7c46332810 Merge branch 'net-stmmac-lpc18xx-and-sti-convert-to-set_phy_intf_sel'
Russell King says:

====================
net: stmmac: lpc18xx and sti: convert to set_phy_intf_sel()

This series converts lpc18xx and sti to use the new .set_phy_intf_sel()
method.
====================

Link: https://patch.msgid.link/aQyEs4DAZRWpAz32@shell.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-07 19:05:52 -08:00
Russell King (Oracle)
e3c8f25cf2 net: stmmac: sti: use ->set_phy_intf_sel()
Rather than placing the phy_intf_sel() setup in the ->init() method,
move it to the new ->set_phy_intf_sel() method.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vGy5o-0000000DhQn-34JE@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-07 19:05:49 -08:00
Russell King (Oracle)
ef5e870be9 net: stmmac: sti: use stmmac_get_phy_intf_sel()
Use stmmac_get_phy_intf_sel() to decode the PHY interface mode to the
phy_intf_sel value, validate the result and use that to set the
control register to select the operating mode for the DWMAC core.

Note that when an unsupported interface mode is used, the array would
decode this to PHY_INTF_SEL_GMII_MII, so preserve this behaviour.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vGy5j-0000000DhQh-2e0x@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-07 19:05:49 -08:00
Russell King (Oracle)
bd5a681592 net: stmmac: sti: use PHY_INTF_SEL_x directly
Use the PHY_INTF_SEL_x values directly rather than the driver private
ETH_PHY_SEL_x values. Move the FIELD_PREP() into sti_dwmac_set_mode().
Use dwmac->interface directly.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vGy5e-0000000DhQb-2B7I@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-07 19:05:49 -08:00
Russell King (Oracle)
9cd23c02ac net: stmmac: sti: use PHY_INTF_SEL_x to select PHY interface
Use the common dwmac definitions for the PHY interface selection field,
adding MII_PHY_SEL_VAL() temporarily to avoid line wrapping.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vGy5Z-0000000DhQV-1e2l@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-07 19:05:49 -08:00
Russell King (Oracle)
7fe0e06a73 net: stmmac: lpc18xx: use ->set_phy_intf_sel()
Move the configuration of the dwmac PHY interface selection to the new
->set_phy_intf_sel() method.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vGy5U-0000000DhQP-19Hd@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-07 19:05:48 -08:00
Russell King (Oracle)
4bad421924 net: stmmac: lpc18xx: validate phy_intf_sel
Validate the phy_intf_sel value rather than the PHY interface mode.
This will allow us to transition to the ->set_phy_intf_sel() method.
Note that this will allow GMII as well as MII as the phy_intf_sel
value is the same for both.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vGy5P-0000000DhQJ-0Oi3@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-07 19:05:48 -08:00
Russell King (Oracle)
9882f12194 net: stmmac: lpc18xx: use stmmac_get_phy_intf_sel()
Use stmmac_get_phy_intf_sel() to decode the PHY interface mode to the
phy_intf_sel value, and use the result to program the ethernet mode.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vGy5J-0000000DhQD-46Ob@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-07 19:05:48 -08:00
Russell King (Oracle)
eb0533c7e6 net: stmmac: lpc18xx: use PHY_INTF_SEL_x directly
Use the PHY_INTF_SEL_x values directly rather than the driver private
LPC18XX_CREG_CREG6_ETHMODE_x definitions, and convert
LPC18XX_CREG_CREG6_ETHMODE_MASK to use GENMASK().

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vGy5E-0000000DhQ7-3cuy@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-07 19:05:48 -08:00
Russell King (Oracle)
5636fcdb02 net: stmmac: lpc18xx: convert to PHY_INTF_SEL_x
Use the common dwmac definitions for the PHY interface selection field.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vGy59-0000000DhQ1-393H@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-07 19:05:48 -08:00
Jakub Kicinski
86b721bb0b Merge branch 'net-use-skb_attempt_defer_free-in-napi_consume_skb'
Eric Dumazet says:

====================
net: use skb_attempt_defer_free() in napi_consume_skb()

There is a lack of NUMA awareness and more generally lack
of slab caches affinity on TX completion path.

Modern drivers are using napi_consume_skb(), hoping to cache sk_buff
in per-cpu caches so that they can be recycled in RX path.

Only use this if the skb was allocated on the same cpu,
otherwise use skb_attempt_defer_free() so that the skb
is freed on the original cpu.

This removes contention on SLUB spinlocks and data structures,
and this makes sure that recycled sk_buff have correct NUMA locality.

After this series, I get ~50% improvement for an UDP tx workload
on an AMD EPYC 9B45 (IDPF 200Gbit NIC with 32 TX queues).

I will later refactor skb_attempt_defer_free()
to no longer have to care of skb_shared() and skb_release_head_state().
====================

Link: https://patch.msgid.link/20251106202935.1776179-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-07 19:02:43 -08:00
Eric Dumazet
b61785852e net: increase skb_defer_max default to 128
skb_defer_max value is very conservative, and can be increased
to avoid too many calls to kick_defer_list_purge().

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
Link: https://patch.msgid.link/20251106202935.1776179-4-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-07 19:02:40 -08:00
Eric Dumazet
e20dfbad8a net: fix napi_consume_skb() with alien skbs
There is a lack of NUMA awareness and more generally lack
of slab caches affinity on TX completion path.

Modern drivers are using napi_consume_skb(), hoping to cache sk_buff
in per-cpu caches so that they can be recycled in RX path.

Only use this if the skb was allocated on the same cpu,
otherwise use skb_attempt_defer_free() so that the skb
is freed on the original cpu.

This removes contention on SLUB spinlocks and data structures.

After this patch, I get ~50% improvement for an UDP tx workload
on an AMD EPYC 9B45 (IDPF 200Gbit NIC with 32 TX queues).

80 Mpps -> 120 Mpps.

Profiling one of the 32 cpus servicing NIC interrupts :

Before:

mpstat -P 511 1 1

Average:     CPU    %usr   %nice    %sys %iowait    %irq   %soft  %steal  %guest  %gnice   %idle
Average:     511    0.00    0.00    0.00    0.00    0.00   98.00    0.00    0.00    0.00    2.00

    31.01%  ksoftirqd/511    [kernel.kallsyms]  [k] queued_spin_lock_slowpath
    12.45%  swapper          [kernel.kallsyms]  [k] queued_spin_lock_slowpath
     5.60%  ksoftirqd/511    [kernel.kallsyms]  [k] __slab_free
     3.31%  ksoftirqd/511    [kernel.kallsyms]  [k] idpf_tx_clean_buf_ring
     3.27%  ksoftirqd/511    [kernel.kallsyms]  [k] idpf_tx_splitq_clean_all
     2.95%  ksoftirqd/511    [kernel.kallsyms]  [k] idpf_tx_splitq_start
     2.52%  ksoftirqd/511    [kernel.kallsyms]  [k] fq_dequeue
     2.32%  ksoftirqd/511    [kernel.kallsyms]  [k] read_tsc
     2.25%  ksoftirqd/511    [kernel.kallsyms]  [k] build_detached_freelist
     2.15%  ksoftirqd/511    [kernel.kallsyms]  [k] kmem_cache_free
     2.11%  swapper          [kernel.kallsyms]  [k] __slab_free
     2.06%  ksoftirqd/511    [kernel.kallsyms]  [k] idpf_features_check
     2.01%  ksoftirqd/511    [kernel.kallsyms]  [k] idpf_tx_splitq_clean_hdr
     1.97%  ksoftirqd/511    [kernel.kallsyms]  [k] skb_release_data
     1.52%  ksoftirqd/511    [kernel.kallsyms]  [k] sock_wfree
     1.34%  swapper          [kernel.kallsyms]  [k] idpf_tx_clean_buf_ring
     1.23%  swapper          [kernel.kallsyms]  [k] idpf_tx_splitq_clean_all
     1.15%  ksoftirqd/511    [kernel.kallsyms]  [k] dma_unmap_page_attrs
     1.11%  swapper          [kernel.kallsyms]  [k] idpf_tx_splitq_start
     1.03%  swapper          [kernel.kallsyms]  [k] fq_dequeue
     0.94%  swapper          [kernel.kallsyms]  [k] kmem_cache_free
     0.93%  swapper          [kernel.kallsyms]  [k] read_tsc
     0.81%  ksoftirqd/511    [kernel.kallsyms]  [k] napi_consume_skb
     0.79%  swapper          [kernel.kallsyms]  [k] idpf_tx_splitq_clean_hdr
     0.77%  ksoftirqd/511    [kernel.kallsyms]  [k] skb_free_head
     0.76%  swapper          [kernel.kallsyms]  [k] idpf_features_check
     0.72%  swapper          [kernel.kallsyms]  [k] skb_release_data
     0.69%  swapper          [kernel.kallsyms]  [k] build_detached_freelist
     0.58%  ksoftirqd/511    [kernel.kallsyms]  [k] skb_release_head_state
     0.56%  ksoftirqd/511    [kernel.kallsyms]  [k] __put_partials
     0.55%  ksoftirqd/511    [kernel.kallsyms]  [k] kmem_cache_free_bulk
     0.48%  swapper          [kernel.kallsyms]  [k] sock_wfree

After:

mpstat -P 511 1 1

Average:     CPU    %usr   %nice    %sys %iowait    %irq   %soft  %steal  %guest  %gnice   %idle
Average:     511    0.00    0.00    0.00    0.00    0.00   51.49    0.00    0.00    0.00   48.51

    19.10%  swapper          [kernel.kallsyms]  [k] idpf_tx_splitq_clean_hdr
    13.86%  swapper          [kernel.kallsyms]  [k] idpf_tx_clean_buf_ring
    10.80%  swapper          [kernel.kallsyms]  [k] skb_attempt_defer_free
    10.57%  swapper          [kernel.kallsyms]  [k] idpf_tx_splitq_clean_all
     7.18%  swapper          [kernel.kallsyms]  [k] queued_spin_lock_slowpath
     6.69%  swapper          [kernel.kallsyms]  [k] sock_wfree
     5.55%  swapper          [kernel.kallsyms]  [k] dma_unmap_page_attrs
     3.10%  swapper          [kernel.kallsyms]  [k] fq_dequeue
     3.00%  swapper          [kernel.kallsyms]  [k] skb_release_head_state
     2.73%  swapper          [kernel.kallsyms]  [k] read_tsc
     2.48%  swapper          [kernel.kallsyms]  [k] idpf_tx_splitq_start
     1.20%  swapper          [kernel.kallsyms]  [k] idpf_features_check
     1.13%  swapper          [kernel.kallsyms]  [k] napi_consume_skb
     0.93%  swapper          [kernel.kallsyms]  [k] idpf_vport_splitq_napi_poll
     0.64%  swapper          [kernel.kallsyms]  [k] native_send_call_func_single_ipi
     0.60%  swapper          [kernel.kallsyms]  [k] acpi_processor_ffh_cstate_enter
     0.53%  swapper          [kernel.kallsyms]  [k] io_idle
     0.43%  swapper          [kernel.kallsyms]  [k] netif_skb_features
     0.41%  swapper          [kernel.kallsyms]  [k] __direct_call_cpuidle_state_enter2
     0.40%  swapper          [kernel.kallsyms]  [k] native_irq_return_iret
     0.40%  swapper          [kernel.kallsyms]  [k] idpf_tx_buf_hw_update
     0.36%  swapper          [kernel.kallsyms]  [k] sched_clock_noinstr
     0.34%  swapper          [kernel.kallsyms]  [k] handle_softirqs
     0.32%  swapper          [kernel.kallsyms]  [k] net_rx_action
     0.32%  swapper          [kernel.kallsyms]  [k] dql_completed
     0.32%  swapper          [kernel.kallsyms]  [k] validate_xmit_skb
     0.31%  swapper          [kernel.kallsyms]  [k] skb_network_protocol
     0.29%  swapper          [kernel.kallsyms]  [k] skb_csum_hwoffload_help
     0.29%  swapper          [kernel.kallsyms]  [k] x2apic_send_IPI
     0.28%  swapper          [kernel.kallsyms]  [k] ktime_get
     0.24%  swapper          [kernel.kallsyms]  [k] __qdisc_run

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
Link: https://patch.msgid.link/20251106202935.1776179-3-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-07 19:02:39 -08:00
Eric Dumazet
1fcf572211 net: allow skb_release_head_state() to be called multiple times
Currently, only skb dst is cleared (thanks to skb_dst_drop())

Make sure skb->destructor, conntrack and extensions are cleared.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://patch.msgid.link/20251106202935.1776179-2-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-07 19:02:39 -08:00
Eric Dumazet
fd9557c360 net: add prefetch() in skb_defer_free_flush()
skb_defer_free_flush() is becoming more important these days.

Add a prefetch operation to reduce latency a bit on some
platforms like AMD EPYC 7B12.

On more recent cpus, a stall happens when reading skb_shinfo().
Avoiding it will require a more elaborate strategy.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Link: https://patch.msgid.link/20251106085500.2438951-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-07 19:00:16 -08:00
Jakub Kicinski
01c87d7f48 Merge branch 'psp-track-stats-from-core-and-provide-a-driver-stats-api'
Daniel Zahka says:

====================
psp: track stats from core and provide a driver stats api

This series introduces stats counters for psp. Device key rotations,
and so called 'stale-events' are common to all drivers and are tracked
by the core.

A driver facing api is provided for reporting stats required by the
"Implementation Requirements" section of the PSP Architecture
Specification. Drivers must implement these stats.

Lastly, implementations of the driver stats api for mlx5 and netdevsim
are included.

Here is the output of running the psp selftest suite and then
printing out stats with the ynl cli on system with a psp-capable CX7:

  $ ./ksft-psp-stats/drivers/net/psp.py
  TAP version 13
  1..28
  ok 1 psp.test_case # SKIP Test requires IPv4 connectivity
  ok 2 psp.data_basic_send_v0_ip6
  ok 3 psp.test_case # SKIP Test requires IPv4 connectivity
  ok 4 psp.data_basic_send_v1_ip6
  ok 5 psp.test_case # SKIP Test requires IPv4 connectivity
  ok 6 psp.data_basic_send_v2_ip6 # SKIP ('PSP version not supported', 'hdr0-aes-gmac-128')
  ok 7 psp.test_case # SKIP Test requires IPv4 connectivity
  ok 8 psp.data_basic_send_v3_ip6 # SKIP ('PSP version not supported', 'hdr0-aes-gmac-256')
  ok 9 psp.test_case # SKIP Test requires IPv4 connectivity
  ok 10 psp.data_mss_adjust_ip6
  ok 11 psp.dev_list_devices
  ok 12 psp.dev_get_device
  ok 13 psp.dev_get_device_bad
  ok 14 psp.dev_rotate
  ok 15 psp.dev_rotate_spi
  ok 16 psp.assoc_basic
  ok 17 psp.assoc_bad_dev
  ok 18 psp.assoc_sk_only_conn
  ok 19 psp.assoc_sk_only_mismatch
  ok 20 psp.assoc_sk_only_mismatch_tx
  ok 21 psp.assoc_sk_only_unconn
  ok 22 psp.assoc_version_mismatch
  ok 23 psp.assoc_twice
  ok 24 psp.data_send_bad_key
  ok 25 psp.data_send_disconnect
  ok 26 psp.data_stale_key
  ok 27 psp.removal_device_rx # XFAIL Test only works on netdevsim
  ok 28 psp.removal_device_bi # XFAIL Test only works on netdevsim
  # Totals: pass:19 fail:0 xfail:2 xpass:0 skip:7 error:0
  #
  # Responder logs (0):
  # STDERR:
  #  Set PSP enable on device 1 to 0x3
  #  Set PSP enable on device 1 to 0x0

  $ ynl --family psp --dump get-stats
  [{'dev-id': 1,
              'key-rotations': 5,
              'rx-auth-fail': 21,
              'rx-bad': 0,
              'rx-bytes': 11844,
              'rx-error': 0,
              'rx-packets': 94,
              'stale-events': 6,
              'tx-bytes': 1128456,
              'tx-error': 0,
              'tx-packets': 780}]
====================

Link: https://patch.msgid.link/20251106002608.1578518-1-daniel.zahka@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-07 18:54:08 -08:00
Daniel Zahka
178f0763c5 netdevsim: implement psp device stats
For now only tx/rx packets/bytes are reported. This is not compliant
with the PSP Architecture Specification.

Signed-off-by: Daniel Zahka <daniel.zahka@gmail.com>
Link: https://patch.msgid.link/20251106002608.1578518-6-daniel.zahka@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-07 18:53:57 -08:00
Jakub Kicinski
b1346219e5 net/mlx5e: Add PSP stats support for Rx/Tx flows
Add all statistics described under the "Implementation Requirements"
section of the PSP Architecture Specification:

Rx successfully decrypted PSP packets:
psp_rx_pkts  : Number of packets decrypted successfully
psp_rx_bytes : Number of bytes decrypted successfully

Rx PSP authentication failure statistics:
psp_rx_pkts_auth_fail  : Number of PSP packets that failed authentication
psp_rx_bytes_auth_fail : Number of PSP bytes that failed authentication

Rx PSP bad frame error statistics:
psp_rx_pkts_frame_err;
psp_rx_bytes_frame_err;

Rx PSP drop statistics:
psp_rx_pkts_drop  : Number of PSP packets dropped
psp_rx_bytes_drop : Number of PSP bytes dropped

Tx successfully encrypted PSP packets:
psp_tx_pkts  : Number of packets encrypted successfully
psp_tx_bytes : Number of bytes encrypted successfully

Tx drops:
tx_drop : Number of misc psp related drops

The above can be seen using the ynl cli:
./pyynl/cli.py  --spec netlink/specs/psp.yaml --dump get-stats

Signed-off-by: Raed Salem <raeds@nvidia.com>
Signed-off-by: Rahul Rameshbabu <rrameshbabu@nvidia.com>
Signed-off-by: Daniel Zahka <daniel.zahka@gmail.com>
Link: https://patch.msgid.link/20251106002608.1578518-5-daniel.zahka@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-07 18:53:57 -08:00
Jakub Kicinski
f05d26198c psp: add stats from psp spec to driver facing api
Provide a driver api for reporting device statistics required by the
"Implementation Requirements" section of the PSP Architecture
Specification. Use a warning to ensure drivers report stats required
by the spec.

Signed-off-by: Daniel Zahka <daniel.zahka@gmail.com>
Link: https://patch.msgid.link/20251106002608.1578518-4-daniel.zahka@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-07 18:53:57 -08:00
Daniel Zahka
2098cec328 selftests: drv-net: psp: add assertions on core-tracked psp dev stats
Add assertions to existing test cases to cover key rotations and
'stale-events'.

Signed-off-by: Daniel Zahka <daniel.zahka@gmail.com>
Link: https://patch.msgid.link/20251106002608.1578518-3-daniel.zahka@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-07 18:53:56 -08:00
Jakub Kicinski
dae4a92399 psp: report basic stats from the core
Track and report stats common to all psp devices from the core. A
'stale-event' is when the core marks the rx state of an active
psp_assoc as incapable of authenticating psp encapsulated data.

Signed-off-by: Daniel Zahka <daniel.zahka@gmail.com>
Link: https://patch.msgid.link/20251106002608.1578518-2-daniel.zahka@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-07 18:53:56 -08:00
Heiner Kallweit
f73e0f46bb net: phy: fixed_phy: shrink size of struct fixed_phy_status
All three members are effectively of type bool, so make this explicit
and shrink size of struct fixed_phy_status.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://patch.msgid.link/9eca3d7e-fa64-4724-8fdc-f2c1a8f2ae8f@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-07 18:53:13 -08:00
Jakub Kicinski
ac81130e36 Merge branch 'net-phy-add-open-alliance-tc14-10base-t1s-phy-cable-diagnostic-support'
Parthiban Veerasooran says:

====================
net: phy: Add Open Alliance TC14 10Base-T1S PHY cable diagnostic support

This patch series adds Open Alliance TC14 (OATC14) 10BASE-T1S cable
diagnostic feature support to the Linux kernel PHY subsystem and enable
this feature for Microchip LAN867x Rev.D0 PHYs. These patches provide
standardized cable test functionality for 10BASE-T1S Ethernet PHYs,
allowing users to perform cable diagnostics via ethtool.

Patch Summary:
1. add OATC14 10BASE-T1S PHY cable diagnostic support
	- Implements support for the OATC14 cable diagnostic feature in
	  Clause 45 PHYs.
	- Adds functions to start a cable test and retrieve its status,
	  mapping hardware results to ethtool codes.
	- Exports these functions for use by PHY drivers.
	- Open Alliance TC14 10BASE-T1S Advanced Diagnostic PHY Features.
	  https://opensig.org/wp-content/uploads/2025/06/OPEN_Alliance_10BASE-T1S_Advanced_PHY_features_for-automotive_Ethernet_V2.1b.pdf

2. add cable diagnostic support for LAN867x Rev.D0
	- Integrates the generic OATC14 cable test functions into the
	  Microchip LAN867x Rev.D0 PHY driver.
	- Enables ethtool cable diagnostics for this PHY, improving
	  troubleshooting and maintenance.
====================

Link: https://patch.msgid.link/20251105051213.50443-1-parthiban.veerasooran@microchip.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-07 18:52:36 -08:00
Parthiban Veerasooran
f424409483 net: phy: microchip_t1s:: add cable diagnostic support for LAN867x Rev.D0
Enable Open Alliance TC14 (OATC14) 10Base-T1S cable diagnostic feature
for Microchip LAN867x Rev.D0 PHY by implementing `cable_test_start` and
`cable_test_get_status` using the generic C45 functions. This allows the
`ethtool` utility to perform cable diagnostic tests directly on the PHY,
improving network troubleshooting and maintenance.

Signed-off-by: Parthiban Veerasooran <parthiban.veerasooran@microchip.com>
Link: https://patch.msgid.link/20251105051213.50443-3-parthiban.veerasooran@microchip.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-07 18:52:33 -08:00
Parthiban Veerasooran
b87ee13e34 net: phy: phy-c45: add OATC14 10BASE-T1S PHY cable diagnostic support
Add support for Open Alliance TC14 (OATC14) 10BASE-T1S PHYs cable
diagnostic feature.

This patch implements:
- genphy_c45_oatc14_cable_test_start() to initiate a cable test
- genphy_c45_oatc14_cable_test_get_status() to retrieve test results
- Helper function to map PHY cable test status to ethtool result codes
- Function declarations and exports for use by PHY drivers

This enables ethtool to report ok, open, short, and undetectable cable
conditions on OATC14 10Base-T1S PHYs.

Open Alliance TC14 10BASE-T1S Advanced Diagnostic PHY Features
Specification ref:
https://opensig.org/wp-content/uploads/2025/06/OPEN_Alliance_10BASE-T1S_Advanced_PHY_features_for-automotive_Ethernet_V2.1b.pdf

Signed-off-by: Parthiban Veerasooran <parthiban.veerasooran@microchip.com>
Link: https://patch.msgid.link/20251105051213.50443-2-parthiban.veerasooran@microchip.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-07 18:52:33 -08:00
Erni Sri Satya Vennela
140039580e net: mana: Fix incorrect speed reported by debugfs
Once the netshaper is created for MANA, the current bandwidth
is reported in debugfs like this:

$ sudo ./tools/net/ynl/pyynl/cli.py \
  --spec Documentation/netlink/specs/net_shaper.yaml \
  --do set \
  --json '{"ifindex":'3',
           "handle":{ "scope": "netdev", "id":'1' },
           "bw-max": 200000000 }'
None

$ sudo cat /sys/kernel/debug/mana/1/vport0/current_speed
200

After the shaper  is deleted, it is expected to report
the maximum speed supported by the SKU. But currently it is
reporting 0, which is incorrect.

Fix this inconsistency, by resetting apc->speed to apc->max_speed
during deletion of the shaper object. This will improve
readability and debuggability.

Signed-off-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Link: https://patch.msgid.link/1762369468-32570-1-git-send-email-ernis@linux.microsoft.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-07 18:49:14 -08:00
Lorenzo Bianconi
3f47e67dff net: airoha: Add the capability to consume out-of-order DMA tx descriptors
EN7581 and AN7583 SoCs are capable of DMA mapping non-linear tx skbs on
non-consecutive DMA descriptors. This feature is useful when multiple
flows are queued on the same hw tx queue since it allows to fully utilize
the available tx DMA descriptors and to avoid the starvation of
high-priority flow we have in the current codebase due to head-of-line
blocking introduced by low-priority flows.

Tested-by: Xuegang Lu <xuegang.lu@airoha.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://patch.msgid.link/20251106-airoha-tx-linked-list-v2-1-0706d4a322bd@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-07 18:47:19 -08:00
Eric Dumazet
416dd649f3 tcp: add net.ipv4.tcp_comp_sack_rtt_percent
TCP SACK compression has been added in 2018 in commit
5d9f4262b7 ("tcp: add SACK compression").

It is working great for WAN flows (with large RTT).
Wifi in particular gets a significant boost _when_ ACK are suppressed.

Add a new sysctl so that we can tune the very conservative 5 % value
that has been used so far in this formula, so that small RTT flows
can benefit from this feature.

delay = min ( 5 % of RTT, 1 ms)

This patch adds new tcp_comp_sack_rtt_percent sysctl
to ease experiments and tuning.

Given that we cap the delay to 1ms (tcp_comp_sack_delay_ns sysctl),
set the default value to 33 %.

Quoting Neal Cardwell ( https://lore.kernel.org/netdev/CADVnQymZ1tFnEA1Q=vtECs0=Db7zHQ8=+WCQtnhHFVbEOzjVnQ@mail.gmail.com/ )

The rationale for 33% is basically to try to facilitate pipelining,
where there are always at least 3 ACKs and 3 GSO/TSO skbs per SRTT, so
that the path can maintain a budget for 3 full-sized GSO/TSO skbs "in
flight" at all times:

+ 1 skb in the qdisc waiting to be sent by the NIC next
+ 1 skb being sent by the NIC (being serialized by the NIC out onto the wire)
+ 1 skb being received and aggregated by the receiver machine's
aggregation mechanism (some combination of LRO, GRO, and sack
compression)

Note that this is basically the same magic number (3) and the same
rationales as:

(a) tcp_tso_should_defer() ensuring that we defer sending data for no
longer than cwnd/tcp_tso_win_divisor (where tcp_tso_win_divisor = 3),
and
(b) bbr_quantization_budget() ensuring that cwnd is at least 3 GSO/TSO
skbs to maintain pipelining and full throughput at low RTTs

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Link: https://patch.msgid.link/20251106115236.3450026-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-07 18:41:44 -08:00
Jakub Kicinski
45cb3c6fbe Merge branch 'tcp-clean-up-syn-ack-rto-code-and-apply-max-rto'
Kuniyuki Iwashima says:

====================
tcp: Clean up SYN+ACK RTO code and apply max RTO.

Patch 1 - 4 are misc cleanup.

Patch 5 applies max RTO to non-TFO SYN+ACK.

Patch 6 adds a test for max RTO of SYN+ACK.
====================

Link: https://patch.msgid.link/20251106003357.273403-1-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-07 18:05:28 -08:00
Kuniyuki Iwashima
ffc56c9081 selftest: packetdrill: Add max RTO test for SYN+ACK.
This script sets net.ipv4.tcp_rto_max_ms to 1000 and checks
if SYN+ACK RTO is capped at 1s for TFO and non-TFO.

Without the previous patch, the max RTO is applied to TFO
SYN+ACK only, and non-TFO SYN+ACK RTO increases exponentially.

  # selftests: net/packetdrill: tcp_rto_synack_rto_max.pkt
  # TAP version 13
  # 1..2
  # tcp_rto_synack_rto_max.pkt:46: error handling packet: timing error:
     expected outbound packet at 5.091936 sec but happened at 6.107826 sec; tolerance 0.127974 sec
  # script packet:  5.091936 S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK>
  # actual packet:  6.107826 S. 0:0(0) ack 1 win 65535 <mss 1460,nop,nop,sackOK>
  # not ok 1 ipv4
  # tcp_rto_synack_rto_max.pkt:46: error handling packet: timing error:
     expected outbound packet at 5.075901 sec but happened at 6.091841 sec; tolerance 0.127976 sec
  # script packet:  5.075901 S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK>
  # actual packet:  6.091841 S. 0:0(0) ack 1 win 65535 <mss 1460,nop,nop,sackOK>
  # not ok 2 ipv6
  # # Totals: pass:0 fail:2 xfail:0 xpass:0 skip:0 error:0
  not ok 49 selftests: net/packetdrill: tcp_rto_synack_rto_max.pkt # exit=1

With the previous patch, all SYN+ACKs are retransmitted
after 1s.

  # selftests: net/packetdrill: tcp_rto_synack_rto_max.pkt
  # TAP version 13
  # 1..2
  # ok 1 ipv4
  # ok 2 ipv6
  # # Totals: pass:2 fail:0 xfail:0 xpass:0 skip:0 error:0
  ok 49 selftests: net/packetdrill: tcp_rto_synack_rto_max.pkt

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251106003357.273403-7-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-07 18:05:26 -08:00
Kuniyuki Iwashima
1e9d3005e0 tcp: Apply max RTO to non-TFO SYN+ACK.
Since commit 54a378f434 ("tcp: add the ability to control
max RTO"), TFO SYN+ACK RTO is capped by the TFO full sk's
inet_csk(sk)->icsk_rto_max.

The value is inherited from the parent listener.

Let's apply the same cap to non-TFO SYN+ACK.

Note that req->rsk_listener is always non-NULL when we call
tcp_reqsk_timeout() in reqsk_timer_handler() or tcp_check_req().

It could be NULL for SYN cookie req, but we do not use
req->timeout then.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251106003357.273403-6-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-07 18:05:26 -08:00
Kuniyuki Iwashima
207ce0f6bc tcp: Remove timeout arg from reqsk_timeout().
reqsk_timeout() is always called with @timeout being TCP_RTO_MAX.

Let's remove the arg.

As a prep for the next patch, reqsk_timeout() is moved to tcp.h
and renamed to tcp_reqsk_timeout().

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251106003357.273403-5-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-07 18:05:26 -08:00
Kuniyuki Iwashima
6fbf648d5c tcp: Remove redundant init for req->num_timeout.
Commit 5903123f66 ("tcp: Use BPF timeout setting for SYN ACK
RTO") introduced req->timeout and initialised it in 3 places:

  1. reqsk_alloc() sets 0
  2. inet_reqsk_alloc() sets TCP_TIMEOUT_INIT
  3. tcp_conn_request() sets tcp_timeout_init()

1. has been always redundant as 2. overwrites it immediately.

2. was necessary for TFO SYN+ACK but is no longer needed after
commit 8ea731d4c2 ("tcp: Make SYN ACK RTO tunable by BPF
programs with TFO").

3. was moved to reqsk_queue_hash_req() in the previous patch.

Now, we always initialise req->timeout just before scheduling
the SYN+ACK timer:

  * For non-TFO SYN+ACK : reqsk_queue_hash_req()
  * For TFO SYN+ACK     : tcp_fastopen_create_child()

Let's remove the redundant initialisation of req->timeout in
reqsk_alloc() and inet_reqsk_alloc().

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251106003357.273403-4-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-07 18:05:25 -08:00
Kuniyuki Iwashima
3ce5dd8161 tcp: Remove timeout arg from reqsk_queue_hash_req().
inet_csk_reqsk_queue_hash_add() is no longer shared by DCCP.

We do not need to pass req->timeout down to reqsk_queue_hash_req().

Let's move tcp_timeout_init() from tcp_conn_request() to
reqsk_queue_hash_req().

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251106003357.273403-3-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-07 18:05:25 -08:00
Kuniyuki Iwashima
be88c549e9 tcp: Call tcp_syn_ack_timeout() directly.
Since DCCP has been removed, we do not need to use
request_sock_ops.syn_ack_timeout().

Let's call tcp_syn_ack_timeout() directly.

Now other function pointers of request_sock_ops are
protocol-dependent.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251106003357.273403-2-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-07 18:05:25 -08:00
Damien Le Moal
25976c314f block: introduce bdev_zone_start()
Introduce the function bdev_zone_start() as a more explicit (and clear)
replacement for ALIGN_DOWN() to get the start sector of a zone
containing a particular sector of a zoned block device.

Use this new helper in blkdev_get_zone_info() and
blkdev_report_zones_cached().

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-07 09:28:08 -07:00
Damien Le Moal
e2b0ec7761 block: refactor disk_zone_wplug_sync_wp_offset()
The helper function blk_zone_wp_offset() is called from
disk_zone_wplug_sync_wp_offset(), and again called from
blk_revalidate_seq_zone() right after the call to
disk_zone_wplug_sync_wp_offset().

Change disk_zone_wplug_sync_wp_offset() to return the value of obtained
with blk_zone_wp_offset() to avoid this double call, which simplifies a
little blk_revalidate_seq_zone().

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-07 09:28:08 -07:00
Damien Le Moal
bbac6e0fa5 block: improve blk_zone_wp_offset()
blk_zone_wp_offset() is always called with a struct blk_zone obtained
from the device, that is, it will never see the BLK_ZONE_COND_ACTIVE
condition. However, handling this condition makes this function more
solid and will also avoid issues when propagating cached report requests
to underlying stacked devices is implemented. Add BLK_ZONE_COND_ACTIVE
as a new case in blk_zone_wp_offset() switch.

Also while at it, change the handling of the full condition to return
UINT_MAX for the zone write pointer to reflect the fact that the write
pointer of a full zone is invalid.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-07 09:28:08 -07:00
Jakub Kicinski
c6934c4e04 netlink: specs: netdev add missing stats to qstat-get
Add missing entries in C attribute list.

Link: https://patch.msgid.link/20251104232348.1954349-2-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-07 08:00:23 -08:00
Christoph Hellwig
86a9ce21f5 block: don't return 1 for the fallback case in blkdev_get_zone_info
blkdev_do_report_zones returns the number of reported zones, but
blkdev_get_zone_info returns 0 or an errno.  Translate to the expected
return value in blkdev_report_zone_fallback.

Fixes: b037d41762fd ("block: introduce blkdev_get_zone_info()")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-07 04:38:28 -07:00
Jakub Kicinski
6fc33710cd Merge branch 'net-renesas-cleanup-usage-of-gptp-flags'
Niklas Söderlund says:

====================
net: renesas: Cleanup usage of gPTP flags

This series aim is to prepare for future work that will enable the use
of gPTP on R-Car RAVB on Gen4. Currently RAVB have a dedicated gPTP
implementation supported on Gen2 and Gen3 (ravb_ptp.c). For Gen4 a new
implementation that is already upstream (rcar_gen4_ptp.c) and used by
other Gen4 devices such as RTSN and RSWITCH is needed.

Unfortunately the design of the Gen2/Gen3 RAVB driver where driver
specific flags to control gPTP behavior have been mimicked in RTSN and
RSWITCH. This was OK as there was no overlap between the two gPTP
implementations. Now that RAVB needs to be able to use both having to
translate between driver specific flags and common net code flags
becomes even more cumbersome as there are two sets of driver specific
flags to pick from.

This series cleans this up for all Renesas drivers using gPTP by
removing all driver specific flags and using the common flags directly.
This simplifies drivers while at the same time prepare RAVB to be
extended with Gen4 support.

Patch 1/7 is a drive by patch where RSWITCH specific define was added in
the wrong header. Patch 2/7 removes a short-cut used in RTSN and RSWITCH
that prevents extending Gen4 support to RAVB without fuss. While patch
3/7 to 7/7 rework the Renesas drivers to use the common flags instead of
driver specific ones.

There is no intentional behavior change and only a small rework in logic
in the RAVB driver. Looking at patch 3/7, 4/7 and 7/7 one can clearly
see how the code have been copied from RAVB to the later implementations
in RTSN and RSWITCH.
====================

Link: https://patch.msgid.link/20251104222420.882731-1-niklas.soderlund+renesas@ragnatech.se
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-06 17:38:27 -08:00
Niklas Söderlund
16e2e6cf75 net: ravb: Use common defines for time stamping control
Instead of translating to/from driver specific flags for packet time
stamp control use the common flags directly. This simplifies the driver
as the translating code can be removed while at the same time making it
clear the flags are not flags written to hardware registers.

The change from a device specific bit-field track variable to the common
enum datatypes forces us to touch the ravb_rx_rcar_hwstamp() in a non
trivial way. To make this cleaner and easier to understand expand the
nested conditions.

Signed-off-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Link: https://patch.msgid.link/20251104222420.882731-8-niklas.soderlund+renesas@ragnatech.se
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-06 17:38:26 -08:00
Niklas Söderlund
5ce97b8d61 net: ravb: Break out Rx hardware timestamping
Prepare for moving away from device specific bit-fields to track how to
do hardware Rx timestamping to using net common enums by breaking out
the timestamping to a helper function. This is done to create cleaner
code and prepare for easier changes improving the hardware timestapming.

There is no functional change.

Signed-off-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Link: https://patch.msgid.link/20251104222420.882731-7-niklas.soderlund+renesas@ragnatech.se
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-06 17:38:26 -08:00
Niklas Söderlund
3614d249d1 net: rcar_gen4_ptp: Remove unused defines
The driver specific flags to control packet time stamps have all been
replaced by values from enum hwtstamp_tx_types and enum
hwtstamp_rx_filters. Remove the driver specific flags as there are no
more users.

Signed-off-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20251104222420.882731-6-niklas.soderlund+renesas@ragnatech.se
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-06 17:38:25 -08:00
Niklas Söderlund
e43791f40b net: rtsn: Use common defines for time stamping control
Instead of translating to/from driver specific flags for packet time
stamp control use the common flags directly. This simplifies the driver
as the translating code can be removed while at the same time making it
clear the flags are not flags written to hardware registers.

One thing to note is that the bit-wise and check in rtsn_rx() of
RCAR_GEN4_RXTSTAMP_TYPE_V2_L2_EVENT is replaced with a not set check of
HWTSTAMP_FILTER_NONE. This is okay as the bit of device specific event
replaced was set for all modes except HWTSTAMP_FILTER_NONE.

Signed-off-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Link: https://patch.msgid.link/20251104222420.882731-5-niklas.soderlund+renesas@ragnatech.se
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-06 17:38:25 -08:00
Niklas Söderlund
b314e4f7a9 net: rswitch: Use common defines for time stamping control
Instead of translating to/from driver specific flags for packet time
stamp control use the common flags directly. This simplifies the driver
as the translating code can be removed while at the same time making it
clear the flags are not flags written to hardware registers.

One thing to note is that the bit-wise and check in rswitch_rx() of
RCAR_GEN4_RXTSTAMP_TYPE_V2_L2_EVENT is replaced with a not set check of
HWTSTAMP_FILTER_NONE. This is okay as the bit of device specific event
replaced was set for all modes except HWTSTAMP_FILTER_NONE.

Signed-off-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Link: https://patch.msgid.link/20251104222420.882731-4-niklas.soderlund+renesas@ragnatech.se
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-06 17:38:25 -08:00
Niklas Söderlund
50ab1c6bec net: rcar_gen4_ptp: Move control fields to users
The struct rcar_gen4_ptp_private provides two fields for convenience of
its users, tstamp_tx_ctrl and tstamp_rx_ctrl. These fields are not used
by the rcar_gen4_ptp driver itself but only by the drivers using it.

Upcoming work will enable the RAVB driver currently only supporting gPTP
on pre-Gen4 SoCs to use the Gen4 implementation as well. To facilitate
this the convenience of having these fields in struct
rcar_gen4_ptp_private becomes a problem as the RAVB driver already have
it's own driver specific fields for the same thing.

Move the fields from struct rcar_gen4_ptp_private to each driver using
the Gen4 gPTP clocks own private data structures. There is no functional
change.

Signed-off-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20251104222420.882731-3-niklas.soderlund+renesas@ragnatech.se
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-06 17:38:25 -08:00
Niklas Söderlund
e98d879292 net: rswitch: Move definition of S4 gPTP offset
The files rcar_gen4_ptp.{c,h} implements an abstraction of the gPTP
support implemented together with different other IP blocks. The first
device added which supported this was RSWITCH on R-Car S4.

While doing so the RSWITCH R-Car S4 specific offset was added to the
generic Gen4 gPTP header file. Move it to the RSWITCH driver to make it
clear it only applies to this driver.

Signed-off-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Link: https://patch.msgid.link/20251104222420.882731-2-niklas.soderlund+renesas@ragnatech.se
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-06 17:38:25 -08:00
Keith Busch
bc840b21a2 nvme: remove virtual boundary for sgl capable devices
The nvme virtual boundary is only required for the PRP format. Devices
that can use SGL for DMA don't need it for IO queues. Drop reporting it
for such devices; rdma fabrics controllers will continue to use the
limit as they currently don't report any boundary requirements, but tcp
and fc never needed it in the first place so they get to report no
virtual boundary.

Applications may continue to align to the same virtual boundaries for
optimization purposes if they want, and the driver will continue to
decide whether to use the PRP format the same as before if the IO allows
it.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-06 18:11:58 -07:00
Keith Busch
2f6b2565d4 block: accumulate memory segment gaps per bio
The blk-mq dma iterator has an optimization for requests that align to
the device's iommu merge boundary. This boundary may be larger than the
device's virtual boundary, but the code had been depending on that queue
limit to know ahead of time if the request is guaranteed to align to
that optimization.

Rather than rely on that queue limit, which many devices may not report,
save the lowest set bit of any boundary gap between each segment in the
bio while checking the segments. The request stores the value for
merging and quickly checking per io if the request can use iova
optimizations.

Signed-off-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-06 18:11:58 -07:00
Daniel Borkmann
25e63e559c netkit: Document fast vs slowpath members via macros
Instead of a comment, just use two cachline groups to document the intent
for members often accessed in fast or slow path.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Co-developed-by: David Wei <dw@davidwei.uk>
Signed-off-by: David Wei <dw@davidwei.uk>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://patch.msgid.link/20251031212103.310683-11-daniel@iogearbox.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-06 16:46:11 -08:00
Daniel Borkmann
24ab8efb9a xsk: Move NETDEV_XDP_ACT_ZC into generic header
Move NETDEV_XDP_ACT_ZC into xdp_sock_drv.h header such that external code
can reuse it, and rename it into more generic NETDEV_XDP_ACT_XSK.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Co-developed-by: David Wei <dw@davidwei.uk>
Signed-off-by: David Wei <dw@davidwei.uk>
Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://patch.msgid.link/20251031212103.310683-7-daniel@iogearbox.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-06 16:46:11 -08:00
Puranjay Mohan
f8c67d8550 bpf: Use kmalloc_nolock() in range tree
The range tree uses bpf_mem_alloc() that is safe to be called from all
contexts and uses a pre-allocated pool of memory to serve these
allocations.

Replace bpf_mem_alloc() with kmalloc_nolock() as it can be called safely
from all contexts and is more scalable than bpf_mem_alloc().

Remove the migrate_disable/enable pairs as they were only needed for
bpf_mem_alloc() as it does per-cpu operations, kmalloc_nolock() doesn't
need this.

Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
Link: https://lore.kernel.org/r/20251106170608.4800-1-puranjay@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-06 15:55:19 -08:00
Cong Zhang
0739c2c6a0 virtio_blk: NULL out vqs to avoid double free on failed resume
The vblk->vqs releases during freeze. If resume fails before vblk->vqs
is allocated, later freeze/remove may attempt to free vqs again.
Set vblk->vqs to NULL after freeing to avoid double free.

Signed-off-by: Cong Zhang <cong.zhang@oss.qualcomm.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-06 16:32:58 -07:00
Keith Busch
3451cf34f5 null_blk: allow byte aligned memory offsets
Allowing byte aligned memory provides a nice testing ground for
direct-io.

Signed-off-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Tested-by: Hans Holmberg <hans.holmberg@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-06 16:28:55 -07:00
Keith Busch
262a3dd04e null_blk: single kmap per bio segment
Rather than kmap the the request bio segment for each sector, do
the mapping just once.

Signed-off-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Tested-by: Hans Holmberg <hans.holmberg@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-06 16:27:36 -07:00
Keith Busch
8459283819 null_blk: consistently use blk_status_t
No need to mix errno and blk_status_t error types. Just use the standard
block layer type.

Signed-off-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Tested-by: Hans Holmberg <hans.holmberg@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-06 16:27:21 -07:00
Keith Busch
1165d20f4d null_blk: simplify copy_from_nullb
It always returns success, so the code that saves the errors status, but
proceeds without checking it looks a bit odd. Clean this up.

Signed-off-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Tested-by: Hans Holmberg <hans.holmberg@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-06 16:27:21 -07:00
Caleb Sander Mateos
e87d66ab27 ublk: use rq_for_each_segment() for user copy
ublk_advance_io_iter() and ublk_copy_io_pages() currently open-code the
iteration over the request's bvecs. Switch to the rq_for_each_segment()
macro provided by blk-mq to avoid reaching into the bio internals and
simplify the code.

Suggested-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-06 16:26:04 -07:00
Caleb Sander Mateos
2299ceec36 ublk: use copy_{to,from}_iter() for user copy
ublk_copy_user_pages()/ublk_copy_io_pages() currently uses
iov_iter_get_pages2() to extract the pages from the iov_iter and
memcpy()s between the bvec_iter and the iov_iter's pages one at a time.
Switch to using copy_to_iter()/copy_from_iter() instead. This avoids the
user page reference count increments and decrements and needing to split
the memcpy() at user page boundaries. It also simplifies the code
considerably.
Ming reports a 40% throughput improvement when issuing I/O to the
selftests null ublk server with zero-copy disabled.

Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-06 16:26:04 -07:00
Pavel Begunkov
93e197e524 io_uring: use WRITE_ONCE for user shared memory
IORING_SETUP_NO_MMAP rings remain user accessible even before the ctx
setup is finalised, so use WRITE_ONCE consistently when initialising
rings.

Fixes: 03d89a2de2 ("io_uring: support for user allocated memory for rings/sqes")
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-06 16:25:18 -07:00
David Wei
75c299a917 io_uring/zcrx: reverse ifq refcount
Add a refcount to struct io_zcrx_ifq to reverse the refcounting
relationship i.e. rings now reference ifqs instead. As a result of this,
remove ctx->refs that an ifq holds on a ring via the page pool memory
provider.

This ref ifq->refs is held by internal users of an ifq, namely rings and
the page pool memory provider associated with an ifq. This is needed to
keep the ifq around until the page pool is destroyed.

Since ifqs now no longer hold refs to ring ctx, there isn't a need to
split the cleanup of ifqs into two: io_shutdown_zcrx_ifqs() in
io_ring_exit_work() while waiting for ctx->refs to drop to 0, and
io_unregister_zcrx_ifqs() after. Remove io_shutdown_zcrx_ifqs().

Signed-off-by: David Wei <dw@davidwei.uk>
Co-developed-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-06 16:23:21 -07:00
David Wei
1bd95163da io_uring/zcrx: move io_unregister_zcrx_ifqs() down
In preparation for removing the ref on ctx->refs held by an ifq and
removing io_shutdown_zcrx_ifqs(), move io_unregister_zcrx_ifqs() down
such that it can call io_zcrx_scrub().

Signed-off-by: David Wei <dw@davidwei.uk>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-06 16:23:21 -07:00
David Wei
5c686456a4 io_uring/zcrx: add user_struct and mm_struct to io_zcrx_ifq
In preparation for removing ifq->ctx and making ifq lifetime independent
of ring ctx, add user_struct and mm_struct to io_zcrx_ifq.

In the ifq cleanup path, these are the only fields used from the main
ring ctx to do accounting. Taking a copy in the ifq allows ifq->ctx to
be removed later, including the ctx->refs held by the ifq.

Signed-off-by: David Wei <dw@davidwei.uk>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-06 16:23:21 -07:00
David Wei
edd706ede8 io_uring/zcrx: add io_zcrx_ifq arg to io_zcrx_free_area()
Add io_zcrx_ifq arg to io_zcrx_free_area(). A QOL change to reduce line
widths.

Signed-off-by: David Wei <dw@davidwei.uk>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-06 16:23:21 -07:00
David Wei
6ab39b392e io_uring/rsrc: refactor io_{un}account_mem() to take {user,mm}_struct param
Refactor io_{un}account_mem() to take user_struct and mm_struct
directly, instead of accessing it from the ring ctx.

Signed-off-by: David Wei <dw@davidwei.uk>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-06 16:23:21 -07:00
David Wei
1fa7a34131 io_uring/memmap: refactor io_free_region() to take user_struct param
Refactor io_free_region() to take user_struct directly, instead of
accessing it from the ring ctx.

Signed-off-by: David Wei <dw@davidwei.uk>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-06 16:23:21 -07:00
David Wei
a5af56a902 io_uring/memmap: remove unneeded io_ring_ctx arg
Remove io_ring_ctx arg from io_region_pin_pages() and
io_region_allocate_pages() that isn't used.

Signed-off-by: David Wei <dw@davidwei.uk>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-06 16:23:20 -07:00
Martin KaFai Lau
6f1f4c1638 Merge branch 'selftests-bpf-enfoce-so_reuseaddr-in-basic-test-servers'
Alexis Lothoré says:

====================
This small series is another follow-up to [1], in which I misunderstood
Martin's initial feedback (see [2]). I proposed to make tc-tunnel apply
SO_REUSEPORT once server is brought up. This series updates
start_server_addr to really apply Martin's proposal after his
clarification [3]

[1] https://lore.kernel.org/bpf/20251031-tc_tunnel_improv-v1-0-0ffe44d27eda@bootlin.com/
[2] https://lore.kernel.org/bpf/efa3540a-1f52-46ca-9f49-e631a5e3e48c@linux.dev/
[3] https://lore.kernel.org/bpf/4cbabdf1-af2c-490a-a41a-b40c1539c1cb@linux.dev/
====================

Link: https://patch.msgid.link/20251105-start-server-soreuseaddr-v1-0-1bbd9c1f8d65@bootlin.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
2025-11-06 15:23:04 -08:00
Alexis Lothoré (eBPF Foundation)
5b7d6c9198 selftests/bpf: Use start_server_str rather than start_reuseport_server in tc_tunnel
Now that start_server_str enforces SO_REUSEADDR, there's no need to keep
using start_reusport_server in tc_tunnel, especially since it only uses
one server at a time.

Replace start_reuseport_server with start_server_str in tc_tunnel test.

Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20251105-start-server-soreuseaddr-v1-2-1bbd9c1f8d65@bootlin.com
2025-11-06 15:23:04 -08:00
Alexis Lothoré (eBPF Foundation)
38e36514fc selftests/bpf: Systematically add SO_REUSEADDR in start_server_addr
Some tests have to stop/start a server multiple time with the same
listening address. Doing so without SO_REUSADDR leads to failures due to
the socket still being in TIME_WAIT right after the first instance
stop/before the second instance start. Instead of letting each test
manually set SO_REUSEADDR on their servers, it can be done automatically
by start_server_addr for all tests (and without any major downside).

Enforce SO_REUSEADDR in start_server_addr for all tests.

Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20251105-start-server-soreuseaddr-v1-1-1bbd9c1f8d65@bootlin.com
2025-11-06 15:23:00 -08:00
Christoph Hellwig
15638d52cb block: fix cached zone reporting after zone append was used
No zone plugs are allocated when a zone is opened by calling Zone Append
on it.  This makes the cached zone reporting report incorrectly empty
zones if the file system is unmounted and report zones is called after
that, e.g. by xfstests test cases using the scratch device.

Fix this by recording if zone append was used on a device, and disable
cached reporting for the device until a ZONE_RESET_ALL happens that
guarantees all zones are empty.

We could probably do even better using a per-zone flag, but the practical
use cache for zone reporting after the initial mount are rather limited,
so let's keep things simple for now.

Fixes: 31f0656a4a ("block: introduce blkdev_report_zones_cached()")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-06 16:15:27 -07:00
Christoph Hellwig
c6886cf610 block: don't leak disk->zones_cond for !disk_need_zone_resources
disk->zones_cond is allocated for all zoned devices, but
disk_free_zone_resources skips it when the zone write plug hash is not
allocated, leaking the allocation for non-mq devices that don't emulate
zone append.  This is reported by kmemleak-enabled xfstests for various
tests that use simple device mapper targets.

Fix this by moving all code that requires writes plugs from
disk_free_zone_resources into disk_destroy_zone_wplugs_hash_table
and executing the rest of the code, including the disk->zones_cond
freeing unconditionally.

Fixes: 6e945ffb65 ("block: use zone condition to determine conventional zones")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-06 16:15:27 -07:00
FUJITA Tomonori
8a25a2e341 net: phy: qt2025: Wait until PHY becomes ready
Wait until a PHY becomes ready in the probe callback by
using read_poll_timeout function.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Alice Ryhl <aliceryhl@google.com>
Reviewed-by: Gary Guo <gary@garyguo.net>
Signed-off-by: FUJITA Tomonori <fujita.tomonori@gmail.com>
Link: https://patch.msgid.link/20251105133126.3221948-1-fujita.tomonori@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-06 14:47:19 -08:00
Breno Leitao
c04956cccb tg3: extract GRXRINGS from .get_rxnfc
Commit 84eaf4359c ("net: ethtool: add get_rx_ring_count callback to
optimize RX ring queries") added specific support for GRXRINGS callback,
simplifying .get_rxnfc.

Remove the handling of GRXRINGS in .get_rxnfc() by moving it to the new
.get_rx_ring_count().

Given that tg3_get_rxnfc() only handles ETHTOOL_GRXRINGS, then this
function becomes useless now, and it is removed.

This also fixes the behavior for devices without MSIX support.
Previously, the function would return -EOPNOTSUPP, but now it correctly
returns 1.

The functionality remains the same: return the current queue count
if the device is running, otherwise return the minimum of online
CPUs and TG3_RSS_MAX_NUM_QS.

Signed-off-by: Breno Leitao <leitao@debian.org>
Reviewed-by: Michael Chan <michael.chan@broadcom.com>
Link: https://patch.msgid.link/20251105-grxrings_v1-v1-1-54c2caafa1fd@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-06 14:26:35 -08:00
Aleksandr Loktionov
3da28eb277 iavf: add RSS support for GTP protocol via ethtool
Extend the iavf driver to support Receive Side Scaling (RSS)
configuration for GTP (GPRS Tunneling Protocol) flows using ethtool.

The implementation introduces new RSS flow segment headers and hash field
definitions for various GTP encapsulations, including:

  - GTPC
  - GTPU (IP, Extension Header, Uplink, Downlink)
  - TEID-based hashing

The ethtool interface is updated to parse and apply these new flow types
and hash fields, enabling fine-grained traffic distribution for GTP-based
mobile workloads.

This enhancement improves performance and scalability for virtualized
network functions (VNFs) and user plane functions (UPFs) in 5G and LTE
deployments.

Reviewed-by: Jedrzej Jagielski <jedrzej.jagielski@intel.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2025-11-06 14:19:43 -08:00
Przemek Kitszel
41e880eb84 ice: Extend PTYPE bitmap coverage for GTP encapsulated flows
Consolidate updates to the Protocol Type (PTYPE) bitmap definitions
across multiple flow types in the Intel ICE driver to support GTP
(GPRS Tunneling Protocol) encapsulated traffic.

Enable improved Receive Side Scaling (RSS) configuration for both user
and control plane GTP flows.

Cover a wide range of protocol and encapsulation scenarios, including:
 - MAC OFOS and IL
 - IPv4 and IPv6 (OFOS, IL, ALL, no-L4)
 - TCP, SCTP, ICMP
 - GRE OF
 - GTPC (control plane)

Expand the PTYPE bitmap entries to improve classification and
distribution of GTP traffic across multiple queues, enhancing
performance and scalability in mobile network environments.

Co-developed-by: Dan Nowlin <dan.nowlin@intel.com>
Signed-off-by: Dan Nowlin <dan.nowlin@intel.com>
Co-developed-by: Qi Zhang <qi.z.zhang@intel.com>
Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
Co-developed-by: Jie Wang <jie1x.wang@intel.com>
Signed-off-by: Jie Wang <jie1x.wang@intel.com>
Co-developed-by: Junfeng Guo <junfeng.guo@intel.com>
Signed-off-by: Junfeng Guo <junfeng.guo@intel.com>
Signed-off-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Reviewed-by: Jedrzej Jagielski <jedrzej.jagielski@intel.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2025-11-06 14:19:43 -08:00
Aleksandr Loktionov
f89e4e1512 ice: improve TCAM priority handling for RSS profiles
Enhance TCAM priority logic to avoid conflicts between RSS profiles
with overlapping PTGs and attributes.

Track used PTG and attribute combinations.
Ensure higher-priority profiles override lower ones.
Add helper for setting TCAM flags and masks.

Ensure RSS rule consistency and prevent unintended matches.

Co-developed-by: Dan Nowlin <dan.nowlin@intel.com>
Signed-off-by: Dan Nowlin <dan.nowlin@intel.com>
Signed-off-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2025-11-06 14:19:43 -08:00
Aleksandr Loktionov
3a6d87e2ea ice: implement GTP RSS context tracking and configuration
This commit implements the core RSS context management and configuration
logic for GTP (GTPU) protocol support in VF RSS operations.

Key implementation features:
- GTPU hash context management with pre/post processing functions
- Context index calculation and mapping for different GTPU scenarios
- Integration with main RSS configuration flow via wrapper functions
- Support for IPv4/IPv6 GTPU RSS configurations
- Rollback mechanism for handling RSS rule conflicts
- Hash context reset and cleanup functionality

The implementation provides comprehensive GTPU RSS support by:
1. Adding ice_add_rss_cfg_pre_gtpu() for preprocessing GTPU contexts
2. Adding ice_add_rss_cfg_post_gtpu() for postprocessing configurations
3. Adding ice_calc_gtpu_ctx_idx() for context index calculation
4. Integrating GTPU logic into ice_add_rss_cfg_wrap() and
   ice_rem_rss_cfg_wrap()
5. Supporting context tracking in VF hash_ctx structures

This completes the GTP RSS infrastructure enabling VFs to configure
RSS hashing on GTP-encapsulated traffic.

Co-developed-by: Dan Nowlin <dan.nowlin@intel.com>
Signed-off-by: Dan Nowlin <dan.nowlin@intel.com>
Co-developed-by: Jie Wang <jie1x.wang@intel.com>
Signed-off-by: Jie Wang <jie1x.wang@intel.com>
Co-developed-by: Junfeng Guo <junfeng.guo@intel.com>
Signed-off-by: Junfeng Guo <junfeng.guo@intel.com>
Co-developed-by: Qi Zhang <qi.z.zhang@intel.com>
Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
Co-developed-by: Ting Xu <ting.xu@intel.com>
Signed-off-by: Ting Xu <ting.xu@intel.com>
Signed-off-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Signed-off-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2025-11-06 14:19:43 -08:00
Aleksandr Loktionov
38724a474c ice: add virtchnl definitions and static data for GTP RSS
Add virtchnl protocol header and field definitions for advanced RSS
configuration including GTPC, GTPU, L2TPv2, ECPRI, PPP, GRE, and IP
fragment headers.

- Define new virtchnl protocol header types
- Add RSS field selectors for tunnel protocols
- Extend static mapping arrays for protocol field matching
- Add L2TPv2 session ID and length+session ID field support

This provides the foundational definitions needed for VF RSS
configuration of tunnel protocols.

Co-developed-by: Dan Nowlin <dan.nowlin@intel.com>
Signed-off-by: Dan Nowlin <dan.nowlin@intel.com>
Co-developed-by: Jie Wang <jie1x.wang@intel.com>
Signed-off-by: Jie Wang <jie1x.wang@intel.com>
Co-developed-by: Junfeng Guo <junfeng.guo@intel.com>
Signed-off-by: Junfeng Guo <junfeng.guo@intel.com>
Co-developed-by: Qi Zhang <qi.z.zhang@intel.com>
Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
Co-developed-by: Ting Xu <ting.xu@intel.com>
Signed-off-by: Ting Xu <ting.xu@intel.com>
Signed-off-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Signed-off-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2025-11-06 14:19:43 -08:00
Aleksandr Loktionov
12ed3e5a03 ice: add flow parsing for GTP and new protocol field support
Introduce new protocol header types and field sizes to support GTPU, GTPC
tunneling protocols.

 - Add field size macros for GTP TEID, QFI, and other headers
 - Extend ice_flow_field_info and enum definitions
 - Update hash macros for new protocols
 - Add support for IPv6 prefix matching and fragment headers

This patch lays the groundwork for enhanced RSS and flow classification
capabilities.

Co-developed-by: Dan Nowlin <dan.nowlin@intel.com>
Signed-off-by: Dan Nowlin <dan.nowlin@intel.com>
Co-developed-by: Junfeng Guo <junfeng.guo@intel.com>
Signed-off-by: Junfeng Guo <junfeng.guo@intel.com>
Co-developed-by: Ting Xu <ting.xu@intel.com>
Signed-off-by: Ting Xu <ting.xu@intel.com>
Signed-off-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2025-11-06 14:19:39 -08:00
Jakub Kicinski
380e6f3c7b Merge branch 'net-dsa-lantiq_gswip-add-support-for-maxlinear-gsw1xx-switch-family'
Daniel Golle says:

====================
net: dsa: lantiq_gswip: Add support for MaxLinear GSW1xx switch family

This patch series extends the existing lantiq_gswip DSA driver to
support the MaxLinear GSW1xx family of dedicated Ethernet switch ICs.
These switches are based on the same IP as the Lantiq/Intel GSWIP found
in VR9 and xRX MIPS router SoCs which are currently supported by the
lantiq_gswip driver, but they are dedicated ICs connected via MDIO
rather than built-in components of a SoC accessible via memory-mapped
I/O.

The series includes several improvements and refactoring to implement
support for GSW1xx switch ICs by reusing the existing lantiq_gswip
driver.

The GSW1xx family includes several variants:
 - GSW120: 4 ports, 2 PHYs, RGMII & SGMII/2500Base-X
 - GSW125: 4 ports, 2 PHYs, RGMII & SGMII/2500Base-X, industrial temperature
 - GSW140: 6 ports, 4 PHYs, RGMII & SGMII/2500Base-X
 - GSW141: 6 ports, 4 PHYs, RGMII & SGMII
 - GSW145: 6 ports, 4 PHYs, RGMII & SGMII/2500Base-X, industrial temperature

Key features implemented:
 - MDIO-based register access using regmap
 - Support for SGMII/1000Base-X/2500Base-X SerDes interfaces
 - Configurable RGMII delays via device tree properties
 - Configurable RMII clock direction
 - Energy Efficient Ethernet (EEE) support
 - enabling/disabling learning
====================

Link: https://patch.msgid.link/cover.1762170107.git.daniel@makrotopia.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-06 14:16:20 -08:00
Daniel Golle
22335939ec net: dsa: add driver for MaxLinear GSW1xx switch family
Add driver for the MaxLinear GSW1xx family of Ethernet switch ICs which
are based on the same IP as the Lantiq/Intel GSWIP found in the Lantiq VR9
and Intel GRX MIPS router SoCs. The main difference is that instead of
using memory-mapped I/O to communicate with the host CPU these ICs are
connected via MDIO (or SPI, which isn't supported by this driver).
Implement the regmap API to access the switch registers over MDIO to allow
reusing lantiq_gswip_common for all core functionality.

The GSW1xx also comes with a SerDes port capable of 1000Base-X, SGMII and
2500Base-X, which can either be used to connect an external PHY or SFP
cage, or as the CPU port. Support for the SerDes interface is implemented
in this driver using the phylink_pcs interface.

Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Tested-by: Alexander Sverdlin <alexander.sverdlin@siemens.com>
Link: https://patch.msgid.link/b567ec1b4beb08fd37abf18b280c56d5d8253c26.1762170107.git.daniel@makrotopia.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-06 14:16:18 -08:00
Daniel Golle
c6230446b1 net: dsa: add tagging driver for MaxLinear GSW1xx switch family
Add support for a new DSA tagging protocol driver for the MaxLinear
GSW1xx switch family. The GSW1xx switches use a proprietary 8-byte
special tag inserted between the source MAC address and the EtherType
field to indicate the source and destination ports for frames
traversing the CPU port.

Implement the tag handling logic to insert the special tag on transmit
and parse it on receive.

Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Reviewed-by: Alexander Sverdlin <alexander.sverdlin@siemens.com>
Tested-by: Alexander Sverdlin <alexander.sverdlin@siemens.com>
Link: https://patch.msgid.link/0e973ebfd9433c30c96f50670da9e9449a0d98f2.1762170107.git.daniel@makrotopia.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-06 14:16:17 -08:00
Daniel Golle
e1bb4b36a7 dt-bindings: net: dsa: lantiq,gswip: add support for MaxLinear GSW1xx switches
Extend the Lantiq GSWIP device tree binding to also cover MaxLinear
GSW1xx switches which are based on the same hardware IP but connected
via MDIO instead of being memory-mapped.

Add compatible strings for MaxLinear GSW120, GSW125, GSW140, GSW141,
and GSW145 switches and adjust the schema to handle the different
connection methods with conditional properties.

Add MaxLinear GSW125 example showing MDIO-connected configuration.

Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://patch.msgid.link/fc96f1dedb2b418a63e69960356dde7f6eb86424.1762170107.git.daniel@makrotopia.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-06 14:16:17 -08:00
Daniel Golle
cdef8e47b6 net: dsa: lantiq_gswip: allow adjusting MII delays
Currently the MII clk vs. data delay is configured based on the PHY
interface mode.

In addition to that add support for setting up MII delays using the
standard Device Tree properties 'tx-internal-delay-ps' and
'rx-internal-delay-ps', using the values determined by the PHY interface
mode as default to maintain backward compatibility with legacy device
trees.

Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Link: https://patch.msgid.link/37203e831cff87dc46e5ef9e8cbd68fb8689773d.1762170107.git.daniel@makrotopia.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-06 14:16:17 -08:00
Daniel Golle
bea0c17786 dt-bindings: net: dsa: lantiq,gswip: add support for MII delay properties
Add support for standard tx-internal-delay-ps and rx-internal-delay-ps
properties on port nodes to allow fine-tuning of RGMII clock delays.

The GSWIP switch hardware supports delay values in 500 picosecond
increments from 0 to 3500 picoseconds, with a post-reset default of 2000
picoseconds for both TX and RX delays. The driver currently sets the
delay to 0 in case the PHY is setup to carry out the delay by the
corresponding interface modes ("rgmii-id", "rgmii-rxid", "rgmii-txid").

This corresponds to the driver changes that allow adjusting MII delays
using Device Tree properties instead of relying solely on the PHY
interface mode.

Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://patch.msgid.link/9e007d4f85c2c6d69e0b91f3663d99e0f6fc8eac.1762170107.git.daniel@makrotopia.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-06 14:16:17 -08:00
Daniel Golle
319fd7e9d4 net: dsa: lantiq_gswip: add vendor property to setup MII refclk output
Read boolean Device Tree property "maxlinear,rmii-refclk-out" and switch
the RMII reference clock to be a clock output rather than an input if it
is set.

Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Reviewed-by: Alexander Sverdlin <alexander.sverdlin@siemens.com>
Tested-by: Alexander Sverdlin <alexander.sverdlin@siemens.com>
Link: https://patch.msgid.link/947d14970f74f760e4a60c777aabee64e7e4f356.1762170107.git.daniel@makrotopia.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-06 14:16:17 -08:00
Daniel Golle
e836824116 dt-bindings: net: dsa: lantiq,gswip: add MaxLinear RMII refclk output property
Add support for the maxlinear,rmii-refclk-out boolean property on port
nodes to configure the RMII reference clock to be an output rather than
an input.

This property is only applicable for ports in RMII mode and allows the
switch to provide the reference clock for RMII-connected PHYs instead
of requiring an external clock source.

This corresponds to the driver changes that read this Device Tree
property to configure the RMII clock direction.

Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Reviewed-by: Alexander Sverdlin <alexander.sverdlin@siemens.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://patch.msgid.link/9813bb916ecce9bae366e6c50c081014fe5371ea.1762170107.git.daniel@makrotopia.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-06 14:16:16 -08:00
Daniel Golle
0c56a98560 net: dsa: lantiq_gswip: define and use GSWIP_TABLE_MAC_BRIDGE_VAL1_VALID
When adding FDB entries to the MAC bridge table on GSWIP 2.2 or later it
is needed to set an (undocumented) bit to mark the entry as valid. If this
bit isn't set for entries in the MAC bridge table, then those entries won't
be considered as valid MAC addresses.

Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Link: https://patch.msgid.link/e02fe0d946c98920bc55b5f389a8f56382aae7df.1762170107.git.daniel@makrotopia.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-06 14:16:16 -08:00
Daniel Golle
3e5ef3b170 net: dsa: lantiq_gswip: set link parameters also for CPU port
On standalone switch ICs the link parameters of the CPU port need to
be setup just like user ports. The destinction in the driver to not
carry out link parameter setup for the CPU port does make sense for
in-SoC switches on which the CPU port is internally connected to the
SoC's Ethernet MAC.
Set link parameters also for the CPU port unless it is an internal
interface. Note that the internal TP PHYs anyway cannot be used as
CPU ports, hence it doesn't matter that they are now also covered by
that condition.

Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Reviewed-by: Alexander Sverdlin <alexander.sverdlin@siemens.com>
Tested-by: Alexander Sverdlin <alexander.sverdlin@siemens.com>
Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
Link: https://patch.msgid.link/07c6b8d3a12296123be5e5938b454fc620f819e6.1762170107.git.daniel@makrotopia.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-06 14:16:16 -08:00
Daniel Golle
9ec1fc0bf2 net: dsa: lantiq_gswip: support Energy Efficient Ethernet
Introduce support for Energy Efficient Ethernet (EEE) on hardware
version 2.2 or later.

Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/753e45acb25e185689ca1afd8a9bd0c199d1c15b.1762170107.git.daniel@makrotopia.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-06 14:16:16 -08:00
Daniel Golle
a7d4b05f9d net: dsa: lantiq_gswip: support enable/disable learning
Switch API 2.2 or later supports enabling or disabling learning on each
port. Implement support for BR_LEARNING bridge flag and announce support
for BR_LEARNING on GSWIP 2.2 or later.

Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
Link: https://patch.msgid.link/0aa4621e01c998378ad5812464bc17d23aa3bf62.1762170107.git.daniel@makrotopia.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-06 14:16:16 -08:00
Daniel Golle
322a1e6f3d net: dsa: lantiq_gswip: split into common and MMIO parts
Move all parts specific for the MMIO/SoC driver into a module of its own
to prepare for supporting MDIO-connected switch ICs.
Modify gswip_probe() functions by splitting it into a common function
gswip_probe_common() which covers allocating, initializing and registering
the DSA switch, while keeping transport-specific regmap initialization as
well as PHY firmware loading in the new MMIO/SoC-specific gswip_probe()
function.

Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Tested-by: Alexander Sverdlin <alexander.sverdlin@siemens.com>
Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
Link: https://patch.msgid.link/dc7da5b65ec220ba8e9bc4bd04fe1ed7de046656.1762170107.git.daniel@makrotopia.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-06 14:16:16 -08:00
Mohammad Heib
2c031d4c77 i40e: support generic devlink param "max_mac_per_vf"
Currently the i40e driver enforces its own internally calculated per-VF MAC
filter limit, derived from the number of allocated VFs and available
hardware resources. This limit is not configurable by the administrator,
which makes it difficult to control how many MAC addresses each VF may
use.

This patch adds support for the new generic devlink runtime parameter
"max_mac_per_vf" which provides administrators with a way to cap the
number of MAC addresses a VF can use:

- When the parameter is set to 0 (default), the driver continues to use
  its internally calculated limit.

- When set to a non-zero value, the driver applies this value as a strict
  cap for VFs, overriding the internal calculation.

Important notes:

- The configured value is a theoretical maximum. Hardware limits may
  still prevent additional MAC addresses from being added, even if the
  parameter allows it.

- Since MAC filters are a shared hardware resource across all VFs,
  setting a high value may cause resource contention and starve other
  VFs.

- This change gives administrators predictable and flexible control over
  VF resource allocation, while still respecting hardware limitations.

- Previous discussion about this change:
  https://lore.kernel.org/netdev/20250805134042.2604897-2-dhill@redhat.com
  https://lore.kernel.org/netdev/20250823094952.182181-1-mheib@redhat.com

Signed-off-by: Mohammad Heib <mheib@redhat.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Tested-by: Rafal Romanowski <rafal.romanowski@intel.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2025-11-06 12:57:31 -08:00
Mohammad Heib
9352d40c8b devlink: Add new "max_mac_per_vf" generic device param
Add a new device generic parameter to controls the maximum
number of MAC filters allowed per VF.

For example, to limit a VF to 3 MAC addresses:
 $ devlink dev param set pci/0000:3b:00.0 name max_mac_per_vf \
        value 3 \
        cmode runtime

Signed-off-by: Mohammad Heib <mheib@redhat.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2025-11-06 12:57:31 -08:00
Dan Carpenter
c105e76bb1 hfs: fix potential use after free in hfs_correct_next_unused_CNID()
This code calls hfs_bnode_put(node) which drops the refcount and then
dreferences "node" on the next line.  It's only safe to use "node"
when we're holding a reference so flip these two lines around.

Fixes: a06ec283e1 ("hfs: add logic of correcting a next unused CNID")
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Reviewed-by: Viacheslav Dubeyko <slava@dubeyko.com>
Signed-off-by: Viacheslav Dubeyko <slava@dubeyko.com>
Link: https://lore.kernel.org/r/aN-Xw8KnbSnuIcLk@stanley.mountain
Signed-off-by: Viacheslav Dubeyko <slava@dubeyko.com>
2025-11-06 11:07:16 -08:00
Jakub Kicinski
1ec9871fbb Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
Cross-merge networking fixes after downstream PR (net-6.18-rc5).

Conflicts:

drivers/net/wireless/ath/ath12k/mac.c
  9222582ec5 ("Revert "wifi: ath12k: Fix missing station power save configuration"")
  6917e268c4 ("wifi: ath12k: Defer vdev bring-up until CSA finalize to avoid stale beacon")
https://lore.kernel.org/11cece9f7e36c12efd732baa5718239b1bf8c950.camel@sipsolutions.net

Adjacent changes:

drivers/net/ethernet/intel/Kconfig
  b1d16f7c00 ("libie: depend on DEBUG_FS when building LIBIE_FWLOG")
  93f53db9f9 ("ice: switch to Page Pool")

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-06 09:27:40 -08:00
Zhang Yi
9dbf945320 ext4: add two trace points for moving extents
To facilitate tracking the length, type, and outcome of the move extent,
add a trace point at both the entry and exit of mext_move_extent().

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Message-ID: <20251013015128.499308-13-yi.zhang@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-06 10:44:39 -05:00
Zhang Yi
65097262f5 ext4: add large folios support for moving extents
Pass the moving extent length into mext_folio_double_lock() so that it
can acquire a higher-order folio if the length exceeds PAGE_SIZE. This
can speed up extent moving when the extent is larger than one page.
Additionally, remove the unnecessary comments from
mext_folio_double_lock().

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Message-ID: <20251013015128.499308-12-yi.zhang@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-06 10:44:39 -05:00
Zhang Yi
4589c4518f ext4: switch to using the new extent movement method
Now that we have mext_move_extent(), we can switch to this new interface
and deprecate move_extent_per_page(). First, after acquiring the
i_rwsem, we can directly use ext4_map_blocks() to obtain a contiguous
extent from the original inode as the extent to be moved. It can and
it's safe to get mapping information from the extent status tree without
needing to access the ondisk extent tree, because ext4_move_extent()
will check the sequence cookie under the folio lock. Then, after
populating the mext_data structure, we call ext4_move_extent() to move
the extent. Finally, the length of the extent will be adjusted in
mext.orig_map.m_len and the actual length moved is returned through
m_len.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Message-ID: <20251013015128.499308-11-yi.zhang@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-06 10:44:39 -05:00
Zhang Yi
962e8a01ea ext4: introduce mext_move_extent()
When moving extents, the current move_extent_per_page() process can only
move extents of length PAGE_SIZE at a time, which is highly inefficient,
especially when the fragmentation of the file is not particularly
severe, this will result in a large number of unnecessary extent split
and merge operations. Moreover, since the ext4 file system now supports
large folios, using PAGE_SIZE as the processing unit is no longer
practical.

Therefore, introduce a new move extents method, mext_move_extent(). It
moves one extent of the origin inode at a time, but not exceeding the
size of a folio. The parameters for the move are passed through the new
mext_data data structure, which includes the origin inode, donor inode,
the mapping extent of the origin inode to be moved, and the starting
offset of the donor inode.

The move process is similar to move_extent_per_page() and can be
categorized into three types: MEXT_SKIP_EXTENT, MEXT_MOVE_EXTENT, and
MEXT_COPY_DATA. MEXT_SKIP_EXTENT indicates that the corresponding area
of the donor file is a hole, meaning no actual space is allocated, so
the move is skipped. MEXT_MOVE_EXTENT indicates that the corresponding
areas of both the origin and donor files are unwritten, so no data needs
to be copied; only the extents are swapped. MEXT_COPY_DATA indicates
that the corresponding areas of both the origin and donor files contain
data, so data must be copied. The data copying is performed in three
steps: first, the data from the original location is read into the page
cache; then, the extents are swapped, and the page cache is rebuilt to
reflect the index of the physical blocks; finally, the dirty page cache
is marked and written back to ensure that the data is written to disk
before the metadata is persisted.

One important point to note is that the folio lock and i_data_sem are
held only during the moving process. Therefore, before moving an extent,
it is necessary to check whether the sequence cookie of the area to be
moved has changed while holding the folio lock. If a change is detected,
it indicates that concurrent write-back operations may have occurred
during this period, and the type of the extent to be moved can no longer
be considered reliable. For example, it may have changed from unwritten
to written. In such cases, return -ESTALE, and the calling function
should reacquire the move extent of the original file and retry the
movement.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Message-ID: <20251013015128.499308-10-yi.zhang@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-06 10:44:39 -05:00
Zhang Yi
37cb211f97 ext4: rename mext_page_mkuptodate() to mext_folio_mkuptodate()
mext_page_mkuptodate() no longer works on a single page, so rename it to
mext_folio_mkuptodate().

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Message-ID: <20251013015128.499308-9-yi.zhang@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-06 10:44:39 -05:00
Zhang Yi
57c1df07f1 ext4: refactor mext_check_arguments()
When moving extents, mext_check_validity() performs some basic file
system and file checks. However, some essential checks need to be
performed after acquiring the i_rwsem are still scattered in
mext_check_arguments(). Move those checks into mext_check_validity() and
make it executes entirely under the i_rwsem to make the checks clearer.
Furthermore, rename mext_check_arguments() to mext_check_adjust_range(),
as it only performs checks and length adjustments on the move extent
range. Finally, also change the print message for the non-existent file
check to be consistent with other unsupported checks.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Message-ID: <20251013015128.499308-8-yi.zhang@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-06 10:44:39 -05:00
Zhang Yi
22218516e4 ext4: add mext_check_validity() to do basic check
Currently, the basic validation checks during the move extent operation
are scattered across __ext4_ioctl() and ext4_move_extents(), which makes
the code somewhat disorganized. Introduce a new helper,
mext_check_validity(), to handle these checks. This change involves only
code relocation without any logical modifications.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Message-ID: <20251013015128.499308-7-yi.zhang@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-06 10:44:39 -05:00
Zhang Yi
c9570b6634 ext4: use EXT4_B_TO_LBLK() in mext_check_arguments()
Switch to using EXT4_B_TO_LBLK() to calculate the EOF position of the
origin and donor inodes, instead of using open-coded calculations.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Message-ID: <20251013015128.499308-6-yi.zhang@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-06 10:44:39 -05:00
Zhang Yi
07c440e8da ext4: pass out extent seq counter when mapping blocks
When creating or querying mapping blocks using the ext4_map_blocks() and
ext4_map_{query|create}_blocks() helpers, also pass out the extent
sequence number of the block mapping info through the ext4_map_blocks
structure. This sequence number can later serve as a valid cookie within
iomap infrastructure and the move extents procedure.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Message-ID: <20251013015128.499308-5-yi.zhang@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-06 10:44:39 -05:00
Zhang Yi
7da5565cab ext4: make ext4_es_lookup_extent() pass out the extent seq counter
When querying extents in the extent status tree, we should hold the
data_sem if we want to obtain the sequence number as a valid cookie
simultaneously. However, currently, ext4_map_blocks() calls
ext4_es_lookup_extent() without holding data_sem. Therefore, we should
acquire i_es_lock instead, which also ensures that the sequence cookie
and the extent remain consistent. Consequently, make
ext4_es_lookup_extent() to pass out the sequence number when necessary.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Message-ID: <20251013015128.499308-4-yi.zhang@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-06 10:44:39 -05:00
Zhang Yi
dd064d5101 ext4: introduce seq counter for the extent status entry
In the iomap_write_iter(), the iomap buffered write frame does not hold
any locks between querying the inode extent mapping info and performing
page cache writes. As a result, the extent mapping can be changed due to
concurrent I/O in flight. Similarly, in the iomap_writepage_map(), the
write-back process faces a similar problem: concurrent changes can
invalidate the extent mapping before the I/O is submitted.

Therefore, both of these processes must recheck the mapping info after
acquiring the folio lock. To address this, similar to XFS, we propose
introducing an extent sequence number to serve as a validity cookie for
the extent. After commit 24b7a2331f ("ext4: clairfy the rules for
modifying extents"), we can ensure the extent information should always
be processed through the extent status tree, and the extent status tree
is always uptodate under i_rwsem or invalidate_lock or folio lock, so
it's safe to introduce this sequence number. The sequence number will be
increased whenever the extent status tree changes, preparing for the
buffered write iomap conversion.

Besides, this mechanism is also applicable for the moving extents case.
In move_extent_per_page(), it also needs to reacquire data_sem and check
the mapping info again under the folio lock.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Message-ID: <20251013015128.499308-3-yi.zhang@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-06 10:44:39 -05:00
Zhang Yi
a2e5a3cea4 ext4: correct the checking of quota files before moving extents
The move extent operation should return -EOPNOTSUPP if any of the inodes
is a quota inode, rather than requiring both to be quota inodes.

Fixes: 02749a4c20 ("ext4: add ext4_is_quota_file()")
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Message-ID: <20251013015128.499308-2-yi.zhang@huaweicloud.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-06 10:44:39 -05:00
Ranganath V N
6640d55218 fs: ext4: fix uninitialized symbols
Fix the issue detected by the smatch tool.

fs/ext4/inode.c:3583 ext4_map_blocks_atomic_write_slow() error: uninitialized symbol 'next_pblk'.
fs/ext4/namei.c:1776 ext4_lookup() error: uninitialized symbol 'de'.
fs/ext4/namei.c:1829 ext4_get_parent() error: uninitialized symbol 'de'.
fs/ext4/namei.c:3162 ext4_rmdir() error: uninitialized symbol 'de'.
fs/ext4/namei.c:3242 __ext4_unlink() error: uninitialized symbol 'de'.
fs/ext4/namei.c:3697 ext4_find_delete_entry() error: uninitialized symbol 'de'.

These changes enhance code clarity, address static analysis tool errors.

Signed-off-by: Ranganath V N <vnranganath.20@gmail.com>
Message-ID: <20251011063830.47485-1-vnranganath.20@gmail.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-06 10:34:20 -05:00
Manish Dharanenthiran
2977567b24 wifi: ath12k: Fix timeout error during beacon stats retrieval
Currently, for beacon_stats, ath12k_mac_get_fw_stats() is called
for each started BSS on the specified hardware.
ath12k_mac_get_fw_stats() will wait for the fw_stats_done completion
after fetching the requested data from firmware. For the beacon_stats,
fw_stats_done completion will be set only when stats are received for
all BSSes. However, for other stats like vdev_stats or pdev_stats, there
is one request to the firmware for all enabled BSSes. Since beacon_stats
is fetched individually for all BSSes enabled in that pdev, waiting for
the completion event results in a timeout error when multiple BSSes are
enabled.

Avoid this by completing the fw_stats_done immediately after
updating the requested BSS's beacon stats in the list. Subsequently,
this list will be used to display the beacon stats for all enabled
BSSes in the requested pdev.

Additionally, remove 'num_bcn_recvd' from the ath12k_fw_stats struct
as it is no longer needed.

Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.4.1-00199-QCAHKSWPL_SILICONZ-1

Fixes: 9fe4669ae9 ("wifi: ath12k: Request beacon stats from firmware")
Signed-off-by: Manish Dharanenthiran <manish.dharanenthiran@oss.qualcomm.com>
Reviewed-by: Vasanthakumar Thiagarajan <vasanthakumar.thiagarajan@oss.qualcomm.com>
Reviewed-by: Baochen Qiang <baochen.qiang@oss.qualcomm.com>
Link: https://patch.msgid.link/20251031-beacon_stats-v1-2-f52fce7b03ac@qti.qualcomm.com
Signed-off-by: Jeff Johnson <jeff.johnson@oss.qualcomm.com>
2025-11-06 07:33:31 -08:00
Manish Dharanenthiran
bd6ec8111e wifi: ath12k: Make firmware stats reset caller-driven
Currently, ath12k_fw_stats_reset() is called in
ath12k_mac_get_fw_stats() before fetching the required stats from the
firmware. However, ath12k_open_bcn_stats() requests firmware stats for
each enabled BSS individually. Since the firmware stats are reset before
fetching, only the last BSS's data is displayed.

Also, in ath12k_mac_op_get_txpower(), ath12k_mac_op_sta_statistics(), and
ath12k_mac_op_link_sta_statistics(), after getting the stats from the
firmware, the reset function is not called until the next firmware
stats are requested or while unloading the module. Hence, the stats buffer
will not be freed until one of the above sequences is executed. However,
in ath12k_open_vdev_stats(), ath12k_open_bcn_stats() and
ath12k_open_pdev_stats(), firmware stats are reset after copying the
necessary data in ath12k_wmi_fw_stats_dump().

This leads to inconsistent usage of ath12k_fw_stats_reset() for
freeing the firmware stats.

Avoid these discrepancies by making it the caller's responsibility to
free the stats buffer, thereby removing the need to free the stats buffer
in ath12k_mac_get_fw_stats() and ath12k_wmi_fw_stats_dump().

Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.4.1-00199-QCAHKSWPL_SILICONZ-1

Signed-off-by: Manish Dharanenthiran <manish.dharanenthiran@oss.qualcomm.com>
Reviewed-by: Vasanthakumar Thiagarajan <vasanthakumar.thiagarajan@oss.qualcomm.com>
Reviewed-by: Baochen Qiang <baochen.qiang@oss.qualcomm.com>
Link: https://patch.msgid.link/20251031-beacon_stats-v1-1-f52fce7b03ac@qti.qualcomm.com
Signed-off-by: Jeff Johnson <jeff.johnson@oss.qualcomm.com>
2025-11-06 07:33:31 -08:00
Julian Sun
ce3236a3c7 ext4: make error code in __ext4fs_dirhash() consistent.
Currently __ext4fs_dirhash() returns -1 (-EPERM) if fscrypt doesn't
have encryption key, which may confuse users. Make the error code here
consistent with existing error code.

Signed-off-by: Julian Sun <sunjunchao@bytedance.com>
Message-ID: <20251010095257.3008275-1-sunjunchao@bytedance.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2025-11-06 10:32:33 -05:00
Pavan Kumar Linga
13068e9d57 idpf: add support for IDPF PCI programming interface
At present IDPF supports only 0x1452 and 0x145C as PF and VF device IDs
on our current generation hardware. Future hardware exposes a new set of
device IDs for each generation. To avoid adding a new device ID for each
generation and to make the driver forward and backward compatible,
make use of the IDPF PCI programming interface to load the driver.

Write and read the VF_ARQBAL mailbox register to find if the current
device is a PF or a VF.

PCI SIG allocated a new programming interface for the IDPF compliant
ethernet network controller devices. It can be found at:
https://members.pcisig.com/wg/PCI-SIG/document/20113
with the document titled as 'PCI Code and ID Assignment Revision 1.16'
or any latest revisions.

Tested this patch by doing a simple driver load/unload on Intel IPU E2000
hardware which supports 0x1452 and 0x145C device IDs and new hardware
which supports the IDPF PCI programming interface.

Reviewed-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Pavan Kumar Linga <pavan.kumar.linga@intel.com>
Signed-off-by: Madhu Chittim <madhu.chittim@intel.com>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Tested-by: Marek Landowski <marek.landowski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
Link: https://patch.msgid.link/20251103224631.595527-1-anthony.l.nguyen@intel.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-06 15:41:05 +01:00
Aswin Karuvally
0cc4b84615 s390/ctcm: Use info level for handshake UC_RCRESET
CTC adapter throws CTC_EVENT_UC_RCRESET (Unit check remote reset event)
during initial handshake, if the peer is not ready yet. This causes the
ctcm driver to re-attempt the handshake.

As it is normal for the event to occur during initialization, use info
instead of warn level in kernel log and NOTICE instead of ERROR level
in s390 debug feature. Also reword the log message for clarity.

Reviewed-by: Alexandra Winter <wintera@linux.ibm.com>
Signed-off-by: Aswin Karuvally <aswin@linux.ibm.com>
Link: https://patch.msgid.link/20251103101652.2349855-1-aswin@linux.ibm.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-06 15:24:13 +01:00
Paolo Abeni
f47b0c1182 Merge branch 'amd-xgbe-introduce-support-for-ethtool-selftests'
Raju Rangoju says:

====================
amd-xgbe: introduce support for ethtool selftests

This patch series introduces support for ethtool selftests, which helps
in finding the misconfiguration of HW. Makes use of network selftest
packet creation infrastructure.

Supports the following tests:

 - MAC loopback selftest
 - PHY loopback selftest
 - Split header selftest
 - Jumbo frame selftest
====================

Link: https://patch.msgid.link/20251031111555.774425-1-Raju.Rangoju@amd.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-06 13:38:13 +01:00
Raju Rangoju
9c11b6b1ab amd-xgbe: add ethtool jumbo frame selftest
Adds support for jumbo frame selftest. Works only for
mtu size greater than 1500.

Signed-off-by: Raju Rangoju <Raju.Rangoju@amd.com>
Link: https://patch.msgid.link/20251031111555.774425-5-Raju.Rangoju@amd.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-06 13:38:11 +01:00
Raju Rangoju
d7735c6bb2 amd-xgbe: add ethtool split header selftest
Adds support for ethtool split header selftest. Performs
UDP and TCP check to ensure split header selft test works
for both packet types.

Signed-off-by: Raju Rangoju <Raju.Rangoju@amd.com>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Link: https://patch.msgid.link/20251031111555.774425-4-Raju.Rangoju@amd.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-06 13:38:11 +01:00
Raju Rangoju
42b06fcc87 amd-xgbe: add ethtool phy loopback selftest
Add support for PHY loopback testing via ethtool self-test.
The test uses phy_loopback() which enables PHY-level loopback
through the PHY driver's set_loopback callback if provided,
else uses the genphy_loopback().

Signed-off-by: Raju Rangoju <Raju.Rangoju@amd.com>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Link: https://patch.msgid.link/20251031111555.774425-3-Raju.Rangoju@amd.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-06 13:38:11 +01:00
Raju Rangoju
862a64c83f amd-xgbe: introduce support ethtool selftest
Add support for ethtool selftest for MAC loopback. This includes the
sanity check and helps in finding the misconfiguration of HW. Uses the
existing selftest infrastructure to create test packets.

Signed-off-by: Raju Rangoju <Raju.Rangoju@amd.com>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Link: https://patch.msgid.link/20251031111555.774425-2-Raju.Rangoju@amd.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-06 13:38:11 +01:00
Raju Rangoju
6b47af35a6 net: selftests: export packet creation helpers for driver use
Export the network selftest packet creation infrastructure to allow
network drivers to reuse the existing selftest framework instead of
duplicating packet creation code.

Signed-off-by: Raju Rangoju <Raju.Rangoju@amd.com>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Link: https://patch.msgid.link/20251031111811.775434-1-Raju.Rangoju@amd.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-06 13:38:11 +01:00
Fedor Pchelkin
d5da3d9fb0 wifi: rtw89: process TX wait skbs for USB via C2H handler
TX wait skbs need to be completed when they are done.  PCIe part does this
inside rtw89_pci_tx_status() during RPP processing.  Other HCIs use a
mechanism based on C2H firmware messages.

Store TX wait skbs inside TX report queue so that it'll be possible to
identify completed items inside the C2H handler.  Try to do this as
similar to PCIe path as possible.  When the corresponding TX wait skb is
found inside TX report queue, unlink it from there and call
rtw89_core_tx_wait_complete() to mark the completion.

If the callee waiting for the completion has already timed out, the TX
wait skb is placed into TX wait list (like PCIe part does).

It's important that during HCI reset all pending TX wait frames should be
completed inside hci.ops->reset method before calling
rtw89_tx_wait_list_clear().

Found by Linux Verification Center (linuxtesting.org).

Acked-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Fedor Pchelkin <pchelkin@ispras.ru>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251104135720.321110-11-pchelkin@ispras.ru
2025-11-06 14:34:11 +08:00
Fedor Pchelkin
cc7070e417 wifi: rtw89: provide TX reports for management frames
In order to provide TX reports for the management queue rtw89 should
configure the firmware.  Do this with SET_CMC_TBL_MGQ_RPT_EN() for the
WiFi6 chips and with CCTLINFO_G7_W0_MGQ_RPT_EN flag for the WiFi7 ones.

Suggested-by: Bitterblue Smith <rtl8821cerfe2@gmail.com>
Acked-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Fedor Pchelkin <pchelkin@ispras.ru>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251104135720.321110-10-pchelkin@ispras.ru
2025-11-06 14:33:56 +08:00
Fedor Pchelkin
816e849ef8 wifi: rtw89: handle IEEE80211_TX_CTL_REQ_TX_STATUS frames for USB
Frames flagged with IEEE80211_TX_CTL_REQ_TX_STATUS mean the driver has to
report to mac80211 stack whether AP sent ACK for the null frame/probe
request or not.  It's not implemented in USB part of the driver yet.

PCIe HCI has its own way of getting TX status incorporated into RPP
feature, and it's always enabled there.  Other HCIs need a different
scheme based on processing C2H messages.

Thus define a .tx_rpt_enabled flag indicating which HCIs need to enable a
TX report feature.  Currently it is USB only.

Toggle a bit in the TX descriptor and place flagged skbs in a fix-sized
queue to wait for a message from the firmware.  Firmware maintains a 4-bit
sequence number for required frames hence the queue can contain just 16
elements simultaneously.  That's enough for normal driver / firmware
communication.  If the firmware crashes for any reason and doesn't provide
TX reports in time, driver will handle this and report the obsolete frames
as dropped.

rtw89 also has a new feature providing a TX report for each transmission
attempt.  Ignore a failed TX status reported by the firmware until retry
limit is reached or successful status appears.  When there is no success
and the retry limit is reached, report the frame up to the wireless stack
as failed eventually.

HCI reset should stop all pending TX activity so forcefully flush the
queue there.

Found by Linux Verification Center (linuxtesting.org).

Signed-off-by: Fedor Pchelkin <pchelkin@ispras.ru>
Acked-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251104135720.321110-9-pchelkin@ispras.ru
2025-11-06 14:32:14 +08:00
Fedor Pchelkin
26a42d804a wifi: rtw89: usb: anchor TX URBs
During HCI reset all pending TX URBs should be canceled.  Use anchor to
keep track of them and have an ability to cancel them synchronously.

Note however that canceling RX URBs can't be done here in
rtw89_usb_ops_reset() as it breaks driver initialization.

Found by Linux Verification Center (linuxtesting.org).

Acked-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Fedor Pchelkin <pchelkin@ispras.ru>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251104135720.321110-8-pchelkin@ispras.ru
2025-11-06 14:32:03 +08:00
Ping-Ke Shih
21b9461040 wifi: rtw89: fill TX descriptor of FWCMD in shortcut
TX type FWCMD is used to download firmware and send H2C commands, and
it only fill few fields of TX description, such as desc_info->pkt_size.
Therefore, early return the TX type FWCMD.

Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Fedor Pchelkin <pchelkin@ispras.ru>
Link: https://patch.msgid.link/20251104135720.321110-7-pchelkin@ispras.ru
2025-11-06 14:31:50 +08:00
Fedor Pchelkin
c33c6a1b6f wifi: rtw89: implement C2H TX report handler
rtw89 has several ways of handling TX status report events.  The first one
is based on RPP feature which is used by PCIe HCI.  The other one depends
on firmware sending a corresponding C2H message, quite similar to what
rtw88 has.

Toggle a bit in the TX descriptor to indicate to the firmware that TX
report for the frame is expected.   This will allow handling TX wait skbs
and the ones flagged with IEEE80211_TX_CTL_REQ_TX_STATUS correctly.

Do the bulk of the patch according to the vendor driver for RTL8851BU.
However, there are slight differences in C2H message format between
different types of chips.  RTL885xB ones follow format V0.  RTL8852C has
format V1, and RTL8922AU has format V2.

Found by Linux Verification Center (linuxtesting.org).

Suggested-by: Bitterblue Smith <rtl8821cerfe2@gmail.com>
Acked-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Fedor Pchelkin <pchelkin@ispras.ru>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251104135720.321110-6-pchelkin@ispras.ru
2025-11-06 14:30:13 +08:00
Fedor Pchelkin
8986bafb09 wifi: rtw89: refine rtw89_core_tx_wait_complete()
Pass TX status value directly into rtw89_core_tx_wait_complete().  This
will make it a bit in sync with further patches and will give flexibility
in future work.  Also use scope based RCU locking which simplifies the
code of the function.

Found by Linux Verification Center (linuxtesting.org).

Acked-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Fedor Pchelkin <pchelkin@ispras.ru>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251104135720.321110-5-pchelkin@ispras.ru
2025-11-06 14:28:42 +08:00
Fedor Pchelkin
45a6a88f01 wifi: rtw89: usb: use ieee80211_free_txskb() where appropriate
rtw89_usb_ops_tx_kick_off() may need to release skb if a failure occurs.
It operates mainly on skbs coming from the core wireless stack and the
ones containing firmware commands.

Use ieee80211_free_txskb() for the former case.

Suggested-by: Ping-Ke Shih <pkshih@realtek.com>
Acked-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Fedor Pchelkin <pchelkin@ispras.ru>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251104135720.321110-4-pchelkin@ispras.ru
2025-11-06 14:28:34 +08:00
Fedor Pchelkin
7543818e97 wifi: rtw89: usb: fix leak in rtw89_usb_write_port()
When there is an attempt to write data and RTW89_FLAG_UNPLUGGED is set,
this means device is disconnected and no urb is submitted.  Return
appropriate error code to the caller to properly free the allocated
resources.

Found by Linux Verification Center (linuxtesting.org).

Fixes: 2135c28be6 ("wifi: rtw89: Add usb.{c,h}")
Acked-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Fedor Pchelkin <pchelkin@ispras.ru>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251104135720.321110-3-pchelkin@ispras.ru
2025-11-06 14:28:25 +08:00
Fedor Pchelkin
28a4557528 wifi: rtw89: usb: use common error path for skbs in rtw89_usb_rx_handler()
Allow adding rx_skb to rx_free_queue for later reuse on the common error
handling path, otherwise free it.

Found by Linux Verification Center (linuxtesting.org).

Fixes: 2135c28be6 ("wifi: rtw89: Add usb.{c,h}")
Acked-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Fedor Pchelkin <pchelkin@ispras.ru>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251104135720.321110-2-pchelkin@ispras.ru
2025-11-06 14:26:48 +08:00
Shangjuan Wei
0567c84d68 dt-bindings: ethernet: eswin: fix yaml schema issues
eswin,hsp-sp-csr attribute is one phandle with multiple arguments,
so the syntax should be in the form of:
 items:
   - items:
       - description: ...
       - description: ...
       - description: ...
       - description: ...

To align with the description of the 'eswin-sp-csr'
attribute in the mmc,usb modules, the description
of the 'eswin,hsp-sp-csr' attribute has been modified.

Fixes: 888bd0eca9 ("dt-bindings: ethernet: eswin: Document for EIC7700 SoC")
Reported-by: Rob Herring (Arm) <robh@kernel.org>
Closes: https://lore.kernel.org/all/176096011380.22917.1988679321096076522.robh@kernel.org/
Signed-off-by: Shangjuan Wei <weishangjuan@eswincomputing.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://patch.msgid.link/20251104073305.299-1-weishangjuan@eswincomputing.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-05 20:00:29 -08:00
Jakub Kicinski
9158447f09 Merge branch 'net-stmmac-socfpga-add-agilex5-platform-support-and-enhancements'
Rohan G Thomas says:

====================
net: stmmac: socfpga: Add Agilex5 platform support and enhancements

This patch series adds support for the Agilex5 EMAC platform to the
dwmac-socfpga driver.

The series includes:
   - Platform configuration for Agilex5 EMAC
   - Enabling Time-Based Scheduling (TBS) for Tx queues 6 and 7
   - Enabling TCP Segmentation Offload(TSO)
   - Adding hardware-supported cross timestamping using the SMTG IP,
     allowing precise synchronization between MAC and system time via
     PTP_SYS_OFFSET_PRECISE.
====================

Link: https://patch.msgid.link/20251101-agilex5_ext-v2-0-a6b51b4dca4d@altera.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-05 18:35:16 -08:00
Rohan G Thomas
fd8c4f6454 net: stmmac: socfpga: Add hardware supported cross-timestamp
Cross timestamping is supported on Agilex5 platform with Synchronized
Multidrop Timestamp Gathering(SMTG) IP. The hardware cross-timestamp
result is made available the applications through the ioctl call
PTP_SYS_OFFSET_PRECISE, which inturn calls stmmac_getcrosststamp().

Device time is stored in the MAC Auxiliary register. The 64-bit System
time (ARM_ARCH_COUNTER) is stored in SMTG IP. SMTG IP is an MDIO device
with 0xC - 0xF MDIO register space holds 64-bit system time.

This commit is similar to following commit for Intel platforms:
Commit 341f67e424 ("net: stmmac: Add hardware supported cross-timestamp")

Signed-off-by: Rohan G Thomas <rohan.g.thomas@altera.com>
Link: https://patch.msgid.link/20251101-agilex5_ext-v2-4-a6b51b4dca4d@altera.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-05 18:35:14 -08:00
Rohan G Thomas
e28988aef7 net: stmmac: socfpga: Enable TSO for Agilex5 platform
Agilex5 supports TCP Segmentation Offload(TSO). This commit enables
TSO for Agilex5 socfpga platforms.

Signed-off-by: Rohan G Thomas <rohan.g.thomas@altera.com>
Link: https://patch.msgid.link/20251101-agilex5_ext-v2-3-a6b51b4dca4d@altera.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-05 18:35:14 -08:00
Rohan G Thomas
4c00476d44 net: stmmac: socfpga: Enable TBS support for Agilex5
Agilex5 supports Time-Based Scheduling(TBS) for Tx queue 6 and Tx
queue 7. This commit enables TBS support for these queues.

Signed-off-by: Rohan G Thomas <rohan.g.thomas@altera.com>
Link: https://patch.msgid.link/20251101-agilex5_ext-v2-2-a6b51b4dca4d@altera.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-05 18:35:14 -08:00
Rohan G Thomas
93d46ea3e9 net: stmmac: socfpga: Agilex5 EMAC platform configuration
Agilex5 HPS EMAC uses the dwxgmac-3.10a IP, unlike previous socfpga
platforms which use dwmac1000 IP. Due to differences in platform
configuration, Agilex5 requires a distinct setup.

Introduce a setup_plat_dat() callback in socfpga_dwmac_ops to handle
platform-specific setup. This callback is invoked before
stmmac_dvr_probe() to ensure the platform data is correctly
configured. Also, implemented separate setup_plat_dat() callback for
current socfpga platforms and Agilex5.

Signed-off-by: Rohan G Thomas <rohan.g.thomas@altera.com>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Tested-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Link: https://patch.msgid.link/20251101-agilex5_ext-v2-1-a6b51b4dca4d@altera.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-05 18:35:14 -08:00
Jakub Kicinski
9b73cdad58 Merge tag 'wireless-next-2025-11-05' of https://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless-next
Johannes Berg says:

====================
More changes from drivers are coming in, notably:

 - ath10k: factory test support
 - ath11k: TX power insertion support
 - ath12k: BSS color change support
 - iwlwifi: new sniffer API support

* tag 'wireless-next-2025-11-05' of https://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless-next: (63 commits)
  wifi: ath10k: use = {} to initialize bmi_target_info instead of memset
  wifi: ath10k: use = {} to initialize pm_qos_request instead of memset
  wifi: ath12k: unassign arvif on scan vdev create failure
  wifi: ath12k: enforce vdev limit in ath12k_mac_vdev_create()
  wifi: ath12k: Set EHT fixed rates for associated STAs
  wifi: ath12k: add EHT rates to ath12k_mac_op_set_bitrate_mask()
  wifi: ath12k: Add EHT fixed GI/LTF
  wifi: ath12k: Add EHT MCS/NSS rates to Peer Assoc
  wifi: ath12k: add EHT rate handling to existing set rate functions
  wifi: ath12k: generalize GI and LTF fixed rate functions
  wifi: ath12k: fix error handling in creating hardware group
  wifi: ath12k: fix reusing m3 memory
  wifi: ath12k: fix potential memory leak in ath12k_wow_arp_ns_offload()
  wifi: iwlwifi: mld: add null check for kzalloc() in iwl_mld_send_proto_offload()
  wifi: iwlwifi: mld: check for NULL pointer after kmalloc
  wifi: iwlwifi: cfg: fix a few device names
  wifi: iwlwifi: mld: Move EMLSR prints to IWL_DL_EHT
  wifi: iwlwifi: disable EHT if the device doesn't allow it
  wifi: iwlwifi: bump core version for BZ/SC/DR
  wifi: iwlwifi: mld: use FW_CHECK on bad ROC notification
  ...
====================

Link: https://patch.msgid.link/20251105153537.54096-38-johannes@sipsolutions.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-05 18:09:23 -08:00
Dan Carpenter
c79a022524 net: dsa: microchip: Fix a link check in ksz9477_pcs_read()
The BMSR_LSTATUS define is 0x4 but the "p->phydev.link" variable
is a 1 bit bitfield in a u32.  Since 4 doesn't fit in 0-1 range
it means that ".link" is always set to false.  Add a !! to fix
this.

[Jakub: According to Maxime the phydev struct isn't really
used and we should consider removing it completely. So not
treating this as a fix.]

Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Link: https://patch.msgid.link/aQSz_euUg0Ja8ZaH@stanley.mountain
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-05 17:58:51 -08:00
Alexei Starovoitov
b54a8e130c Merge branch 'bpf-indirect-jumps'
Anton Protopopov says:

====================
BPF indirect jumps

This patchset implements a new type of map, instruction set, and uses
it to build support for indirect branches in BPF (on x86). (The same
map will be later used to provide support for indirect calls and static
keys.) See [1], [2] for more context.

Short table of contents:

  * Patches 1-6 implement the new map of type
    BPF_MAP_TYPE_INSN_SET and corresponding selftests. This map can
    be used to track the "original -> xlated -> jitted mapping" for
    a given program.

  * Patches 7-12 implement the support for indirect jumps on x86 and add libbpf
    support for LLVM-compiled programs containing indirect jumps, and selftests.

The jump table support was merged to LLVM and now can be
enabled with -mcpu=v4, see [3]. The __BPF_FEATURE_GOTOX
macros can be used to check if the compiler supports the
feature or not.

See individual patches for more details on the implementation details.

v10 -> v11 (this series):

  * rearranged patches and split libbpf patch such that first 6 patches
    implementing instruction arrays can be applied independently

  * instruction arrays:
    * move [fake] aux->used_maps assignment in this patch

  * indirect jumps:
    * call clear_insn_aux_data before bpf_remove_insns (AI)

  * libbpf:
    * remove the relocations check after the new LLVM is released (Eduard, Yonghong)
    * libbpf: fix an index printed in pr_warn (AI)

  * selftests:
    * protect programs triggered by nanosleep from fake runs (Eduard)
    * patch verifier_gotox to not emit .rel.jumptables

v9 -> v10 (https://lore.kernel.org/bpf/20251102205722.3266908-1-a.s.protopopov@gmail.com/T/#t):

  * Three bugs were noticed by AI in v9 (two old, one introduced by v9):
    * [new] insn_array_alloc_size could overflow u32, switched to u64 (AI)
    * map_ptr should be compared in regsafe for PTR_TO_INSN (AI)
    * duplicate elements were copied in jt_from_map (AI)

  * added a selftest in verifier_gotox with a jump table containing non-unique entries

v8 -> v9 (https://lore.kernel.org/bpf/20251101110717.2860949-1-a.s.protopopov@gmail.com/T/#t):

  * instruction arrays:
    * remove the size restriction of 256 elements
    * add a comments about addrs usage, old and new (Alexei)

  * libbpf:
    * properly prefix warnings (Andrii)
    * cast j[t] to long long for printf and some other minor cleanups (Andrii)

  * selftests:
    * use __BPF_FEATURE_GOTOX in selftests and skip tests if it's not set (Eduard)
    * fix a typo in a selftest assembly (AI)

v7 -> v8 (https://lore.kernel.org/bpf/20251028142049.1324520-1-a.s.protopopov@gmail.com/T/#u):

  * instruction arrays:
    * simplify the bpf_prog_update_insn_ptrs function (Eduard)
    * remove a semicolon after a function definition (AI)

  * libbpf:
    * add a proper error path in libbpf patch (AI)
    * re-re-factor the create_jt_map & find_subprog_idx (Eduard)

  * selftests:
    * verifier_gotox: add a test for a jump table pointing to outside of a subprog (Eduard)
    * used test__skip instead of just running an empty test
    * split tests in bpf_gotox into subtests for convenience

  * random:
    * drop the docs commit for now

v6 -> v7 (https://lore.kernel.org/bpf/20251026192709.1964787-1-a.s.protopopov@gmail.com/T/#t):

  * rebased and dropped already merged commits

  * instruction arrays
    * use jit_data to find mappings from insn to jit (Alexei)
    * alloc `ips` as part of the main allocation (Eduard)
    * the `jitted_ip` member wasn't actually used (Eduard)
    * remove the bpf_insn_ptr structure, which is not needed for this patch

  * indirect jumps, kernel:
    * fix a memory leak in `create_jt` (AI)
    * use proper reg+8*ereg in `its_static_thunk` (AI)
    * some minor cleanups (Eduard)

  * indirect jumps, libbpf:
    * refactor the `jt_adjust_off()` piece (Edurad)
    * move "JUMPTABLES_SEC" into libbpf_internal.h (Eduard)
    * remove an unnecessary if (Eduard)

  * verifier_gotox: add tests to verify that `gotox rX` works with all registers

v5 -> v6 (https://lore.kernel.org/bpf/20251019202145.3944697-1-a.s.protopopov@gmail.com/T/#u):

  * instruction arrays:
    * better document `struct bpf_insn_array_value` (Eduard)
    * remove a condition in `bpf_insn_array_adjust_after_remove` (Eduard)
    * make userspace see original, xlated, and jitted indexes (+original) (Eduard)

  * indirect jumps, kernel:
    * reject writes to the map
    * reject unaligned ops
    * add a check what `w` is not outside the program in check_config for `gotox` (Eduard)
    * do not introduce unneeded `bpf_find_containing_subprog_idx`
    * simplify error processing for `bpf_find_containing_subprog` (Eduard)
    * add `insn_state |= DISCOVERED` when it's discovered (Eduard)
    * support SUB operations on PTR_TO_INSN (Eduard)
    * make `gotox_tmp_buf` a bpf_iarray and use helper to relocate it (Eduard)
    * rename fields of `bpf_iarray` to more generic (Eduard)
    * re-implement `visit_gotox_insn` in a loop (Eduard)
    * some minor cleanups (Eduard)

  * libbpf:
    * `struct reloc_desc`: add a comment about `union` (Eduard)
    * rename parameters of (and one other place in code) `{create,add}_jt_map` to `sym_off` (Eduard)
    * `create_jt_map`: check that size/off are 8-byte aligned (Eduard)

  * Selftests:
    * instruction array selftests:
      * only run tests on x86_64
      * write a more generic function to test things to reduce code (Eduard)
      * errno wasn't used in checks, so don't reset it (Eduard)
      * print `i`, `xlated_off` and `map_out[i]` here (Eduard)
    * added `verifier_gotox` selftests which do not depend on LLVM:
    * disabled `bpf_gotox` tests by default

  * other changes:
    * remove an extra function in bpf disasm (Eduard)
    * some minor cleanups in the insn_successors patch (Eduard)
    * update documentation in `Documentation/bpf/linux-notes.html` about jumps, now it is supported :)

v3 -> v4 -> v5 (https://lore.kernel.org/bpf/20250930125111.1269861-1-a.s.protopopov@gmail.com/):

  * [v4 -> v5] rebased on top of the last bpf-next/master

  * instruction arrays:
    * add copyright (Alexei)
    * remove mutexes, add frozen back (Alexei)
    * setup 1:1 prog-map correspondence using atomic_xchg
    * do not copy/paste array_map_get_next_key, add a common helper (Alexei)
    * misc minor code cleanups (Alexei)

  * indirect jumps, kernel side:
    * remove jt_allocated, just check if insn is gotox (Eduard)
    * use copy_register_state instead of individual copies (Eduard)
    * in push_stack is_speculative should be inherited (Eduard)
    * a few cleanups for insn_successors, including omitting error path (Eduard)
    * check if reserved fields are used when considering `gotox` instruction (Eduard)
    * read size and alignment of read from insn_array should be 8 (Eduard)
    * put buffer for sorting in subfun info and realloc to grow as needed (Eduard)
    * properly do `jump_point` / `prune_point` from `push_gotox_edge` (Eduard)
    * use range_within to check states (Eduard)
    * some minor cleanups and fix commit message (Eduard)

  * indirect jumps, libbpf side:
    * close map_fd in some error paths in create_jt_map (Andrii)
    * maps for jump tables are actually not closed at all, fix this (Andrii)
    * rename map from `jt` to `.jumptables` (Andrii)
    * use `errstr` in an error message (Andrii)
    * rephrase error message to look more standard (Andrii)
    * misc other minor renames and cleanups (Andrii)

  * selftests:
    * add the frozen selftest back
    * add a selftest for two jumps loading same table

  * some other changes:
    * rebase and split insn_successor changes into separate patch
    * use PTR_ERR_OR_ZERO in the push stack patch (Eduard)
    * indirect jumps on x86: properly re-read *pprog (Eduard)

v2 -> v3 (https://lore.kernel.org/bpf/20250918093850.455051-1-a.s.protopopov@gmail.com/):

  * fix build failure when CONFIG_BPF_SYSCALL is not set (kbuild-bot)
  * reformat bpftool help messages (Quentin)

v1 -> v2 (https://lore.kernel.org/bpf/20250913193922.1910480-1-a.s.protopopov@gmail.com/):

  * push_stack changes:
    * sanitize_speculative_path should just return int (Eduard)
    * return code from sanitize_speculative_path, not EFAULT (Eduard)
    * when BPF_COMPLEXITY_LIMIT_JMP_SEQ is reached, return E2BIG (Eduard)

  * indirect jumps:
    * omit support for .imm=fd in gotox, as we're not using it for now (Eduard)
    * struct jt -> struct bpf_iarray (Eduard)
    * insn_successors: rewrite the interface to just return a pointer (Eduard)
    * remove min_index/max_index, use umin_value/umax_value instead (Alexei, Eduard)
    * move emit_indirect_jump args change to the previous patch (Eduard)
    * add a comment to map_mem_size() (Eduard)
    * use verifier_bug for some error cases in check_indirect_jump (Eduard)
    * clear_insn_aux_data: use start,len instead of start,end (Eduard)
    * make regs[insn->dst_reg].type = PTR_TO_INSN part of check_mem_access (Eduard)

  * constant blinding changes:
    * make subprog_start adjustment better readable (Eduard)
    * do not set subprog len, it is already set (Eduard)

  * libbpf:
    * remove check that relocations from .rodata are ok (Anton)
    * do not freeze the map, it is not necessary anymore (Anton)
    * rename the goto_x -> gotox everywhere (Anton)
    * use u64 when parsing LLVM jump tables (Eduard)
    * split patch in two due to spaces->tabs change (Eduard)
    * split bpftool changes to bpftool patch (Andrii)
    * make sym_size it a union with ext_idx (Andrii)
    * properly copy/free the jumptables_data section from elf (Andrii)
    * a few cosmetic changes around create_jt_map (Andrii)
    * fix some comments + rewrite patch description (Andrii)
    * inline bpf_prog__append_subprog_offsets (Andrii)
    * subprog_sec_offst -> subprog_sec_off (Andrii)
    * !strcmp -> strcmp() == 0 (Andrii)
    * make some function names more readable (Andrii)
    * allocate table of subfunc offsets via libbpf_reallocarray (Andrii)

  * selftests:
    * squash insn_array* tests together (Anton)

  * fixed build warnings (kernel test robot)

RFC -> v1 (https://lore.kernel.org/bpf/20250816180631.952085-1-a.s.protopopov@gmail.com/):

  * I've tried to address all the comments provided by Alexei and
    Eduard in RFC. Will try to list the most important of them below.
  * One big change: move from older LLVM version [5] to newer [4].
    Now LLVM generates jump tables as symbols in the new special
    section ".jumptables". Another part of this change is that
    libbpf now doesn't try to link map load and goto *rX, as
    1) this is absolutely not reliable 2) for some use cases this
    is impossible (namely, when more than one jump table can be used
    in the same gotox instruction).
  * Added insn_successors() support (Alexei, Eduard). This includes
    getting rid of the ugly bpf_insn_set_iter_xlated_offset()
    interface (Eduard).
  * Removed hack for the unreachable instruction, as new LLVM thank to
    Eduard doesn't generate it.
  * Set mem_size for direct map access properly instead of hacking.
    Remove off>0 check. (Alexei)
  * Do not allocate new memory for min_index/max_index (Alexei, Eduard)
  * Information required during check_cfg is now cached to be reused
    later (Alexei + general logic for supporting multiple JT per jump)
  * Properly compare registers in regsafe (Alexei, Eduard)
  * Remove support for JMP32 (Eduard)
  * Better checks in adjust_ptr_min_max_vals (Eduard)
  * More selftests were added (but still there's room for more) which
    directly use gotox (Alexei)
  * More checks and verbose messages added
  * "unique pointers" are no more in the map

Links:
  1. https://lpc.events/event/18/contributions/1941/
  2. https://lwn.net/Articles/1017439/
  3. https://github.com/llvm/llvm-project/pull/149715
  4. https://github.com/llvm/llvm-project/pull/149715#issuecomment-3274833753
  6. rfc: https://lore.kernel.org/bpf/20250615085943.3871208-1-a.s.protopopov@gmail.com/
====================

Link: https://patch.msgid.link/20251105090410.1250500-1-a.s.protopopov@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-05 17:54:06 -08:00
Anton Protopopov
ac4d838ce1 selftests/bpf: add C-level selftests for indirect jumps
Add C-level selftests for indirect jumps to validate LLVM and libbpf
functionality. The tests are intentionally disabled, to be run
locally by developers, but will not make the CI red.

Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Link: https://lore.kernel.org/r/20251105090410.1250500-13-a.s.protopopov@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-05 17:53:56 -08:00
Anton Protopopov
ccbdb48ce5 selftests/bpf: add new verifier_gotox test
Add a set of tests to validate core gotox functionality
without need to rely on compilers.

Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Link: https://lore.kernel.org/r/20251105090410.1250500-12-a.s.protopopov@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-05 17:53:23 -08:00
Anton Protopopov
dd3fd3c965 libbpf: support llvm-generated indirect jumps
For v4 instruction set LLVM is allowed to generate indirect jumps for
switch statements and for 'goto *rX' assembly. Every such a jump will
be accompanied by necessary metadata, e.g. (`llvm-objdump -Sr ...`):

       0:       r2 = 0x0 ll
                0000000000000030:  R_BPF_64_64  BPF.JT.0.0

Here BPF.JT.1.0 is a symbol residing in the .jumptables section:

    Symbol table:
       4: 0000000000000000   240 OBJECT  GLOBAL DEFAULT     4 BPF.JT.0.0

The -bpf-min-jump-table-entries llvm option may be used to control the
minimal size of a switch which will be converted to an indirect jumps.

Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251105090410.1250500-11-a.s.protopopov@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-05 17:53:23 -08:00
Anton Protopopov
bc414d3583 bpf: disasm: add support for BPF_JMP|BPF_JA|BPF_X
Add support for indirect jump instruction.

Example output from bpftool:

   0: (79) r3 = *(u64 *)(r1 +0)
   1: (25) if r3 > 0x4 goto pc+666
   2: (67) r3 <<= 3
   3: (18) r1 = 0xffffbeefspameggs
   5: (0f) r1 += r3
   6: (79) r1 = *(u64 *)(r1 +0)
   7: (0d) gotox r1

Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251105090410.1250500-10-a.s.protopopov@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-05 17:53:23 -08:00
Anton Protopopov
493d9e0d60 bpf, x86: add support for indirect jumps
Add support for a new instruction

    BPF_JMP|BPF_X|BPF_JA, SRC=0, DST=Rx, off=0, imm=0

which does an indirect jump to a location stored in Rx.  The register
Rx should have type PTR_TO_INSN. This new type assures that the Rx
register contains a value (or a range of values) loaded from a
correct jump table – map of type instruction array.

For example, for a C switch LLVM will generate the following code:

    0:   r3 = r1                    # "switch (r3)"
    1:   if r3 > 0x13 goto +0x666   # check r3 boundaries
    2:   r3 <<= 0x3                 # adjust to an index in array of addresses
    3:   r1 = 0xbeef ll             # r1 is PTR_TO_MAP_VALUE, r1->map_ptr=M
    5:   r1 += r3                   # r1 inherits boundaries from r3
    6:   r1 = *(u64 *)(r1 + 0x0)    # r1 now has type INSN_TO_PTR
    7:   gotox r1                   # jit will generate proper code

Here the gotox instruction corresponds to one particular map. This is
possible however to have a gotox instruction which can be loaded from
different maps, e.g.

    0:   r1 &= 0x1
    1:   r2 <<= 0x3
    2:   r3 = 0x0 ll                # load from map M_1
    4:   r3 += r2
    5:   if r1 == 0x0 goto +0x4
    6:   r1 <<= 0x3
    7:   r3 = 0x0 ll                # load from map M_2
    9:   r3 += r1
    A:   r1 = *(u64 *)(r3 + 0x0)
    B:   gotox r1                   # jump to target loaded from M_1 or M_2

During check_cfg stage the verifier will collect all the maps which
point to inside the subprog being verified. When building the config,
the high 16 bytes of the insn_state are used, so this patch
(theoretically) supports jump tables of up to 2^16 slots.

During the later stage, in check_indirect_jump, it is checked that
the register Rx was loaded from a particular instruction array.

Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251105090410.1250500-9-a.s.protopopov@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-05 17:53:23 -08:00
Anton Protopopov
5bef46ac9c bpf, x86: allow indirect jumps to r8...r15
Currently the emit_indirect_jump() function only accepts one of the
RAX, RCX, ..., RBP registers as the destination. Make it to accept
R8, R9, ..., R15 as well, and make callers to pass BPF registers, not
native registers. This is required to enable indirect jumps support
in eBPF.

Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251105090410.1250500-8-a.s.protopopov@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-05 17:53:22 -08:00
Anton Protopopov
ae48162a66 selftests/bpf: test instructions arrays with blinding
Add a specific test for instructions arrays with blinding enabled.

Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251105090410.1250500-7-a.s.protopopov@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-05 17:53:22 -08:00
Anton Protopopov
30ec0ec09b bpf: support instructions arrays with constants blinding
When bpf_jit_harden is enabled, all constants in the BPF code are
blinded to prevent JIT spraying attacks. This happens during JIT
phase. Adjust all the related instruction arrays accordingly.

Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Reviewed-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251105090410.1250500-6-a.s.protopopov@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-05 17:53:22 -08:00
Anton Protopopov
218edd6db6 selftests/bpf: add selftests for new insn_array map
Add the following selftests for new insn_array map:

  * Incorrect instruction indexes are rejected
  * Two programs can't use the same map
  * BPF progs can't operate the map
  * no changes to code => map is the same
  * expected changes when instructions are added
  * expected changes when instructions are deleted
  * expected changes when multiple functions are present

Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251105090410.1250500-5-a.s.protopopov@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-05 17:53:18 -08:00
Tim Hostetler
dfb073d32c ptp: Return -EINVAL on ptp_clock_register if required ops are NULL
ptp_clock should never be registered unless it stubs one of gettimex64()
or gettime64() and settime64(). WARN_ON_ONCE and error out if either set
of function pointers is null.

For consistency, n_alarm validation is also folded into the
WARN_ON_ONCE.

Suggested-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Harshitha Ramamurthy <hramamurthy@google.com>
Reviewed-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Signed-off-by: Tim Hostetler <thostet@google.com>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Link: https://patch.msgid.link/20251104225915.2040080-1-thostet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-05 17:39:17 -08:00
Anton Protopopov
cbef91de02 libbpf: Recognize insn_array map type
Teach libbpf about the existence of the new instruction array map.

Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Link: https://lore.kernel.org/r/20251105090410.1250500-4-a.s.protopopov@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-05 17:31:25 -08:00
Anton Protopopov
18a187bf25 bpftool: Recognize insn_array map type
Teach bpftool to recognize instruction array map type.

Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Acked-by: Quentin Monnet <qmo@kernel.org>
Link: https://lore.kernel.org/r/20251105090410.1250500-3-a.s.protopopov@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-05 17:31:25 -08:00
Anton Protopopov
b4ce5923e7 bpf, x86: add new map type: instructions array
On bpf(BPF_PROG_LOAD) syscall user-supplied BPF programs are
translated by the verifier into "xlated" BPF programs. During this
process the original instructions offsets might be adjusted and/or
individual instructions might be replaced by new sets of instructions,
or deleted.

Add a new BPF map type which is aimed to keep track of how, for a
given program, the original instructions were relocated during the
verification. Also, besides keeping track of the original -> xlated
mapping, make x86 JIT to build the xlated -> jitted mapping for every
instruction listed in an instruction array. This is required for every
future application of instruction arrays: static keys, indirect jumps
and indirect calls.

A map of the BPF_MAP_TYPE_INSN_ARRAY type must be created with a u32
keys and value of size 8. The values have different semantics for
userspace and for BPF space. For userspace a value consists of two
u32 values – xlated and jitted offsets. For BPF side the value is
a real pointer to a jitted instruction.

On map creation/initialization, before loading the program, each
element of the map should be initialized to point to an instruction
offset within the program. Before the program load such maps should
be made frozen. After the program verification xlated and jitted
offsets can be read via the bpf(2) syscall.

If a tracked instruction is removed by the verifier, then the xlated
offset is set to (u32)-1 which is considered to be too big for a valid
BPF program offset.

One such a map can, obviously, be used to track one and only one BPF
program.  If the verification process was unsuccessful, then the same
map can be re-used to verify the program with a different log level.
However, if the program was loaded fine, then such a map, being
frozen in any case, can't be reused by other programs even after the
program release.

Example. Consider the following original and xlated programs:

    Original prog:                      Xlated prog:

     0:  r1 = 0x0                        0: r1 = 0
     1:  *(u32 *)(r10 - 0x4) = r1        1: *(u32 *)(r10 -4) = r1
     2:  r2 = r10                        2: r2 = r10
     3:  r2 += -0x4                      3: r2 += -4
     4:  r1 = 0x0 ll                     4: r1 = map[id:88]
     6:  call 0x1                        6: r1 += 272
                                         7: r0 = *(u32 *)(r2 +0)
                                         8: if r0 >= 0x1 goto pc+3
                                         9: r0 <<= 3
                                        10: r0 += r1
                                        11: goto pc+1
                                        12: r0 = 0
     7:  r6 = r0                        13: r6 = r0
     8:  if r6 == 0x0 goto +0x2         14: if r6 == 0x0 goto pc+4
     9:  call 0x76                      15: r0 = 0xffffffff8d2079c0
                                        17: r0 = *(u64 *)(r0 +0)
    10:  *(u64 *)(r6 + 0x0) = r0        18: *(u64 *)(r6 +0) = r0
    11:  r0 = 0x0                       19: r0 = 0x0
    12:  exit                           20: exit

An instruction array map, containing, e.g., instructions [0,4,7,12]
will be translated by the verifier to [0,4,13,20]. A map with
index 5 (the middle of 16-byte instruction) or indexes greater than 12
(outside the program boundaries) would be rejected.

The functionality provided by this patch will be extended in consequent
patches to implement BPF Static Keys, indirect jumps, and indirect calls.

Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Reviewed-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251105090410.1250500-2-a.s.protopopov@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-05 17:31:25 -08:00
Shankari Anand
ba13710ddd rust: block: update ARef and AlwaysRefCounted imports from sync::aref
Update call sites in the block subsystem to import `ARef` and
`AlwaysRefCounted` from `sync::aref` instead of `types`.

This aligns with the ongoing effort to move `ARef` and
`AlwaysRefCounted` to sync.

Suggested-by: Benno Lossin <lossin@kernel.org>
Link: https://github.com/Rust-for-Linux/linux/issues/1173
Signed-off-by: Shankari Anand <shankari.ak0208@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-05 18:24:10 -07:00
Jens Axboe
9246979536 io_uring/futex: move futexv owned status to struct io_futexv_data
Free up a bit of space in the shared futex opcode private data, by
moving the futexv specific futexv_owned out of there and into the struct
specific to vectored futexes.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-05 12:55:07 -07:00
Jens Axboe
88559f8b2a io_uring/futex: move futexv async data handling to struct io_futexv_data
Rather than alloc an array of struct futex_vector for the futexv wait
handling, wrap it in a struct io_futexv_data struct, similar to what
the non-vectored futex wait handling does.

No functional changes in this patch.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-05 12:54:31 -07:00
Johannes Berg
2f6adeaf92 Merge tag 'ath-next-20251103' of git://git.kernel.org/pub/scm/linux/kernel/git/ath/ath into wireless-next
Jeff Johnson says:
==================
ath.git patches for v6.19

Highlights for some specific drivers include:

ath10k:
Add support for Factory Test TLV commands

ath11k:
Add support for Tx Power insertion

ath12k:
Add support for BSS color change

And of course there is the usual set of cleanups and bug fixes across
the entire family of "ath" drivers.

We do expect to have one more pull request before the v6.19 merge
window to pull in the refactored ath12k driver from the ath12k-ng
branch.
==================

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-05 16:29:11 +01:00
Alok Tiwari
59f44afbe8 io_uring: fix typos and comment wording
Corrected spelling mistakes in comments
 "reuqests" -> "requests", "noifications" -> "notifications",
 "seperately" -> "separately").

Fixed a small grammar issue ("then" -> "than").
Updated "flag" -> "flags" in fdinfo.c

Signed-off-by: Alok Tiwari <alok.a.tiwari@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-05 08:21:43 -07:00
Jens Axboe
55de535e07 Merge branch 'cached-zones' into for-6.19/block
This patch series implements a cached report zones using information
from the block layer zone write plugs and a new zone condition tracking.
This avoids having to execute slow report zones commands on the device
when for instance mounting file systems, which can significantly speed
things up, especially in setups with multiple SMR HDDs (e.g. a BTRFS
RAID volume).

The first patch improves handling of zone management commands. Patch 2
fixes zone resource updates and the following 3 patches cleanup the zone
code in preparation for introducing cached zone report support.
From patch 6 to 13, cached report zones is implemented and made
available to users with a new ioctl() command.

Finally, patches 14 and 15 introduce the use of cached report zones in
the mount operation of XFS and BTRFS.

Link: https://lore.kernel.org/linux-block/20251104212249.1075412-1-dlemoal@kernel.org/
Signed-off-by: Jens Axboe <axboe@kernel.dk>

* cached-zones:
  xfs: use blkdev_report_zones_cached()
  btrfs: use blkdev_report_zones_cached()
  block: add zone write plug condition to debugfs zone_wplugs
  block: improve zone_wplugs debugfs attribute output
  block: introduce BLKREPORTZONESV2 ioctl
  block: introduce blkdev_report_zones_cached()
  block: introduce blkdev_get_zone_info()
  block: refactor blkdev_report_zones() code
  block: track zone conditions
  block: use zone condition to determine conventional zones
  block: reorganize struct blk_zone_wplug
  block: introduce disk_report_zone()
  block: cleanup blkdev_report_zones()
  block: freeze queue when updating zone resources
  block: handle zone management operations completions
2025-11-05 08:07:41 -07:00
Damien Le Moal
e04ccfc282 xfs: use blkdev_report_zones_cached()
Modify xfs_mount_zones() to replace the call to blkdev_report_zones()
with blkdev_report_zones_cached() to speed-up mount operations.
Since this causes xfs_zone_validate_seq() to see zones with the
BLK_ZONE_COND_ACTIVE condition, this function is also modified to acept
this condition as valid.

With this change, mounting a freshly formatted large capacity (30 TB)
SMR HDD completes under 2s compared to over 4.7s before.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-05 08:07:21 -07:00
Damien Le Moal
ad3c1188b4 btrfs: use blkdev_report_zones_cached()
Modify btrfs_get_dev_zones() and btrfs_sb_log_location_bdev() to replace
the call to blkdev_report_zones() with blkdev_report_zones_cached() to
speed-up mount operations. btrfs_get_dev_zone_info() is also modified to
take into account the BLK_ZONE_COND_ACTIVE condition, which is
equivalent to either BLK_ZONE_COND_IMP_OPEN, BLK_ZONE_COND_EXP_OPEN or
BLK_ZONE_COND_CLOSED.

With this change, mounting a freshly formatted large capacity (30 TB)
SMR HDD completes under 100ms compared to over 1.8s before.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Acked-by: David Sterba <dsterba@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-05 08:07:21 -07:00
Damien Le Moal
1efbbc641e block: add zone write plug condition to debugfs zone_wplugs
Modify queue_zone_wplug_show() to include the condition of a zone write
plug to the zone_wplugs debugfs attribute of a zoned block device.
To improve readability and ease of use, rather than the zone condition
raw value, the zone condition name is given using blk_zone_cond_str().

Suggested-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-05 08:07:21 -07:00
Damien Le Moal
2b39d4a6c6 block: improve zone_wplugs debugfs attribute output
Make the output of the zone_wplugs debugfs attribute file more easily
readable by adding the name of the zone write plugs fields in the
output.

No functional changes.

Suggested-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-05 08:07:21 -07:00
Damien Le Moal
b30ffcdc0c block: introduce BLKREPORTZONESV2 ioctl
Introduce the new BLKREPORTZONESV2 ioctl command to allow user
applications access to the fast zone report implemented by
blkdev_report_zones_cached(). This new ioctl is defined as number 142
and is documented in include/uapi/linux/fs.h.

Unlike the existing BLKREPORTZONES ioctl, this new ioctl uses the flags
field of struct blk_zone_report also as an input. If the user sets the
BLK_ZONE_REP_CACHED flag as an input, then blkdev_report_zones_cached()
is used to generate the zone report using cached zone information. If
this flag is not set, then BLKREPORTZONESV2 behaves in the same manner
as BLKREPORTZONES and the zone report is generated by accessing the
zoned device.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-05 08:07:21 -07:00
Damien Le Moal
31f0656a4a block: introduce blkdev_report_zones_cached()
Introduce the function blkdev_report_zones_cached() to provide a fast
report zone built using the blkdev_get_zone_info() function, which gets
zone information from a disk zones_cond array or zone write plugs.
For a large capacity SMR drive, such fast report zone can be completed
in a few milliseconds compared to several seconds completion times
when the report zone is obtained from the device.

The zone report is built in the same manner as with the regular
blkdev_report_zones() function, that is, the first zone reported is the
one containing the specified start sector and the report is limited to
the specified number of zones (nr_zones argument). The information for
each zone in the report is obtained using blkdev_get_zone_info().

For zoned devices that do not use zone write plug resources,
using blkdev_get_zone_info() is inefficient as the zone report would
be very slow, generated one zone at a time. To avoid this,
blkdev_report_zones_cached() falls back to calling
blkdev_do_report_zones() to execute a regular zone report. In this case,
the .report_active field of struct blk_report_zones_args is set to true
to report zone conditions using the BLK_ZONE_COND_ACTIVE condition in
place of the implicit open, explicit open and closed conditions.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-05 08:07:21 -07:00
Damien Le Moal
f2284eec50 block: introduce blkdev_get_zone_info()
Introduce the function blkdev_get_zone_info() to obtain a single zone
information from cached zone data, that is, either from the zone write
plug for the target zone if it exists and from the disk zones_cond
array otherwise.

Since sequential zones that do not have a zone write plug are either
full, empty or in a bad state (read-only or offline), the zone write
pointer can be inferred from the zone condition cached in the disk
zones_cond array. For sequential zones that have a zone write plug, the
zone condition and zone write pointer are obtained from the condition
and write pointer offset managed with the zone write plug. This allows
obtaining the information for a zone much more quickly than having to
execute a report zones command on the device.

blkdev_get_zone_info() falls back to using a regular zone report if the
target zone is flagged as needing an update with the
BLK_ZONE_WPLUG_NEED_WP_UPDATE flag, or if the target device does not
use zone write plugs (i.e. a device mapper device). In this case, the
new function blkdev_report_zone_fallback() is used and the zone
condition is reported consistantly with the cahced report, that is, the
BLK_ZONE_COND_ACTIVE condition is used in place of the implicit open,
explicit open and closed conditions. This is achieved by adding the
.report_active field to struct blk_report_zones_args and by having
disk_report_zone() sets the correct zone condition if .report_active is
true.

In preparation for using blkdev_get_zone_info() in upcoming file systems
changes, also export this function as a GPL symbol.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-05 08:07:21 -07:00
Damien Le Moal
1af3f4e0c4 block: refactor blkdev_report_zones() code
In preparation for implementing cached report zone, split the main part
of the code of blkdev_report_zones() into the helper function
blkdev_do_report_zones(), with this new helper taking as argument a
struct blk_report_zones_args pointer instead of a report callback
function and its private argument.

No functional changes.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-05 08:07:21 -07:00
Damien Le Moal
0bf0e2e466 block: track zone conditions
The function blk_revalidate_zone_cond() already caches the condition of
all zones of a zoned block device in the zones_cond array of a gendisk.
However, the zone conditions are updated only when the device is scanned
or revalidated.

Implement tracking of the runtime changes to zone conditions using
the new cond field in struct blk_zone_wplug. The size of this structure
remains 112 Bytes as the new field replaces the 4 Bytes padding at the
end of the structure.

Beause zones that do not have a zone write plug can be in the empty,
implicit open, explicit open or full condition, the zones_cond array of
a disk is used to track the conditions, of zones that do not have a zone
write plug. The condition of such zone is updated in the disk zones_cond
array when a zone reset, reset all or finish operation is executed, and
also when a zone write plug is removed from the disk hash table when the
zone becomes full.

Since a device may automatically close an implicitly open zone when
writing to an empty or closed zone, if the total number of open zones
has reached the device limit, the BLK_ZONE_COND_IMP_OPEN and
BLK_ZONE_COND_CLOSED zone conditions cannot be precisely tracked. To
overcome this, the zone condition BLK_ZONE_COND_ACTIVE is introduced to
represent a zone that has the condition BLK_ZONE_COND_IMP_OPEN,
BLK_ZONE_COND_EXP_OPEN or BLK_ZONE_COND_CLOSED.  This follows the
definition of an active zone as defined in the NVMe Zoned Namespace
specifications. As such, for a zoned device that has a limit on the
maximum number of open zones, we will never have more zones in the
BLK_ZONE_COND_ACTIVE condition than the device limit. This is compatible
with the SCSI ZBC and ATA ZAC specifications for SMR HDDs as these
devices do not have a limit on the number of active zones.

The function disk_zone_wplug_set_wp_offset() is modified to use the new
helper disk_zone_wplug_update_cond() to update a zone write plug
condition whenever a zone write plug write offset is updated on
submission or merging of write BIOs to a zone.

The functions blk_zone_reset_bio_endio(), blk_zone_reset_all_bio_endio()
and blk_zone_finish_bio_endio() are modified to update the condition of
the zones targeted by reset, reset_all and finish operations, either
using though disk_zone_wplug_set_wp_offset() for zones that have a
zone write plug, or using the disk_zone_set_cond() helper to update the
zones_cond array of the disk for zones that do not have a zone write
plug.

When a zone write plug is removed from the disk hash table (when the
zone becomes empty or full), the condition of struct blk_zone_wplug is
used to update the disk zones_cond array. Conversely, when a zone write
plug is added to the disk hash table, the zones_cond array is used to
initialize the zone write plug condition.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-05 08:07:21 -07:00
Damien Le Moal
6e945ffb65 block: use zone condition to determine conventional zones
The conv_zones_bitmap field of struct gendisk is used to define a bitmap
to identify the conventional zones of a zoned block device. The bit for
a zone is set in this bitmap if the zone is a conventional one, that is,
if the zone type is BLK_ZONE_TYPE_CONVENTIONAL. For such zone, this
always corresponds to the zone condition BLK_ZONE_COND_NOT_WP.
In other words, conv_zones_bitmap tracks a single condition of the
zones of a zoned block device.

In preparation for tracking more zone conditions, change
conv_zones_bitmap into an array of zone conditions, using 1 byte per
zone. This increases the memory usage from 1 bit per zone to 1 byte per
zone, that is, from 16 KiB to about 100 KiB for a 30 TB SMR HDD with 256
MiB zones. This is a trade-off to allow fast cached report zones later
on top of this change.

Rename the conv_zones_bitmap field of struct gendisk to zones_cond. Add
a blk_revalidate_zone_cond() function to initialize the zones_cond array
of a disk during device scan and to update it on device revalidation.
Move the allocation of the zones_cond array to
disk_revalidate_zone_resources(), making sure that this array is always
allocated, even for devices that do not need zone write plugs (zone
resources), to ensure that bdev_zone_is_seq() can be re-implemented to
use the zone condition array in place of the conv zones bitmap.

Finally, the function bdev_zone_is_seq() is rewritten to use a test on
the condition of the target zone.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-05 08:07:21 -07:00
Damien Le Moal
ca1a897fb2 block: reorganize struct blk_zone_wplug
Reorganize the fields of struct blk_zone_wplug to remove a hole after
the wp_offset field and avoid having the bio_work structure split
between 2 cache lines.

No functional changes.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-05 08:07:21 -07:00
Damien Le Moal
fdb9aed869 block: introduce disk_report_zone()
Commit b76b840fd9 ("dm: Fix dm-zoned-reclaim zone write pointer
alignment") introduced an indirect call for the callback function of a
report zones executed with blkdev_report_zones(). This is necessary so
that the function disk_zone_wplug_sync_wp_offset() can be called to
refresh a zone write plug zone write pointer offset after a write error.
However, this solution makes following the path of a zone information
harder to understand.

Clean this up by introducing the new blk_report_zones_args structure to
define a zone report callback and its private data and introduce the
helper function disk_report_zone() which calls both
disk_zone_wplug_sync_wp_offset() and the zone report user callback
function for all zones of a zone report. This helper function must be
called by all block device drivers that implement the report zones
block operation in order to correctly report a zone information.

All block device drivers supporting the report_zones block operation are
updated to use this new scheme.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-05 08:07:21 -07:00
Damien Le Moal
e8ecb21f08 block: cleanup blkdev_report_zones()
The variable capacity is used only in one place and so can be removed
and get_capacity(disk) used directly instead.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-05 08:07:21 -07:00
Damien Le Moal
bba4322e3f block: freeze queue when updating zone resources
Modify disk_update_zone_resources() to freeze the device queue before
updating the number of zones, zone capacity and other zone related
resources. The locking order resulting from the call to
queue_limits_commit_update_frozen() is preserved, that is, the queue
limits lock is first taken by calling queue_limits_start_update() before
freezing the queue, and the queue is unfrozen after executing
queue_limits_commit_update(), which replaces the call to
queue_limits_commit_update_frozen().

This change ensures that there are no in-flights I/Os when the zone
resources are updated due to a zone revalidation. In case of error when
the limits are applied, directly call disk_free_zone_resources() from
disk_update_zone_resources() while the disk queue is still frozen to
avoid needing to freeze & unfreeze the queue again in
blk_revalidate_disk_zones(), thus simplifying that function code a
little.

Fixes: 0b83c86b44 ("block: Prevent potential deadlock in blk_revalidate_disk_zones()")
Cc: stable@vger.kernel.org
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-05 08:07:21 -07:00
Damien Le Moal
efae226c2e block: handle zone management operations completions
The functions blk_zone_wplug_handle_reset_or_finish() and
blk_zone_wplug_handle_reset_all() both modify the zone write pointer
offset of zone write plugs that are the target of a reset, reset all or
finish zone management operation. However, these functions do this
modification before the BIO is executed. So if the zone operation fails,
the modified zone write pointer offsets become invalid.

Avoid this by modifying the zone write pointer offset of a zone write
plug that is the target of a zone management operation when the
operation completes. To do so, modify blk_zone_bio_endio() to call the
new function blk_zone_mgmt_bio_endio() which in turn calls the functions
blk_zone_reset_all_bio_endio(), blk_zone_reset_bio_endio() or
blk_zone_finish_bio_endio() depending on the operation of the completed
BIO, to modify a zone write plug write pointer offset accordingly.
These functions are called only if the BIO execution was successful.

Fixes: dd291d77cc ("block: Introduce zone write plugging")
Cc: stable@vger.kernel.org
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-05 08:07:21 -07:00
Jakub Kicinski
89aec171d9 Merge branch 'net-introduce-struct-sockaddr_unsized'
Kees Cook says:

====================
net: Introduce struct sockaddr_unsized

The historically fixed-size struct sockaddr is part of UAPI and embedded
in many existing structures. The kernel uses struct sockaddr extensively
within the kernel to represent arbitrarily sized sockaddr structures,
which caused problems with the compiler's ability to determine object
sizes correctly. The "temporary" solution was to make sockaddr explicitly
use a flexible array, but this causes problems for embedding struct
sockaddr in structures, where once again the compiler has to guess about
the size of such objects, and causes thousands of warnings under the
coming -Wflex-array-member-not-at-end warning.

Switching to sockaddr_storage internally everywhere wastes a lot of memory,
so we are left with needing two changes:
- introduction of an explicitly arbitrarily sized sockaddr struct
- switch struct sockaddr back to being fixed size

Doing the latter step requires all "arbitrarily sized" uses of struct
sockaddr to be replaced with the new struct from the first step.

So, introduce the new struct and do enough conversions that we can
switch sockaddr back to a fixed-size sa_data.
====================

Link: https://patch.msgid.link/20251104002608.do.383-kees@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 19:10:36 -08:00
Kees Cook
2b5e9f9b7e net: Convert struct sockaddr to fixed-size "sa_data[14]"
Revert struct sockaddr from flexible array to fixed 14-byte "sa_data",
to solve over 36,000 -Wflex-array-member-not-at-end warnings, since
struct sockaddr is embedded within many network structs.

With socket/proto sockaddr-based internal APIs switched to use struct
sockaddr_unsized, there should be no more uses of struct sockaddr that
depend on reading beyond the end of struct sockaddr::sa_data that might
trigger bounds checking.

Comparing an x86_64 "allyesconfig" vmlinux build before and after this
patch showed no new "ud1" instructions from CONFIG_UBSAN_BOUNDS nor any
new "field-spanning" memcpy CONFIG_FORTIFY_SOURCE instrumentations.

Cc: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: Kees Cook <kees@kernel.org>
Link: https://patch.msgid.link/20251104002617.2752303-8-kees@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 19:10:33 -08:00
Kees Cook
c1a799eef6 bpf: Convert bpf_sock_addr_kern "uaddr" to sockaddr_unsized
Change struct bpf_sock_addr_kern to use sockaddr_unsized for the "uaddr"
field instead of sockaddr. This improves type safety in the BPF cgroup
socket address filtering code.

The casting in __cgroup_bpf_run_filter_sock_addr() is updated to match the
new type, removing an unnecessary cast in the initialization and updating
the conditional assignment to use the appropriate sockaddr_unsized cast.

Additionally rename the "unspec" variable to "storage" to better align
with its usage.

No binary changes expected.

Signed-off-by: Kees Cook <kees@kernel.org>
Link: https://patch.msgid.link/20251104002617.2752303-7-kees@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 19:10:33 -08:00
Kees Cook
8116d803e7 bpf: Convert cgroup sockaddr filters to use sockaddr_unsized consistently
Update BPF cgroup sockaddr filtering infrastructure to use sockaddr_unsized
consistently throughout the call chain, removing redundant explicit casts
from callers.

No binary changes expected.

Signed-off-by: Kees Cook <kees@kernel.org>
Link: https://patch.msgid.link/20251104002617.2752303-6-kees@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 19:10:33 -08:00
Kees Cook
449f68f8ff net: Convert proto callbacks from sockaddr to sockaddr_unsized
Convert struct proto pre_connect(), connect(), bind(), and bind_add()
callback function prototypes from struct sockaddr to struct sockaddr_unsized.
This does not change per-implementation use of sockaddr for passing around
an arbitrarily sized sockaddr struct. Those will be addressed in future
patches.

Additionally removes the no longer referenced struct sockaddr from
include/net/inet_common.h.

No binary changes expected.

Signed-off-by: Kees Cook <kees@kernel.org>
Link: https://patch.msgid.link/20251104002617.2752303-5-kees@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 19:10:33 -08:00
Kees Cook
3d39d34146 net: Remove struct sockaddr from net.h
Now that struct sockaddr is no longer used by net.h, remove it.

Signed-off-by: Kees Cook <kees@kernel.org>
Link: https://patch.msgid.link/20251104002617.2752303-4-kees@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 19:10:33 -08:00
Kees Cook
85cb0757d7 net: Convert proto_ops connect() callbacks to use sockaddr_unsized
Update all struct proto_ops connect() callback function prototypes from
"struct sockaddr *" to "struct sockaddr_unsized *" to avoid lying to the
compiler about object sizes. Calls into struct proto handlers gain casts
that will be removed in the struct proto conversion patch.

No binary changes expected.

Signed-off-by: Kees Cook <kees@kernel.org>
Link: https://patch.msgid.link/20251104002617.2752303-3-kees@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 19:10:32 -08:00
Kees Cook
0e50474fa5 net: Convert proto_ops bind() callbacks to use sockaddr_unsized
Update all struct proto_ops bind() callback function prototypes from
"struct sockaddr *" to "struct sockaddr_unsized *" to avoid lying to the
compiler about object sizes. Calls into struct proto handlers gain casts
that will be removed in the struct proto conversion patch.

No binary changes expected.

Signed-off-by: Kees Cook <kees@kernel.org>
Link: https://patch.msgid.link/20251104002617.2752303-2-kees@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 19:10:32 -08:00
Kees Cook
bf33247a90 net: Add struct sockaddr_unsized for sockaddr of unknown length
Add flexible sockaddr structure to support addresses longer than the
traditional 14-byte struct sockaddr::sa_data limitation without
requiring the full 128-byte sa_data of struct sockaddr_storage. This
allows the network APIs to pass around a pointer to an object that
isn't lying to the compiler about how big it is, but must be accompanied
by its actual size as an additional parameter.

It's possible we may way to migrate to including the size with the
struct in the future, e.g.:

struct sockaddr_unsized {
	u16 sa_data_len;
	u16 sa_family;
	u8  sa_data[] __counted_by(sa_data_len);
};

Signed-off-by: Kees Cook <kees@kernel.org>
Link: https://patch.msgid.link/20251104002617.2752303-1-kees@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 19:10:32 -08:00
Jakub Kicinski
bd0fa86073 Merge branch 'net-phy-remove-fixed_phy_add-and-first-its-users'
Heiner Kallweit says:

====================
net: phy: remove fixed_phy_add and first its users

fixed_phy_add() has a number of problems/disadvantages:
- It uses phy address 0 w/o checking whether a fixed phy with this
  address exists already.
- A subsequent call to fixed_phy_register() would also use phy address 0,
  because fixed_phy_add() doesn't mark it as used.
- fixed_phy_add() is used from platform code, therefore requires that
  fixed phy code is built-in.

fixed_phy_add() has only two users
- coldfire/5272, using fec
- bcm47xx, using b44

So migrate fec and b44 to use fixed_phy_register_100fd(), afterwards
remove usage of fixed_phy_add() from the two platforms, and eventually
remove fixed_phy_add().
====================

Link: https://patch.msgid.link/0285fcb0-0fb5-4f6f-823c-7b6e85e28ba3@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 18:46:15 -08:00
Heiner Kallweit
5de9ea1c50 net: phy: fixed_phy: remove fixed_phy_add
fixed_phy_add() has a number of problems/disadvantages:
- It uses phy address 0 w/o checking whether a fixed phy with this
  address exists already.
- A subsequent call to fixed_phy_register() would also use phy address 0,
  because fixed_phy_add() doesn't mark it as used.
- fixed_phy_add() is used from platform code, therefore requires that
  fixed_phy code is built-in.

Now that for the only two users (coldfire/5272 and bcm47xx) fixed_phy
creation has been moved to the respective ethernet driver (fec, b44),
we can remove fixed_phy_add().

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://patch.msgid.link/bee046a1-1e77-4057-8b04-fdb2a1bbbd08@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 18:46:14 -08:00
Heiner Kallweit
458639c42b MIPS: BCM47XX: remove creating a fixed phy
Now that b44 ethernet driver creates a fixed phy if needed, we can
remove this here.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://patch.msgid.link/8983b705-6bca-4728-9283-7aa60f49340f@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 18:46:13 -08:00
Heiner Kallweit
10d2f15afb net: b44: register a fixed phy using fixed_phy_register_100fd if needed
In case of bcm47xx a fixed phy is used, which so far is created
by platform code, using fixed_phy_add(). This function has a number of
problems, therefore create a potentially needed fixed phy here, using
fixed_phy_register_100fd.

Due to lack of hardware, this is compile-tested only.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://patch.msgid.link/53e4e74d-a49e-4f37-b970-5543a35041db@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 18:46:13 -08:00
Heiner Kallweit
0ee21f39c5 m68k: coldfire: remove creating a fixed phy
Now that the fec ethernet driver creates a fixed phy if needed,
we can remove this here.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://patch.msgid.link/212e0cb5-a2f5-460f-8e03-3c3369d0acf1@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 18:46:13 -08:00
Heiner Kallweit
dc86b621e1 net: fec: register a fixed phy using fixed_phy_register_100fd if needed
In case of coldfire/5272 a fixed phy is used, which so far is created
by platform code, using fixed_phy_add(). This function has a number of
problems, therefore create a potentially needed fixed phy here, using
fixed_phy_register_100fd.

Note 1: This includes a small functional change, as coldfire/5272
created a fixed phy in half-duplex mode. Likely this was by mistake,
because the fec MAC is 100FD-capable, and connection is to a switch.

Note 2: Usage of phy_find_next() makes use of the fact that dev_id can
only be 0 or 1.

Due to lack of hardware, this is compile-tested only.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://patch.msgid.link/adf4dc5c-5fa3-4ae6-a75c-a73954dede73@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 18:46:13 -08:00
Heiner Kallweit
c9445e3c08 net: phy: fixed_phy: add helper fixed_phy_register_100fd
In few places a 100FD fixed PHY is used. Create a helper so that users
don't have to define the struct fixed_phy_status.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://patch.msgid.link/bf564b19-e9bc-4896-aeae-9f721cc4fecd@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 18:46:13 -08:00
Jakub Kicinski
907c46ae20 Merge branch 'net-altera-tse-cleanup-init-sequence'
Maxime Chevallier says:

====================
net: altera-tse: Cleanup init sequence

Altera TSE cleanup to make sure everything is properly intialized
before registering the netdev.

When Altera TSE was converted to phylink, the PCS and phylink creation
were added after register_netdev(), which is wrong as this may race
with .ndo_open() once the netdev is registered.

This series makes so that we register the netdev once all resources are
cleanly initialised, that includes PCS and phylink creation as well as a
few other operations such as reading the IP version.

No errors were found in the wild, so this series doesn't target net, but
given that we fix some racy-ness, a point could be made to send that to
net.

This series doesn't introduce functional changes, however the internal
mii_bus for PCS configuration is renamed.

v1: https://lore.kernel.org/20251030102418.114518-1-maxime.chevallier@bootlin.com
====================

Link: https://patch.msgid.link/20251103104928.58461-1-maxime.chevallier@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 18:15:18 -08:00
Maxime Chevallier
055e554b8f net: altera-tse: Init PCS and phylink before registering netdev
register_netdev() must be done only once all resources are ready, as
they may be used in .ndo_open() immediately upon registration.

Move the lynx PCS and phylink initialisation before registerng the
netdevice. We also remove the call to netif_carrier_off(), as phylink
takes care of that.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Link: https://patch.msgid.link/20251103104928.58461-5-maxime.chevallier@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 18:15:15 -08:00
Maxime Chevallier
9350ea63fe net: altera-tse: Don't use netdev name for the PCS mdio bus
The PCS mdio bus must be created before registering the net_device. To
do that, we musn't depend on the netdev name to create the mdio bus
name. Let's use the device's name instead.

Note that this changes the bus name in /sys/bus/mdiobus

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Link: https://patch.msgid.link/20251103104928.58461-4-maxime.chevallier@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 18:15:15 -08:00
Maxime Chevallier
dd2619d38d net: altera-tse: Warn on bad revision at probe time
Instead of reading the core revision at probe time, and print a warning
for an unexecpected version at .ndo_open() time, let's print that
warning directly in .probe().

This allows getting rid of the "revision" private field, and also
prevent a potential race between reading the revision in .probe() after
netdev registration, and accessing that revision in .ndo_open().

By printing the warning after register_netdev(), we are sure that we
have a netdev name, and that we try to print the revision after having
read it from the internal registers.

Suggested-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20251103104928.58461-3-maxime.chevallier@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 18:15:15 -08:00
Maxime Chevallier
6874520518 net: altera-tse: Set platform drvdata before registering netdev
We don't have to wait until netdev is registered before setting it as the
pdev's drvdata. Move it at netdev alloc time.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Link: https://patch.msgid.link/20251103104928.58461-2-maxime.chevallier@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 18:15:14 -08:00
Heiner Kallweit
617a0dd24e net: phy: make phy_device members pause and asym_pause bitfield bits
We can reduce the size of struct phy_device a little by switching
the type of members pause and asym_pause from int to a single bit.
As C99 is supported now, we can use type bool for the bitfield members,
what provides us with the benefit of the usual implicit bool conversions.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://patch.msgid.link/764e9a31-b40b-4dc9-b808-118192a16d87@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 18:14:35 -08:00
Jakub Kicinski
f005b348d3 Merge branch 'add-driver-for-1gbe-network-chips-from-mucse'
Dong Yibo says:

====================
Add driver for 1Gbe network chips from MUCSE

This patch series adds support for MUCSE RNPGBE 1Gbps PCIe Ethernet controllers
(N500/N210 series), including build infrastructure, hardware initialization,
mailbox (MBX) communication with firmware, and basic netdev registration
(Can show mac witch is got from firmware, and tx/rx will be added later).

Series breakdown (5 patches):
 01/05: net: ethernet/mucse: Add build support for rnpgbe
       - Kconfig/Makefile for MUCSE vendor, basic PCI probe (no netdev)
 02/05: net: ethernet/mucse: Add N500/N210 chip support
       - netdev allocation, BAR mapping
 03/05: net: ethernet/mucse: Add basic MBX ops for PF-FW communication
       - base read/write, write with poll ack, poll and read data
 04/05: net: ethernet/mucse: Add FW commands (sync, reset, MAC query)
       - FW sync retry logic, MAC address retrieval, reset hw with
         base mbx ops in patch4
 05/05: net: ethernet/mucse: Complete netdev registration
       - HW reset, MAC setup, netdev_ops registration
====================

Link: https://patch.msgid.link/20251101013849.120565-1-dong100@mucse.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 18:11:39 -08:00
Dong Yibo
2ee95ec17e net: rnpgbe: Add register_netdev
Complete the network device (netdev) registration flow for Mucse Gbe
Ethernet chips, including:
1. Hardware state initialization:
   - Send powerup notification to firmware (via echo_fw_status)
   - Sync with firmware
   - Reset hardware
2. MAC address handling:
   - Retrieve permanent MAC from firmware (via mucse_mbx_get_macaddr)
   - Fallback to random valid MAC (eth_random_addr) if not valid mac
     from Fw

Signed-off-by: Dong Yibo <dong100@mucse.com>
Reviewed-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Link: https://patch.msgid.link/20251101013849.120565-6-dong100@mucse.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 18:11:37 -08:00
Dong Yibo
c6d3f0198e net: rnpgbe: Add basic mbx_fw support
Add fundamental firmware (FW) communication operations via PF-FW
mailbox, including:
- FW sync (via HW info query with retries)
- HW reset (post FW command to reset hardware)
- MAC address retrieval (request FW for port-specific MAC)
- Power management (powerup/powerdown notification to FW)

Signed-off-by: Dong Yibo <dong100@mucse.com>
Reviewed-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Link: https://patch.msgid.link/20251101013849.120565-5-dong100@mucse.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 18:11:36 -08:00
Dong Yibo
4543534c3e net: rnpgbe: Add basic mbx ops support
Add fundamental mailbox (MBX) communication operations between PF
(Physical Function) and firmware for n500/n210 chips

Signed-off-by: Dong Yibo <dong100@mucse.com>
Reviewed-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Link: https://patch.msgid.link/20251101013849.120565-4-dong100@mucse.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 18:11:36 -08:00
Dong Yibo
1b7f85f733 net: rnpgbe: Add n500/n210 chip support with BAR2 mapping
Add hardware initialization foundation for MUCSE 1Gbe controller,
including:
1. Map PCI BAR2 as hardware register base;
2. Bind PCI device to driver private data (struct mucse) and
   initialize hardware context (struct mucse_hw);
3. Reserve board-specific init framework via rnpgbe_init_hw.

Signed-off-by: Dong Yibo <dong100@mucse.com>
Reviewed-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Reviewed-by: MD Danish Anwar <danishanwar@ti.com>
Link: https://patch.msgid.link/20251101013849.120565-3-dong100@mucse.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 18:11:36 -08:00
Dong Yibo
ee61c10cd4 net: rnpgbe: Add build support for rnpgbe
Add build options and doc for mucse.
Initialize pci device access for MUCSE devices.

Signed-off-by: Dong Yibo <dong100@mucse.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Reviewed-by: MD Danish Anwar <danishanwar@ti.com>
Link: https://patch.msgid.link/20251101013849.120565-2-dong100@mucse.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 18:11:36 -08:00
Vadim Fedorenko
3f02b82725 ti: netcp: convert to ndo_hwtstamp callbacks
Convert TI NetCP driver to use ndo_hwtstamp_get()/ndo_hwtstamp_set()
callbacks. The logic is slightly changed, because I believe the original
logic was not really correct. Config reading part is using the very
first module to get the configuration instead of iterating over all of
them and keep the last one as the configuration is supposed to be identical
for all modules. HW timestamp config set path is now trying to configure
all modules, but in case of error from one module it adds extack
message. This way the configuration will be as synchronized as possible.

There are only 2 modules using netcp core infrastructure, and both use
the very same function to configure HW timestamping, so no actual
difference in behavior is expected.

Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Reviewed-by: Kory Maincent <kory.maincent@bootlin.com>
Link: https://patch.msgid.link/20251103172902.3538392-1-vadim.fedorenko@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 17:50:30 -08:00
Jakub Kicinski
bdf27b5447 Merge branch 'convert-drivers-to-use-ndo_hwtstamp-callbacks-part-3'
Vadim Fedorenko says:

====================
convert drivers to use ndo_hwtstamp callbacks part 3 [part]

This patchset converts the rest of ethernet drivers to use ndo callbacks
instead ioctl to configure and report time stamping. The drivers in part
3 originally implemented only SIOCSHWTSTAMP command, but converted to
also provide configuration back to users.
====================

Link: https://patch.msgid.link/20251103150952.3538205-1-vadim.fedorenko@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 17:44:13 -08:00
Vadim Fedorenko
d8fdc70694 net: pch_gbe: convert to use ndo_hwtstamp callbacks
The driver implemented SIOCSHWTSTAMP ioctl command only, but it stores
configuration in the private data, so it is possible to report it back
to users. Implement both ndo_hwtstamp_set and ndo_hwtstamp_get
callbacks. To properly report RX filter type, store it in hwts_rx_en
instead of using this field as a simple flag. The logic didn't change
because receive path used this field as boolean flag.

Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Reviewed-by: Kory Maincent <kory.maincent@bootlin.com>
Link: https://patch.msgid.link/20251103150952.3538205-7-vadim.fedorenko@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 17:43:52 -08:00
Vadim Fedorenko
a23d0486d0 net: thunderx: convert to use ndo_hwtstamp callbacks
The driver implemented SIOCSHWTSTAMP ioctl command only, but it also
stores configuration in private data, so it's possible to report it back
to users. Implement both ndo_hwtstamp_set and ndo_hwtstamp_get
callbacks.

Reviewed-by: Kory Maincent <kory.maincent@bootlin.com>
Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Link: https://patch.msgid.link/20251103150952.3538205-6-vadim.fedorenko@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 17:43:52 -08:00
Vadim Fedorenko
72c35e3a95 net: octeon: mgmt: convert to use ndo_hwtstamp callbacks
The driver implemented SIOCSHWTSTAMP ioctl command only. But it stores
timestamping configuration, so it is possible to report it to users.
Implement both ndo_hwtstamp_set and ndo_hwtstamp_get callbacks. After
this the ndo_eth_ioctl effectively becomes phy_do_ioctl - adjust
callback accordingly.

Reviewed-by: Kory Maincent <kory.maincent@bootlin.com>
Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Link: https://patch.msgid.link/20251103150952.3538205-5-vadim.fedorenko@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 17:43:52 -08:00
Vadim Fedorenko
94037a0e18 net: liquidio_vf: convert to use ndo_hwtstamp callbacks
The driver implemented SIOCSHWTSTAMP ioctl command only, but there is a
way to get configuration back. Implement both ndo_hwtstamp_set and
ndo_hwtstamp_set callbacks.

Reviewed-by: Kory Maincent <kory.maincent@bootlin.com>
Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Link: https://patch.msgid.link/20251103150952.3538205-4-vadim.fedorenko@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 17:43:51 -08:00
Vadim Fedorenko
2b38447548 net: liquidio: convert to use ndo_hwtstamp callbacks
The driver implemented SIOCSHWTSTAMP ioctl command only, but there is a
way to get configured status. Implement both ndo_hwtstamp_set and
ndo_hwtstamp_get callbacks.

Reviewed-by: Kory Maincent <kory.maincent@bootlin.com>
Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Link: https://patch.msgid.link/20251103150952.3538205-3-vadim.fedorenko@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 17:43:51 -08:00
Buday Csaba
e0c78fcad2 dt-bindings: net: ethernet-phy: clarify when compatible must specify PHY ID
Change PHY ID description in ethernet-phy.yaml to clarify that a
PHY ID is required (may -> must) when the PHY requires special
initialization sequence.

Link: https://lore.kernel.org/netdev/20251026212026.GA2959311-robh@kernel.org/
Link: https://lore.kernel.org/netdev/aQIZvDt5gooZSTcp@debianbuilder/

Signed-off-by: Buday Csaba <buday.csaba@prolan.hu>
Acked-by: Conor Dooley <conor.dooley@microchip.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Link: https://patch.msgid.link/64c52d1a726944a68a308355433e8ef0f82c4240.1762157515.git.buday.csaba@prolan.hu
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 17:30:46 -08:00
Yue Haibing
f2143e283c net: devmem: Remove unused declaration net_devmem_bind_tx_release()
Commit bd61848900 ("net: devmem: Implement TX path") declared this
but never implemented it.

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Reviewed-by: Mina Almasry <almasrymina@google.com>
Link: https://patch.msgid.link/20251103072046.1670574-1-yuehaibing@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 17:29:19 -08:00
Jakub Kicinski
f102600ebe Merge branch 'mptcp-pm-in-kernel-fullmesh-endp-nb-bind-cases'
Matthieu Baerts says:

====================
mptcp: pm: in-kernel: fullmesh endp nb + bind cases

Here is a small optimisation for the in-kernel PM, joined by a small
behavioural change to avoid confusions, and followed by a few more
tests.

- Patch 1: record fullmesh endpoints numbers, not to iterate over all
  endpoints to check if one is marked as fullmesh.

- Patch 2: when at least one endpoint is marked as fullmesh, only use
  these endpoints when reacting to an ADD_ADDR, even if there are no
  endpoints for this IP family: this is less confusing.

- Patch 3: reduce duplicated code to prepare the next patch.

- Patch 4: extra "bind" cases: the listen socket restrict the bind to
  one IP address, not allowing MP_JOIN to extra IP addresses, except if
  another listening socket accepts them.
====================

Link: https://patch.msgid.link/20251101-net-next-mptcp-fm-endp-nb-bind-v1-0-b4166772d6bb@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 17:16:06 -08:00
Matthieu Baerts (NGI0)
5c59df126b selftests: mptcp: join: validate extra bind cases
By design, an MPTCP connection will not accept extra subflows where no
MPTCP listening sockets can accept such requests.

In other words, it means that if the 'server' listens on a specific
address / device, it cannot accept MP_JOIN sent to a different address /
device. Except if there is another MPTCP listening socket accepting
them.

This is what the new tests are validating:

 - Forcing a bind on the main v4/v6 address, and checking that MP_JOIN
   to announced addresses are not accepted.

 - Also forcing a bind on the main v4/v6 address, but before, another
   listening socket is created to accept additional subflows. Note that
   'mptcpize run nc -l' -- or something else only doing: socket(MPTCP),
   bind(<IP>), listen(0) -- would be enough, but here mptcp_connect is
   reused not to depend on another tool just for that.

 - Same as the previous one, but using v6 link-local addresses: this is
   a bit particular because it is required to specify the outgoing
   network interface when connecting to a link-local address announced
   by the other peer. When using the routing rules, this doesn't work
   (the outgoing interface is not known) ; but it does work with a
   'laminar' endpoint having a specified interface.

Note that extra small modifications are needed for these tests to work:

 - mptcp_connect's check_getpeername_connect() check should strip the
   specified interface when comparing addresses.

 - With IPv6 link-local addresses, it is required to wait for them to
   be ready (no longer in 'tentative' mode) before using them, otherwise
   the bind() will not be allowed.

Link: https://github.com/multipath-tcp/mptcp_net-next/issues/591
Reviewed-by: Geliang Tang <geliang@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20251101-net-next-mptcp-fm-endp-nb-bind-v1-4-b4166772d6bb@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 17:15:07 -08:00
Matthieu Baerts (NGI0)
4a6220a453 selftests: mptcp: join: do_transfer: reduce code dup
The same extra long commands are present twice, with small differences:
the variable for the stdin file is different.

Use new dedicated variables in one command to avoid this code
duplication.

Reviewed-by: Geliang Tang <geliang@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20251101-net-next-mptcp-fm-endp-nb-bind-v1-3-b4166772d6bb@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 17:15:06 -08:00
Matthieu Baerts (NGI0)
e461e8a799 mptcp: pm: in kernel: only use fullmesh endp if any
Our documentation is saying that the in-kernel PM is only using fullmesh
endpoints to establish subflows to announced addresses when at least one
endpoint has a fullmesh flag. But this was not totally correct: only
fullmesh endpoints were used if at least one endpoint *from the same
address family as the received ADD_ADDR* has the fullmesh flag.

This is confusing, and it seems clearer not to have differences
depending on the address family.

So, now, when at least one MPTCP endpoint has a fullmesh flag, the local
addresses are picked from all fullmesh endpoints, which might be 0 if
there are no endpoints for the correct address family.

One selftest needs to be adapted for this behaviour change.

Reviewed-by: Geliang Tang <geliang@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20251101-net-next-mptcp-fm-endp-nb-bind-v1-2-b4166772d6bb@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 17:15:06 -08:00
Matthieu Baerts (NGI0)
f88191c7f3 mptcp: pm: in-kernel: record fullmesh endp nb
Instead of iterating over all endpoints, under RCU read lock, just to
check if one of them as the fullmesh flag, we can keep a counter of
fullmesh endpoint, similar to what is done with the other flags.

This counter is now checked, before iterating over all endpoints.

Similar to the other counters, this new one is also exposed. A userspace
app can then know when it is being used in a fullmesh mode, with
potentially (too) many subflows.

Reviewed-by: Geliang Tang <geliang@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20251101-net-next-mptcp-fm-endp-nb-bind-v1-1-b4166772d6bb@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 17:15:06 -08:00
Jakub Kicinski
b117befe8a Merge branch 'net-mlx5e-reduce-interface-downtime-on-configuration-change'
Tariq Toukan says:

====================
net/mlx5e: Reduce interface downtime on configuration change

This series significantly reduces the interface downtime while swapping
channels during a configuration change, on capable devices.

Here we remove an old requirement on operations ordering that became
obsolete on recent capable devices. This helps cutting the downtime by a
factor of magnitude, ~80% in our example.

Perf numbers:
Measured the number of dropped packets in a simple ping flood test,
during a configuration change operation, that switches the number of
channels from 247 to 248.

Before: 71 packets lost
After:  15 packets lost, ~80% saving.
====================

Link: https://patch.msgid.link/1761831159-1013140-1-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 17:04:54 -08:00
Tariq Toukan
3b88a535a8 net/mlx5e: Defer channels closure to reduce interface down time
Cap bit tis_tir_td_order=1 indicates that an old firmware requirement /
limitation no longer exists. When unset, the latency of several firmware
commands significantly increases with the presence of high number of
co-existing channels (both old and new sets). Hence, we used to close
unneeded old channels before invoking those firmware commands.

Today, on capable devices, this is no longer the case. Minimize the
interface down time by deferring the old channels closure, after the
activation of the new ones.

Perf numbers:
Measured the number of dropped packets in a simple ping flood test,
during a configuration change operation, that switches the number of
channels from 247 to 248.

Before: 71 packets lost
After:  15 packets lost, ~80% saving.

Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/1761831159-1013140-8-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 17:04:36 -08:00
Tariq Toukan
911e3a37b0 net/mlx5e: Pass old channels as argument to mlx5e_switch_priv_channels
Let the caller function mlx5e_safe_switch_params() maintain a copy
of the old channels, and pass it to mlx5e_switch_priv_channels().

This is in preparation for the next patch.

Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/1761831159-1013140-7-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 17:04:36 -08:00
Tariq Toukan
477c352add net/mlx5e: Do not re-apply TIR loopback configuration if not necessary
On old firmware, (tis_tir_td_order=0), TIR of a transport domain should
either be created after all SQs of the same domain, or TIR.self_lb_en
should be reapplied using MODIFY_TIR, for self loopback filtering to
function correctly.

This is not necessary anymnore on new FW (tis_tir_td_order=1), thus
there's no need for calling modify_tir operations after creating a new
set of SQs to maintain the self loopback prevention functional.

Skip these operations.

This saves O(max_num_channels) MODIFY_TIR firmware commands in
operations like interface up or channels configuration change.

Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/1761831159-1013140-6-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 17:04:36 -08:00
Tariq Toukan
a4c81e72f1 net/mlx5: IPoIB, set self loopback prevention in TIR init
In IPoIB, the self loopback prevention configuration apply in activation
stage has two roles: fulfill a firmware requirement for old firmware
(tis_tir_td_order=0), and update the proper configuration as it was not
set in init.

Here we set the proper configuration in init, to allow skipping the
modify_tirs commands on new firmware in a downstream patch.

Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/1761831159-1013140-5-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 17:04:36 -08:00
Tariq Toukan
99b002018f net/mlx5e: Allow setting self loopback prevention bits on TIR init
Until now, IPoIB was creating TIRs without setting self loopback
prevention, then modifying them in activation stage.

This is a preparation patch, that will be used by IPoIB to init TIRs
properly without the need for following calls of modify_tir.

Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/1761831159-1013140-4-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 17:04:36 -08:00
Tariq Toukan
5c51a86122 net/mlx5e: Use TIR API in mlx5e_modify_tirs_lb()
Extend the TIR API and use it in mlx5e_modify_tirs_lb() instead of the
explicit modify_tir code.

Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Dragos Tatulea <dtatulea@nvidia.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/1761831159-1013140-3-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 17:04:35 -08:00
Tariq Toukan
091400a5d4 net/mlx5e: Enhance function structures for self loopback prevention application
The re-application of self loopback prevention attributes in TIRs is
necessary in old firmwares (where tis_tir_td_order cap is cleared) after
recreation of SQs.

However, this is not needed in new firmware with tis_tir_td_order=1.

As a preparation patch, enhance the function structures to differentiate
between an explicit loopback prevention configuration apply, and the
re-apply operation required by old firmware.

Loopback selftests should now call mlx5e_modify_tirs_lb() directly, as
their use case is not related to the firmware limitation.

Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/1761831159-1013140-2-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 17:04:35 -08:00
Chu Guangqing
52665fcc22 xen/netfront: Comment Correction: Fix Spelling Error and Description of Queue Quantity Rules
The original comments contained spelling errors and incomplete logical
descriptions, which could easily lead to misunderstandings of the code
logic. The specific modifications are as follows:

Correct the spelling error by changing "inut max" to "but not exceed the
maximum limit";

Add the note "If the user has not specified a value, the default maximum
limit is 8" to clarify the default value logic;

Improve the coherence of the statement to make the queue quantity rules
clearer.

After the modification, the comments can accurately reflect the code
behavior of "taking the smaller value between the number of CPUs and the
default maximum limit of 8 for the number of queues", enhancing code
maintainability.

Signed-off-by: Chu Guangqing <chuguangqing@inspur.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Link: https://patch.msgid.link/20251103032212.2462-1-chuguangqing@inspur.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 17:01:01 -08:00
Chu Guangqing
96c68954cd net: sungem_phy: Fix a typo error in sungem_phy
Fix a spelling mistakes for regularly

Signed-off-by: Chu Guangqing <chuguangqing@inspur.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20251103054443.2878-1-chuguangqing@inspur.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 17:00:55 -08:00
Chu Guangqing
9781642e58 veth: Fix a typo error in veth
Fix a spellling error for resources

Signed-off-by: Chu Guangqing <chuguangqing@inspur.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Link: https://patch.msgid.link/20251103055351.3150-1-chuguangqing@inspur.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 17:00:50 -08:00
Chu Guangqing
2428803d5e gtp: Fix a typo error for size
Fix the spelling error of "size".

Signed-off-by: Chu Guangqing <chuguangqing@inspur.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Link: https://patch.msgid.link/20251103060504.3524-1-chuguangqing@inspur.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 17:00:43 -08:00
Chu Guangqing
f4b2786fb1 virtio_net: Fix a typo error in virtio_net
Fix the spelling error of "separate".

Signed-off-by: Chu Guangqing <chuguangqing@inspur.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Link: https://patch.msgid.link/20251103074305.4727-1-chuguangqing@inspur.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 17:00:37 -08:00
Jakub Kicinski
31113a452a Merge branch 'net-stmmac-multi-interface-stmmac'
Russell King says:

====================
net: stmmac: multi-interface stmmac

This series adds a callback for platform glue to configure the stmmac
core interface mode depending on the PHY interface mode that is being
used. This is currently only called just before the dwmac core is reset
since these signals are latched on reset.

Included in this series are changes to s32 to move its PHY_INTF_SEL_x
definitions out of the way of the dwmac core's signals which has more
entitlement to use this name. We convert dwmac-imx as an example.

Including other platform glue would make this series excessively large,
but once this core code is merged, the individual platform glue updates
can be posted one after another as they will be independent of each
other.

It is hoped that this callback can be used in future to reconfigure the
dwmac core when the interface mode changes to support PHYs that change
their interface mode, but we're nowhere near being able to do that yet.
====================

Link: https://patch.msgid.link/aQiWzyrXU_2hGJ4j@shell.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 16:21:26 -08:00
Russell King (Oracle)
eaca1a4dc5 net: stmmac: imx: use ->set_phy_intf_sel()
Rather than placing the phy_intf_sel() setup in the ->init() method,
move it to the new ->set_phy_intf_sel() method.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vFt5C-0000000ChpR-2kAB@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 16:21:26 -08:00
Russell King (Oracle)
38cd4e84b3 net: stmmac: imx: cleanup arguments for set_intf_mode() method
Pass the imx_priv_data instead of the plat_stmmacenet_data into the
set_intf_mode() SoC specific methods.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vFt57-0000000ChpL-25kS@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 16:21:26 -08:00
Russell King (Oracle)
35103babce net: stmmac: imx: simplify set_intf_mode() implementations
Simplify the set_intf_mode() implementations, testing the phy_intf_sel
value rather than the PHY interface mode.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vFt52-0000000ChpG-1bsd@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 16:21:25 -08:00
Russell King (Oracle)
c012710c14 net: stmmac: imx: use stmmac_get_phy_intf_sel()
i.MX implementations other than IMX8DXL involve setting the dwmac core
phy_intf_sel input. Use stmmac_get_phy_intf_sel() to decode the PHY
interface mode to the phy_intf_sel value, validating the result, and
passing it into the implementation specific .set_intf_mode() method
rather than each .set_intf_mode() method doing this.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vFt4x-0000000ChpA-1Edr@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 16:21:25 -08:00
Russell King (Oracle)
d73c1dccfb net: stmmac: imx: use FIELD_PREP()/FIELD_GET() for PHY_INTF_SEL_x
Use FIELD_PREP()/FIELD_GET() in the functions to construct the PHY
interface selection bitfield or to extract its value.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vFt4s-0000000Chp4-0kwf@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 16:21:25 -08:00
Russell King (Oracle)
8233cc4397 net: stmmac: imx: convert to PHY_INTF_SEL_xxx
Convert dwmac-imx to use the PHY_INTF_SEL_xxx definitions rather than
constants via:
- ensuring that the prefix for the MASK and value definitions is the
  same.
- using FIELD_PREP() to shift the PHY_INTF_SEL_xxx definition to the
  appropriate bitfield.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vFt4n-0000000Choy-0IeG@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 16:21:25 -08:00
Russell King (Oracle)
1b6aa81c85 net: stmmac: add support for configuring the phy_intf_sel inputs
When dwmac is synthesised with support for multiple PHY interfaces, the
core provides phy_intf_sel inputs, sampled on reset, to configure the
PHY facing interface. Use stmmac_get_phy_intf_sel() in core code to
determine the dwmac phy_intf_sel input value, and provide a new
platform method called with this value just before we issue a soft
reset to the dwmac core.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vFt4h-0000000Chos-3wxX@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 16:21:25 -08:00
Russell King (Oracle)
b459790d3f net: stmmac: add stmmac_get_phy_intf_sel()
Provide a function to translate the PHY interface mode to the
phy_intf_sel pin configuration for dwmac1000 and dwmac4 cores that
support multiple interfaces. We currently handle MII, GMII, RGMII,
SGMII, RMII and REVMII, but not TBI, RTBI nor SMII as drivers do not
appear to use these three and the driver doesn't currently support
these.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vFt4c-0000000Choe-3SII@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 16:21:25 -08:00
Russell King (Oracle)
4a4692e909 net: stmmac: add phy_intf_sel and ACTPHYIF definitions
Add definitions for the active PHY interface found in DMA hardware
feature register 0, and also used to configure the core in multi-
interface designs via phy_intf_sel.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Link: https://patch.msgid.link/E1vFt4X-0000000ChoY-30p9@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 16:21:25 -08:00
Russell King (Oracle)
553f23d195 net: stmmac: s32: move PHY_INTF_SEL_x definitions out of the way
S32's PHY_INTF_SEL_x definitions conflict with those for the dwmac
cores as they use a different bitmapping. Add a S32 prefix so that
they are unique.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Reviewed-by: Jan Petrous (OSS) <jan.petrous@oss.nxp.com>
Link: https://patch.msgid.link/E1vFt4S-0000000ChoS-2Ahi@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 16:21:25 -08:00
Russell King (Oracle)
dec568a36f net: stmmac: imx: use phylink's interface mode for set_clk_tx_rate()
imx_dwmac_set_clk_tx_rate() is passed the interface mode from phylink
which will be the same as plat_dat->phy_interface. Use the passed-in
interface mode rather than plat_dat->phy_interface.

Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Tested-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vFt4N-0000000ChoM-1llp@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 16:21:17 -08:00
Eric Dumazet
46173144e0 net: mark deliver_skb() as unlikely and not inlined
deliver_skb() should not be inlined as is it not called
in the fast path.

Add unlikely() clauses giving hints to the compiler about this fact.

Before this patch:

size net/core/dev.o
   text	   data	    bss	    dec	    hex	filename
 121794	  13330	    176	 135300	  21084	net/core/dev.o

__netif_receive_skb_core() size on x86_64 : 4080 bytes.

After:

size net/core/dev.o
  text	   data	    bss	    dec	    hex	filenamee
 120330	  13338	    176	 133844	  20ad4	net/core/dev.o

__netif_receive_skb_core() size on x86_64 : 2781 bytes.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251103165256.1712169-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 16:08:25 -08:00
Adrian Moreno
105bae3218 rtnetlink: honor RTEXT_FILTER_SKIP_STATS in IFLA_STATS
Gathering interface statistics can be a relatively expensive operation
on certain systems as it requires iterating over all the cpus.

RTEXT_FILTER_SKIP_STATS was first introduced [1] to skip AF_INET6
statistics from interface dumps and it was then extended [2] to
also exclude IFLA_VF_INFO.

The semantics of the flag does not seem to be limited to AF_INET
or VF statistics and having a way to query the interface status
(e.g: carrier, address) without retrieving its statistics seems
reasonable. So this patch extends the use RTEXT_FILTER_SKIP_STATS
to also affect IFLA_STATS.

[1] https://lore.kernel.org/all/20150911204848.GC9687@oracle.com/
[2] https://lore.kernel.org/all/20230611105108.122586-1-gal@nvidia.com/

Signed-off-by: Adrian Moreno <amorenoz@redhat.com>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Link: https://patch.msgid.link/20251103154006.1189707-1-amorenoz@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-04 16:07:37 -08:00
Andrii Nakryiko
4cb4897bb4 Merge branch 'multi-split-btf-fixes-and-test'
Alan Maguire says:

====================
Multi-split BTF fixes and test

This small series consists of a fix to multi-split BTF parsing
(patch 1) and a test which exercises (multi-)split BTF parsing
(patch 2).

Changes since v3 [1]
- add asserts to ensure number of types in original and parsed
  BTF are identical, and the calls to btf__type_by_id() return
  valid pointers (code review bot, patch 2)

Changes since v2 [2]

- fix Fixes: tag formatting (Andrii, patch 1)
- BPF code-review bot saw we were doing ASSERT_OK_PTR() on wrong
  BTF (not multisplit) in patch 2
- ensure cleanup is correctly handled for BTF, unlink in split
  tests (Andrii, patch 2)

Changes since v1 [3]

- BPF code-review bot spotted another place that the string offset
needed to be adjusted based upon base start string offset + header
string offset.
- added selftests to extend split BTF testing to parsing

[1] https://lore.kernel.org/bpf/20251028225544.1312356-1-alan.maguire@oracle.com/
[2] https://lore.kernel.org/bpf/20251028155709.1265445-1-alan.maguire@oracle.com/
[3] https://lore.kernel.org/bpf/20251023142812.258870-1-alan.maguire@oracle.com/
====================

Link: https://patch.msgid.link/20251104203309.318429-1-alan.maguire@oracle.com
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
2025-11-04 13:44:13 -08:00
Alan Maguire
cc77a20389 selftests/bpf: Test parsing of (multi-)split BTF
Write raw BTF to files, parse it and compare to original;
this allows us to test parsing of (multi-)split BTF code.

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20251104203309.318429-3-alan.maguire@oracle.com
2025-11-04 13:44:12 -08:00
Alan Maguire
4f596acc26 libbpf: Fix parsing of multi-split BTF
When creating multi-split BTF we correctly set the start string offset
to be the size of the base string section plus the base BTF start
string offset; the latter is needed for multi-split BTF since the
offset is non-zero there.

Unfortunately the BTF parsing case needed that logic and it was
missed.

Fixes: 4e29128a9a ("libbpf/btf: Fix string handling to support multi-split BTF")
Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20251104203309.318429-2-alan.maguire@oracle.com
2025-11-04 13:44:12 -08:00
Jens Axboe
f68ff6bc0d Merge branch 'autopi-deadlock' into for-6.19/block
Currently the automatic block layer PI generation allocates the integrity
buffer using kmalloc, and thus could deadlock, or fail I/O request due
to memory pressure.

Fix this by adding a mempool, and capping the maximum I/O size on PI
capable devices to not exceed the allocation size of the mempool.

Link: https://lore.kernel.org/linux-block/20251103101653.2083310-1-hch@lst.de/
Signed-off-by: Jens Axboe <axboe@kernel.dk>

* autopi-deadlock:
  block: make bio auto-integrity deadlock safe
  block: blocking mempool_alloc doesn't fail
2025-11-04 12:43:02 -07:00
Christoph Hellwig
ec7f31b2a2 block: make bio auto-integrity deadlock safe
The current block layer automatic integrity protection allocates the
actual integrity buffer, which has three problems:

 - because it happens at the bottom of the I/O stack and doesn't use a
   mempool it can deadlock under load
 - because the data size in a bio is almost unbounded when using lage
   folios it can relatively easily exceed the maximum kmalloc size
 - even when it does not exceed the maximum kmalloc size, it could
   exceed the maximum segment size of the device

Fix this by limiting the I/O size so that we can allocate at least a
2MiB integrity buffer, i.e. 128MiB for 8 byte PI and 512 byte integrity
intervals, and create a mempool as a last resort for this maximum size,
mirroring the scheme used for bvecs.  As a nice upside none of this
can fail now, so we remove the error handling and open code the
trivial addition of the bip vec.

The new allocation helpers sit outside of bio-integrity-auto.c because
I plan to reuse them for file system based PI in the near future.

Fixes: 7ba1ba12ee ("block: Block layer data integrity support")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anuj Gupta <anuj20.g@samsung.com>
Reviewed-by: Kanchan Joshi <joshi.k@samsung.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-04 12:41:50 -07:00
Christoph Hellwig
eef09f742b block: blocking mempool_alloc doesn't fail
So remove the error check for it in bio_integrity_prep.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Anuj Gupta <anuj20.g@samsung.com>
Reviewed-by: Kanchan Joshi <joshi.k@samsung.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-04 12:40:46 -07:00
Donald Hunter
b3387b3122 docs/bpf: Add missing BPF k/uprobe program types to docs
Update the table of program types in the libbpf docs with the missing
k/uprobe multi and session program types.

Signed-off-by: Donald Hunter <donald.hunter@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20251029180932.98038-1-donald.hunter@gmail.com
2025-11-04 10:24:43 -08:00
Jianyun Gao
efa47566ad libbpf: Update the comment to remove the reference to the deprecated interface bpf_program__load().
Commit be2f2d1680 ("libbpf: Deprecate bpf_program__load() API") marked
bpf_program__load() as deprecated starting with libbpf v0.6. And later
in commit 146bf811f5 ("libbpf: remove most other deprecated high-level
APIs") actually removed the bpf_program__load() implementation and
related old high-level APIs.

This patch update the comment in bpf_program__set_attach_target() to
remove the reference to the deprecated interface bpf_program__load().

Signed-off-by: Jianyun Gao <jianyungao89@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20251103120727.145965-1-jianyungao89@gmail.com
2025-11-04 10:22:40 -08:00
Jianyun Gao
74bd7bc068 libbpf: Complete the missing @param and @return tags in btf.h
Complete the missing @param and @return tags in the Doxygen comments of
the btf.h file.

Signed-off-by: Jianyun Gao <jianyungao89@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20251103115836.144339-1-jianyungao89@gmail.com
2025-11-04 10:19:57 -08:00
Caleb Sander Mateos
4b25b75c30 io_uring/memmap: return bool from io_mem_alloc_compound()
io_mem_alloc_compound() returns either ERR_PTR(-ENOMEM) or a virtual
address for the allocated memory, but its caller just checks whether the
result is an error. Return a bool success value instead.

Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-04 09:32:09 -07:00
Jens Axboe
ffce324364 io_uring/cancel: move cancelation code from io_uring.c to cancel.c
There's a bunch of code strictly dealing with cancelations, and that
code really belongs in cancel.c rather than in the core io_uring.c file.
Move the code there. Mostly mechanical, only real oddity here is that
struct io_defer_entry now needs to be visible across both io_uring.c
and cancel.c.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-04 09:32:09 -07:00
Jens Axboe
01e019b2a3 io_uring/cancel: move __io_uring_cancel() into cancel.c
Yet another function that should be in cancel.c, move it over.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-04 09:32:08 -07:00
Jens Axboe
0d677936d6 io_uring/cancel: move request/task cancelation logic into cancel.c
Move io_match_task_safe() and helpers into cancel.c, where it belongs.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-04 09:32:08 -07:00
Jens Axboe
bc82b02218 io_uring/memmap: remove dead io_create_region_mmap_safe() declaration
No longer used and doesn't even exist, kill it from the memmap header
file.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-04 09:32:08 -07:00
Paolo Abeni
255d75ef02 Merge branch 'xsk-minor-optimizations-around-locks'
Jason Xing says:

====================
xsk: minor optimizations around locks

Two optimizations regarding xsk_tx_list_lock and cq_lock can yield a
performance increase because of avoiding disabling and enabling
interrupts frequently.
====================

Link: https://patch.msgid.link/20251030000646.18859-1-kerneljasonxing@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-04 16:10:55 +01:00
Jason Xing
30ed05adca xsk: use a smaller new lock for shared pool case
- Split cq_lock into two smaller locks: cq_prod_lock and
  cq_cached_prod_lock
- Avoid disabling/enabling interrupts in the hot xmit path

In either xsk_cq_cancel_locked() or xsk_cq_reserve_locked() function,
the race condition is only between multiple xsks sharing the same
pool. They are all in the process context rather than interrupt context,
so now the small lock named cq_cached_prod_lock can be used without
handling interrupts.

While cq_cached_prod_lock ensures the exclusive modification of
@cached_prod, cq_prod_lock in xsk_cq_submit_addr_locked() only cares
about @producer and corresponding @desc. Both of them don't necessarily
be consistent with @cached_prod protected by cq_cached_prod_lock.
That's the reason why the previous big lock can be split into two
smaller ones. Please note that SPSC rule is all about the global state
of producer and consumer that can affect both layers instead of local
or cached ones.

Frequently disabling and enabling interrupt are very time consuming
in some cases, especially in a per-descriptor granularity, which now
can be avoided after this optimization, even when the pool is shared by
multiple xsks.

With this patch, the performance number[1] could go from 1,872,565 pps
to 1,961,009 pps. It's a minor rise of around 5%.

[1]: taskset -c 1 ./xdpsock -i enp2s0f1 -q 0 -t -S -s 64

Signed-off-by: Jason Xing <kernelxing@tencent.com>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://patch.msgid.link/20251030000646.18859-3-kerneljasonxing@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-04 16:10:53 +01:00
Jason Xing
4622800434 xsk: do not enable/disable irq when grabbing/releasing xsk_tx_list_lock
The commit ac98d8aab6 ("xsk: wire upp Tx zero-copy functions")
originally introducing this lock put the deletion process in the
sk_destruct which can run in irq context obviously, so the
xxx_irqsave()/xxx_irqrestore() pair was used. But later another
commit 541d7fdd76 ("xsk: proper AF_XDP socket teardown ordering")
moved the deletion into xsk_release() that only happens in process
context. It means that since this commit, it doesn't necessarily
need that pair.

Now, there are two places that use this xsk_tx_list_lock and only
run in the process context. So avoid manipulating the irq then.

Signed-off-by: Jason Xing <kernelxing@tencent.com>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://patch.msgid.link/20251030000646.18859-2-kerneljasonxing@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-04 16:10:52 +01:00
Tonghao Zhang
27cb3de7f4 net: add net cookie for net device trace events
In a multi-network card or container environment, this is needed in order
to differentiate between trace events relating to net devices that exist
in different network namespaces and share the same name.

for xmit_timeout trace events:
[002] ..s1.  1838.311662: net_dev_xmit_timeout: dev=eth0 driver=virtio_net queue=10 net_cookie=3
[007] ..s1.  1839.335650: net_dev_xmit_timeout: dev=eth0 driver=virtio_net queue=10 net_cookie=4100
[007] ..s1.  1844.455659: net_dev_xmit_timeout: dev=eth0 driver=virtio_net queue=10 net_cookie=3
[002] ..s1.  1850.087647: net_dev_xmit_timeout: dev=eth0 driver=virtio_net queue=10 net_cookie=3

Cc: Eran Ben Elisha <eranbe@mellanox.com>
Cc: Jiri Pirko <jiri@mellanox.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Simon Horman <horms@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Suggested-by: Ido Schimmel <idosch@idosch.org>
Signed-off-by: Tonghao Zhang <tonghao@bamaicloud.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Link: https://patch.msgid.link/20251028043244.82288-1-tonghao@bamaicloud.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-11-04 09:59:19 +01:00
Jakub Kicinski
9e8a443401 Merge branch 'ethtool-introduce-phy-mse-diagnostics-uapi-and-drivers'
Oleksij Rempel says:

====================
ethtool: introduce PHY MSE diagnostics UAPI and drivers

This series introduces a generic kernel-userspace API for retrieving PHY
Mean Square Error (MSE) diagnostics, together with netlink integration,
a fast-path reporting hook in LINKSTATE_GET, and initial driver
implementations for the KSZ9477 and DP83TD510E PHYs.

MSE is defined by the OPEN Alliance "Advanced diagnostic features for
100BASE-T1 automotive Ethernet PHYs" specification [1] as a measure of
slicer error rate, typically used internally to derive the Signal
Quality Indicator (SQI). While SQI is useful as a normalized quality
index, it hides raw measurement data, varies in scaling and thresholds
between vendors, and may not indicate certain failure modes - for
example, cases where autonegotiation would fail even though SQI reports
a good link. In practice, such scenarios can only be investigated in
fixed-link mode; here, MSE can provide an empirically estimated value
indicating conditions under which autonegotiation would not succeed.

Example output with current implementation:
root@DistroKit:~ ethtool lan1
Settings for lan1:
...
        Speed: 1000Mb/s
        Duplex: Full
...
        Link detected: yes
        SQI: 5/7
        MSE: 3/127 (channel: worst)

root@DistroKit:~ ethtool --show-mse lan1
MSE diagnostics for lan1:
MSE Configuration:
        Max Average MSE: 127
        Refresh Rate: 2000000 ps
        Symbols per Sample: 250
        Supported capabilities: average channel-a channel-b channel-c
                                channel-d worst

MSE Snapshot (Channel: a):
        Average MSE: 4

MSE Snapshot (Channel: b):
        Average MSE: 3

MSE Snapshot (Channel: c):
        Average MSE: 2

MSE Snapshot (Channel: d):
        Average MSE: 3

[1] https://opensig.org/wp-content/uploads/2024/01/Advanced_PHY_features_for_automotive_Ethernet_V1.0.pdf
====================

Link: https://patch.msgid.link/20251027122801.982364-1-o.rempel@pengutronix.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-03 18:32:33 -08:00
Oleksij Rempel
fd93ed77ef net: phy: dp83td510: add MSE interface support for 10BASE-T1L
Implement get_mse_capability() and get_mse_snapshot() for the DP83TD510E
to expose its Mean Square Error (MSE) register via the new PHY MSE
UAPI.

The DP83TD510E does not document any peak MSE values; it only exposes
a single average MSE register used internally to derive SQI. This
implementation therefore advertises only PHY_MSE_CAP_AVG, along with
LINK and channel-A selectors. Scaling is fixed to 0xFFFF, and the
refresh interval/number of symbols are estimated from 10BASE-T1L
symbol rate (7.5 MBd) and typical diagnostic intervals (~1 ms).

For 10BASE-T1L deployments, SQI is a reliable indicator of link
modulation quality once the link is established, but it does not
indicate whether autonegotiation pulses will be correctly received
in marginal conditions. MSE provides a direct measurement of slicer
error rate that can be used to evaluate if autonegotiation is likely
to succeed under a given cable length and condition. In practice,
testing such scenarios often requires forcing a fixed-link setup to
isolate MSE behaviour from the autonegotiation process.

Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Link: https://patch.msgid.link/20251027122801.982364-5-o.rempel@pengutronix.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-03 18:32:27 -08:00
Oleksij Rempel
335a9660e1 net: phy: micrel: add MSE interface support for KSZ9477 family
Implement the get_mse_capability() and get_mse_snapshot() PHY driver ops
for KSZ9477-series integrated PHYs to demonstrate the new PHY MSE
UAPI.

These PHYs do not expose a documented direct MSE register, but the
Signal Quality Indicator (SQI) registers are derived from the
internal MSE computation. This hook maps SQI readings into the MSE
interface so that tooling can retrieve the raw value together with
metadata for correct interpretation in userspace.

Behaviour:
  - For 1000BASE-T, report per-channel (A–D) values and support a
    WORST channel selector.
  - For 100BASE-TX, only LINK-wide measurements are available.
  - Report average MSE only, with a max scale based on
    KSZ9477_MMD_SQI_MASK and a fixed refresh rate of 2 µs.

This mapping differs from the OPEN Alliance SQI definition, which
assigns thresholds such as pre-fail indices; the MSE interface
instead provides the raw measurement, leaving interpretation to
userspace.

Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Link: https://patch.msgid.link/20251027122801.982364-4-o.rempel@pengutronix.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-03 18:32:27 -08:00
Oleksij Rempel
e6e93fb013 ethtool: netlink: add ETHTOOL_MSG_MSE_GET and wire up PHY MSE access
Introduce the userspace entry point for PHY MSE diagnostics via
ethtool netlink. This exposes the core API added previously and
returns both capability information and one or more snapshots.

Userspace sends ETHTOOL_MSG_MSE_GET. The reply carries:
- ETHTOOL_A_MSE_CAPABILITIES: scale limits and timing information
- ETHTOOL_A_MSE_CHANNEL_* nests: one or more snapshots (per-channel
  if available, otherwise WORST, otherwise LINK)

Link down returns -ENETDOWN.

Changes:
  - YAML: add attribute sets (mse, mse-capabilities, mse-snapshot)
    and the mse-get operation
  - UAPI (generated): add ETHTOOL_A_MSE_* enums and message IDs,
    ETHTOOL_MSG_MSE_GET/REPLY
  - ethtool core: add net/ethtool/mse.c implementing the request,
    register genl op, and hook into ethnl dispatch
  - docs: document MSE_GET in ethtool-netlink.rst

The include/uapi/linux/ethtool_netlink_generated.h is generated
from Documentation/netlink/specs/ethtool.yaml.

Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Link: https://patch.msgid.link/20251027122801.982364-3-o.rempel@pengutronix.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-03 18:32:27 -08:00
Oleksij Rempel
abcf6eef90 net: phy: introduce internal API for PHY MSE diagnostics
Add the base infrastructure for Mean Square Error (MSE) diagnostics,
as proposed by the OPEN Alliance "Advanced diagnostic features for
100BASE-T1 automotive Ethernet PHYs" [1] specification.

The OPEN Alliance spec defines only average MSE and average peak MSE
over a fixed number of symbols. However, other PHYs, such as the
KSZ9131, additionally expose a worst-peak MSE value latched since the
last channel capture. This API accounts for such vendor extensions by
adding a distinct capability bit and snapshot field.

Channel-to-pair mapping is normally straightforward, but in some cases
(e.g. 100BASE-TX with MDI-X resolution unknown) the mapping is ambiguous.
If hardware does not expose MDI-X status, the exact pair cannot be
determined. To avoid returning misleading per-channel data in this case,
a LINK selector is defined for aggregate MSE measurements.

All investigated devices differ in MSE capabilities, such
as sample rate, number of analyzed symbols, and scaling factors.
For example, the KSZ9131 uses different scaling for MSE and pMSE.
To make this visible to callers, scale limits and timing information
are returned via get_mse_capability().

Some PHYs sample very few symbols at high frequency (e.g. 2 us update
rate). To cover such cases and allow for future high-speed PHYs with
even shorter intervals, the refresh rate is reported as u64 in
picoseconds.

This patch introduces the internal PHY API for Mean Square Error
diagnostics. It defines new kernel-side data types and driver hooks:

  - struct phy_mse_capability: describes supported metrics, scale
    limits, update interval, and sampling length.
  - struct phy_mse_snapshot: holds one correlated measurement set.
  - New phy_driver ops: `get_mse_capability()` and `get_mse_snapshot()`.

These definitions form the core kernel API. No user-visible interfaces
are added in this commit.

Standardization notes:
OPEN Alliance defines presence and interpretation of some metrics but does
not fix numeric scales or sampling internals:

- SQI (3-bit, 0..7) is mandatory; correlation to SNR/BER is informative
  (OA 100BASE-T1 TC1 v1.0 6.1.2; OA 1000BASE-T1 TC12 v2.2 6.1.2).
- MSE is optional; OA recommends 2^16 symbols and scaling to 0..511,
  with a worst-case latch since last read (OA 100BASE-T1 TC1 v1.0 6.1.1; OA
  1000BASE-T1 TC12 v2.2 6.1.1). Refresh is recommended (~0.8-2.0 ms for
  100BASE-T1; ~80-200 us for 1000BASE-T1). Exact scaling/time windows
  are vendor-specific.
- Peak MSE (pMSE) is defined only for 100BASE-T1 as optional, e.g.
  128-symbol sliding window with 8-bit range and worst-case latch (OA
  100BASE-T1 TC1 v1.0 6.1.3).

Therefore this API exposes which measures and selectors a PHY supports,
and documents where behavior is standard-referenced vs vendor-specific.

[1] <https://opensig.org/wp-content/uploads/2024/01/
     Advanced_PHY_features_for_automotive_Ethernet_V1.0.pdf>

Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Link: https://patch.msgid.link/20251027122801.982364-2-o.rempel@pengutronix.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-03 18:32:27 -08:00
Jakub Kicinski
ff371a7e73 Merge branch 'add-support-to-do-threaded-napi-busy-poll'
Samiullah Khawaja says:

====================
Add support to do threaded napi busy poll

Extend the already existing support of threaded napi poll to do continuous
busy polling.

This is used for doing continuous polling of napi to fetch descriptors
from backing RX/TX queues for low latency applications. Allow enabling
of threaded busypoll using netlink so this can be enabled on a set of
dedicated napis for low latency applications.

Once enabled user can fetch the PID of the kthread doing NAPI polling
and set affinity, priority and scheduler for it depending on the
low-latency requirements.

Extend the netlink interface to allow enabling/disabling threaded
busypolling at individual napi level.

We use this for our AF_XDP based hard low-latency usecase with usecs
level latency requirement. For our usecase we want low jitter and stable
latency at P99.

Following is an analysis and comparison of available (and compatible)
busy poll interfaces for a low latency usecase with stable P99. This can
be suitable for applications that want very low latency at the expense
of cpu usage and efficiency.

Already existing APIs (SO_BUSYPOLL and epoll) allow busy polling a NAPI
backing a socket, but the missing piece is a mechanism to busy poll a
NAPI instance in a dedicated thread while ignoring available events or
packets, regardless of the userspace API. Most existing mechanisms are
designed to work in a pattern where you poll until new packets or events
are received, after which userspace is expected to handle them.

As a result, one has to hack together a solution using a mechanism
intended to receive packets or events, not to simply NAPI poll. NAPI
threaded busy polling, on the other hand, provides this capability
natively, independent of any userspace API. This makes it really easy to
setup and manage.

For analysis we use an AF_XDP based benchmarking tool `xsk_rr`. The
description of the tool and how it tries to simulate the real workload
is following,

- It sends UDP packets between 2 machines.
- The client machine sends packets at a fixed frequency. To maintain the
  frequency of the packet being sent, we use open-loop sampling. That is
  the packets are sent in a separate thread.
- The server replies to the packet inline by reading the pkt from the
  recv ring and replies using the tx ring.
- To simulate the application processing time, we use a configurable
  delay in usecs on the client side after a reply is received from the
  server.

The xsk_rr tool is posted separately as an RFC for tools/testing/selftest.

We use this tool with following napi polling configurations,

- Interrupts only
- SO_BUSYPOLL (inline in the same thread where the client receives the
  packet).
- SO_BUSYPOLL (separate thread and separate core)
- Threaded NAPI busypoll

System is configured using following script in all 4 cases,

```
echo 0 | sudo tee /sys/class/net/eth0/threaded
echo 0 | sudo tee /proc/sys/kernel/timer_migration
echo off | sudo tee  /sys/devices/system/cpu/smt/control

sudo ethtool -L eth0 rx 1 tx 1
sudo ethtool -G eth0 rx 1024

echo 0 | sudo tee /proc/sys/net/core/rps_sock_flow_entries
echo 0 | sudo tee /sys/class/net/eth0/queues/rx-0/rps_cpus

 # pin IRQs on CPU 2
IRQS="$(gawk '/eth0-(TxRx-)?1/ {match($1, /([0-9]+)/, arr); \
				print arr[0]}' < /proc/interrupts)"
for irq in "${IRQS}"; \
	do echo 2 | sudo tee /proc/irq/$irq/smp_affinity_list; done

echo -1 | sudo tee /proc/sys/kernel/sched_rt_runtime_us

for i in /sys/devices/virtual/workqueue/*/cpumask; \
			do echo $i; echo 1,2,3,4,5,6 > $i; done

if [[ -z "$1" ]]; then
  echo 400 | sudo tee /proc/sys/net/core/busy_read
  echo 100 | sudo tee /sys/class/net/eth0/napi_defer_hard_irqs
  echo 15000   | sudo tee /sys/class/net/eth0/gro_flush_timeout
fi

sudo ethtool -C eth0 adaptive-rx off adaptive-tx off rx-usecs 0 tx-usecs 0

if [[ "$1" == "enable_threaded" ]]; then
  echo 0 | sudo tee /proc/sys/net/core/busy_poll
  echo 0 | sudo tee /proc/sys/net/core/busy_read
  echo 100 | sudo tee /sys/class/net/eth0/napi_defer_hard_irqs
  echo 15000 | sudo tee /sys/class/net/eth0/gro_flush_timeout
  NAPI_ID=$(ynl --family netdev --output-json --do queue-get \
    --json '{"ifindex": '${IFINDEX}', "id": '0', "type": "rx"}' | jq '."napi-id"')

  ynl --family netdev --json '{"id": "'${NAPI_ID}'", "threaded": "busy-poll"}'

  NAPI_T=$(ynl --family netdev --output-json --do napi-get \
    --json '{"id": "'$NAPI_ID'"}' | jq '."pid"')

  sudo chrt -f  -p 50 $NAPI_T

  # pin threaded poll thread to CPU 2
  sudo taskset -pc 2 $NAPI_T
fi

if [[ "$1" == "enable_interrupt" ]]; then
  echo 0 | sudo tee /proc/sys/net/core/busy_read
  echo 0 | sudo tee /sys/class/net/eth0/napi_defer_hard_irqs
  echo 15000 | sudo tee /sys/class/net/eth0/gro_flush_timeout
fi
```

To enable various configurations, script can be run as following,

- Interrupt Only
  ```
  <script> enable_interrupt
  ```

- SO_BUSYPOLL (no arguments to script)
  ```
  <script>
  ```

- NAPI threaded busypoll
  ```
  <script> enable_threaded
  ```

Once configured, the workload is run with various configurations using
following commands. Set period (1/frequency) and delay in usecs to
produce results for packet frequency and application processing delay.

 ## Interrupt Only and SO_BUSYPOLL (inline)

- Server
```
sudo chrt -f 50 taskset -c 3-5 ./xsk_rr -o 0 -B 400 -i eth0 -4 \
	-D <IP-dest> -S <IP-src> -M <MAC-dst> -m <MAC-src> -p 54321 -h -v
```

- Client
```
sudo chrt -f 50 taskset -c 3-5 ./xsk_rr -o 0 -B 400 -i eth0 -4 \
	-S <IP-src> -D <IP-dest> -m <MAC-src> -M <MAC-dst> -p 54321 \
	-P <Period-usecs> -d <Delay-usecs>  -T -l 1 -v
```

 ## SO_BUSYPOLL(done in separate core using recvfrom)

Argument -t spawns a separate thread and continuously calls recvfrom.

- Server
```
sudo chrt -f 50 taskset -c 3-5 ./xsk_rr -o 0 -B 400 -i eth0 -4 \
	-D <IP-dest> -S <IP-src> -M <MAC-dst> -m <MAC-src> -p 54321 \
	-h -v -t
```

- Client
```
sudo chrt -f 50 taskset -c 3-5 ./xsk_rr -o 0 -B 400 -i eth0 -4 \
	-S <IP-src> -D <IP-dest> -m <MAC-src> -M <MAC-dst> -p 54321 \
	-P <Period-usecs> -d <Delay-usecs>  -T -l 1 -v -t
```

 ## NAPI Threaded Busy Poll

Argument -n skips the recvfrom call as there is no recv kick needed.

- Server
```
sudo chrt -f 50 taskset -c 3-5 ./xsk_rr -o 0 -B 400 -i eth0 -4 \
	-D <IP-dest> -S <IP-src> -M <MAC-dst> -m <MAC-src> -p 54321 \
	-h -v -n
```

- Client
```
sudo chrt -f 50 taskset -c 3-5 ./xsk_rr -o 0 -B 400 -i eth0 -4 \
	-S <IP-src> -D <IP-dest> -m <MAC-src> -M <MAC-dst> -p 54321 \
	-P <Period-usecs> -d <Delay-usecs>  -T -l 1 -v -n
```

| Experiment | interrupts | SO_BUSYPOLL | SO_BUSYPOLL(separate) | NAPI threaded |
|---|---|---|---|---|
| 12 Kpkt/s + 0us delay | | | | |
|  | p5: 12700 | p5: 12900 | p5: 13300 | p5: 12800 |
|  | p50: 13100 | p50: 13600 | p50: 14100 | p50: 13000 |
|  | p95: 13200 | p95: 13800 | p95: 14400 | p95: 13000 |
|  | p99: 13200 | p99: 13800 | p99: 14400 | p99: 13000 |
| 32 Kpkt/s + 30us delay | | | | |
|  | p5: 19900 | p5: 16600 | p5: 13100 | p5: 12800 |
|  | p50: 21100 | p50: 17000 | p50: 13700 | p50: 13000 |
|  | p95: 21200 | p95: 17100 | p95: 14000 | p95: 13000 |
|  | p99: 21200 | p99: 17100 | p99: 14000 | p99: 13000 |
| 125 Kpkt/s + 6us delay | | | | |
|  | p5: 14600 | p5: 17100 | p5: 13300 | p5: 12900 |
|  | p50: 15400 | p50: 17400 | p50: 13800 | p50: 13100 |
|  | p95: 15600 | p95: 17600 | p95: 14000 | p95: 13100 |
|  | p99: 15600 | p99: 17600 | p99: 14000 | p99: 13100 |
| 12 Kpkt/s + 78us delay | | | | |
|  | p5: 14100 | p5: 16700 | p5: 13200 | p5: 12600 |
|  | p50: 14300 | p50: 17100 | p50: 13900 | p50: 12800 |
|  | p95: 14300 | p95: 17200 | p95: 14200 | p95: 12800 |
|  | p99: 14300 | p99: 17200 | p99: 14200 | p99: 12800 |
| 25 Kpkt/s + 38us delay | | | | |
|  | p5: 19900 | p5: 16600 | p5: 13000 | p5: 12700 |
|  | p50: 21000 | p50: 17100 | p50: 13800 | p50: 12900 |
|  | p95: 21100 | p95: 17100 | p95: 14100 | p95: 12900 |
|  | p99: 21100 | p99: 17100 | p99: 14100 | p99: 12900 |

 ## Observations

- Here without application processing all the approaches give the same
  latency within 1usecs range and NAPI threaded gives minimum latency.
- With application processing the latency increases by 3-4usecs when
  doing inline polling.
- Using a dedicated core to drive napi polling keeps the latency same
  even with application processing. This is observed both in userspace
  and threaded napi (in kernel).
- Using napi threaded polling in kernel gives lower latency by
  1-2usecs as compared to userspace driven polling in separate core.
- Even on a dedicated core, SO_BUSYPOLL adds around 1-2usecs of latency.
  This is because it doesn't continuously busy poll until events are
  ready. Instead, it returns after polling only once, requiring the
  process to re-invoke the syscall for each poll, which requires a new
  enter/leave kernel cycle and the setup/teardown of the busy poll for
  every single poll attempt.
- With application processing userspace will get the packet from recv
  ring and spend some time doing application processing and then do napi
  polling. While application processing is happening a dedicated core
  doing napi polling can pull the packet of the NAPI RX queue and
  populate the AF_XDP recv ring. This means that when the application
  thread is done with application processing it has new packets ready to
  recv and process in recv ring.
- Napi threaded busy polling in the kernel with a dedicated core gives
  the consistent P5-P99 latency.

Note well that threaded napi busy-polling has not been shown to yield
efficiency or throughput benefits. In contrast, dedicating an entire
core to busy-polling one NAPI (NIC queue) is rather inefficient.
However, in certain specific use cases, this mechanism results in lower
packet processing latency. The experimental testing reported here only
covers those use cases and does not present a comprehensive evaluation
of threaded napi busy-polling.

Following histogram is generated to measure the time spent in recvfrom
while using inline thread with SO_BUSYPOLL. The histogram is generated
using the following bpftrace command. In this experiment there are 32K
packets per second and the application processing delay is 30usecs. This
is to measure whether there is significant time spent pulling packets
from the descriptor queue that it will affect the overall latency if
done inline.

```
bpftrace -e '
        kprobe:xsk_recvmsg {
                @start[tid] = nsecs;
        }
        kretprobe:xsk_recvmsg {
                if (@start[tid]) {
                        $sample = (nsecs - @start[tid]);
                        @xsk_recvfrom_hist = hist($sample);
                        delete(@start[tid]);
                }
        }
        END { clear(@start);}'
```

Here in case of inline busypolling around 35 percent of calls are taking
1-2usecs and around 50 percent are taking 0.5-2usecs.

@xsk_recvfrom_hist:
[128, 256)         24073 |@@@@@@@@@@@@@@@@@@@@@@                              |
[256, 512)         55633 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[512, 1K)          20974 |@@@@@@@@@@@@@@@@@@@                                 |
[1K, 2K)           34234 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@                     |
[2K, 4K)            3266 |@@@                                                 |
[4K, 8K)              19 |                                                    |
====================

Link: https://patch.msgid.link/20251028203007.575686-1-skhawaja@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-03 18:11:44 -08:00
Samiullah Khawaja
add3c1324a selftests: Add napi threaded busy poll test in busy_poller
Add testcase to run busy poll test with threaded napi busy poll enabled.

Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Acked-by: Martin Karsten <mkarsten@uwaterloo.ca>
Tested-by: Martin Karsten <mkarsten@uwaterloo.ca>
Link: https://patch.msgid.link/20251028203007.575686-3-skhawaja@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-03 18:11:40 -08:00
Samiullah Khawaja
c18d4b190a net: Extend NAPI threaded polling to allow kthread based busy polling
Add a new state NAPI_STATE_THREADED_BUSY_POLL to the NAPI state enum to
enable and disable threaded busy polling.

When threaded busy polling is enabled for a NAPI, enable
NAPI_STATE_THREADED also.

When the threaded NAPI is scheduled, set NAPI_STATE_IN_BUSY_POLL to
signal napi_complete_done not to rearm interrupts.

Whenever NAPI_STATE_THREADED_BUSY_POLL is unset, the
NAPI_STATE_IN_BUSY_POLL will be unset, napi_complete_done unsets the
NAPI_STATE_SCHED_THREADED bit also, which in turn will make the kthread
go to sleep.

Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Acked-by: Martin Karsten <mkarsten@uwaterloo.ca>
Tested-by: Martin Karsten <mkarsten@uwaterloo.ca>
Link: https://patch.msgid.link/20251028203007.575686-2-skhawaja@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-03 18:11:40 -08:00
Hiroaki Yamamoto
8adb609f64 wifi: rtw88: Add BUFFALO WI-U3-866DHP to the USB ID list
BUFFALO WI-U3-866DHP (0411:03d0) is based on rtl8812bu. I locally tested
this patch with a retail sample and it worked fine.

Signed-off-by: Hiroaki Yamamoto <hrak1529@gmail.com>
Acked-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251031111641.33653-1-hrak1529@gmail.com
2025-11-04 10:03:43 +08:00
Chin-Yen Lee
71ee0d5659 wifi: rtw88: 8822c: use fixed rate and bandwidth to reply CSI packets
AP could trigger beamform and send NDPA packet with 6Mbps rate
on bandwidth 80MHz, but RTL8822C can't reply CSI packet with
the same setting. Therefore, force to use OFDM rate and
bandwidth 20MHz instead.

Signed-off-by: Chin-Yen Lee <timlee@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251027070259.18931-1-pkshih@realtek.com
2025-11-04 09:58:30 +08:00
Alexei Starovoitov
11369e6e33 Merge branch 'bpf-skip-bounds-adjustment-for-conditional-jumps-on-same-scalar-register'
KaFai Wan says:

====================
bpf: Skip bounds adjustment for conditional jumps on same scalar register

This small patchset is about avoid verifier bug warning when conditional
 jumps on same register when the register holds a scalar with range.

v4:
  - make code better. (Alexei)

v3:
  https://lore.kernel.org/bpf/20251031154107.403054-1-kafai.wan@linux.dev/
  - Enhance is_scalar_branch_taken() to handle scalar case. (Eduard)
  - Update the selftest to cover all conditional jump opcodes. (Eduard)

v2:
  https://lore.kernel.org/bpf/20251025053017.2308823-1-kafai.wan@linux.dev/
 - Enhance is_branch_taken() and is_scalar_branch_taken() to handle
   branch direction computation for same register. (Eduard and Alexei)
 - Update the selftest.

v1:
  https://lore.kernel.org/bpf/20251022164457.1203756-1-kafai.wan@linux.dev/
---
====================

Link: https://patch.msgid.link/20251103063108.1111764-1-kafai.wan@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-03 17:45:11 -08:00
KaFai Wan
9f32bfec54 selftests/bpf: Add test for conditional jumps on same scalar register
Add test cases to verify the correctness of the BPF verifier's branch analysis
when conditional jumps are performed on the same scalar register. And make sure
that JGT does not trigger verifier BUG.

Signed-off-by: KaFai Wan <kafai.wan@linux.dev>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251103063108.1111764-3-kafai.wan@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-03 17:44:53 -08:00
KaFai Wan
d43ad9da80 bpf: Skip bounds adjustment for conditional jumps on same scalar register
When conditional jumps are performed on the same scalar register
(e.g., r0 <= r0, r0 > r0, r0 < r0), the BPF verifier incorrectly
attempts to adjust the register's min/max bounds. This leads to
invalid range bounds and triggers a BUG warning.

The problematic BPF program:
   0: call bpf_get_prandom_u32
   1: w8 = 0x80000000
   2: r0 &= r8
   3: if r0 > r0 goto <exit>

The instruction 3 triggers kernel warning:
   3: if r0 > r0 goto <exit>
   true_reg1: range bounds violation u64=[0x1, 0x0] s64=[0x1, 0x0] u32=[0x1, 0x0] s32=[0x1, 0x0] var_off=(0x0, 0x0)
   true_reg2: const tnum out of sync with range bounds u64=[0x0, 0xffffffffffffffff] s64=[0x8000000000000000, 0x7fffffffffffffff] var_off=(0x0, 0x0)

Comparing a register with itself should not change its bounds and
for most comparison operations, comparing a register with itself has
a known result (e.g., r0 == r0 is always true, r0 < r0 is always false).

Fix this by:
1. Enhance is_scalar_branch_taken() to properly handle branch direction
   computation for same register comparisons across all BPF jump operations
2. Adds early return in reg_set_min_max() to avoid bounds adjustment
   for unknown branch directions (e.g., BPF_JSET) on the same register

The fix ensures that unnecessary bounds adjustments are skipped, preventing
the verifier bug while maintaining correct branch direction analysis.

Reported-by: Kaiyan Mei <M202472210@hust.edu.cn>
Reported-by: Yinhao Hu <dddddd@hust.edu.cn>
Closes: https://lore.kernel.org/all/1881f0f5.300df.199f2576a01.Coremail.kaiyanm@hust.edu.cn/
Signed-off-by: KaFai Wan <kafai.wan@linux.dev>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251103063108.1111764-2-kafai.wan@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-03 17:43:28 -08:00
Jakub Kicinski
998b5d9683 Merge branch 'mpls-remove-rtnl-dependency'
Kuniyuki Iwashima says:

====================
mpls: Remove RTNL dependency.

MPLS uses RTNL

  1) to guarantee the lifetime of struct mpls_nh.nh_dev
  2) to protect net->mpls.platform_label

, but neither actually requires RTNL.

If struct mpls_nh holds a refcnt for nh_dev, we do not need RTNL,
and it can be replaced with a dedicated mutex.

The series removes RTNL from net/mpls/.

Overview:

  Patch 1 is misc cleanup.

  Patch 2 - 9 are prep to drop RTNL for RTM_{NEW,DEL,GET}ROUTE
  handlers.

  Patch 10 & 11 converts mpls_dump_routes() and RTM_GETNETCONF to RCU.

  Patch 12 replaces RTNL with a new per-netns mutex.

  Patch 13 drops RTNL from RTM_{NEW,DEL,GET}ROUTE.
====================

Link: https://patch.msgid.link/20251029173344.2934622-1-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-03 17:41:01 -08:00
Kuniyuki Iwashima
7d99a7c6c6 mpls: Drop RTNL for RTM_NEWROUTE, RTM_DELROUTE, and RTM_GETROUTE.
RTM_NEWROUTE looks up dev under RCU (ip_route_output(),
ipv6_stub->ipv6_dst_lookup_flow(), netdev_get_by_index()),
and each neighbour holds the refcnt of its dev.

Also, net->mpls.platform_label is protected by a dedicated
per-netns mutex.

Now, no MPLS code depends on RTNL.

Let's drop RTNL for RTM_NEWROUTE, RTM_DELROUTE, and RTM_GETROUTE.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Guillaume Nault <gnault@redhat.com>
Link: https://patch.msgid.link/20251029173344.2934622-14-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-03 17:40:54 -08:00
Kuniyuki Iwashima
e833eb2516 mpls: Protect net->mpls.platform_label with a per-netns mutex.
MPLS (re)uses RTNL to protect net->mpls.platform_label,
but the lock does not need to be RTNL at all.

Let's protect net->mpls.platform_label with a dedicated
per-netns mutex.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Guillaume Nault <gnault@redhat.com>
Link: https://patch.msgid.link/20251029173344.2934622-13-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-03 17:40:53 -08:00
Kuniyuki Iwashima
fb2b77b9b1 mpls: Convert RTM_GETNETCONF to RCU.
mpls_netconf_get_devconf() calls __dev_get_by_index(),
and this only depends on RTNL.

Let's convert mpls_netconf_get_devconf() to RCU and use
dev_get_by_index_rcu().

Note that nlmsg_new() is moved ahead to use GFP_KERNEL.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Guillaume Nault <gnault@redhat.com>
Link: https://patch.msgid.link/20251029173344.2934622-12-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-03 17:40:52 -08:00
Kuniyuki Iwashima
dde1b38e87 mpls: Convert mpls_dump_routes() to RCU.
mpls_dump_routes() sets fib_dump_filter.rtnl_held to true and
calls __dev_get_by_index() in mpls_valid_fib_dump_req().

This is the only RTNL dependant in mpls_dump_routes().

Also, synchronize_rcu() in resize_platform_label_table()
guarantees that net->mpls.platform_label is alive under RCU.

Let's convert mpls_dump_routes() to RCU and use dev_get_by_index_rcu().

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Guillaume Nault <gnault@redhat.com>
Link: https://patch.msgid.link/20251029173344.2934622-11-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-03 17:40:51 -08:00
Kuniyuki Iwashima
3a49629335 mpls: Use mpls_route_input() where appropriate.
In many places, we uses rtnl_dereference() twice for
net->mpls.platform_label and net->mpls.platform_label[index].

Let's replace the code with mpls_route_input().

We do not use mpls_route_input() in mpls_dump_routes() since
we will rely on RCU there.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Guillaume Nault <gnault@redhat.com>
Link: https://patch.msgid.link/20251029173344.2934622-10-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-03 17:40:50 -08:00
Kuniyuki Iwashima
73e4053939 mpls: Add mpls_route_input().
mpls_route_input_rcu() is called from mpls_forward() and
mpls_getroute().

The former is under RCU, and the latter is under RTNL, so
mpls_route_input_rcu() uses rcu_dereference_rtnl().

Let's use rcu_dereference() in mpls_route_input_rcu() and
add an RTNL variant for mpls_getroute().

Later, we will remove rtnl_dereference() there.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Guillaume Nault <gnault@redhat.com>
Link: https://patch.msgid.link/20251029173344.2934622-9-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-03 17:40:49 -08:00
Kuniyuki Iwashima
1fb462de93 mpls: Pass net to mpls_dev_get().
We will replace RTNL with a per-netns mutex to protect dev->mpls_ptr.

Then, we will use rcu_dereference_protected() with the lockdep_is_held()
annotation, which requires net to access the per-netns mutex.

However, dev_net(dev) is not safe without RTNL.

Let's pass net to mpls_dev_get().

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Guillaume Nault <gnault@redhat.com>
Link: https://patch.msgid.link/20251029173344.2934622-8-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-03 17:40:48 -08:00
Kuniyuki Iwashima
ab061f3347 mpls: Add mpls_dev_rcu().
mpls_dev_get() uses rcu_dereference_rtnl() to fetch dev->mpls_ptr.

We will replace RTNL with a dedicated mutex to protect the field.

Then, we will use rcu_dereference_protected() for clarity.

Let's add mpls_dev_rcu() for the RCU reader.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Guillaume Nault <gnault@redhat.com>
Link: https://patch.msgid.link/20251029173344.2934622-7-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-03 17:40:48 -08:00
Kuniyuki Iwashima
bc7ebc569e mpls: Use in6_dev_rcu() and dev_net_rcu() in mpls_forward() and mpls_xmit().
mpls_forward() and mpls_xmit() are called under RCU.

Let's use in6_dev_rcu() and dev_net_rcu() there to annotate
as such.

Now we pass net to mpls_stats_inc_outucastpkts() not to read
dev_net_rcu() twice.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Guillaume Nault <gnault@redhat.com>
Link: https://patch.msgid.link/20251029173344.2934622-6-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-03 17:40:47 -08:00
Kuniyuki Iwashima
d8f9581e1b ipv6: Add in6_dev_rcu().
rcu_dereference_rtnl() does not clearly tell whether the caller
is under RCU or RTNL.

Let's add in6_dev_rcu() to make it easy to remove __in6_dev_get()
in the future.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Guillaume Nault <gnault@redhat.com>
Link: https://patch.msgid.link/20251029173344.2934622-5-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-03 17:40:46 -08:00
Kuniyuki Iwashima
451c538ec0 mpls: Unify return paths in mpls_dev_notify().
We will protect net->mpls.platform_label by a dedicated mutex.

Then, we need to wrap functions called from mpls_dev_notify()
with the mutex.

As a prep, let's unify the return paths.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Guillaume Nault <gnault@redhat.com>
Link: https://patch.msgid.link/20251029173344.2934622-4-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-03 17:40:45 -08:00
Kuniyuki Iwashima
f0914b8436 mpls: Hold dev refcnt for mpls_nh.
MPLS uses RTNL

  1) to guarantee the lifetime of struct mpls_nh.nh_dev
  2) to protect net->mpls.platform_label

, but neither actually requires RTNL.

If we do not call dev_put() in find_outdev() and call it
just before freeing struct mpls_route, we can drop RTNL for 1).

Let's hold the refcnt of mpls_nh.nh_dev and track it with
netdevice_tracker.

Two notable changes:

If mpls_nh_build_multi() fails to set up a neighbour, we need
to call netdev_put() for successfully created neighbours in
mpls_rt_free_rcu(), so the number of neighbours (rt->rt_nhn)
is now updated in each iteration.

When a dev is unregistered, mpls_ifdown() clones mpls_route
and replaces it with the clone, so the clone requires extra
netdev_hold().

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Guillaume Nault <gnault@redhat.com>
Link: https://patch.msgid.link/20251029173344.2934622-3-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-03 17:40:44 -08:00
Kuniyuki Iwashima
2214ca1ff6 mpls: Return early in mpls_label_ok().
When mpls_label_ok() returns false, it does not need to update *index.

Let's remove is_ok and return early.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Guillaume Nault <gnault@redhat.com>
Link: https://patch.msgid.link/20251029173344.2934622-2-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-03 17:40:44 -08:00
Bitterblue Smith
0d971ffdae wifi: rtw89: Enable the new rtw89_8852cu module
Tested mostly in station mode, and a little bit in AP mode.

Signed-off-by: Bitterblue Smith <rtl8821cerfe2@gmail.com>
Acked-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/859a45ce-2730-4eeb-99d2-37d7ff277bd7@gmail.com
2025-11-04 09:37:18 +08:00
Bitterblue Smith
406849000d wifi: rtw89: Add rtw8852cu.c
This is the entry point for the new rtw89_8852cu module.

Signed-off-by: Bitterblue Smith <rtl8821cerfe2@gmail.com>
Acked-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/a63a71f0-8003-4390-8bee-5cfd298867a0@gmail.com
2025-11-04 09:36:28 +08:00
Bitterblue Smith
6bc2711085 wifi: rtw89: 8852c: Accept USB devices and load their MAC address
Make rtw8852c_read_efuse() accept USB devices and load the MAC
address from the correct offset.

Also fix the offset of the MAC address because it was wrong.

Signed-off-by: Bitterblue Smith <rtl8821cerfe2@gmail.com>
Acked-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/78138507-99ea-4a58-a02b-f4d11dbfba3b@gmail.com
2025-11-04 09:36:16 +08:00
Bitterblue Smith
19faad8c72 wifi: rtw89: Add rtw8852c_hfc_param_ini_usb
"hfc" means "hci fc" which is "Host Control Interface Flow Control".
These are some parameters needed for RTL8852CU.

Signed-off-by: Bitterblue Smith <rtl8821cerfe2@gmail.com>
Acked-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/5b90204c-60ad-4579-b241-b7ac2e1fee91@gmail.com
2025-11-04 09:36:06 +08:00
Bitterblue Smith
a865899084 wifi: rtw89: Add rtw8852c_dle_mem_usb{2,3}
Add rtw8852c_dle_mem_usb2 and rtw8852c_dle_mem_usb3 and their various
quotas and sizes in struct rtw89_mac_size_set.

"dle" could be "Data Link Engine" or "Double Link Engine". These are
some parameters needed for RTL8852CU.

Signed-off-by: Bitterblue Smith <rtl8821cerfe2@gmail.com>
Acked-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/40a58644-13ce-48a4-85e2-ba4f3cbb975b@gmail.com
2025-11-04 09:35:04 +08:00
Bitterblue Smith
97259766b1 wifi: rtw89: 8852c: Fix rtw8852c_pwr_{on,off}_func() for USB
There are a few differences in the power on/off functions between PCIE
and USB. The changes in the power off function in particular are needed
for the RTL8832CU to be able to power on again after it's powered off.

While the RTL8832CU appears to work without the changes in the power on
function, it's probably best to implement them, in case they are needed
in some situations.

Signed-off-by: Bitterblue Smith <rtl8821cerfe2@gmail.com>
Acked-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/598dec66-b5cc-435a-bcf6-fa66577f8cfc@gmail.com
2025-11-04 09:34:53 +08:00
Bitterblue Smith
32e0381e86 wifi: rtw89: Fix rtw89_mac_dmac_func_pre_en_ax() for USB/SDIO
Set the DMA mode according to the interface type.

Signed-off-by: Bitterblue Smith <rtl8821cerfe2@gmail.com>
Acked-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/d2250e72-0aa1-422c-8f7f-9aeb283ca376@gmail.com
2025-11-04 09:34:41 +08:00
Bitterblue Smith
199afd3af1 wifi: rtw89: usb: Prepare rtw89_usb_ops_mac_post_init() for RTL8852CU
The registers used in rtw89_usb_ops_mac_post_init() are located at
different offsets in RTL8852CU, so move them to struct rtw89_usb_info,
which is filled in each chip's driver.

Signed-off-by: Bitterblue Smith <rtl8821cerfe2@gmail.com>
Acked-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/c12b621a-037c-4870-ac12-9795ddde6b53@gmail.com
2025-11-04 09:34:30 +08:00
Bitterblue Smith
7697701d6c wifi: rtw89: usb: Prepare rtw89_usb_ops_mac_pre_init() for RTL8852CU
The registers used in rtw89_usb_ops_mac_pre_init() are located at
different offsets in RTL8852CU, so move them to struct rtw89_usb_info,
which is filled in each chip's driver.

Signed-off-by: Bitterblue Smith <rtl8821cerfe2@gmail.com>
Acked-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/36b823c9-2cb5-4191-8492-2a291f9dc1db@gmail.com
2025-11-04 09:34:19 +08:00
Bitterblue Smith
994944aa58 wifi: rtw89: usb: Move bulk out map to new struct rtw89_usb_info
RTL8852AU, RTL8852CU, and RTL8922AU will need a different TX channel
to bulk out endpoint mapping, so create a new struct rtw89_usb_info
and move the mapping there. Initialise it in each chip's driver.

Struct rtw89_usb_info will also hold some registers which are located
at different offsets in RTL8852CU compared to the other wifi 6 chips.

Signed-off-by: Bitterblue Smith <rtl8821cerfe2@gmail.com>
Acked-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/de11cfae-1dc0-4514-95b2-02b1bbfa92be@gmail.com
2025-11-04 09:32:52 +08:00
Bitterblue Smith
89acd6c493 wifi: rtw89: Add rtw89_core_get_ch_dma_v2()
RTL8852CU, RTL8852AU, and RTL8922AU need a different mapping of TX
queue to DMA channel compared to their PCI versions, so make
get_ch_dma in struct rtw89_chip_ops an array and add
rtw89_core_get_ch_dma_v2().

Signed-off-by: Bitterblue Smith <rtl8821cerfe2@gmail.com>
Acked-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/6c7b2f01-2c78-47c8-a4c4-98cd3060d7f3@gmail.com
2025-11-04 09:31:25 +08:00
Ping-Ke Shih
030b8d5878 wifi: rtw89: 8832cu: turn off TX partial mode
The TX partial mode in USB devices will cause timeout to wait for payload,
causing SER 0x999 and disconnection. Turn off this mode according to
design suggestion.

rtw89_8852cu 2-4:1.0: FW status = 0xee001108
rtw89_8852cu 2-4:1.0: FW BADADDR = 0x18605fc8
rtw89_8852cu 2-4:1.0: FW EPC/RA = 0x0
rtw89_8852cu 2-4:1.0: FW MISC = 0x1010000
rtw89_8852cu 2-4:1.0: R_AX_HALT_C2H = 0x999
rtw89_8852cu 2-4:1.0: R_AX_SER_DBG_INFO = 0x71020010
rtw89_8852cu 2-4:1.0: [ERR]fw PC = 0x2013f55e
rtw89_8852cu 2-4:1.0: [ERR]fw PC = 0x2013f55a
rtw89_8852cu 2-4:1.0: [ERR]fw PC = 0x2013f55a
rtw89_8852cu 2-4:1.0: [ERR]fw PC = 0x2013f55e
rtw89_8852cu 2-4:1.0: [ERR]fw PC = 0x2013f55e
rtw89_8852cu 2-4:1.0: [ERR]fw PC = 0x2013f55e
rtw89_8852cu 2-4:1.0: [ERR]fw PC = 0x2013f55a
rtw89_8852cu 2-4:1.0: [ERR]fw PC = 0x2013f55e
rtw89_8852cu 2-4:1.0: [ERR]fw PC = 0x2013f55a
rtw89_8852cu 2-4:1.0: [ERR]fw PC = 0x2013f55e
rtw89_8852cu 2-4:1.0: [ERR]fw PC = 0x2013f554
rtw89_8852cu 2-4:1.0: [ERR]fw PC = 0x2013f556
rtw89_8852cu 2-4:1.0: [ERR]fw PC = 0x2013f55a
rtw89_8852cu 2-4:1.0: [ERR]fw PC = 0x2013f55a
rtw89_8852cu 2-4:1.0: [ERR]fw PC = 0x2013f55e
rtw89_8852cu 2-4:1.0: --->
err=0x999
rtw89_8852cu 2-4:1.0: R_AX_SER_DBG_INFO =0x71020010
rtw89_8852cu 2-4:1.0: R_AX_SER_DBG_INFO =0x71020010
rtw89_8852cu 2-4:1.0: DBG Counter 1 (R_AX_DRV_FW_HSK_4)=0x00000000
rtw89_8852cu 2-4:1.0: DBG Counter 2 (R_AX_DRV_FW_HSK_5)=0x00000000
rtw89_8852cu 2-4:1.0: R_AX_DMAC_ERR_ISR=0x00000000
rtw89_8852cu 2-4:1.0: R_AX_DMAC_ERR_IMR=0x00000000
rtw89_8852cu 2-4:1.0: R_AX_CMAC_ERR_ISR [0]=0x00000000
rtw89_8852cu 2-4:1.0: R_AX_CMAC_FUNC_EN [0]=0xf000803f
rtw89_8852cu 2-4:1.0: R_AX_CK_EN [0]=0xffffffff
rtw89_8852cu 2-4:1.0: R_AX_CMAC_ERR_IMR [0]=0x00000000
rtw89_8852cu 2-4:1.0: [CMAC] : CMAC1 not enabled

Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Bitterblue Smith <rtl8821cerfe2@gmail.com>
Link: https://patch.msgid.link/3904dd9f-2178-41e5-95c2-7a9f6268e935@gmail.com
2025-11-04 09:30:01 +08:00
Maxime Chevallier
209ff7af79 net: stmmac: rename devlink parameter ts_coarse into phc_coarse_adj
The devlink param "ts_coarse" doesn't indicate that we get coarse
timestamps, but rather that the PHC clock adjusments are coarse as the
frequency won't be continuously adjusted. Adjust the devlink parameter
name to reflect that.

The Coarse terminlogy comes from the dwmac register naming, update the
documentation to better explain what the parameter is about.

With this change, the parameter can now be adjusted using:
  devlink dev param set <dev> name phc_coarse_adj value true cmode runtime

Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Link: https://patch.msgid.link/20251030182454.182406-1-maxime.chevallier@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-03 17:05:36 -08:00
Colin Ian King
22795871ed net: dsa: yt921x: Fix spelling mistake "stucked" -> "stuck"
There is a spelling mistake in a dev_err message. Fix it.

Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20251101183446.32134-1-colin.i.king@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-03 16:47:13 -08:00
Jianhui Zhao
18aa36238a net: phy: realtek: add interrupt support for RTL8221B
This commit introduces interrupt support for RTL8221B (C45 mode).
Interrupts are mapped on the VEND2 page. VEND2 registers are only
accessible via C45 reads and cannot be accessed by C45 over C22.

Signed-off-by: Jianhui Zhao <zhaojh329@gmail.com>
[Enable only link state change interrupts]
Signed-off-by: Aleksander Jan Bajkowski <olek2@wp.pl>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20251102152644.1676482-1-olek2@wp.pl
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-03 16:45:46 -08:00
Alok Tiwari
acbf1d0a9a hinic3: fix misleading error message in hinic3_open_channel()
The error message printed when hinic3_configure() fails incorrectly
reports "Failed to init txrxq irq", which does not match the actual
operation performed. The hinic3_configure() function sets up various
device resources such as MTU and RSS parameters , not IRQ initialization.

Update the log to "Failed to configure device resources" to make the
message accurate and clearer for debugging.

Signed-off-by: Alok Tiwari <alok.a.tiwari@oracle.com>
Reviewed-by: Fan Gong <gongfan1@huawei.com>
Link: https://patch.msgid.link/20251031112654.46187-1-alok.a.tiwari@oracle.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-03 16:38:31 -08:00
Alexei Starovoitov
5dae7453ec Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf after 6.18-rc4
Cross-merge BPF and other fixes after downstream PR.
No conflicts.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-11-03 14:59:55 -08:00
Jens Axboe
3615e3f794 io_uring/rsrc: use get/put_user() for integer copy
It's just getting an integer from userspace, installing a file, then
copying the output direct descriptor back. No need to use the full
copy_to/from_user() for that.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-03 11:02:54 -07:00
Jens Axboe
adb395c457 io_uring/slist: remove unused wq list splice helpers
Nobody is using those helpers anymore, get rid of them.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-03 10:51:39 -07:00
Ming Lei
3f5b1169d2 selftests: ublk: make ublk_thread thread-local variable
Refactor ublk_thread to be a thread-local variable instead of storing
it in ublk_dev:

- Remove pthread_t thread field from struct ublk_thread and move it to
  struct ublk_thread_info

- Remove struct ublk_thread array from struct ublk_dev, reducing memory
  footprint

- Define struct ublk_thread as local variable in __ublk_io_handler_fn()
  instead of accessing it from dev->threads[]

- Extract main IO handling logic into __ublk_io_handler_fn() which is
  marked as noinline

- Move CPU affinity setup to ublk_io_handler_fn() before calling
  __ublk_io_handler_fn()

- Update ublk_thread_set_sched_affinity() to take struct ublk_thread_info *
  instead of struct ublk_thread *, and use pthread_setaffinity_np()
  instead of sched_setaffinity()

- Reorder struct ublk_thread fields to group related state together

This change makes each thread's ublk_thread structure truly local to
the thread, improving cache locality and reducing memory usage.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-03 08:34:59 -07:00
Ming Lei
0123bb91f4 selftests: ublk: set CPU affinity before thread initialization
Move ublk_thread_set_sched_affinity() call before ublk_thread_init()
to ensure memory allocations during thread initialization occur on
the correct NUMA node. This leverages Linux's first-touch memory
policy for better NUMA locality.

Also convert ublk_thread_set_sched_affinity() to use
pthread_setaffinity_np() instead of sched_setaffinity(), as the
pthread API is the proper interface for setting thread affinity in
multithreaded programs.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-03 08:34:59 -07:00
Ming Lei
c28ba6b6c5 ublk: use struct_size() for allocation
Convert ublk_queue to use struct_size() for allocation.

Changes in this commit:

1. Update ublk_init_queue() to use struct_size(ubq, ios, depth)
   instead of manual size calculation (sizeof(struct ublk_queue) +
   depth * sizeof(struct ublk_io)).

This provides better type safety and makes the code more maintainable
by using standard kernel macro for flexible array handling.

Meantime annotate ublk_queue.ios by __counted_by().

Reviewed-by: Caleb Sander Mateos <csander@purestorage.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-03 08:34:59 -07:00
Ming Lei
529d4d6327 ublk: implement NUMA-aware memory allocation
Implement NUMA-friendly memory allocation for ublk driver to improve
performance on multi-socket systems.

This commit includes the following changes:

1. Rename __queues to queues, dropping the __ prefix since the field is
   now accessed directly throughout the codebase rather than only through
   the ublk_get_queue() helper.

2. Remove the queue_size field from struct ublk_device as it is no longer
   needed.

3. Move queue allocation and deallocation into ublk_init_queue() and
   ublk_deinit_queue() respectively, improving encapsulation. This
   simplifies ublk_init_queues() and ublk_deinit_queues() to just
   iterate and call the per-queue functions.

4. Add ublk_get_queue_numa_node() helper function to determine the
   appropriate NUMA node for a queue by finding the first CPU mapped
   to that queue via tag_set.map[HCTX_TYPE_DEFAULT].mq_map[] and
   converting it to a NUMA node using cpu_to_node(). This function is
   called internally by ublk_init_queue() to determine the allocation
   node.

5. Allocate each queue structure on its local NUMA node using
   kvzalloc_node() in ublk_init_queue().

6. Allocate the I/O command buffer on the same NUMA node using
   alloc_pages_node().

This reduces memory access latency on multi-socket NUMA systems by
ensuring each queue's data structures are local to the CPUs that
access them.

Reviewed-by: Caleb Sander Mateos <csander@purestorage.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-03 08:34:59 -07:00
Ming Lei
011af85ccd ublk: reorder tag_set initialization before queue allocation
Move ublk_add_tag_set() before ublk_init_queues() in the device
initialization path. This allows us to use the blk-mq CPU-to-queue
mapping established by the tag_set to determine the appropriate
NUMA node for each queue allocation.

The error handling paths are also reordered accordingly.

Reviewed-by: Caleb Sander Mateos <csander@purestorage.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-03 08:34:59 -07:00
Caleb Sander Mateos
20fb3d05a3 io_uring/uring_cmd: avoid double indirect call in task work dispatch
io_uring task work dispatch makes an indirect call to struct io_kiocb's
io_task_work.func field to allow running arbitrary task work functions.
In the uring_cmd case, this calls io_uring_cmd_work(), which immediately
makes another indirect call to struct io_uring_cmd's task_work_cb field.
Change the uring_cmd task work callbacks to functions whose signatures
match io_req_tw_func_t. Add a function io_uring_cmd_from_tw() to convert
from the task work's struct io_tw_req argument to struct io_uring_cmd *.
Define a constant IO_URING_CMD_TASK_WORK_ISSUE_FLAGS to avoid
manufacturing issue_flags in the uring_cmd task work callbacks. Now
uring_cmd task work dispatch makes a single indirect call to the
uring_cmd implementation's callback. This also allows removing the
task_work_cb field from struct io_uring_cmd, freeing up 8 bytes for
future storage.
Since fuse_uring_send_in_task() now has access to the io_tw_token_t,
check its cancel field directly instead of relying on the
IO_URING_F_TASK_DEAD issue flag.

Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-03 08:31:26 -07:00
Caleb Sander Mateos
c33e779aba io_uring: add wrapper type for io_req_tw_func_t arg
In preparation for uring_cmd implementations to implement functions
with the io_req_tw_func_t signature, introduce a wrapper struct
io_tw_req to hide the struct io_kiocb * argument. The intention is for
only the io_uring core to access the inner struct io_kiocb *. uring_cmd
implementations should instead call a helper from io_uring/cmd.h to
convert struct io_tw_req to struct io_uring_cmd *.

Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-03 08:31:26 -07:00
Caleb Sander Mateos
4531d165ee io_uring: only call io_should_terminate_tw() once for ctx
io_fallback_req_func() calls io_should_terminate_tw() on each req's ctx.
But since the reqs all come from the ctx's fallback_llist, req->ctx will
be ctx for all of the reqs. Therefore, compute ts.cancel as
io_should_terminate_tw(ctx) just once, outside the loop.

Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-03 08:31:26 -07:00
Chaitanya Kulkarni
bc49af56ee blktrace: add support for REQ_OP_WRITE_ZEROES tracing
Currently, REQ_OP_WRITE_ZEROES operations are not handled in the
blktrace infrastructure, resulting in incorrect or missing operation
labels in ftrace blktrace output. This manifests as write-zeroes
operations appearing with incorrect labels like "N" instead of a
proper "WZ" designation.

This patch adds complete support for REQ_OP_WRITE_ZEROES across the
blktrace infrastructure:

Add BLK_TC_WRITE_ZEROES trace category in blktrace_api.h and update
BLK_TC_END_V2 marker accordingly
Map REQ_OP_WRITE_ZEROES to BLK_TC_WRITE_ZEROES in __blk_add_trace()
to ensure proper trace event categorization
Update fill_rwbs() to generate "WZ" label for write-zeroes operations
in ftrace output, making them easily identifiable
Add "write-zeroes" string mapping in act_to_str array for debugfs
filter interface
Update blk_fill_rwbs() to handle REQ_OP_WRITE_ZEROES for block layer
event tracing

With this fix, write-zeroes operations are now correctly traced and
displayed.

===========================================================
BEFORE THIS PATCH
===========================================================
blkdiscard -z -o 0 -l 40960 /dev/nvme0n1
   blkdiscard-3809 [030] .....  1212.253701: block_bio_queue: 259,0 NS 0 + 80 [blkdiscard]
   blkdiscard-3809 [030] .....  1212.253703: block_getrq: 259,0 NS 0 + 80 [blkdiscard]
   blkdiscard-3809 [030] .....  1212.253704: block_io_start: 259,0 NS 40960 () 0 + 80 be,0,4 [blkdiscard]
   blkdiscard-3809 [030] .....  1212.253704: block_plug: [blkdiscard]
   blkdiscard-3809 [030] .....  1212.253706: block_unplug: [blkdiscard] 1
   blkdiscard-3809 [030] .....  1212.253706: block_rq_insert: 259,0 NS 40960 () 0 + 80 be,0,4 [blkdiscard]
kworker/30:1H-566  [030] .....  1212.253726: block_rq_issue: 259,0 NS 40960 () 0 + 80 be,0,4 [kworker/30:1H]
       <idle>-0    [030] d.h1.  1212.253957: block_rq_complete: 259,0 NS () 0 + 80 be,0,4 [0]
       <idle>-0    [030] dNh1.  1212.253960: block_io_done: 259,0 NS 0 () 0 + 0 none,0,0 [swapper/30]

Trace Event Breakdown:
 Event             | Device | Op  | Sector | Sectors | Byte Size | Calculation

 block_bio_queue   | 259,0  | NS  | 0      | 80      | -         | 80 × 512 = 40,960
 block_getrq       | 259,0  | NS  | 0      | 80      | -         | 80 × 512 = 40,960
 block_io_start    | 259,0  | NS  | 0      | 80      | 40960     | Direct from trace
 block_rq_insert   | 259,0  | NS  | 0      | 80      | 40960     | Direct from trace
 block_rq_issue    | 259,0  | NS  | 0      | 80      | 40960     | Direct from trace
 block_rq_complete | 259,0  | NS  | 0      | 80      | -         | 80 × 512 = 40,960
 block_io_done     | 259,0  | NS  | 0      | 0       | 0         | Completion (no data)

  Total Bytes Transferred: Sectors: 80 Bytes: 80 × 512 = 40,960 bytes

===========================================================
AFTER THIS PATCH
===========================================================
blkdiscard -z -o 0 -l 40960 /dev/nvme0n1

   blkdiscard-2477 [020] .....   960.989131: block_bio_queue: 259,0 WZS 0 + 80 [blkdiscard]
   blkdiscard-2477 [020] .....   960.989134: block_getrq: 259,0 WZS 0 + 80 [blkdiscard]
   blkdiscard-2477 [020] .....   960.989135: block_io_start: 259,0 WZS 40960 () 0 + 80 be,0,4 [blkdiscard]
   blkdiscard-2477 [020] .....   960.989138: block_plug: [blkdiscard]
   blkdiscard-2477 [020] .....   960.989140: block_unplug: [blkdiscard] 1
   blkdiscard-2477 [020] .....   960.989141: block_rq_insert: 259,0 WZS 40960 () 0 + 80 be,0,4 [blkdiscard]
kworker/20:1H-736  [020] .....   960.989166: block_rq_issue: 259,0 WZS 40960 () 0 + 80 be,0,4 [kworker/20:1H]
       <idle>-0    [020] d.h1.   960.989476: block_rq_complete: 259,0 WZS () 0 + 80 be,0,4 [0]
       <idle>-0    [020] dNh1.   960.989482: block_io_done: 259,0 WZS 0 () 0 + 0 none,0,0 [swapper/20]

Trace Event Breakdown:
 Event             | Device | Op  | Sector | Sectors | Byte Size | Calculation

 block_bio_queue   | 259,0  | WZS | 0      | 80      | -         | 80 × 512 = 40,960
 block_getrq       | 259,0  | WZS | 0      | 80      | -         | 80 × 512 = 40,960
 block_io_start    | 259,0  | WZS | 0      | 80      | 40960     | Direct from trace
 block_rq_insert   | 259,0  | WZS | 0      | 80      | 40960     | Direct from trace
 block_rq_issue    | 259,0  | WZS | 0      | 80      | 40960     | Direct from trace
 block_rq_complete | 259,0  | WZS | 0      | 80      | -         | 80 × 512 = 40,960
 block_io_done     | 259,0  | WZS | 0      | 0       | 0         | Completion (no data)

  Total Bytes Transferred: Sectors: 80 Bytes: 80 × 512 = 40,960 bytes

Tested with ftrace blktrace on NVMe devices using blkdiscard with
the -z (write-zeroes) flag.

Signed-off-by: Chaitanya Kulkarni <ckulkarnilinux@gmail.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-03 08:30:56 -07:00
Shi Hao
77220f6d18 drbd: replace kmap() with kmap_local_page() in receiver path
Use kmap_local_page() instead of kmap() to avoid
CPU contention.

kmap() uses a global set of mapping slots that can cause contention
between multiple CPUs, while kmap_local_page() uses per-CPU slots
eliminating this contention. It also ensures non-sleeping operation
and provides better cache locality.

Convert kmap() to kmap_local_page() as it aligns with ongoing
kernel efforts to modernize kmap() usage for better multi-core
scalability.

Signed-off-by: Shi Hao <i.shihao.999@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-11-03 08:15:54 -07:00
Johannes Berg
dcbc94c1f0 Merge tag 'iwlwifi-next-2025-10-28' of https://git.kernel.org/pub/scm/linux/kernel/git/iwlwifi/iwlwifi-next
Miri Korenblit says:
====================
iwlwifi features. Notably:

- Unsupported APIs cleanup
- New sniffer API
- small bugfixes and features
====================

Link: https://patch.msgid.link/DM3PPF63A6024A9B77EE7385B4C865E3E54A3FDA@DM3PPF63A6024A9.namprd11.prod.outlook.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-11-03 11:39:50 +01:00
Konstantin Komarov
1ff28f36eb fs/ntfs3: disable readahead for compressed files
Reading large compressed files is extremely slow when readahead is enabled.
For example, reading a 4 GB XPRESS-4K compressed file (compression ratio
≈ 4:1) takes about 230 minutes with readahead enabled, but only around 3
minutes when readahead is disabled.

The issue was first observed in January 2025 and is reproducible with large
compressed NTFS files. Disabling readahead for compressed files avoids this
performance regression, although this may not be the ideal long-term fix.

Signed-off-by: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
2025-11-01 16:30:01 +01:00
Bagas Sanjaya
01cc760632 Documentation: ARCnet: Update obsolete contact info
ARCnet docs states that inquiries on the subsystem should be emailed to
Avery Pennarun <apenwarr@worldvisions.ca>, for whom has been in CREDITS
since the beginning of kernel git history and her email address is
unreachable (bounce). The subsystem is now maintained by Michael
Grzeschik since c38f6ac74c ("MAINTAINERS: add arcnet and take
maintainership").

In addition, there used to be a dedicated ARCnet mailing list but its
archive at epistolary.org has been shut down. ARCnet discussion nowadays
take place in netdev list. The arcnet.com domain mentioned has become
AIoT (Artificial Intelligence of Things) related Typeform page and
ARCnet info now resides on arcnet.cc (ARCnet Resource Center) instead.

Update contact information.

Signed-off-by: Bagas Sanjaya <bagasdotme@gmail.com>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://patch.msgid.link/20251028014451.10521-2-bagasdotme@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-31 18:08:08 -07:00
Jakub Kicinski
718878c7f7 Merge branch 'dpll-add-support-for-phase-adjustment-granularity'
Ivan Vecera says:

====================
dpll: Add support for phase adjustment granularity

Phase-adjust values are currently limited only by a min-max range. Some
hardware requires, for certain pin types, that values be multiples of
a specific granularity, as in the zl3073x driver.

Patch 1: Adds 'phase-adjust-gran' pin attribute and an appropriate
         handling
Patch 2: Adds a support for this attribute into zl3073x driver
====================

Link: https://patch.msgid.link/20251029153207.178448-1-ivecera@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-31 17:59:21 -07:00
Ivan Vecera
055a01b29f dpll: zl3073x: Specify phase adjustment granularity for pins
Output pins phase adjustment values in the device are expressed
in half synth clock cycles. Use this number of cycles as output
pins' phase adjust granularity and simplify both get/set callbacks.

Reviewed-by: Michal Schmidt <mschmidt@redhat.com>
Reviewed-by: Petr Oros <poros@redhat.com>
Tested-by: Prathosh Satish <Prathosh.Satish@microchip.com>
Signed-off-by: Ivan Vecera <ivecera@redhat.com>
Reviewed-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Link: https://patch.msgid.link/20251029153207.178448-3-ivecera@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-31 17:59:17 -07:00
Ivan Vecera
30176bf7c8 dpll: add phase-adjust-gran pin attribute
Phase-adjust values are currently limited by a min-max range. Some
hardware requires, for certain pin types, that values be multiples of
a specific granularity, as in the zl3073x driver.

Add a `phase-adjust-gran` pin attribute and an appropriate field in
dpll_pin_properties. If set by the driver, use its value to validate
user-provided phase-adjust values.

Reviewed-by: Michal Schmidt <mschmidt@redhat.com>
Reviewed-by: Petr Oros <poros@redhat.com>
Tested-by: Prathosh Satish <Prathosh.Satish@microchip.com>
Signed-off-by: Ivan Vecera <ivecera@redhat.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Link: https://patch.msgid.link/20251029153207.178448-2-ivecera@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-31 17:59:17 -07:00
Jakub Kicinski
29f7ae9eba Merge branch 'net-pse-pd-add-tps23881b-support'
Thomas Wismer says:

====================
net: pse-pd: Add TPS23881B support

This patch series aims at adding support for the TI TPS23881B PoE
PSE controller.
====================

Link: https://patch.msgid.link/20251029212312.108749-1-thomas@wismer.xyz
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-31 17:56:34 -07:00
Thomas Wismer
32032eb166 dt-bindings: pse-pd: ti,tps23881: Add TPS23881B
Add the TPS23881B I2C power sourcing equipment controller to the list of
supported devices.

Falling back to the TPS23881 predecessor device is not suitable as firmware
loading needs to handled differently by the driver. The TPS23881 and
TPS23881B devices require different firmware. Trying to load the TPS23881
firmware on a TPS23881B device fails and must therefore be omitted.

Signed-off-by: Thomas Wismer <thomas.wismer@scs.ch>
Acked-by: Conor Dooley <conor.dooley@microchip.com>
Reviewed-by: Kory Maincent <kory.maincent@bootlin.com>
Link: https://patch.msgid.link/20251029212312.108749-3-thomas@wismer.xyz
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-31 17:56:32 -07:00
Thomas Wismer
4d07797faa net: pse-pd: tps23881: Add support for TPS23881B
The TPS23881B uses different firmware than the TPS23881. Trying to load the
TPS23881 firmware on a TPS23881B device fails and must be omitted.

The TPS23881B ships with a more recent ROM firmware. Moreover, no updated
firmware has been released yet and so the firmware loading step must be
skipped. As of today, the TPS23881B is intended to use its ROM firmware.

Signed-off-by: Thomas Wismer <thomas.wismer@scs.ch>
Reviewed-by: Kory Maincent <kory.maincent@bootlin.com>
Acked-by: Oleksij Rempel <o.rempel@pengutronix.de>
Link: https://patch.msgid.link/20251029212312.108749-2-thomas@wismer.xyz
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-31 17:56:32 -07:00
Bagas Sanjaya
a7aca10c00 Documentation: netconsole: Separate literal code blocks for full and short netcat command name versions
Both full and short (abbreviated) command name versions of netcat
example are combined in single literal code block due to 'or::'
paragraph being indented one more space than the preceding paragraph
(before the short version example).

Unindent it to separate the versions.

Signed-off-by: Bagas Sanjaya <bagasdotme@gmail.com>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://patch.msgid.link/20251030075013.40418-1-bagasdotme@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-31 16:54:22 -07:00
Jakub Kicinski
8c1211376e Merge branch 'net-phy-microchip_t1s-add-support-for-lan867x-rev-d0-phy'
Parthiban Veerasooran says:

====================
net: phy: microchip_t1s: Add support for LAN867x Rev.D0 PHY

This patch series adds support for the latest Microchip LAN8670/1/2 Rev.D0
10BASE-T1S PHYs to the microchip_t1s driver.

The new Rev.D0 silicon introduces updated initialization requirements and
link status handling behavior compared to earlier revisions (Rev.C2 and
below). These updates are necessary for full compliance with the OPEN
Alliance 10BASE-T1S specification and are documented in Microchip
Application Note AN1699 Revision G (DS60001699G – October 2025).

Summary of changes:
- Implements Rev.D0-specific configuration sequence as described in AN1699
  Rev.G.
- Introduces link status control configuration for LAN867x Rev.D0.
====================

Link: https://patch.msgid.link/20251030102258.180061-1-parthiban.veerasooran@microchip.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-31 16:52:09 -07:00
Parthiban Veerasooran
07f5765f26 net: phy: microchip_t1s: configure link status control for LAN867x Rev.D0
Configure the link status in the Link Status Control register for
LAN8670/1/2 Rev.D0 PHYs, depending on whether PLCA or CSMA/CD mode
is enabled. When PLCA is enabled, the link status reflects the PLCA
status. When PLCA is disabled (CSMA/CD mode), the PHY does not support
autonegotiation, so the link status is forced active by setting
the LINK_STATUS_SEMAPHORE bit.

The link status control is configured:
- During PHY initialization, for default CSMA/CD mode.
- Whenever PLCA configuration is updated.

This ensures correct link reporting and consistent behavior for
LAN867x Rev.D0 devices.

Signed-off-by: Parthiban Veerasooran <parthiban.veerasooran@microchip.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20251030102258.180061-3-parthiban.veerasooran@microchip.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-31 16:52:06 -07:00
Parthiban Veerasooran
e7e756779a net: phy: microchip_t1s: add support for Microchip LAN867X Rev.D0 PHY
Add support for the LAN8670/1/2 Rev.D0 10BASE-T1S PHYs from Microchip.
The new Rev.D0 silicon requires a specific set of initialization
settings to be configured for optimal performance and compliance with
OPEN Alliance specifications, as described in Microchip Application Note
AN1699 (Revision G, DS60001699G – October 2025).
https://www.microchip.com/en-us/application-notes/an1699

Signed-off-by: Parthiban Veerasooran <parthiban.veerasooran@microchip.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20251030102258.180061-2-parthiban.veerasooran@microchip.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-31 16:52:05 -07:00
Russell King (Oracle)
9b443e58a8 net: stmmac: qcom-ethqos: remove MAC_CTRL_REG modification
When operating in "SGMII" mode (Cisco SGMII or 2500BASE-X), qcom-ethqos
modifies the MAC control register in its ethqos_configure_sgmii()
function, which is only called from one path:

stmmac_mac_link_up()
+- reads MAC_CTRL_REG
+- masks out priv->hw->link.speed_mask
+- sets bits according to speed (2500, 1000, 100, 10) from priv->hw.link.speed*
+- ethqos_fix_mac_speed()
|  +- qcom_ethqos_set_sgmii_loopback(false)
|  +- ethqos_update_link_clk(speed)
|  `- ethqos_configure(speed)
|     `- ethqos_configure_sgmii(speed)
|        +- reads MAC_CTRL_REG,
|        +- configures PS/FES bits according to speed
|        `- writes MAC_CTRL_REG as the last operation
+- sets duplex bit(s)
+- stmmac_mac_flow_ctrl()
+- writes MAC_CTRL_REG if changed from original read
...

As can be seen, the modification of the control register that
stmmac_mac_link_up() overwrites the changes that ethqos_fix_mac_speed()
does to the register. This makes ethqos_configure_sgmii()'s
modification questionable at best.

Analysing the values written, GMAC4 sets the speed bits as:
speed_mask = GMAC_CONFIG_FES | GMAC_CONFIG_PS
speed2500 = GMAC_CONFIG_FES                     B14=1 B15=0
speed1000 = 0                                   B14=0 B15=0
speed100 = GMAC_CONFIG_FES | GMAC_CONFIG_PS     B14=1 B15=1
speed10 = GMAC_CONFIG_PS                        B14=0 B15=1

Whereas ethqos_configure_sgmii():
2500: clears ETHQOS_MAC_CTRL_PORT_SEL           B14=X B15=0
1000: clears ETHQOS_MAC_CTRL_PORT_SEL           B14=X B15=0
100: sets ETHQOS_MAC_CTRL_PORT_SEL |            B14=1 B15=1
          ETHQOS_MAC_CTRL_SPEED_MODE
10: sets ETHQOS_MAC_CTRL_PORT_SEL               B14=0 B15=1
    clears ETHQOS_MAC_CTRL_SPEED_MODE

Thus, they appear to be doing very similar, with the exception of the
FES bit (bit 14) for 1G and 2.5G speeds.

Given that stmmac_mac_link_up() will write the MAC_CTRL_REG after
ethqos_configure_sgmii(), remove the unnecessary update in the
glue driver's ethqos_configure_sgmii() method, simplifying the code.

Konrad states:

Without any additional knowledge, the register description says:

2500: B14=1 B15=0
1000: B14=0 B15=0
 100: B14=1 B15=1
  10: B14=0 B15=1

Tested-by: Mohd Ayaan Anwar <mohd.anwar@oss.qualcomm.com>
Reviewed-by: Konrad Dybcio <konrad.dybcio@oss.qualcomm.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vEPlg-0000000CFHY-282A@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-31 16:50:33 -07:00
Jakub Kicinski
ab5b76806e Merge branch 'convert-mlx5e-and-ipoib-to-ndo_hwtstamp_get-set'
Tariq Toukan says:

====================
Convert mlx5e and IPoIB to ndo_hwtstamp_get/set

This series by Carolina migrates mlx5e and IPoIB to the
ndo_hwtstamp_get/set interface and removes legacy hardware timestamp
ioctl handling.  While doing so, it also cleans up naming and removes
redundant code.

No functional change in timestamp behavior.

Cleanup patches:
- net/mlx5e: Remove redundant tstamp pointer from channel structures
- net/mlx5e: Remove unnecessary tstamp local variable in mlx5i_complete_rx_cqe
- net/mlx5e: Rename hwstamp functions to hwtstamp
- net/mlx5e: Rename timestamp fields to hwtstamp_config

Add suppport in ipoib:
- IB/IPoIB: Add support for hwtstamp get/set ndos

Convert mlx5:
- net/mlx5e: Convert to new hwtstamp_get/set interface
====================

Link: https://patch.msgid.link/1761819910-1011051-1-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-31 16:41:13 -07:00
Carolina Jubran
1c7fe48a90 net/mlx5e: Convert to new hwtstamp_get/set interface
Migrate from the legacy ioctl hardware timestamping interface to the
ndo_hwtstamp_get/set operations.

Signed-off-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1761819910-1011051-7-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-31 16:37:32 -07:00
Carolina Jubran
250da3c8fe IB/IPoIB: Add support for hwtstamp get/set ndos
Add support for the ndo_hwtstamp_get and ndo_hwtstamp_set operations in
IPoIB. This allows lower devices to handle hardware timestamp
configuration through the new ndos instead of the legacy ioctls.

Signed-off-by: Carolina Jubran <cjubran@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1761819910-1011051-6-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-31 16:36:43 -07:00
Carolina Jubran
91baaf96f5 net/mlx5e: Rename timestamp fields to hwtstamp_config
Rename hardware timestamp-related fields from 'tstamp' to
'hwtstamp_config' throughout the MLX5 driver. The new name is more
descriptive as it clearly indicates these fields contain hardware
timestamp configuration.

Signed-off-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1761819910-1011051-5-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-31 16:36:43 -07:00
Carolina Jubran
fee182371a net/mlx5e: Rename hwstamp functions to hwtstamp
Rename mlx5e_hwstamp_set/get() functions to mlx5e_hwtstamp_set/get()
to better reflect that these functions handle hardware timestamping,
not just hardware stamping.

Signed-off-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1761819910-1011051-4-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-31 16:36:43 -07:00
Carolina Jubran
bf79165974 net/mlx5e: Remove unnecessary tstamp local variable in mlx5i_complete_rx_cqe
Remove the tstamp local variable in mlx5i_complete_rx_cqe() and directly
pass the tstamp field from priv to mlx5e_rx_hw_stamp(). The local variable
was only used once and provided no additional value.

Signed-off-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1761819910-1011051-3-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-31 16:36:43 -07:00
Carolina Jubran
7ea4376b39 net/mlx5e: Remove redundant tstamp pointer from channel structures
Remove the tstamp pointer field from mlx5e_channel, mlx5e_ptp, and
mlx5e_trap structures, since it was only used to reference the tstamp
field in the priv structure. Instead, directly use the tstamp field
from priv when initializing RQ structures.

Also remove the unused hwtstamp_config field from mlx5_clock structure
as part of the cleanup.

Signed-off-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1761819910-1011051-2-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-31 16:36:43 -07:00
Haiyang Zhang
54133f9b4b net: mana: Support HW link state events
Handle the NIC hardware link state events received from the HW
channel, then set the proper link state accordingly.

And, add a feature bit, GDMA_DRV_CAP_FLAG_1_HW_VPORT_LINK_AWARE,
to inform the NIC hardware this handler exists.

Our MANA NIC only sends out the link state down/up messages
when we need to let the VM rerun DHCP client and change IP
address. So, add netif_carrier_on() in the probe(), let the NIC
show the right initial state in /sys/class/net/ethX/operstate.

Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Link: https://patch.msgid.link/1761770601-16920-1-git-send-email-haiyangz@linux.microsoft.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-31 15:56:53 -07:00
Alexis Lothoré (eBPF Foundation)
e6e10c51fb selftests/bpf: Add checks in tc_tunnel when entering net namespaces
test_tc_tunnel is missing checks on any open_netns. Add those checks
anytime we try to enter a net namespace, and skip the related operations
if we fail. While at it, reduce the number of open_netns/close_netns for
cases involving operations in two distinct namespaces: the test
currently does the following:

  nstoken = open_netns("foo")
  do_operation();
  close(nstoken);
  nstoken = open_netns("bar")
  do_another_operation();
  close(nstoken);

As already stated in reviews for the initial test, we don't need to go
back to the root net namespace to enter a second namespace, so just do:

  ntoken_client = open_netns("foo")
  do_operation();
  nstoken_server = open_netns("bar")
  do_another_operation();
  close(nstoken_server);
  close(nstoken_client);

Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20251031-tc_tunnel_improv-v1-2-0ffe44d27eda@bootlin.com
2025-10-31 15:00:30 -07:00
Alexis Lothoré (eBPF Foundation)
c076fd5bb4 selftests/bpf: Skip tc_tunnel subtest if its setup fails
A subtest setup can fail in a wide variety of ways, so make sure not to
run it if an issue occurs during its setup. The return value is
already representing whether the setup succeeds or fails, it is just
about wiring it.

Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20251031-tc_tunnel_improv-v1-1-0ffe44d27eda@bootlin.com
2025-10-31 15:00:30 -07:00
Sukrut Heroorkar
de90521604 gfs2: document ip in __gfs2_holder_init kernel-doc comment
Building with W=1 reports:
Warning: fs/gfs2/glock.c:1248 function parameter 'ip' not described
in '__gfs2_holder_init'

The ip parameter was added when __gfs2_holder_init started saving the
gfs2_glock_nq_init caller's return address to gh_ip. This makes it
easier to backtrack which holder took the lock. Document @ip to silence
this warning.

Fixes: b016d9a84a ("gfs2: Save ip from gfs2_glock_nq_init")
Signed-off-by: Sukrut Heroorkar <hsukrut3@gmail.com>
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2025-10-31 21:04:17 +00:00
Bagas Sanjaya
620fc27ef6 Documentation: gfs2: Consolidate GFS2 docs into its own subdirectory
Documentation for GFS2 is scattered in three docs that are in
Documentation/filesystems/ directory. As these docs are standing out as
a group, move them into separate gfs2/ subdirectory.

Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Bagas Sanjaya <bagasdotme@gmail.com>
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2025-10-31 21:00:43 +00:00
Utkarsh Singh
02c03021e2 gfs2/sysfs: Replace sprintf/snprintf with sysfs_emit
Documentation/filesystems/sysfs.rst mentions that show() should only
use sysfs_emit() or sysfs_emit_at() when formatting values returned
to user space. This patch updates the GFS2 sysfs interface accordingly.

It replaces uses of sprintf() and snprintf() in all *_show() functions
with sysfs_emit() to align with current kernel sysfs API best practices.
It also updates the TUNE_ATTR_2 macro to use sysfs_emit() instead of
snprintf().

Signed-off-by: Utkarsh Singh <utkarsh.singh.em@gmail.com>
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
2025-10-31 21:00:42 +00:00
Alexei Starovoitov
ab01bfad61 Merge branch 'selftests-bpf-integrate-test_xsk-c-to-test_progs-framework'
Bastien Curutchet says:

====================
selftests/bpf: Integrate test_xsk.c to test_progs framework

The test_xsk.sh script covers many AF_XDP use cases. The tests it runs
are defined in xksxceiver.c. Since this script is used to test real
hardware, the goal here is to leave it as it is, and only integrate the
tests that run on veth peers into the test_progs framework.

PATCH 1 extracts test_xsk[.c/.h] from xskxceiver[.c/.h] to make the
tests available to test_progs.
PATCH 2 to 7 fix small issues in the current test
PATCH 8 to 13 handle all errors to release resources instead of calling
exit() when any error occurs.
PATCH 14 isolates the tests that won't fit in the CI
PATCH 15 integrates the CI tests to the test_progs framework

Signed-off-by: Bastien Curutchet (eBPF Foundation) <bastien.curutchet@bootlin.com>
---
Changes in v7:
- Restore 'test_ns' prefix to allow parallel execution.
- PATCH 11: fix potential uninitialized variable spotted by AI.
- PACTH 12: fix potential resource leak spotted by AI
- Link to v6: https://lore.kernel.org/r/20251029-xsk-v6-0-5a63a64dff98@bootlin.com

Changes in v6:
- Setup veth peer once for each mode instead of once for each substest
- Rename the 'flaky' table 'skip-ci' table and move the automatically
  skipped and the longest tests into it
- Link to v5: https://lore.kernel.org/r/20251016-xsk-v5-0-662c95eb8005@bootlin.com

Changes in v5:
- Rebase on latest bpf-next_base
- Move XDP_ADJUST_TAIL_SHRINK_MULTI_BUFF to the flaky table
- Add Maciej's reviewed-by
- Link to v4: https://lore.kernel.org/r/20250924-xsk-v4-0-20e57537b876@bootlin.com

Changes in v4:
- Fix test_xsk.sh's summary report.
- Merge PATCH 11 & 12 together, otherwise PATCH 11 fails to build.
- Split old PATCH 3 in two patches. The first one fixes
  testapp_stats_rx_dropped(), the second one fixes
  testapp_xdp_shared_umem(). The unecessary frees (in
  testapp_stats_rx_full() and testapp_stats_fill_empty() are removed)
- Link to v3: https://lore.kernel.org/r/20250904-xsk-v3-0-ce382e331485@bootlin.com

Changes in v3:
- Rebase on latest bpf-next_base to integrate commit c9110e6f72 ("selftests/bpf:
Fix count write in testapp_xdp_metadata_copy()").
- Move XDP_METADATA_COPY_* tests from flaky-tests to nominal tests
- Link to v2: https://lore.kernel.org/r/20250902-xsk-v2-0-17c6345d5215@bootlin.com

Changes in v2:
- Rebase on the latest bpf-next_base and integrate the newly added tests
  to the work (adjust_tail* and tx_queue_consumer tests)
- Re-order patches to split xkxceiver sooner.
- Fix the bug reported by Maciej.
- Fix verbose mode in test_xsk.sh by keeping kselftest (remove PATCH 1,
  7 and 8)
- Link to v1: https://lore.kernel.org/r/20250313-xsk-v1-0-7374729a93b9@bootlin.com
====================

Link: https://lore.kernel.org/r/20251031-xsk-v7-0-39fe486593a3@bootlin.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-31 09:24:53 -07:00
Bastien Curutchet (eBPF Foundation)
d1aec26fce selftests/bpf: test_xsk: Integrate test_xsk.c to test_progs framework
test_xsk.c isn't part of the test_progs framework.

Integrate the tests defined by test_xsk.c into the test_progs framework
through a new file : prog_tests/xsk.c. ZeroCopy mode isn't tested in it
as veth peers don't support it.

Move test_xsk{.c/.h} to prog_tests/.

Add the find_bit library to test_progs sources in the Makefile as it is
is used by test_xsk.c

Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Bastien Curutchet (eBPF Foundation) <bastien.curutchet@bootlin.com>
Link: https://lore.kernel.org/r/20251031-xsk-v7-15-39fe486593a3@bootlin.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-31 09:24:39 -07:00
Bastien Curutchet (eBPF Foundation)
75fc630867 selftests/bpf: test_xsk: Isolate non-CI tests
Following tests won't fit in the CI:
- XDP_ADJUST_TAIL_* and SEND_RECEIVE_9K_PACKETS because of their
  flakyness
- UNALIGNED_* because they depend on huge page allocations
- *_RING_SIZE because they depend on HW rings
- TEARDOWN because it's too long

Remove these tests from the nominal tests table so they won't be
run by the CI in upcoming patch.
Create a skip_ci_tests table to hold them.
Use this skip_ci table in xskxceiver.c to keep all the tests available
from the test_xsk.sh script.

Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Bastien Curutchet (eBPF Foundation) <bastien.curutchet@bootlin.com>
Link: https://lore.kernel.org/r/20251031-xsk-v7-14-39fe486593a3@bootlin.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-31 09:24:39 -07:00
Bastien Curutchet (eBPF Foundation)
7a96615f2e selftests/bpf: test_xsk: Don't exit immediately on allocation failures
If any allocation in the pkt_stream_*() helpers fail, exit_with_error() is
called. This terminates the program immediately. It prevents the following
tests from running and isn't compliant with the CI.

Return NULL in case of allocation failure.
Return TEST_FAILURE when something goes wrong in the packet generation.
Clean up the resources if a failure happens between two steps of a test.

Move exit_with_error()'s definition into xskxceiver.c as it isn't used
anywhere else now.

Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Bastien Curutchet (eBPF Foundation) <bastien.curutchet@bootlin.com>
Link: https://lore.kernel.org/r/20251031-xsk-v7-13-39fe486593a3@bootlin.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-31 09:24:39 -07:00
Bastien Curutchet (eBPF Foundation)
844b13a9ff selftests/bpf: test_xsk: Don't exit immediately if validate_traffic fails
__testapp_validate_traffic() calls exit_with_error() on failures. This
exits the program immediately. It prevents the following tests from
running and isn't compliant with the CI.

Return TEST_FAILURE instead of calling exit_with_error().
Release the resource of the 1st thread if a failure happens between its
creation and the creation of the second thread.

Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Bastien Curutchet (eBPF Foundation) <bastien.curutchet@bootlin.com>
Link: https://lore.kernel.org/r/20251031-xsk-v7-12-39fe486593a3@bootlin.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-31 09:24:39 -07:00
Bastien Curutchet (eBPF Foundation)
5b2a757a16 selftests/bpf: test_xsk: Don't exit immediately when workers fail
TX and RX workers can fail in many places. These failures trigger a call
to exit_with_error() which exits the program immediately. It prevents the
following tests from running and isn't compliant with the CI.

Add return value to functions that can fail.
Handle failures more smoothly through report_failure().

Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Bastien Curutchet (eBPF Foundation) <bastien.curutchet@bootlin.com>
Link: https://lore.kernel.org/r/20251031-xsk-v7-11-39fe486593a3@bootlin.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-31 09:24:39 -07:00
Bastien Curutchet (eBPF Foundation)
3f09728f90 selftests/bpf: test_xsk: Don't exit immediately when gettimeofday fails
exit_with_error() is called when gettimeofday() fails. This exits the
program immediately. It prevents the following tests from being run and
isn't compliant with the CI.

Return TEST_FAILURE instead of calling exit_on_error().

Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Bastien Curutchet (eBPF Foundation) <bastien.curutchet@bootlin.com>
Link: https://lore.kernel.org/r/20251031-xsk-v7-10-39fe486593a3@bootlin.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-31 09:24:38 -07:00
Bastien Curutchet (eBPF Foundation)
f12f1b5d14 selftests/bpf: test_xsk: Don't exit immediately when xsk_attach fails
xsk_reattach_xdp calls exit_with_error() on failures. This exits the
program immediately. It prevents the following tests from being run and
isn't compliant with the CI.

Add a return value to the functions handling XDP attachments to handle
errors more smoothly.

Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Bastien Curutchet (eBPF Foundation) <bastien.curutchet@bootlin.com>
Link: https://lore.kernel.org/r/20251031-xsk-v7-9-39fe486593a3@bootlin.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-31 09:24:38 -07:00
Bastien Curutchet (eBPF Foundation)
e645bcfb16 selftests/bpf: test_xsk: Add return value to init_iface()
init_iface() doesn't have any return value while it can fail. In case of
failure it calls exit_on_error() which exits the application
immediately. This prevents the following tests from being run and isn't
compliant with the CI

Add a return value to init_iface() so errors can be handled more
smoothly.

Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Bastien Curutchet (eBPF Foundation) <bastien.curutchet@bootlin.com>
Link: https://lore.kernel.org/r/20251031-xsk-v7-8-39fe486593a3@bootlin.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-31 09:24:38 -07:00
Bastien Curutchet (eBPF Foundation)
f477b0fd75 selftests/bpf: test_xsk: Release resources when swap fails
testapp_validate_traffic() doesn't release the sockets and the umem
created by the threads if the test isn't currently in its last step.
Thus, if the swap_xsk_resources() fails before the last step, the
created resources aren't cleaned up.

Clean the sockets and the umem in case of swap_xsk_resources() failure.

Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Bastien Curutchet (eBPF Foundation) <bastien.curutchet@bootlin.com>
Link: https://lore.kernel.org/r/20251031-xsk-v7-7-39fe486593a3@bootlin.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-31 09:24:38 -07:00
Bastien Curutchet (eBPF Foundation)
e3dfa0faf1 selftests/bpf: test_xsk: Wrap test clean-up in functions
The clean-up done at the end of a test in __testapp_validate_traffic()
isn't wrapped in a function. It isn't convenient if we want to use it
somewhere else in the code.

Wrap the clean-up in two new functions : the first deletes the sockets,
the second releases the umem.

Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Bastien Curutchet (eBPF Foundation) <bastien.curutchet@bootlin.com>
Link: https://lore.kernel.org/r/20251031-xsk-v7-6-39fe486593a3@bootlin.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-31 09:24:38 -07:00
Bastien Curutchet (eBPF Foundation)
bea4f03897 selftests/bpf: test_xsk: fix memory leak in testapp_xdp_shared_umem()
testapp_xdp_shared_umem() generates pkt_stream on each xsk from xsk_arr,
where normally xsk_arr[0] gets pkt_streams and xsk_arr[1] have them NULLed.
At the end of the test pkt_stream_restore_default() only releases
xsk_arr[0] which leads to memory leaks.

Release the missing pkt_stream at the end of testapp_xdp_shared_umem()

Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Bastien Curutchet (eBPF Foundation) <bastien.curutchet@bootlin.com>
Link: https://lore.kernel.org/r/20251031-xsk-v7-5-39fe486593a3@bootlin.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-31 09:24:38 -07:00
Bastien Curutchet (eBPF Foundation)
d66e49ffa0 selftests/bpf: test_xsk: fix memory leak in testapp_stats_rx_dropped()
testapp_stats_rx_dropped() generates pkt_stream twice. The last
generated is released by pkt_stream_restore_default() at the end of the
test but we lose the pointer of the first pkt_stream.

Release the 'middle' pkt_stream when it's getting replaced to prevent
memory leaks.

Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Bastien Curutchet (eBPF Foundation) <bastien.curutchet@bootlin.com>
Link: https://lore.kernel.org/r/20251031-xsk-v7-4-39fe486593a3@bootlin.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-31 09:24:38 -07:00
Bastien Curutchet (eBPF Foundation)
cadc0c1fd7 selftests/bpf: test_xsk: Fix __testapp_validate_traffic()'s return value
__testapp_validate_traffic is supposed to return an integer value that
tells if the test passed (0), failed (-1) or was skiped (2). It actually
returns a boolean in the end. This doesn't harm when the test is
successful but can lead to misinterpretation in case of failure as 1
will be returned instead of -1.

Return TEST_FAILURE (-1) in case of failure, TEST_PASS (0) otherwise.

Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Bastien Curutchet (eBPF Foundation) <bastien.curutchet@bootlin.com>
Link: https://lore.kernel.org/r/20251031-xsk-v7-3-39fe486593a3@bootlin.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-31 09:24:38 -07:00
Bastien Curutchet (eBPF Foundation)
2233ef8bba selftests/bpf: test_xsk: Initialize bitmap before use
bitmap is used before being initialized.

Initialize it to zero before using it.

Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Bastien Curutchet (eBPF Foundation) <bastien.curutchet@bootlin.com>
Link: https://lore.kernel.org/r/20251031-xsk-v7-2-39fe486593a3@bootlin.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-31 09:24:38 -07:00
Bastien Curutchet (eBPF Foundation)
3ab77f35a7 selftests/bpf: test_xsk: Split xskxceiver
AF_XDP features are tested by the test_xsk.sh script but not by the
test_progs framework. The tests used by the script are defined in
xksxceiver.c which can't be integrated in the test_progs framework as is.

Extract these test definitions from xskxceiver{.c/.h} to put them in new
test_xsk{.c/.h} files.
Keep the main() function and its unshared dependencies in xksxceiver to
avoid impacting the test_xsk.sh script which is often used to test real
hardware.
Move ksft_test_result_*() calls to xskxceiver.c to keep the kselftest's
report valid

Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Bastien Curutchet (eBPF Foundation) <bastien.curutchet@bootlin.com>
Link: https://lore.kernel.org/r/20251031-xsk-v7-1-39fe486593a3@bootlin.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-31 09:24:38 -07:00
Jakub Kicinski
1a2352ad82 Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
Cross-merge networking fixes after downstream PR (net-6.18-rc4).

No conflicts, adjacent changes:

drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
  ded9813d17 ("net: stmmac: Consider Tx VLAN offload tag length for maxSDU")
  26ab9830be ("net: stmmac: replace has_xxxx with core_type")

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-31 06:46:03 -07:00
Maud Spierings
5cf236b89f can: mcp251x: mcp251x_can_probe(): use dev_err_probe()
The currently used combination of dev_err() plus return leaves a loud error
in dmesg even when the error is a deferred probe which gets resolved later.
For example a supply that has not been probed yet.

Use dev_err_probe() to improve the handling/display of errors.

Signed-off-by: Maud Spierings <maudspierings@gocontroll.com>
Link: https://patch.msgid.link/20251030-mcp_err-v1-1-eecf737823b7@gocontroll.com
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-10-31 13:17:46 +01:00
Marc Kleine-Budde
cf89ae5bd7 Merge patch series "convert can drivers to use ndo_hwtstamp callbacks"
Vadim Fedorenko <vadim.fedorenko@linux.dev> says:

The patchset converts generic ioctl implementation into a pair of
ndo_hwtstamp_get/ndo_hwtstamp_set generic callbacks and replaces
callbacks in drivers.

Link: https://patch.msgid.link/20251029231620.1135640-1-vadim.fedorenko@linux.dev
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-10-31 13:12:20 +01:00
Vadim Fedorenko
243449f992 can: peak_usb: convert to use ndo_hwtstamp callbacks
Convert driver to use ndo_hwtstamp_set()/ndo_hwtstamp_get() callbacks.
ndo_eth_ioctl handler does nothing after conversion - remove it.

Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Reviewed-by: Kory Maincent <kory.maincent@bootlin.com>
Reviewed-by: Vincent Mailhol <mailhol@kernel.org>
Link: https://patch.msgid.link/20251029231620.1135640-4-vadim.fedorenko@linux.dev
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-10-31 13:12:18 +01:00
Vadim Fedorenko
336e223258 can: peak_canfd: convert to use ndo_hwtstamp callbacks
Convert driver to use ndo_hwtstamp_set()/ndo_hwtstamp_get() callbacks.
ndo_eth_ioctl handler does nothing after conversion - remove it.

Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Reviewed-by: Kory Maincent <kory.maincent@bootlin.com>
Reviewed-by: Vincent Mailhol <mailhol@kernel.org>
Link: https://patch.msgid.link/20251029231620.1135640-3-vadim.fedorenko@linux.dev
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-10-31 13:12:18 +01:00
Vadim Fedorenko
4f6b0435c6 can: convert generic HW timestamp ioctl to ndo_hwtstamp callbacks
Can has generic implementation of ndo_eth_ioctl which implements only HW
timestamping commands. Implement generic ndo_hwtstamp callbacks and use
it in drivers instead of generic ioctl interface.

Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Reviewed-by: Kory Maincent <kory.maincent@bootlin.com>
Reviewed-by: Vincent Mailhol <mailhol@kernel.org>
Link: https://patch.msgid.link/20251029231620.1135640-2-vadim.fedorenko@linux.dev
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-10-31 13:12:18 +01:00
Harshita V Rajput
0d0eb18642 cxgb4: flower: add support for fragmentation
This patch adds support for matching fragmented packets in tc flower
filters.

Previously, commit 93a8540aac ("cxgb4: flower: validate control flags")
added a check using flow_rule_match_has_control_flags() to reject
any rules with control flags, as the driver did not support
fragmentation at that time.

Now, with this patch, support for FLOW_DIS_IS_FRAGMENT is added:
- The driver checks for control flags using
  flow_rule_is_supp_control_flags(), as recommended in
  commit d11e631194 ("flow_offload: add control flag checking helpers").
- If the fragmentation flag is present, the driver sets `fs->val.frag` and
  `fs->mask.frag` accordingly in the filter specification.

Since fragmentation is now supported, the earlier check that rejected all
control flags (flow_rule_match_has_control_flags()) has been removed.

Signed-off-by: Harshita V Rajput <harshitha.vr@chelsio.com>
Signed-off-by: Potnuri Bharat Teja <bharat@chelsio.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20251028075255.1391596-1-harshitha.vr@chelsio.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-30 19:07:44 -07:00
Jakub Kicinski
12a7c6a993 Merge tag 'nf-next-25-10-30' of https://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next
Florian Westphal says:

====================
netfilter: updates for net-next

1) Convert nf_tables 'nft_set_iter' usage to use C99 struct
   initialization, from Fernando Fernandez Mancera.
2) Disallow nf_conntrack_max=0.  This was an (undocumented)
   historic inheritance from ip_conntrack (ipv4 only nf_conntrack
   predecessor).  Doing so will simplify future changes to make
   this pernet-tuneable.
3) Fix a typo in conntrack.h comment, from Weibiao Tu.

* tag 'nf-next-25-10-30' of https://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next:
  netfilter: fix typo in nf_conntrack_l4proto.h comment
  netfilter: conntrack: disable 0 value for conntrack_max setting
  netfilter: nf_tables: use C99 struct initializer for nft_set_iter
====================

Link: https://patch.msgid.link/20251030121954.29175-1-fw@strlen.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-30 17:57:07 -07:00
Jakub Kicinski
1659b441b6 Merge tag 'wireless-next-2025-10-30' of https://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless-next
Johannes Berg says:

====================
Not that many changes this time:

 - mac80211:
   - improved VHT radiotap reporting
   - S1G improvements
   - multi-radio monitor improvements
   - HT action frame handling on 6 GHz
   - mesh rate tracking improvements
   - CSA handling improvements
 - cfg80211: multi-radio debugfs
 - rt2x00: improvements for embedded platforms

* tag 'wireless-next-2025-10-30' of https://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless-next:
  wifi: mac80211: Allow HT Action frame processing on 6 GHz when HE is supported
  wifi: rt2x00: add nvmem eeprom support
  wifi: mac80211: add RX flag to report radiotap VHT information
  net: wireless: Remove redundant pm_runtime_mark_last_busy() calls
  wifi: cfg80211: Add parameters to radio-specific debugfs directories
  wifi: cfg80211: Add debugfs support for multi-radio wiphy
  wifi: mac80211: fix missing RX bitrate update for mesh forwarding path
  wifi: cfg80211: default S1G chandef width to 1MHz
  wifi: mac80211: get probe response chan via ieee80211_get_channel_khz
  wifi: mac80211: reset CRC valid after CSA
  wifi: mac80211_hwsim: advertise puncturing feature support
  wifi: cfg80211/mac80211: validate radio frequency range for monitor mode
  wifi: rt2x00: check retval for of_get_mac_address
====================

Link: https://patch.msgid.link/20251030105355.13216-3-johannes@sipsolutions.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-30 17:38:37 -07:00
Jakub Kicinski
ecca75ae5a selftests: drv-net: replace the nsim ring test with a drv-net one
We are trying to move away from netdevsim-only tests and towards
tests which can be run both against netdevsim and real drivers.

Replace the simple bash script we have for checking ethtool -g/-G
on netdevsim with a Python test tweaking those params as well
as channel count.

The new test is not exactly equivalent to the netdevsim one,
but real drivers don't often support random ring sizes,
let alone modifying max values via debugfs.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20251029164930.2923448-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-30 17:32:18 -07:00
Jakub Kicinski
4920abacb1 Merge branch '100GbE' of git://git.kernel.org/pub/scm/linux/kernel/git/tnguy/next-queue
Tony Nguyen says:

====================
Intel Wired LAN Driver Updates 2025-10-29 (ice, i40e, idpf, ixgbe, igbvf)

For ice:
Michal converts driver to utilize Page Pool and libeth APIs. Conversion
is based on similar changes done for iavf in order to simplify buffer
management, improve maintainability, and increase code reuse across
Intel Ethernet drivers.

Additional details:
https://lore.kernel.org/20250925092253.1306476-1-michal.kubiak@intel.com

Alexander adds support for header split, configurable via ethtool.

Grzegorz allows for use of 100Mbps on E825C SGMII devices.

For i40e:
Jay Vosburgh avoids sending link state changes to VF if it is already in
the requested state.

For idpf:
Sreedevi removes duplicated defines.

For ixgbe:
Alok Tiwari fixes some typos.

For igbvf:
Alok Tiwari fixes output of VLAN warning message.

* '100GbE' of git://git.kernel.org/pub/scm/linux/kernel/git/tnguy/next-queue:
  igbvf: fix misplaced newline in VLAN add warning message
  ixgbe: fix typos in ixgbe driver comments
  idpf: remove duplicate defines in IDPF_CAP_RSS
  i40e: avoid redundant VF link state updates
  ice: Allow 100M speed for E825C SGMII device
  ice: implement configurable header split for regular Rx
  ice: switch to Page Pool
  ice: drop page splitting and recycling
  ice: remove legacy Rx and construct SKB
====================

Link: https://patch.msgid.link/20251029231218.1277233-1-anthony.l.nguyen@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-30 17:24:57 -07:00
Jens Axboe
8cd5a59e4d io_uring/fdinfo: validate opcode before checking if it's an 128b one
The mixed SQE support assumes that userspace always passes valid data,
that is not the case. Validate the opcode properly before indexing
the io_issue_defs[] array, and pass it through the nospec indexing
as well as it's a user valid indexing a kernel array.

Fixes: 1cba30bf9f ("io_uring: add support for IORING_SETUP_SQE_MIXED")
Reported-by: syzbot+b883b008a0b1067d5833@syzkaller.appspotmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-10-30 17:09:00 -06:00
Zhongqiu Han
059ca8fd69 wifi: ath10k: use = {} to initialize bmi_target_info instead of memset
Initialize the bmi_target_info structure using = {} at declaration time
instead of calling memset() in each bus-specific code path. This
simplifies the code and avoids an explicit memset.

Signed-off-by: Zhongqiu Han <zhongqiu.han@oss.qualcomm.com>
Reviewed-by: Vasanthakumar Thiagarajan <vasanthakumar.thiagarajan@oss.qualcomm.com>
Link: https://patch.msgid.link/20251030130023.1836808-3-zhongqiu.han@oss.qualcomm.com
Signed-off-by: Jeff Johnson <jeff.johnson@oss.qualcomm.com>
2025-10-30 14:55:08 -07:00
Zhongqiu Han
877f9c22fd wifi: ath10k: use = {} to initialize pm_qos_request instead of memset
Initialize the pm_qos_request structure using = {} instead of memset() in
ath10k_download_fw(). This ensures the structure is properly zeroed before
passing it to cpu_latency_qos_add_request(), and improves efficiency by
avoiding an explicit runtime memset.

Signed-off-by: Zhongqiu Han <zhongqiu.han@oss.qualcomm.com>
Reviewed-by: Vasanthakumar Thiagarajan <vasanthakumar.thiagarajan@oss.qualcomm.com>
Link: https://patch.msgid.link/20251030130023.1836808-2-zhongqiu.han@oss.qualcomm.com
Signed-off-by: Jeff Johnson <jeff.johnson@oss.qualcomm.com>
2025-10-30 14:55:08 -07:00
Rameshkumar Sundaram
e70515039d wifi: ath12k: unassign arvif on scan vdev create failure
During scan and remain-on-channel requests, a scan link vif (arvif) is
assigned and a temporary vdev is created. If vdev creation fails, the
assigned arvif is left attached until the virtual interface is removed,
leaving a stale link in ahvif.

Fix this by freeing the stale arvif and resetting the corresponding link in
ahvif by calling ath12k_mac_unassign_link_vif() when vdev creation fails.

While at it, propagate the actual error code from ath12k_mac_vdev_create()
instead of returning -EINVAL in ath12k_mac_initiate_hw_scan().

Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.4.1-00199-QCAHKSWPL_SILICONZ-1

Fixes: 477cabfdb7 ("wifi: ath12k: modify link arvif creation and removal for MLO")
Signed-off-by: Rameshkumar Sundaram <rameshkumar.sundaram@oss.qualcomm.com>
Reviewed-by: Vasanthakumar Thiagarajan <vasanthakumar.thiagarajan@oss.qualcomm.com>
Link: https://patch.msgid.link/20251026182254.1399650-3-rameshkumar.sundaram@oss.qualcomm.com
Signed-off-by: Jeff Johnson <jeff.johnson@oss.qualcomm.com>
2025-10-30 14:55:08 -07:00
Rameshkumar Sundaram
448bf7b514 wifi: ath12k: enforce vdev limit in ath12k_mac_vdev_create()
Currently, vdev limit check is performed only in
ath12k_mac_assign_vif_to_vdev(). If the host has already created
maximum number of vdevs for the radio (ar) and a scan request
arrives for the same radio, ath12k_mac_initiate_hw_scan() attempts
to create a vdev without checking the limit, causing firmware asserts.

Centralize the vdev limit guard by moving the check into
ath12k_mac_vdev_create() so that all callers obey the limit.
While doing this, update the condition from
`num_created_vdevs > (TARGET_NUM_VDEVS(ab) - 1)` to
`num_created_vdevs >= TARGET_NUM_VDEVS(ab)` for clarity and to
eliminate unnecessary arithmetic.

Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.4.1-00199-QCAHKSWPL_SILICONZ-1

Fixes: 0d6e6736ed ("wifi: ath12k: scan statemachine changes for single wiphy")
Fixes: 4938ba733e ("wifi: ath12k: modify remain on channel for single wiphy")
Signed-off-by: Rameshkumar Sundaram <rameshkumar.sundaram@oss.qualcomm.com>
Reviewed-by: Vasanthakumar Thiagarajan <vasanthakumar.thiagarajan@oss.qualcomm.com>
Link: https://patch.msgid.link/20251026182254.1399650-2-rameshkumar.sundaram@oss.qualcomm.com
Signed-off-by: Jeff Johnson <jeff.johnson@oss.qualcomm.com>
2025-10-30 14:55:08 -07:00
Muna Sinada
09486128ca wifi: ath12k: Set EHT fixed rates for associated STAs
Fixed rate is set for STAs that are associated. This will be done
during association or with ath12k_sta_rc_update_wk().

Add EHT fixed rate setting for STAs by adding call to
ath12k_mac_set_peer_eht_fixed_rate() during the times fixed rate is
set for STAs. This new function sets EHT fixed rate for a peer, which
sends WMI command with the updated MCS/NSS rate using
WMI_PEER_PARAM_FIXED_RATE command id.

Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.0.1-00029-QCAHKSWPL_SILICONZ-1

Co-developed-by: Aaradhana Sahu <quic_aarasahu@quicinc.com>
Signed-off-by: Aaradhana Sahu <quic_aarasahu@quicinc.com>
Signed-off-by: Muna Sinada <muna.sinada@oss.qualcomm.com>
Reviewed-by: Vasanthakumar Thiagarajan <vasanthakumar.thiagarajan@oss.qualcomm.com>
Link: https://patch.msgid.link/20251024001928.257356-7-muna.sinada@oss.qualcomm.com
Signed-off-by: Jeff Johnson <jeff.johnson@oss.qualcomm.com>
2025-10-30 14:55:08 -07:00
Muna Sinada
5ee9cb2c23 wifi: ath12k: add EHT rates to ath12k_mac_op_set_bitrate_mask()
Extend ath12k_mac_op_set_bitrate_mask() to handle EHT rates.
Create and pass EHT mask containing MCS and NSS along with EHT GI and
LTF when calling ath12k_mac_set_rate_params()

Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.0.1-00029-QCAHKSWPL_SILICONZ-1

Co-developed-by: Aaradhana Sahu <quic_aarasahu@quicinc.com>
Signed-off-by: Aaradhana Sahu <quic_aarasahu@quicinc.com>
Signed-off-by: Muna Sinada <muna.sinada@oss.qualcomm.com>
Reviewed-by: Vasanthakumar Thiagarajan <vasanthakumar.thiagarajan@oss.qualcomm.com>
Link: https://patch.msgid.link/20251024001928.257356-6-muna.sinada@oss.qualcomm.com
Signed-off-by: Jeff Johnson <jeff.johnson@oss.qualcomm.com>
2025-10-30 14:55:08 -07:00
Muna Sinada
ab31a9b73c wifi: ath12k: Add EHT fixed GI/LTF
Add EHT functionality to set fixed GI/LTF parameters.
Add new wmi vdev parameter id for EHT LTF

Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.0.1-00029-QCAHKSWPL_SILICONZ-1

Signed-off-by: Muna Sinada <muna.sinada@oss.qualcomm.com>
Reviewed-by: Vasanthakumar Thiagarajan <vasanthakumar.thiagarajan@oss.qualcomm.com>
Link: https://patch.msgid.link/20251024001928.257356-5-muna.sinada@oss.qualcomm.com
Signed-off-by: Jeff Johnson <jeff.johnson@oss.qualcomm.com>
2025-10-30 14:55:07 -07:00
Muna Sinada
6c95151e2e wifi: ath12k: Add EHT MCS/NSS rates to Peer Assoc
Add EHT MCS/NSS rate functionality to peer association. As part of
ath12k_peer_assoc_h_eht() add the calculation of EHT MCS/NSS using
intersection of link_sta and phy capability.

ath12k_mac_max_eht_mcs_nss() function is utilized when comparing the
max NSS of link STA and phy capability since in split phy case, phy
supports max NSS of 2 for 5G band.

Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.0.1-00029-QCAHKSWPL_SILICONZ-1

Co-developed-by: Vishal Kumar <quic_visk@quicinc.com>
Signed-off-by: Vishal Kumar <quic_visk@quicinc.com>
Co-developed-by: Aloka Dixit <aloka.dixit@oss.qualcomm.com>
Signed-off-by: Aloka Dixit <aloka.dixit@oss.qualcomm.com>
Signed-off-by: Muna Sinada <muna.sinada@oss.qualcomm.com>
Reviewed-by: Vasanthakumar Thiagarajan <vasanthakumar.thiagarajan@oss.qualcomm.com>
Link: https://patch.msgid.link/20251024001928.257356-4-muna.sinada@oss.qualcomm.com
Signed-off-by: Jeff Johnson <jeff.johnson@oss.qualcomm.com>
2025-10-30 14:55:07 -07:00
Muna Sinada
ec1d9b79be wifi: ath12k: add EHT rate handling to existing set rate functions
Add EHT rate handling to the existing rate functions that validate,
prepare and set rates.

Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.0.1-00029-QCAHKSWPL_SILICONZ-1

Co-developed-by: Aloka Dixit <aloka.dixit@oss.qualcomm.com>
Signed-off-by: Aloka Dixit <aloka.dixit@oss.qualcomm.com>
Signed-off-by: Muna Sinada <muna.sinada@oss.qualcomm.com>
Reviewed-by: Vasanthakumar Thiagarajan <vasanthakumar.thiagarajan@oss.qualcomm.com>
Link: https://patch.msgid.link/20251024001928.257356-3-muna.sinada@oss.qualcomm.com
Signed-off-by: Jeff Johnson <jeff.johnson@oss.qualcomm.com>
2025-10-30 14:55:07 -07:00
Muna Sinada
770bff7942 wifi: ath12k: generalize GI and LTF fixed rate functions
Currently, functions in mac.c for setting GI and LTF rates are
specifically for HE rates.

Remove any mention of "HE" in such functions in order to allow for
other modes to utilize the functions. The intention is to prepare for
the addition of EHT GI and LTF fixed rate settings.

Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.0.1-00029-QCAHKSWPL_SILICONZ-1

Signed-off-by: Muna Sinada <muna.sinada@oss.qualcomm.com>
Reviewed-by: Vasanthakumar Thiagarajan <vasanthakumar.thiagarajan@oss.qualcomm.com>
Link: https://patch.msgid.link/20251024001928.257356-2-muna.sinada@oss.qualcomm.com
Signed-off-by: Jeff Johnson <jeff.johnson@oss.qualcomm.com>
2025-10-30 14:55:07 -07:00
Baochen Qiang
088a099690 wifi: ath12k: fix error handling in creating hardware group
In ath12k_core_init() when ath12k_core_hw_group_create() fails,
ath12k_core_hw_group_destroy() is called where for each device below
path would get executed

	ath12k_core_soc_destroy()
		ath12k_qmi_deinit_service()
			qmi_handle_release()

This results in kernel crash in case one of the device fails at
qmi_handle_init() when creating hardware group:

ath12k_pci 0000:10:00.0: failed to initialize qmi handle
ath12k_pci 0000:10:00.0: failed to initialize qmi :-517
ath12k_pci 0000:10:00.0: failed to create soc core: -517
ath12k_pci 0000:10:00.0: unable to create hw group
BUG: unable to handle page fault for address: ffffffffffffffb7
RIP: 0010:qmi_handle_release
Call Trace:
 <TASK>
 ath12k_qmi_deinit_service
 ath12k_core_hw_group_destroy
 ath12k_core_init
 ath12k_pci_probe

The detailed reason is, when qmi_handle_init() fails for a device
ab->qmi.handle is not correctly initialized. Then
ath12k_core_hw_group_create() returns failure, since error handing
is done for all device, eventually qmi_handle_release() is called for the
issue device and finally kernel crashes due to the uninitialized
ab->qmi.handle.

Fix this by moving error handling to ath12k_core_hw_group_create(), this
way the issue device can be skipped.

Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.1.c5-00284.1-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3

Fixes: 6f245ea0ec ("wifi: ath12k: introduce device group abstraction")
Link: https://lore.kernel.org/ath12k/fabc97122016d1a66a53ddedd965d134@posteo.net
Reported-by: a-development <a-development@posteo.de>
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220518
Tested-by: a-development <a-development@posteo.de>
Signed-off-by: Baochen Qiang <baochen.qiang@oss.qualcomm.com>
Reviewed-by: Vasanthakumar Thiagarajan <vasanthakumar.thiagarajan@oss.qualcomm.com>
Link: https://patch.msgid.link/20251030-fix-hw-group-create-err-handling-v1-1-0659e4d15fb9@oss.qualcomm.com
Signed-off-by: Jeff Johnson <jeff.johnson@oss.qualcomm.com>
2025-10-30 14:55:07 -07:00
Baochen Qiang
00575bb44b wifi: ath12k: fix reusing m3 memory
During firmware recovery or suspend/resume, m3 memory could be reused if
the size of the new m3 binary is equal to or less than that of the
existing memory. There will be issues for the latter case, since
m3_mem->size will be updated with a smaller value and this value is
eventually used in the free path, where the original total size should be
used instead.

To fix it, add a new member in m3_mem_region structure to track the original
memory size and use it in free path.

Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.1.c5-00302-QCAHMTSWPL_V1.0_V2.0_SILICONZ-1.115823.3

Fixes: 05090ae82f ("wifi: ath12k: check M3 buffer size as well whey trying to reuse it")
Signed-off-by: Baochen Qiang <baochen.qiang@oss.qualcomm.com>
Reviewed-by: Vasanthakumar Thiagarajan <vasanthakumar.thiagarajan@oss.qualcomm.com>
Link: https://patch.msgid.link/20251029-ath12k-fix-m3-reuse-v1-1-69225bacfc5d@oss.qualcomm.com
Signed-off-by: Jeff Johnson <jeff.johnson@oss.qualcomm.com>
2025-10-30 14:55:07 -07:00
Abdun Nihaal
be5febd51c wifi: ath12k: fix potential memory leak in ath12k_wow_arp_ns_offload()
When the call to ath12k_wmi_arp_ns_offload() fails, the temporary memory
allocation for offload is not freed before returning. Fix that by
freeing offload in the error path.

Fixes: 1666108c74 ("wifi: ath12k: support ARP and NS offload")
Signed-off-by: Abdun Nihaal <nihaal@cse.iitm.ac.in>
Reviewed-by: Baochen Qiang <baochen.qiang@oss.qualcomm.com>
Link: https://patch.msgid.link/20251028170457.134608-1-nihaal@cse.iitm.ac.in
Signed-off-by: Jeff Johnson <jeff.johnson@oss.qualcomm.com>
2025-10-30 14:55:07 -07:00
Paolo Abeni
7ea7694495 Merge branch 'net-smc-make-wr-buffer-count-configurable'
Halil Pasic says:

====================
net/smc: make wr buffer count configurable

The current value of SMC_WR_BUF_CNT is 16 which leads to heavy
contention on the wr_tx_wait workqueue of the SMC-R linkgroup and its
spinlock when many connections are competing for the work request
buffers. Currently up to 256 connections per linkgroup are supported.

To make things worse when finally a buffer becomes available and
smc_wr_tx_put_slot() signals the linkgroup's wr_tx_wait wq, because
WQ_FLAG_EXCLUSIVE is not used all the waiters get woken up, most of the
time a single one can proceed, and the rest is contending on the
spinlock of the wq to go to sleep again.

Addressing this by simply bumping SMC_WR_BUF_CNT to 256 was deemed
risky, because the large-ish physically continuous allocation could fail
and lead to TCP fall-backs. For reference see this discussion thread on
"[PATCH net-next] net/smc: increase SMC_WR_BUF_CNT" (in archive
https://lists.openwall.net/netdev/2024/11/05/186), which concludes with
the agreement to try to come up with something smarter, which is what
this series aims for.

Additionally if for some reason it is known that heavy contention is not
to be expected going with something like 256 work request buffers is
wasteful. To address these concerns make the number of work requests
configurable, and introduce a back-off logic with handles -ENOMEM form
smc_wr_alloc_link_mem() gracefully.

v5: https://lore.kernel.org/netdev/20250929000001.1752206-1-pasic@linux.ibm.com/
v4: https://lore.kernel.org/netdev/20250927232144.3478161-1-pasic@linux.ibm.com/
v3: https://lore.kernel.org/netdev/20250921214440.325325-1-pasic@linux.ibm.com/
v2: https://lore.kernel.org/netdev/20250908220150.3329433-1-pasic@linux.ibm.com/
v1: https://lore.kernel.org/all/20250904211254.1057445-1-pasic@linux.ibm.com/
====================

Link: https://patch.msgid.link/20251027224856.2970019-1-pasic@linux.ibm.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-30 13:31:46 +01:00
Halil Pasic
8f736087e5 net/smc: handle -ENOMEM from smc_wr_alloc_link_mem gracefully
Currently if a -ENOMEM from smc_wr_alloc_link_mem() is handled by
giving up and going the way of a TCP fallback. This was reasonable
before the sizes of the allocations there were compile time constants
and reasonably small. But now those are actually configurable.

So instead of giving up, keep retrying with half of the requested size
unless we dip below the old static sizes -- then give up! In terms of
numbers that means we give up when it is certain that we at best would
end up allocating less than 16 send WR buffers or less than 48 recv WR
buffers. This is to avoid regressions due to having fewer buffers
compared the static values of the past.

Please note that SMC-R is supposed to be an optimisation over TCP, and
falling back to TCP is superior to establishing an SMC connection that
is going to perform worse. If the memory allocation fails (and we
propagate -ENOMEM), we fall back to TCP.

Preserve (modulo truncation) the ratio of send/recv WR buffer counts.

Signed-off-by: Halil Pasic <pasic@linux.ibm.com>
Reviewed-by: Wenjia Zhang <wenjia@linux.ibm.com>
Reviewed-by: Mahanta Jambigi <mjambigi@linux.ibm.com>
Reviewed-by: Sidraya Jayagond <sidraya@linux.ibm.com>
Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
Tested-by: Mahanta Jambigi <mjambigi@linux.ibm.com>
Link: https://patch.msgid.link/20251027224856.2970019-3-pasic@linux.ibm.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-30 13:31:43 +01:00
Halil Pasic
aef3cdb47b net/smc: make wr buffer count configurable
Think SMC_WR_BUF_CNT_SEND := SMC_WR_BUF_CNT used in send context and
SMC_WR_BUF_CNT_RECV := 3 * SMC_WR_BUF_CNT used in recv context. Those
get replaced with lgr->max_send_wr and lgr->max_recv_wr respective.

Please note that although with the default sysctl values
qp_attr.cap.max_send_wr ==  qp_attr.cap.max_recv_wr is maintained but
can not be assumed to be generally true any more. I see no downside to
that, but my confidence level is rather modest.

Signed-off-by: Halil Pasic <pasic@linux.ibm.com>
Reviewed-by: Sidraya Jayagond <sidraya@linux.ibm.com>
Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
Tested-by: Mahanta Jambigi <mjambigi@linux.ibm.com>
Link: https://patch.msgid.link/20251027224856.2970019-2-pasic@linux.ibm.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-30 13:31:43 +01:00
caivive (Weibiao Tu)
57347d58a4 netfilter: fix typo in nf_conntrack_l4proto.h comment
In the comment for nf_conntrack_l4proto.h, the word "nfnetink" was
incorrectly spelled. It has been corrected to "nfnetlink".

Fixes a typo to enhance readability and ensure consistency.

Signed-off-by: caivive (Weibiao Tu) <cavivie@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
2025-10-30 12:52:45 +01:00
Florian Westphal
2b749f2576 netfilter: conntrack: disable 0 value for conntrack_max setting
Undocumented historical artifact inherited from ip_conntrack.
If value is 0, then no limit is applied at all, conntrack table
can grow to huge value, only limited by size of conntrack hashes and
the kernel-internal upper limit on the hash chain lengths.

This feature makes no sense; users can just set
conntrack_max=2147483647 (INT_MAX).

Disallow a 0 value.  This will make it slightly easier to allow
per-netns constraints for this value in a future patch.

Signed-off-by: Florian Westphal <fw@strlen.de>
2025-10-30 12:52:45 +01:00
Fernando Fernandez Mancera
320d80eeb2 netfilter: nf_tables: use C99 struct initializer for nft_set_iter
Use C99 struct initializer for nft_set_iter, simplifying the code and
preventing future errors due to uninitialized fields if new fields are
added to the struct.

Signed-off-by: Fernando Fernandez Mancera <fmancera@suse.de>
Signed-off-by: Florian Westphal <fw@strlen.de>
2025-10-30 12:52:45 +01:00
Paolo Abeni
ea7d0d60eb Merge branch 'add-cn20k-nix-and-npa-contexts'
Subbaraya Sundeep says:

====================
Add CN20K NIX and NPA contexts

The hardware contexts of blocks NIX and NPA in CN20K silicon are
different than that of previous silicons CN10K and CN9XK. This
patchset adds the new contexts of CN20K in AF and PF drivers.
A new mailbox for enqueuing contexts to hardware is added.

Patch 1 simplifies context writing and reading by using max context
size supported by hardware instead of using each context size.
Patch 2 and 3 adds NIX block contexts in AF driver and extends
debugfs to display those new contexts
Patch 4 and 5 adds NPA block contexts in AF driver and extends
debugfs to display those new contexts
Patch 6 omits NDC configuration since CN20K NPA does not use NDC
for caching its contexts
Patch 7 and 8 uses the new NIX and NPA contexts in PF/VF driver.
Patch 9, 10 and 11 are to support more bandwidth profiles present in
CN20K for RX ratelimiting and to display new profiles in debugfs

v3: https://lore.kernel.org/all/1752772063-6160-1-git-send-email-sbhatta@marvell.com/
====================

Link: https://patch.msgid.link/1761388367-16579-1-git-send-email-sbhatta@marvell.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-30 10:44:12 +01:00
Subbaraya Sundeep
33d8a1f457 octeontx2-pf: Use new bandwidth profiles in receive queue
Receive queue points to a bandwidth profile for rate limiting.
Since cn20k has additional bandwidth profiles use them
too while mapping receive queue to bandwidth profile.

Signed-off-by: Subbaraya Sundeep <sbhatta@marvell.com>
Link: https://patch.msgid.link/1761388367-16579-12-git-send-email-sbhatta@marvell.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-30 10:44:09 +01:00
Subbaraya Sundeep
47a1208776 octeontx2-af: Display new bandwidth profiles too in debugfs
Consider the new profiles of cn20k too while displaying
bandwidth profile contexts in debugfs.

Signed-off-by: Subbaraya Sundeep <sbhatta@marvell.com>
Link: https://patch.msgid.link/1761388367-16579-11-git-send-email-sbhatta@marvell.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-30 10:44:09 +01:00
Subbaraya Sundeep
f7774633cf octeontx2-af: Accommodate more bandwidth profiles for cn20k
CN20K has 16k of leaf profiles, 2k of middle profiles and
256 of top profiles. This patch modifies existing receive
queue and bandwidth profile context structures to accommodate
additional profiles of cn20k.

Signed-off-by: Subbaraya Sundeep <sbhatta@marvell.com>
Link: https://patch.msgid.link/1761388367-16579-10-git-send-email-sbhatta@marvell.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-30 10:44:09 +01:00
Subbaraya Sundeep
81f1253357 octeontx2-pf: Initialize new NIX SQ context for cn20k
cn20k has different NIX context for send queue hence use
the new cn20k mailbox to init SQ context.

Signed-off-by: Subbaraya Sundeep <sbhatta@marvell.com>
Link: https://patch.msgid.link/1761388367-16579-9-git-send-email-sbhatta@marvell.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-30 10:44:09 +01:00
Linu Cherian
d322fbd172 octeontx2-pf: Initialize cn20k specific aura and pool contexts
With new CN20K NPA pool and aura contexts supported in AF
driver this patch modifies PF driver to use new NPA contexts.
Implement new hw_ops for intializing aura and pool contexts
for all the silicons.

Signed-off-by: Linu Cherian <lcherian@marvell.com>
Signed-off-by: Subbaraya Sundeep <sbhatta@marvell.com>
Link: https://patch.msgid.link/1761388367-16579-8-git-send-email-sbhatta@marvell.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-30 10:44:09 +01:00
Linu Cherian
a861e5809f octeontx2-af: Skip NDC operations for cn20k
For cn20k, NPA block doesn't use the general purpose
NDC (Near Coprocessor Bus Data cache Unit) for caching,
hence skip the NDC related operations.
Also refactor NDC configuration code to a helper function.

Signed-off-by: Linu Cherian <lcherian@marvell.com>
Signed-off-by: Subbaraya Sundeep <sbhatta@marvell.com>
Link: https://patch.msgid.link/1761388367-16579-7-git-send-email-sbhatta@marvell.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-30 10:44:09 +01:00
Linu Cherian
e4a8e78aca octeontx2-af: Extend debugfs support for cn20k NPA
Extend debugfs to display CN20K NPA aura and pool contexts.

Signed-off-by: Linu Cherian <lcherian@marvell.com>
Signed-off-by: Subbaraya Sundeep <sbhatta@marvell.com>
Link: https://patch.msgid.link/1761388367-16579-6-git-send-email-sbhatta@marvell.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-30 10:44:08 +01:00
Linu Cherian
8a8b130127 octeontx2-af: Add cn20k NPA block contexts
New CN20K silicon has NPA hardware context structures different from
previous silicons. Add NPA aura and pool context definitions for cn20k.
Extend NPA context handling support to cn20k.

Signed-off-by: Linu Cherian <lcherian@marvell.com>
Signed-off-by: Subbaraya Sundeep <sbhatta@marvell.com>
Link: https://patch.msgid.link/1761388367-16579-5-git-send-email-sbhatta@marvell.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-30 10:44:08 +01:00
Subbaraya Sundeep
45229e9a9a octeontx2-af: Extend debugfs support for cn20k NIX
Extend debugfs to display CN20K NIX send, receive and
completion queue contexts.

Signed-off-by: Subbaraya Sundeep <sbhatta@marvell.com>
Link: https://patch.msgid.link/1761388367-16579-4-git-send-email-sbhatta@marvell.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-30 10:44:08 +01:00
Subbaraya Sundeep
b5dcdde074 octeontx2-af: Add cn20k NIX block contexts
New CN20K silicon has NIX hardware context structures different from
previous silicons. Add NIX send and completion queue context
definitions for cn20k. Extend NIX context handling support to cn20k.

Signed-off-by: Subbaraya Sundeep <sbhatta@marvell.com>
Link: https://patch.msgid.link/1761388367-16579-3-git-send-email-sbhatta@marvell.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-30 10:44:08 +01:00
Subbaraya Sundeep
85708c5d5f octeontx2-af: Simplify context writing and reading to hardware
Simplify NIX context reading and writing by using hardware
maximum context size instead of using individual sizes of
each context type.

Signed-off-by: Subbaraya Sundeep <sbhatta@marvell.com>
Link: https://patch.msgid.link/1761388367-16579-2-git-send-email-sbhatta@marvell.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-30 10:44:08 +01:00
Steffen Klassert
6b3b6e59c4 pfkey: Deprecate pfkey
The pfkey user configuration interface was replaced by the netlink
user configuration interface more than a decade ago. In between
all maintained IKE implementations moved to the netlink interface.
So let config NET_KEY default to no in Kconfig. The pfkey code
will be removed in a second step.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Acked-by: Antony Antony <antony.antony@secunet.com>
Acked-by: Tobias Brunner <tobias@strongswan.org>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: Tuomo Soini <tis@foobar.fi>
Acked-by: Paul Wouters <paul@nohats.ca>
2025-10-30 09:03:12 +01:00
Thomas Wu
508dfc1f2c wifi: mac80211: Allow HT Action frame processing on 6 GHz when HE is supported
Management frames on 6 GHz do not include HT Capabilities, causing HT
Action frames to be dropped in ieee80211_rx_h_action(). The current logic
checks only ht_cap.ht_supported, which fails for 6 GHz radios that support
only HE and EHT.

Update the condition to also allow HT Action frame processing when
he_cap.has_he is true. This enables support for HE dynamic SM power save
as defined in IEEE Std 802.11ax-2021, section 26.14.4.

Signed-off-by: Thomas Wu <quic_wthomas@quicinc.com>
Signed-off-by: Aaradhana Sahu <aaradhana.sahu@oss.qualcomm.com>
Link: https://patch.msgid.link/20251028043442.523647-1-aaradhana.sahu@oss.qualcomm.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-10-30 08:40:47 +01:00
Rosen Penev
ec81b33b23 wifi: rt2x00: add nvmem eeprom support
Some embedded platforms have eeproms located in flash. Add nvmem support
to handle this. Support is added for PCI and SOC backends.

Signed-off-by: Rosen Penev <rosenp@gmail.com>
Link: https://patch.msgid.link/20251027180639.3797-1-rosenp@gmail.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-10-30 08:39:18 +01:00
Benjamin Berg
db82ddeaf4 wifi: mac80211: add RX flag to report radiotap VHT information
mac80211 already reports some basic information in the radiotap header
with the known fields declared by the driver. However, drivers may want
to report more accurate information and in that case the full VHT
radiotap structure needs to be provided.

Add a new RX_FLAG_RADIOTAP_VHT which is set when the VHT information
should be pulled from the skb. Update the code to fill in the VHT fields
to only do so when requested by the driver or if the information has not
yet been set. This way the driver can fully control the information if
it chooses so.

Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
Reviewed-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20251027142118.0bad1c307a21.I2cf285c20a822698039603f2af00ed9c548f2ee0@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-10-30 08:38:51 +01:00
Sakari Ailus
0a119c68d0 net: wireless: Remove redundant pm_runtime_mark_last_busy() calls
pm_runtime_put_autosuspend(), pm_runtime_put_sync_autosuspend(),
pm_runtime_autosuspend() and pm_request_autosuspend() now include a call
to pm_runtime_mark_last_busy(). Remove the now-reduntant explicit call to
pm_runtime_mark_last_busy().

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Link: https://patch.msgid.link/20251027115022.390997-3-sakari.ailus@linux.intel.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-10-30 08:38:37 +01:00
Jakub Kicinski
1bae0fd900 Merge branch 'net-phy-add-iterator-mdiobus_for_each_phy'
Heiner Kallweit says:

====================
net: phy: add iterator mdiobus_for_each_phy

Add and use an iterator for all PHY's on a MII bus, and phy_find_next()
as a prerequisite.
====================

Link: https://patch.msgid.link/07fc63e8-53fd-46aa-853e-96187bba9d44@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-29 19:00:37 -07:00
Heiner Kallweit
d4780abb8c net: phy: use new iterator mdiobus_for_each_phy in mdiobus_prevent_c45_scan
Use new iterator mdiobus_for_each_phy() to simplify the code.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://patch.msgid.link/6d792b1e-d23d-4b7e-a94f-89c6617b620f@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-29 19:00:34 -07:00
Heiner Kallweit
4575875065 net: davinci_mdio: use new iterator mdiobus_for_each_phy
Use new iterator mdiobus_for_each_phy() to simplify the code.

Reviewed-by: Siddharth Vadapalli <s-vadapalli@ti.com>
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://patch.msgid.link/326d1337-2c22-42e3-a152-046ac5c43095@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-29 19:00:34 -07:00
Heiner Kallweit
0514010d55 net: fec: use new iterator mdiobus_for_each_phy
Use new iterator mdiobus_for_each_phy() to simplify the code.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Wei Fang <wei.fang@nxp.com>
Link: https://patch.msgid.link/65eb9490-5666-4b4a-8d26-3fca738b1315@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-29 19:00:34 -07:00
Heiner Kallweit
26888de97b net: phy: add iterator mdiobus_for_each_phy
Add an iterator for all PHY's on a MII bus, and phy_find_next()
as a prerequisite.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Wei Fang <wei.fang@nxp.com>
Link: https://patch.msgid.link/cd112f15-401a-43d9-8525-9ff0965a68cd@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-29 19:00:34 -07:00
Heiner Kallweit
cf35f4347d net: stmmac: mdio: fix incorrect phy address check
max_addr is the max number of addresses, not the highest possible address,
therefore check phydev->mdio.addr > max_addr isn't correct.
To fix this change the semantics of max_addr, so that it represents
the highest possible address. IMO this is also a little bit more intuitive
wrt name max_addr.

Fixes: 4a107a0e83 ("net: stmmac: mdio: use phy_find_first to simplify stmmac_mdio_register")
Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Reported-by: Simon Horman <horms@kernel.org>
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/e869999b-2d4b-4dc1-9890-c2d3d1e8d0f8@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-29 18:58:01 -07:00
Sakari Ailus
10c7b9be47 net: wwan: Remove redundant pm_runtime_mark_last_busy() calls
pm_runtime_put_autosuspend(), pm_runtime_put_sync_autosuspend(),
pm_runtime_autosuspend() and pm_request_autosuspend() now include a call
to pm_runtime_mark_last_busy(). Remove the now-reduntant explicit call to
pm_runtime_mark_last_busy().

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Link: https://patch.msgid.link/20251027115022.390997-4-sakari.ailus@linux.intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-29 18:55:56 -07:00
Sakari Ailus
a5d937dd0e net: ipa: Remove redundant pm_runtime_mark_last_busy() calls
pm_runtime_put_autosuspend(), pm_runtime_put_sync_autosuspend(),
pm_runtime_autosuspend() and pm_request_autosuspend() now include a call
to pm_runtime_mark_last_busy(). Remove the now-reduntant explicit call to
pm_runtime_mark_last_busy().

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Dmitry Baryshkov <dmitry.baryshkov@oss.qualcomm.com>
Link: https://patch.msgid.link/20251027115022.390997-2-sakari.ailus@linux.intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-29 18:55:56 -07:00
Sakari Ailus
9f2674e1c3 net: ethernet: Remove redundant pm_runtime_mark_last_busy() calls
pm_runtime_put_autosuspend(), pm_runtime_put_sync_autosuspend(),
pm_runtime_autosuspend() and pm_request_autosuspend() now include a call
to pm_runtime_mark_last_busy(). Remove the now-reduntant explicit call to
pm_runtime_mark_last_busy().

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Link: https://patch.msgid.link/20251027115022.390997-1-sakari.ailus@linux.intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-29 18:55:50 -07:00
Jakub Kicinski
b5171b8996 Merge branch 'net-enetc-add-i-mx94-enetc-support'
Wei Fang says:

====================
net: enetc: Add i.MX94 ENETC support

i.MX94 NETC has two kinds of ENETCs, one is the same as i.MX95, which
can be used as a standalone network port. The other one is an internal
ENETC, it connects to the CPU port of NETC switch through the pseudo
MAC. Also, i.MX94 have multiple PTP Timers, which is different from
i.MX95. Any PTP Timer can be bound to a specified standalone ENETC by
the IERB ETBCR registers. Currently, this patch only add ENETC support
and Timer support for i.MX94. The switch will be added by a separate
patch set.

In addition, note that i.MX94 SoC is launched after i.MX95, its NETC
has a higher version, so the driver support is added after i.MX95.
====================

Link: https://patch.msgid.link/20251029013900.407583-1-wei.fang@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-29 18:44:21 -07:00
Wei Fang
2d673b0e2f net: enetc: add standalone ENETC support for i.MX94
The revision of i.MX94 ENETC is changed to v4.3, so add this revision to
enetc_info to support i.MX94 ENETC. And add PTP suspport for i.MX94.

Signed-off-by: Wei Fang <wei.fang@nxp.com>
Reviewed-by: Frank Li <Frank.Li@nxp.com>
Link: https://patch.msgid.link/20251029013900.407583-7-wei.fang@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-29 18:44:19 -07:00
Wei Fang
5175c1e4ad net: enetc: add basic support for the ENETC with pseudo MAC for i.MX94
The ENETC with pseudo MAC is an internal port which connects to the CPU
port of the switch. The switch CPU/host ENETC is fully integrated with
the switch and does not require a back-to-back MAC, instead a light
weight "pseudo MAC" provides the delineation between switch and ENETC.
This translates to lower power (less logic and memory) and lower delay
(as there is no serialization delay across this link).

Different from the standalone ENETC which is used as the external port,
the internal ENETC has a different PCIe device ID, and it does not have
Ethernet MAC port registers, instead, it has a small number of pseudo
MAC port registers, so some features are not supported by pseudo MAC,
such as loopback, half duplex, one-step timestamping and so on.

Therefore, the configuration of this internal ENETC is also somewhat
different from that of the standalone ENETC. So add the basic support
for ENETC with pseudo MAC. More supports will be added in the future.

Signed-off-by: Wei Fang <wei.fang@nxp.com>
Reviewed-by: Claudiu Manoil <claudiu.manoil@nxp.com>
Link: https://patch.msgid.link/20251029013900.407583-6-wei.fang@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-29 18:44:19 -07:00
Clark Wang
1cd3f21c18 net: enetc: add ptp timer binding support for i.MX94
The i.MX94 has three PTP timers, and all standalone ENETCs can select
one of them to bind to as their PHC. The 'ptp-timer' property is used
to represent the PTP device of the Ethernet controller. So users can
add 'ptp-timer' to the ENETC node to specify the PTP timer. The driver
parses this property to bind the two hardware devices.

If the "ptp-timer" property is not present, the first timer of the PCIe
bus where the ENETC is located is used as the default bound PTP timer.

Signed-off-by: Clark Wang <xiaoning.wang@nxp.com>
Signed-off-by: Wei Fang <wei.fang@nxp.com>
Link: https://patch.msgid.link/20251029013900.407583-5-wei.fang@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-29 18:44:19 -07:00
Wei Fang
ba5d7d45ce net: enetc: add preliminary i.MX94 NETC blocks control support
NETC blocks control is used for warm reset and pre-boot initialization.
Different versions of NETC blocks control are not exactly the same. We
need to add corresponding netc_devinfo data for each version. i.MX94
series are launched after i.MX95, so its NETC version (v4.3) is higher
than i.MX95 NETC (v4.1). Currently, the patch adds the following
configurations for ENETCs.

1. Set the link's MII protocol.
2. ENETC 0 (MAC 3) and the switch port 2 (MAC 2) share the same parallel
interface, but due to SoC constraint, they cannot be used simultaneously.
Since the switch is not supported yet, so the interface is assigned to
ENETC 0 by default.

The switch configuration will be added separately in a subsequent patch.

Signed-off-by: Wei Fang <wei.fang@nxp.com>
Reviewed-by: Frank Li <Frank.Li@nxp.com>
Link: https://patch.msgid.link/20251029013900.407583-4-wei.fang@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-29 18:44:19 -07:00
Wei Fang
c4430f2ac0 dt-bindings: net: enetc: add compatible string for ENETC with pseduo MAC
The ENETC with pseudo MAC is used to connect to the CPU port of the NETC
switch. This ENETC has a different PCI device ID, so add a standard PCI
device compatible string to it.

Signed-off-by: Wei Fang <wei.fang@nxp.com>
Acked-by: Rob Herring (Arm) <robh@kernel.org>
Link: https://patch.msgid.link/20251029013900.407583-3-wei.fang@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-29 18:44:19 -07:00
Wei Fang
3a85ec37bc dt-bindings: net: netc-blk-ctrl: add compatible string for i.MX94 platforms
Add the compatible string "nxp,imx94-netc-blk-ctrl" for i.MX94 platforms.

Signed-off-by: Wei Fang <wei.fang@nxp.com>
Acked-by: Rob Herring (Arm) <robh@kernel.org>
Link: https://patch.msgid.link/20251029013900.407583-2-wei.fang@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-29 18:44:18 -07:00
Jakub Kicinski
76c231b3c2 Merge branch 'icmp-add-rfc-5837-support'
Ido Schimmel says:

====================
icmp: Add RFC 5837 support

tl;dr
=====

This patchset extends certain ICMP error messages (e.g., "Time
Exceeded") with incoming interface information in accordance with RFC
5837 [1]. This is required for more meaningful traceroute results in
unnumbered networks. Like other ICMP settings, the feature is controlled
via a per-{netns, address family} sysctl. The interface and the
implementation are designed to support more ICMP extensions.

Motivation
==========

Over the years, the kernel was extended with the ability to derive the
source IP of ICMP error messages from the interface that received the
datagram which elicited the ICMP error [2][3][4]. This is especially
important for "Time Exceeded" messages as it allows traceroute users to
trace the actual packet path along the network.

The above scheme does not work in unnumbered networks. In these
networks, only the loopback / VRF interface is assigned a global IP
address while router interfaces are assigned IPv6 link-local addresses.
As such, ICMP error messages are generated with a source IP derived from
the loopback / VRF interface, making it impossible to trace the actual
packet path when parallel links exist between routers.

The problem can be solved by implementing the solution proposed by RFC
4884 [5] and RFC 5837. The former defines an ICMP extension structure
that can be appended to selected ICMP messages and carry extension
objects. The latter defines an extension object called the "Interface
Information Object" (IIO) that can carry interface information (e.g.,
name, index, MTU) about interfaces with certain roles such as the
interface that received the datagram which elicited the ICMP error.

The payload of the datagram that elicited the error (potentially padded
/ trimmed) along with the ICMP extension structure will be queued to the
error queue of the originating socket, thereby allowing traceroute
applications to parse and display the information encoded in the ICMP
extension structure. Example:

 # traceroute6 -e 2001:db8:1::3
 traceroute to 2001:db8:1::3 (2001:db8:1::3), 30 hops max, 80 byte packets
  1  2001:db8:1::2 (2001:db8:1::2) <INC:11,"eth1",mtu=1500>  0.214 ms  0.171 ms  0.162 ms
  2  2001:db8:1::3 (2001:db8:1::3) <INC:12,"eth2",mtu=1500>  0.154 ms  0.135 ms  0.127 ms

 # traceroute -e 192.0.2.3
 traceroute to 192.0.2.3 (192.0.2.3), 30 hops max, 60 byte packets
  1  192.0.2.2 (192.0.2.2) <INC:11,"eth1",mtu=1500>  0.191 ms  0.148 ms  0.144 ms
  2  192.0.2.3 (192.0.2.3) <INC:12,"eth2",mtu=1500>  0.137 ms  0.122 ms  0.114 ms

Implementation
==============

As previously stated, the feature is controlled via a per-{netns,
address} sysctl. Specifically, a bit mask where each bit controls the
addition of a different ICMP extension to ICMP error messages.
Currently, only a single value is supported, to append the incoming
interface information.

Key points:

1. Global knob vs finer control. I am not aware of users who require
finer control, but it is possible that some users will want to avoid
appending ICMP extensions when the packet is sent out of a specific
interface (e.g., the management interface) or to a specific subnet. This
can be accomplished via a tc-bpf program that trims the ICMP extension
structure. An example program can be found here [6].

2. Split implementation between IPv4 / IPv6. While the implementation is
currently similar, there are some differences between both address
families. In addition, some extensions (e.g., RFC 8883 [7]) are
IPv6-specific. Given the above and given that the implementation is not
very complex, it makes sense to keep both implementations separate.

3. Compatibility with legacy applications. RFC 4884 from 2007 extended
certain ICMP messages with a length field that encodes the length of the
"original datagram" field, so that applications will be able to tell
where the "original datagram" ends and where the ICMP extension
structure starts.

Before the introduction of the IP{,6}_RECVERR_RFC4884 socket options
[8][9] in 2020 it was impossible for applications to know where the ICMP
extension structure starts and to this day some applications assume that
it starts at offset 128, which is the minimum length of the "original
datagram" field as specified by RFC 4884.

Therefore, in order to be compatible with both legacy and modern
applications, the datagram that elicited the ICMP error is trimmed /
padded to 128 bytes before appending the ICMP extension structure.

This behavior is specifically called out by RFC 4884: "Those wishing to
be backward compatible with non-compliant TRACEROUTE implementations
will include exactly 128 octets" [10].

Note that in 128 bytes we should be able to include enough headers for
the originating node to match the ICMP error message with the relevant
socket. For example, the following headers will be present in the
"original datagram" field when a VXLAN encapsulated IPv6 packet elicits
an ICMP error in an IPv6 underlay: IPv6 (40) | UDP (8) | VXLAN (8) | Eth
(14) | IPv6 (40) | UDP (8). Overall, 118 bytes.

If the 128 bytes limit proves to be insufficient for some use case, we
can consider dedicating a new bit in the previously mentioned sysctl to
allow for more bytes to be included in the "original datagram" field.

4. Extensibility. This patchset adds partial support for a single ICMP
extension. However, the interface and the implementation should be able
to support more extensions, if needed. Examples:

* More interface information objects as part of RFC 5837. We should be
  able to derive the outgoing interface information and nexthop IP from
  the dst entry attached to the packet that elicited the error.

* Node identification object (e.g., hostname / loopback IP) [11].

* Extended Information object which encodes aggregate header limits as
  part of RFC 8883.

A previous proposal from Ishaan Gandhi and Ron Bonica is available here
[12].

Testing
=======

The existing traceroute selftest is extended to test that ICMP
extensions are reported correctly when enabled. Both address families
are tested and with different packet sizes in order to make sure that
trimming / padding works correctly. Tested that packets are parsed
correctly by the IP{,6}_RECVERR_RFC4884 socket options using Willem's
selftest [13].

Changelog
=========

Changes since v1 [14]:

* Patches #1-#2: Added a comment about field ordering and review tags.

* Patch #3: Converted "sysctl" to "echo" when testing the return value.
  Added a check to skip the test if traceroute version is older
  than 2.1.5.

[1] https://datatracker.ietf.org/doc/html/rfc5837
[2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=1c2fb7f93cb20621772bf304f3dba0849942e5db
[3] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=fac6fce9bdb59837bb89930c3a92f5e0d1482f0b
[4] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=4a8c416602d97a4e2073ed563d4d4c7627de19cf
[5] https://datatracker.ietf.org/doc/html/rfc4884
[6] https://gist.github.com/idosch/5013448cdb5e9e060e6bfdc8b433577c
[7] https://datatracker.ietf.org/doc/html/rfc8883
[8] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=eba75c587e811d3249c8bd50d22bb2266ccd3c0f
[9] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=01370434df85eb76ecb1527a4466013c4aca2436
[10] https://datatracker.ietf.org/doc/html/rfc4884#section-5.3
[11] https://datatracker.ietf.org/doc/html/draft-ietf-intarea-extended-icmp-nodeid-04
[12] https://lore.kernel.org/netdev/20210317221959.4410-1-ishaangandhi@gmail.com/
[13] https://lore.kernel.org/netdev/aPpMItF35gwpgzZx@shredder/
[14] https://lore.kernel.org/netdev/20251022065349.434123-1-idosch@nvidia.com/
====================

Link: https://patch.msgid.link/20251027082232.232571-1-idosch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-29 18:28:33 -07:00
Ido Schimmel
02da595751 selftests: traceroute: Add ICMP extensions tests
Test that ICMP extensions are reported correctly when enabled and not
reported when disabled. Test both IPv4 and IPv6 and using different
packet sizes, to make sure trimming / padding works correctly.

Disable ICMP rate limiting (defaults to 1 per-second per-target) so that
the kernel will always generate ICMP errors when needed.

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Link: https://patch.msgid.link/20251027082232.232571-4-idosch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-29 18:28:30 -07:00
Ido Schimmel
d12d04d221 ipv6: icmp: Add RFC 5837 support
Add the ability to append the incoming IP interface information to
ICMPv6 error messages in accordance with RFC 5837 and RFC 4884. This is
required for more meaningful traceroute results in unnumbered networks.

The feature is disabled by default and controlled via a new sysctl
("net.ipv6.icmp.errors_extension_mask") which accepts a bitmask of ICMP
extensions to append to ICMP error messages. Currently, only a single
value is supported, but the interface and the implementation should be
able to support more extensions, if needed.

Clone the skb and copy the relevant data portions before modifying the
skb as the caller of icmp6_send() still owns the skb after the function
returns. This should be fine since by default ICMP error messages are
rate limited to 1000 per second and no more than 1 per second per
specific host.

Trim or pad the packet to 128 bytes before appending the ICMP extension
structure in order to be compatible with legacy applications that assume
that the ICMP extension structure always starts at this offset (the
minimum length specified by RFC 4884).

Since commit 20e1954fe2 ("ipv6: RFC 4884 partial support for SIT/GRE
tunnels") it is possible for icmp6_send() to be called with an skb that
already contains ICMP extensions. This can happen when we receive an
ICMPv4 message with extensions from a tunnel and translate it to an
ICMPv6 message towards an IPv6 host in the overlay network. I could not
find an RFC that supports this behavior, but it makes sense to not
overwrite the original extensions that were appended to the packet.
Therefore, avoid appending extensions if the length field in the
provided ICMPv6 header is already filled.

Export netdev_copy_name() using EXPORT_IPV6_MOD_GPL() to make it
available to IPv6 when it is built as a module.

Reviewed-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251027082232.232571-3-idosch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-29 18:28:30 -07:00
Ido Schimmel
f0e7036fc9 ipv4: icmp: Add RFC 5837 support
Add the ability to append the incoming IP interface information to
ICMPv4 error messages in accordance with RFC 5837 and RFC 4884. This is
required for more meaningful traceroute results in unnumbered networks.

The feature is disabled by default and controlled via a new sysctl
("net.ipv4.icmp_errors_extension_mask") which accepts a bitmask of ICMP
extensions to append to ICMP error messages. Currently, only a single
value is supported, but the interface and the implementation should be
able to support more extensions, if needed.

Clone the skb and copy the relevant data portions before modifying the
skb as the caller of __icmp_send() still owns the skb after the function
returns. This should be fine since by default ICMP error messages are
rate limited to 1000 per second and no more than 1 per second per
specific host.

Trim or pad the packet to 128 bytes before appending the ICMP extension
structure in order to be compatible with legacy applications that assume
that the ICMP extension structure always starts at this offset (the
minimum length specified by RFC 4884).

Reviewed-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251027082232.232571-2-idosch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-29 18:28:29 -07:00
Puranjay Mohan
5701d5aefa bpf: Use kmalloc_nolock() in bpf streams
BPF stream kfuncs need to be non-sleeping as they can be called from
programs running in any context, this requires a way to allocate memory
from any context. Currently, this is done by a custom per-CPU NMI-safe
bump allocation mechanism, backed by alloc_pages_nolock() and
free_pages_nolock() primitives.

As kmalloc_nolock() and kfree_nolock() primitives are available now, the
custom allocator can be removed in favor of these.

Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20251023161448.4263-1-puranjay@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-29 18:19:46 -07:00
Alexei Starovoitov
d28c0e4921 Merge branch 'misc-rqspinlock-updates'
Kumar Kartikeya Dwivedi says:

====================
Misc rqspinlock updates

A couple of changes for rqspinlock, the first disables propagation of AA
and ABBA deadlocks to waiters succeeding the deadlocking waiter. A more
verbose rationale is available in the commit log. The second commit
expands the stress test to introduce a ABBCCA mode that will reliably
exercise the timeout fallback.
====================

Link: https://lore.kernel.org/r/20251029181828.231529-1-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-29 18:17:56 -07:00
Kumar Kartikeya Dwivedi
a8a0abf097 selftests/bpf: Add ABBCCA case for rqspinlock stress test
Introduce a new mode for the rqspinlock stress test that exercises a
deadlock that won't be detected by the AA and ABBA checks, such that we
always reliably trigger the timeout fallback. We need 4 CPUs for this
particular case, as CPU 0 is untouched, and three participant CPUs for
triggering the ABBCCA case.

Refactor the lock acquisition paths in the module to better reflect the
three modes and choose the right lock depending on the context.

Also drop ABBA case from running by default as part of test progs, since
the stress test can consume a significant amount of time.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Reviewed-by: Amery Hung <ameryhung@gmail.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20251029181828.231529-3-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-29 18:17:56 -07:00
Kumar Kartikeya Dwivedi
7bd6e5ce5b rqspinlock: Disable queue destruction for deadlocks
Disable propagation and unwinding of the waiter queue in case the head
waiter detects a deadlock condition, but keep it enabled in case of the
timeout fallback.

Currently, when the head waiter experiences an AA deadlock, it will
signal all its successors in the queue to exit with an error. This is
not ideal for cases where the same lock is held in contexts which can
cause errors in an unrestricted fashion (e.g., BPF programs, or kernel
paths invoked through BPF programs), and core kernel logic which is
written in a correct fashion and does not expect deadlocks.

The same reasoning can be extended to ABBA situations. Depending on the
actual runtime schedule, one or both of the head waiters involved in an
ABBA situation can detect and exit directly without terminating their
waiter queue. If the ABBA situation manifests again, the waiters will
keep exiting until progress can be made, or a timeout is triggered in
case of more complicated locking dependencies.

We still preserve the queue destruction in case of timeouts, as either
the locking dependencies are too complex to be captured by AA and ABBA
heuristics, or the owner is perpetually stuck. As such, it would be
unwise to continue to apply the timeout for each new head waiter without
terminating the queue, since we may end up waiting for more than 250 ms
in aggregate with all participants in the locking transaction.

The patch itself is fairly simple; we can simply signal our successor to
become the next head waiter, and leave the queue without attempting to
acquire the lock.

With this change, the behavior for waiters in case of deadlocks
experienced by a predecessor changes. It is guaranteed that call sites
will no longer receive errors if the predecessors encounter deadlocks
and the successors do not participate in one. This should lower the
failure rate for waiters that are not doing improper locking opreations,
just because they were unlucky to queue behind a misbehaving waiter.
However, timeouts are still a possibility, hence they must be accounted
for, so users cannot rely upon errors not occuring at all.

Suggested-by: Amery Hung <ameryhung@gmail.com>
Suggested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20251029181828.231529-2-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-29 18:17:56 -07:00
Mykyta Yatsenko
5913e936f6 selftests/bpf: Fix intermittent failures in file_reader test
file_reader/on_open_expect_fault intermittently fails when test_progs
runs tests in parallel, because it expects a page fault on first read.
Another file_reader test running concurrently may have already pulled
the same pages into the page cache, eliminating the fault and causing a
spurious failure.

Make file_reader/on_open_expect_fault read from a file region that does
not overlap with other file_reader tests, so the initial access still
faults even under parallel execution.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Acked-by: Ihor Solodrai <ihor.solodrai@linux.dev>
Link: https://lore.kernel.org/r/20251029195907.858217-1-mykyta.yatsenko5@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-29 18:15:30 -07:00
Kuniyuki Iwashima
b8a7826e4b net: sched: Don't use WARN_ON_ONCE() for -ENOMEM in tcf_classify().
As demonstrated by syzbot, WARN_ON_ONCE() in tcf_classify() can
be easily triggered by fault injection. [0]

We should not use WARN_ON_ONCE() for the simple -ENOMEM case.

Also, we provide SKB_DROP_REASON_NOMEM for the same error.

Let's remove WARN_ON_ONCE() there.

[0]:
FAULT_INJECTION: forcing a failure.
name failslab, interval 1, probability 0, space 0, times 0
CPU: 0 UID: 0 PID: 31392 Comm: syz.8.7081 Not tainted syzkaller #0 PREEMPT(full)
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/02/2025
Call Trace:
 <TASK>
 dump_stack_lvl+0x189/0x250
 should_fail_ex+0x414/0x560
 should_failslab+0xa8/0x100
 kmem_cache_alloc_noprof+0x74/0x6e0
 skb_ext_add+0x148/0x8f0
 tcf_classify+0xeba/0x1140
 multiq_enqueue+0xfd/0x4c0 net/sched/sch_multiq.c:66
...
WARNING: CPU: 0 PID: 31392 at net/sched/cls_api.c:1869 tcf_classify+0xfd7/0x1140
Modules linked in:
CPU: 0 UID: 0 PID: 31392 Comm: syz.8.7081 Not tainted syzkaller #0 PREEMPT(full)
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/02/2025
RIP: 0010:tcf_classify+0xfd7/0x1140
Code: e8 03 42 0f b6 04 30 84 c0 0f 85 41 01 00 00 66 41 89 1f eb 05 e8 89 26 75 f8 bb ff ff ff ff e9 04 f9 ff ff e8 7a 26 75 f8 90 <0f> 0b 90 49 83 c5 44 4c 89 eb 49 c1 ed 03 43 0f b6 44 35 00 84 c0
RSP: 0018:ffffc9000b7671f0 EFLAGS: 00010293
RAX: ffffffff894addf6 RBX: 0000000000000002 RCX: ffff888025029e40
RDX: 0000000000000000 RSI: ffffffff8bbf05c0 RDI: ffffffff8bbf0580
RBP: 0000000000000000 R08: 00000000ffffffff R09: 1ffffffff1c0bfd6
R10: dffffc0000000000 R11: fffffbfff1c0bfd7 R12: ffff88805a90de5c
R13: ffff88805a90ddc0 R14: dffffc0000000000 R15: ffffc9000b7672c0
FS:  00007f20739f66c0(0000) GS:ffff88812613e000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 000000110c2d2a80 CR3: 0000000024e36000 CR4: 00000000003526f0
Call Trace:
 <TASK>
 multiq_classify net/sched/sch_multiq.c:39 [inline]
 multiq_enqueue+0xfd/0x4c0 net/sched/sch_multiq.c:66
 dev_qdisc_enqueue+0x4e/0x260 net/core/dev.c:4118
 __dev_xmit_skb net/core/dev.c:4214 [inline]
 __dev_queue_xmit+0xe83/0x3b50 net/core/dev.c:4729
 packet_snd net/packet/af_packet.c:3076 [inline]
 packet_sendmsg+0x3e33/0x5080 net/packet/af_packet.c:3108
 sock_sendmsg_nosec net/socket.c:727 [inline]
 __sock_sendmsg+0x21c/0x270 net/socket.c:742
 ____sys_sendmsg+0x505/0x830 net/socket.c:2630
 ___sys_sendmsg+0x21f/0x2a0 net/socket.c:2684
 __sys_sendmsg net/socket.c:2716 [inline]
 __do_sys_sendmsg net/socket.c:2721 [inline]
 __se_sys_sendmsg net/socket.c:2719 [inline]
 __x64_sys_sendmsg+0x19b/0x260 net/socket.c:2719
 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
 do_syscall_64+0xfa/0xfa0 arch/x86/entry/syscall_64.c:94
 entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7f207578efc9
Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a8 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007f20739f6038 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
RAX: ffffffffffffffda RBX: 00007f20759e5fa0 RCX: 00007f207578efc9
RDX: 0000000000000004 RSI: 00002000000000c0 RDI: 0000000000000008
RBP: 00007f20739f6090 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000001
R13: 00007f20759e6038 R14: 00007f20759e5fa0 R15: 00007f2075b0fa28
 </TASK>

Reported-by: syzbot+87e1289a044fcd0c5f62@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/netdev/69003e33.050a0220.32483.00e8.GAE@google.com/
Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Jamal Hadi Salim <jhs@mojatatu.com>
Link: https://patch.msgid.link/20251028035859.2067690-1-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-29 18:00:37 -07:00
Ankit Khushwaha
afb8f6567a selftest: net: fix socklen_t type mismatch in sctp_collision test
Socket APIs like recvfrom(), accept(), and getsockname() expect socklen_t*
arg, but tests were using int variables. This causes -Wpointer-sign
warnings on platforms where socklen_t is unsigned.

Change the variable type from int to socklen_t to resolve the warning and
ensure type safety across platforms.

warning fixed:

sctp_collision.c:62:70: warning: passing 'int *' to parameter of
type 'socklen_t *' (aka 'unsigned int *') converts between pointers to
integer types with different sign [-Wpointer-sign]
   62 |                 ret = recvfrom(sd, buf, sizeof(buf),
									0, (struct sockaddr *)&daddr, &len);
      |                                                           ^~~~
/usr/include/sys/socket.h:165:27: note: passing argument to
parameter '__addr_len' here
  165 |                          socklen_t *__restrict __addr_len);
      |                                                ^

Reviewed-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
Signed-off-by: Ankit Khushwaha <ankitkhushwaha.linux@gmail.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20251028172947.53153-1-ankitkhushwaha.linux@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-29 17:39:26 -07:00
Jakub Kicinski
efd3e30e65 Merge branch 'net-stmmac-hwif-c-cleanups'
Russell King says:

====================
net: stmmac: hwif.c cleanups

This series cleans up hwif.c:

- move the reading of the version information out of stmmac_hwif_init()
  into its own function, stmmac_get_version(), storing the result in a
  new struct.

- simplify stmmac_get_version().

- read the version register once, passing it to stmmac_get_id() and
  stmmac_get_dev_id().

- move stmmac_get_id() and stmmac_get_dev_id() into
  stmmac_get_version()

- define version register fields and use FIELD_GET() to decode

- start tackling the big loop in stmmac_hwif_init() - provide a
  function, stmmac_hwif_find(), which looks up the hwif entry, thus
  making a much smaller loop, which improves readability of this code.

- change the use of '^' to '!=' when comparing the dev_id, which is
  what is really meant here.

- reorganise the test after calling stmmac_hwif_init() so that we
  handle the error case in the indented code, and the success case
  with no indent, which is the classical arrangement.
====================

Link: https://patch.msgid.link/aQFZVSGJuv8-_DIo@shell.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-29 17:18:25 -07:00
Russell King (Oracle)
6436f408eb net: stmmac: reorganise stmmac_hwif_init()
Reorganise stmmac_hwif_init() to handle the error case of
stmmac_hwif_find() in the indented block, which follows normal
programming pattern.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Tested-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Tested-by: Mohd Ayaan Anwar <mohd.anwar@oss.qualcomm.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vDtfG-0000000CCCX-2YwQ@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-29 17:18:23 -07:00
Russell King (Oracle)
f9326b139b net: stmmac: use != rather than ^ for comparing dev_id
Use the more usual not-equals rather than exclusive-or operator when
comparing the dev_id in stmmac_hwif_find().

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Tested-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Tested-by: Mohd Ayaan Anwar <mohd.anwar@oss.qualcomm.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vDtfB-0000000CCCR-25rr@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-29 17:18:22 -07:00
Russell King (Oracle)
7b510ea8e5 net: stmmac: provide function to lookup hwif
Provide a function to lookup the hwif entry given the core type,
Synopsys version, and device ID (used for XGMAC cores).

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Tested-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Tested-by: Mohd Ayaan Anwar <mohd.anwar@oss.qualcomm.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vDtf6-0000000CCCL-1cQA@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-29 17:18:22 -07:00
Russell King (Oracle)
b2fe9e29b5 net: stmmac: use FIELD_GET() for version register
Provide field definitions in common.h, and use these with FIELD_GET()
to extract the fields from the version register.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Tested-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Tested-by: Mohd Ayaan Anwar <mohd.anwar@oss.qualcomm.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vDtf1-0000000CCCF-0uUV@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-29 17:18:22 -07:00
Russell King (Oracle)
7b2e41fff7 net: stmmac: move stmmac_get_*id() into stmmac_get_version()
Move the contents of both stmmac_get_id() and stmmac_get_dev_id() into
stmmac_get_version() as it no longer makes sense for these to be
separate functions.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Tested-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Tested-by: Mohd Ayaan Anwar <mohd.anwar@oss.qualcomm.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vDtew-0000000CCC9-0KeM@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-29 17:18:22 -07:00
Russell King (Oracle)
c36b97e4ca net: stmmac: consolidate version reading and validation
There is no need to read the version register twice, once in
stmmac_get_id() and then again in stmmac_get_dev_id(). Consolidate
this into stmmac_get_version() and pass each of these this value.

As both functions unnecessarily issue the same warning for a zero
register value, also move this into stmmac_get_version().

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Tested-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Tested-by: Mohd Ayaan Anwar <mohd.anwar@oss.qualcomm.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vDteq-0000000CCC3-3zbJ@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-29 17:18:22 -07:00
Russell King (Oracle)
f49838f77c net: stmmac: simplify stmmac_get_version()
We can simplify stmmac_get_version() by pre-initialising the version
members to zero, detecting the MAC100 core and returning, otherwise
determining the version register offset separately from calling
stmmac_get_id() and stmmac_get_dev_id(). Do this.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Tested-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Tested-by: Mohd Ayaan Anwar <mohd.anwar@oss.qualcomm.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vDtel-0000000CCBx-3Lpf@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-29 17:18:22 -07:00
Russell King (Oracle)
fc18b6e98c net: stmmac: move version handling into own function
Move the version handling out of stmmac_hwif_init() and into its own
function, returning the version information through a structure.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Tested-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Tested-by: Mohd Ayaan Anwar <mohd.anwar@oss.qualcomm.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vDteg-0000000CCBr-2m7q@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-29 17:18:21 -07:00
Wang Liang
f58abec23d net: ipv4: Remove extern udp_v4_early_demux()/tcp_v4_early_demux() in .c files
Function udp_v4_early_demux() was already declared in 'include/net/udp.h',
no need to keep the extern in 'ip_input.c', which may produce the
following checkpatch warning:

  WARNING: externs should be avoided in .c files
  #45: FILE: net/ipv4/ip_input.c:322:
  +enum skb_drop_reason udp_v4_early_demux(struct sk_buff *skb);

Replace it by including 'net/udp.h'. Do the same for tcp_v4_early_demux().

Signed-off-by: Wang Liang <wangliang74@huawei.com>
Link: https://patch.msgid.link/20251025092637.1020960-1-wangliang74@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-29 17:05:30 -07:00
Alok Tiwari
9157b8a88c igbvf: fix misplaced newline in VLAN add warning message
Corrected the dev_warn format string:
- "Vlan id %d\n is not added" -> "Vlan id %d is not added\n"

Signed-off-by: Alok Tiwari <alok.a.tiwari@oracle.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2025-10-29 13:55:22 -07:00
Alok Tiwari
6ef670d833 ixgbe: fix typos in ixgbe driver comments
Corrected function reference:
 - "proc_autoc_read_82599" -> "prot_autoc_read_82599"
Fixed spelling of:
 - "big-enian" -> "big-endian"
 - "Virtualiztion" -> "Virtualization"

Signed-off-by: Alok Tiwari <alok.a.tiwari@oracle.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2025-10-29 13:55:22 -07:00
Sreedevi Joshi
5d9b400e6f idpf: remove duplicate defines in IDPF_CAP_RSS
Remove duplicate defines from the OR operation when defining IDPF_CAP_RSS.

Duplicate definitions were introduced when IDPF_CAP_RSS was originally
defined and were left behind and went unnoticed during a previous commit
that renamed them. Review of the original out-of-tree code confirms these
duplicates were the result of a typing error.

Remove the duplicates to clean up the code and avoid potential confusion.
Also verify no other duplicate occurrences of these defines exist
elsewhere in the codebase.

Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Signed-off-by: Sreedevi Joshi <sreedevi.joshi@intel.com>
Tested-by: Samuel Salin <Samuel.salin@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2025-10-29 13:55:22 -07:00
Jay Vosburgh
a7ae783da0 i40e: avoid redundant VF link state updates
Multiple sources can request VF link state changes with identical
parameters. For example, OpenStack Neutron may request to set the VF link
state to IFLA_VF_LINK_STATE_AUTO during every initialization or user can
issue: `ip link set <ifname> vf 0 state auto` multiple times. Currently,
the i40e driver processes each of these requests, even if the requested
state is the same as the current one. This leads to unnecessary VF resets
and can cause performance degradation or instability in the VF driver,
particularly in environment using Data Plane Development Kit (DPDK).

With this patch i40e will skip VF link state change requests when the
desired link state matches the current configuration. This prevents
unnecessary VF resets and reduces PF-VF communication overhead.

To reproduce the problem run following command multiple times
on the same interface: 'ip link set <ifname> vf 0 state auto'
Every time command is executed, PF driver will trigger VF reset.

Co-developed-by: Robert Malz <robert.malz@canonical.com>
Signed-off-by: Robert Malz <robert.malz@canonical.com>
Signed-off-by: Jay Vosburgh <jay.vosburgh@canonical.com>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Tested-by: Rafal Romanowski <rafal.romanowski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2025-10-29 13:55:22 -07:00
Grzegorz Nitka
ba2807b869 ice: Allow 100M speed for E825C SGMII device
Add E825C 10GbE SGMII device to the list of devices supporting 100Mbit
link mode. Without that change, 100Mbit link mode is ignored in ethtool
interface. This change was missed while adding the support for E825C
devices family.

Testing hints (please note, for previous version, 100baseT/Full entry
was missing):
[root@localhost]# ethtool eth3
Settings for eth3:
        Supported ports: [ TP ]
        Supported link modes:   100baseT/Full
                                1000baseT/Full
                                10000baseT/Full
        Supported pause frame use: Symmetric
        Supports auto-negotiation: Yes
        Supported FEC modes: None
        Advertised link modes:  100baseT/Full
                                1000baseT/Full
                                10000baseT/Full
	...

Signed-off-by: Grzegorz Nitka <grzegorz.nitka@intel.com>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Reviewed-by: Paul Menzel <pmenzel@molgen.mpg.de>
Reviewed-by: Simon Horman <horms@kernel.org>
Tested-by: Rinitha S <sx.rinitha@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2025-10-29 13:55:22 -07:00
Alexander Lobakin
8adfcfd6a2 ice: implement configurable header split for regular Rx
Add second page_pool for header buffers to each Rx queue and ability
to toggle the header split on/off using Ethtool (default to off to
match the current behaviour).
Unlike idpf, all HW backed up by ice doesn't require any W/As and
correctly splits all types of packets as configured: after L4 headers
for TCP/UDP/SCTP, after L3 headers for other IPv4/IPv6 frames, after
the Ethernet header otherwise (in case of tunneling, same as above,
but after innermost headers).
This doesn't affect the XSk path as there are no benefits of having
it there.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Tested-by: Alexander Nowlin <alexander.nowlin@intel.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2025-10-29 13:55:21 -07:00
Michal Kubiak
93f53db9f9 ice: switch to Page Pool
This patch completes the transition of the ice driver to use the Page Pool
and libeth APIs, following the same direction as commit 5fa4caff59
("iavf: switch to Page Pool"). With the legacy page splitting and recycling
logic already removed, the driver is now in a clean state to adopt the
modern memory model.

The Page Pool integration simplifies buffer management by offloading
DMA mapping and recycling to the core infrastructure. This eliminates
the need for driver-specific handling of headroom, buffer sizing, and
page order. The libeth helper is used for CPU-side processing, while
DMA-for-device is handled by the Page Pool core.

Additionally, this patch extends the conversion to cover XDP support.
The driver now uses libeth_xdp helpers for Rx buffer processing,
and optimizes XDP_TX by skipping per-frame DMA mapping. Instead, all
buffers are mapped as bi-directional up front, leveraging Page Pool's
lifecycle management. This significantly reduces overhead in virtualized
environments.

Performance observations:
- In typical scenarios (netperf, XDP_PASS, XDP_DROP), performance remains
  on par with the previous implementation.
- In XDP_TX mode:
  * With IOMMU enabled, performance improves dramatically - over 5x
    increase - due to reduced DMA mapping overhead and better memory reuse.
  * With IOMMU disabled, performance remains comparable to the previous
    implementation, with no significant changes observed.
- In XDP_DROP mode:
  * For small MTUs, (where multiple buffers can be allocated on a single
    memory page), a performance drop of approximately 20% is observed.
    According to 'perf top' analysis, the bottleneck is caused by atomic
    reference counter increments in the Page Pool.
  * For normal MTUs, (where only one buffer can be allocated within a
    single memory page), performance remains comparable to baseline
    levels.

This change is also a step toward a more modular and unified XDP
implementation across Intel Ethernet drivers, aligning with ongoing
efforts to consolidate and streamline feature support.

Suggested-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Suggested-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Reviewed-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Michal Kubiak <michal.kubiak@intel.com>
Tested-by: Alexander Nowlin <alexander.nowlin@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2025-10-29 13:55:16 -07:00
Michal Kubiak
3a4f419f75 ice: drop page splitting and recycling
As part of the transition toward Page Pool integration, remove the
legacy page splitting and recycling logic from the ice driver. This
mirrors the approach taken in commit 920d86f3c5 ("iavf: drop page
splitting and recycling").

The previous model attempted to reuse partially consumed pages by
splitting them and tracking their usage across descriptors. While
this was once a memory optimization, it introduced significant
complexity and overhead in the Rx path, including:
- Manual refcount management and page reuse heuristics;
- Per-descriptor buffer shuffling, which could involve moving dozens
  of `ice_rx_buf` structures per NAPI cycle;
- Increased branching and cache pressure in the hotpath.

This change simplifies the Rx logic by always allocating fresh pages
and letting the networking stack handle their lifecycle. Although this
may temporarily reduce performance (up to ~98% in some XDP cases), it
greatly improves maintainability and paves the way for Page Pool,
which will restore and exceed previous performance levels.

The `ice_rx_buf` array is retained for now to minimize diffstat and
ease future replacement with a shared buffer abstraction.

Co-developed-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Reviewed-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Michal Kubiak <michal.kubiak@intel.com>
Tested-by: Alexander Nowlin <alexander.nowlin@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2025-10-29 13:53:12 -07:00
Michal Kubiak
9e314a3c52 ice: remove legacy Rx and construct SKB
The commit 53844673d5 ("iavf: kill 'legacy-rx' for good") removed
the legacy Rx path in the iavf driver. This change applies the same
rationale to the ice driver.

The legacy Rx path relied on manual skb allocation and header copying,
which has become increasingly inefficient and difficult to maintain.
With the stabilization of build_skb() and the growing adoption of
features like XDP, page_pool, and multi-buffer support, the legacy
approach is no longer viable.

Key drawbacks of the legacy path included:
- Higher memory pressure due to direct page allocations and splitting;
- Redundant memcpy() operations for packet headers;
- CPU overhead from eth_get_headlen() and Flow Dissector usage;
- Compatibility issues with XDP, which imposes strict headroom and
  tailroom requirements.

The ice driver, like iavf, does not benefit from the minimal headroom
savings that legacy Rx once offered, as it already splits pages into
fixed halves. Removing this path simplifies the Rx logic, eliminates
unnecessary branches in the hotpath, and prepares the driver for
upcoming enhancements.

In addition to removing the legacy Rx path, this change also eliminates
the custom construct_skb() functions from both the standard and
zero-copy (ZC) Rx paths. These are replaced with the build_skb()
and standardized xdp_build_skb_from_zc() helpers, aligning the driver
with the modern XDP infrastructure and reducing code duplication.

This cleanup also reduces code complexity and improves maintainability
as we move toward a more unified and modern Rx model across drivers.

Co-developed-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Reviewed-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Michal Kubiak <michal.kubiak@intel.com>
Tested-by: Alexander Nowlin <alexander.nowlin@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2025-10-29 13:52:55 -07:00
Martin KaFai Lau
e2e668bd81 Merge branch 'selftests-bpf-convert-test_tc_tunnel-sh-to-test_progs'
Alexis Lothoré says:

====================
Hello,
this is the v3 of test_tc_tunnel conversion into test_progs framework.
This new revision:
- fixes a few issues spotted by the bot reviewer
- removes any test ensuring connection failure (and so depending on a
  timout) to keep the execution time reasonable

test_tc_tunnel.sh tests a variety of tunnels based on BPF: packets are
encapsulated by a BPF program on the client egress. We then check that
those packets can be decapsulated on server ingress side, either thanks
to kernel-based or BPF-based decapsulation. Those tests are run thanks
to two veths in two dedicated namespaces.

- patches 1 and 2 are preparatory patches
- patch 3 introduce tc_tunnel test into test_progs
- patch 4 gets rid of the test_tc_tunnel.sh script

The new test has been executed both in some x86 local qemu machine, as
well as in CI:

  # ./test_progs -a tc_tunnel
  #454/1   tc_tunnel/ipip_none:OK
  #454/2   tc_tunnel/ipip6_none:OK
  #454/3   tc_tunnel/ip6tnl_none:OK
  #454/4   tc_tunnel/sit_none:OK
  #454/5   tc_tunnel/vxlan_eth:OK
  #454/6   tc_tunnel/ip6vxlan_eth:OK
  #454/7   tc_tunnel/gre_none:OK
  #454/8   tc_tunnel/gre_eth:OK
  #454/9   tc_tunnel/gre_mpls:OK
  #454/10  tc_tunnel/ip6gre_none:OK
  #454/11  tc_tunnel/ip6gre_eth:OK
  #454/12  tc_tunnel/ip6gre_mpls:OK
  #454/13  tc_tunnel/udp_none:OK
  #454/14  tc_tunnel/udp_eth:OK
  #454/15  tc_tunnel/udp_mpls:OK
  #454/16  tc_tunnel/ip6udp_none:OK
  #454/17  tc_tunnel/ip6udp_eth:OK
  #454/18  tc_tunnel/ip6udp_mpls:OK
  #454     tc_tunnel:OK
  Summary: 1/18 PASSED, 0 SKIPPED, 0 FAILED
====================

Link: https://patch.msgid.link/20251027-tc_tunnel-v3-0-505c12019f9d@bootlin.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
2025-10-29 12:24:57 -07:00
Alexis Lothoré (eBPF Foundation)
5d3591607d selftests/bpf: Remove test_tc_tunnel.sh
Now that test_tc_tunnel.sh scope has been ported to the test_progs
framework, remove it.

Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20251027-tc_tunnel-v3-4-505c12019f9d@bootlin.com
2025-10-29 12:17:24 -07:00
Alexis Lothoré (eBPF Foundation)
8517b1abe5 selftests/bpf: Integrate test_tc_tunnel.sh tests into test_progs
The test_tc_tunnel.sh script checks that a large variety of tunneling
mechanisms handled by the kernel can be handled as well by eBPF
programs. While this test shares similarities with test_tunnel.c (which
is already integrated in test_progs), those are testing slightly
different things:
- test_tunnel.c creates a tunnel interface, and then get and set tunnel
  keys in packet metadata, from BPF programs.
- test_tc_tunnels.sh manually parses/crafts packets content

Bring the tests covered by test_tc_tunnel.sh into the test_progs
framework, by creating a dedicated test_tc_tunnel.sh. This new test
defines a "generic" runner which, for each test configuration:
- will configure the relevant veth pair, each of those isolated in a
  dedicated namespace
- will check that traffic will fail if there is only an encapsulating
  program attached to one veth egress
- will check that traffic succeed if we enable some decapsulation module
  on kernel side
- will check that traffic still succeeds if we replace the kernel
  decapsulation with some eBPF ingress decapsulation.

Example of the new test execution:

  # ./test_progs -a tc_tunnel
  #447/1   tc_tunnel/ipip_none:OK
  #447/2   tc_tunnel/ipip6_none:OK
  #447/3   tc_tunnel/ip6tnl_none:OK
  #447/4   tc_tunnel/sit_none:OK
  #447/5   tc_tunnel/vxlan_eth:OK
  #447/6   tc_tunnel/ip6vxlan_eth:OK
  #447/7   tc_tunnel/gre_none:OK
  #447/8   tc_tunnel/gre_eth:OK
  #447/9   tc_tunnel/gre_mpls:OK
  #447/10  tc_tunnel/ip6gre_none:OK
  #447/11  tc_tunnel/ip6gre_eth:OK
  #447/12  tc_tunnel/ip6gre_mpls:OK
  #447/13  tc_tunnel/udp_none:OK
  #447/14  tc_tunnel/udp_eth:OK
  #447/15  tc_tunnel/udp_mpls:OK
  #447/16  tc_tunnel/ip6udp_none:OK
  #447/17  tc_tunnel/ip6udp_eth:OK
  #447/18  tc_tunnel/ip6udp_mpls:OK
  #447     tc_tunnel:OK
  Summary: 1/18 PASSED, 0 SKIPPED, 0 FAILED

Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20251027-tc_tunnel-v3-3-505c12019f9d@bootlin.com
2025-10-29 12:17:22 -07:00
Alexis Lothoré (eBPF Foundation)
86433db932 selftests/bpf: Make test_tc_tunnel.bpf.c compatible with big endian platforms
When trying to run bpf-based encapsulation in a s390x environment, some
parts of test_tc_tunnel.bpf.o do not encapsulate correctly the traffic,
leading to tests failures. Adding some logs shows for example that
packets about to be sent on an interface with the ip6vxlan_eth program
attached do not have the expected value 5 in the ip header ihl field,
and so are ignored by the program.

This phenomenon appears when trying to cross-compile the selftests,
rather than compiling it from a virtualized host: the selftests build
system may then wrongly pick some host headers. If <asm/byteorder.h>
ends up being picked on the host (and if the host has a endianness
different from the target one), it will then expose wrong endianness
defines (e.g __LITTLE_ENDIAN_BITFIELD instead of __BIT_ENDIAN_BITFIELD),
and it will for example mess up the iphdr structure layout used in the
ebpf program.

To prevent this, directly use the vmlinux.h header generated by the
selftests build system rather than including directly specific kernel
headers. As a consequence, add some missing definitions that are not
exposed by vmlinux.h, and adapt the bitfield manipulations to allow
building and using the program on both types of platforms.

Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20251027-tc_tunnel-v3-2-505c12019f9d@bootlin.com
2025-10-29 11:07:26 -07:00
Alexis Lothoré (eBPF Foundation)
1d5137c8d1 selftests/bpf: Add tc helpers
The test_tunnel.c file defines small fonctions to easily attach eBPF
programs to tc hooks, either on egress, ingress or both.

Create a shared helper in network_helpers.c so that other tests can
benefit from it.

Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20251027-tc_tunnel-v3-1-505c12019f9d@bootlin.com
2025-10-29 11:07:24 -07:00
Tianling Shen
a8abe8e210 net: phy: motorcomm: Add support for PHY LEDs on YT8531
The LED registers on YT8531 are exactly same as YT8521, so simply
reuse yt8521_led_hw_* functions.

Tested on OrangePi R1 Plus LTS and Zero3.

Signed-off-by: Tianling Shen <cnsztl@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Jijie Shao<shaojijie@huawei.com>
Link: https://patch.msgid.link/20251026133652.1288732-1-cnsztl@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-28 18:56:16 -07:00
Issam Hamdi
61958b33ef net: phy: realtek: Add RTL8224 cable testing support
The RTL8224 can detect open pairs and short types (in same pair or some
other pair). The distance to this problem can be estimated. This is done
for each of the 4 pairs separately.

It is not meant to be run while there is an active link partner because
this interferes with the active test pulses.

Output with open 50 m cable:

  Pair A code Open Circuit, source: TDR
  Pair A, fault length: 51.79m, source: TDR
  Pair B code Open Circuit, source: TDR
  Pair B, fault length: 51.28m, source: TDR
  Pair C code Open Circuit, source: TDR
  Pair C, fault length: 50.46m, source: TDR
  Pair D code Open Circuit, source: TDR
  Pair D, fault length: 51.12m, source: TDR

Terminated cable:

  Pair A code OK, source: TDR
  Pair B code OK, source: TDR
  Pair C code OK, source: TDR
  Pair D code OK, source: TDR

Shorted cable (both short types are at roughly the same distance)

  Pair A code Short to another pair, source: TDR
  Pair A, fault length: 2.35m, source: TDR
  Pair B code Short to another pair, source: TDR
  Pair B, fault length: 2.15m, source: TDR
  Pair C code OK, source: TDR
  Pair D code Short within Pair, source: TDR
  Pair D, fault length: 1.94m, source: TDR

Signed-off-by: Issam Hamdi <ih@simonwunderlich.de>
Co-developed-by: Sven Eckelmann <se@simonwunderlich.de>
Signed-off-by: Sven Eckelmann <se@simonwunderlich.de>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20251024-rtl8224-cable-test-v1-1-e3cda89ac98f@simonwunderlich.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-28 18:35:08 -07:00
Jakub Kicinski
e9ce7f493e Merge branch '100GbE' of git://git.kernel.org/pub/scm/linux/kernel/git/tnguy/next-queue
Tony Nguyen says:

====================
ice: postpone service task disabling

Przemek Kitszel says:

Move service task shutdown to the very end of driver teardown procedure.
This is needed (or at least beneficial) for all unwinding functions that
talk to FW/HW via Admin Queue (so, most of top-level functions, like
ice_deinit_hw()).

Most of the patches move stuff around (I believe it makes it much easier
to review/proof when kept separate) in preparation to defer stopping the
service task to the very end of ice_remove() (and other unwinding flows).
Then last patch fixes duplicate call to ice_init_hw() (actual, but
unlikely to encounter, so -next, given the size of the changes).

First patch is not much related, only by that it was developed together

* '100GbE' of git://git.kernel.org/pub/scm/linux/kernel/git/tnguy/next-queue:
  ice: remove duplicate call to ice_deinit_hw() on error paths
  ice: move ice_deinit_dev() to the end of deinit paths
  ice: extract ice_init_dev() from ice_init()
  ice: move ice_init_pf() out of ice_init_dev()
  ice: move udp_tunnel_nic and misc IRQ setup into ice_init_pf()
  ice: ice_init_pf: destroy mutexes and xarrays on memory alloc failure
  ice: move ice_init_interrupt_scheme() prior ice_init_pf()
  ice: move service task start out of ice_init_pf()
  ice: enforce RTNL assumption of queue NAPI manipulation
====================

Link: https://patch.msgid.link/20251024204746.3092277-1-anthony.l.nguyen@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-28 18:12:07 -07:00
Rakuram Eswaran
5c00da851c net: tcp_lp: fix kernel-doc warnings and update outdated reference links
Fix kernel-doc warnings in tcp_lp.c by adding missing parameter
descriptions for tcp_lp_cong_avoid() and tcp_lp_pkts_acked() when
building with W=1.

Also replace invalid URLs in the file header comment with the currently
valid links to the TCP-LP paper and implementation page.

No functional changes.

Signed-off-by: Rakuram Eswaran <rakuram.e96@gmail.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20251025-net_ipv4_tcp_lp_c-v1-1-058cc221499e@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-28 17:52:44 -07:00
Christophe JAILLET
294bfe0343 sctp: Constify struct sctp_sched_ops
'struct sctp_sched_ops' is not modified in these drivers.

Constifying this structure moves some data to a read-only section, so
increases overall security, especially when the structure holds some
function pointers.

On a x86_64, with allmodconfig, as an example:
Before:
======
   text	   data	    bss	    dec	    hex	filename
   8019	    568	      0	   8587	   218b	net/sctp/stream_sched_fc.o

After:
=====
   text	   data	    bss	    dec	    hex	filename
   8275	    312	      0	   8587	   218b	net/sctp/stream_sched_fc.o

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Link: https://patch.msgid.link/dce03527eb7b7cc8a3c26d5cdac12bafe3350135.1761377890.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-28 17:50:55 -07:00
Bobby Eshleman
8443c31608 net: netmem: remove NET_IOV_MAX from net_iov_type enum
Remove the NET_IOV_MAX workaround from the net_iov_type enum. This entry
was previously added to force the enum size to unsigned long to satisfy
the NET_IOV_ASSERT_OFFSET static assertions.

After commit f3d85c9ee5 ("netmem: introduce struct netmem_desc
mirroring struct page") this approach became unnecessary by placing the
net_iov_type after the netmem_desc. Placing the net_iov_type after
netmem_desc results in the net_iov_type size having no effect on the
position or layout of the fields that mirror the struct page.

The layout before this patch:

struct net_iov {
	union {
		struct netmem_desc desc;                 /*     0    48 */
		struct {
			long unsigned int _flags;        /*     0     8 */
			long unsigned int pp_magic;      /*     8     8 */
			struct page_pool * pp;           /*    16     8 */
			long unsigned int _pp_mapping_pad; /*    24     8 */
			long unsigned int dma_addr;      /*    32     8 */
			atomic_long_t pp_ref_count;      /*    40     8 */
		};                                       /*     0    48 */
	};                                               /*     0    48 */
	struct net_iov_area *      owner;                /*    48     8 */
	enum net_iov_type          type;                 /*    56     8 */

	/* size: 64, cachelines: 1, members: 3 */
};

The layout after this patch:

struct net_iov {
	union {
		struct netmem_desc desc;                 /*     0    48 */
		struct {
			long unsigned int _flags;        /*     0     8 */
			long unsigned int pp_magic;      /*     8     8 */
			struct page_pool * pp;           /*    16     8 */
			long unsigned int _pp_mapping_pad; /*    24     8 */
			long unsigned int dma_addr;      /*    32     8 */
			atomic_long_t pp_ref_count;      /*    40     8 */
		};                                       /*     0    48 */
	};                                               /*     0    48 */
	struct net_iov_area *      owner;                /*    48     8 */
	enum net_iov_type          type;                 /*    56     4 */

	/* size: 64, cachelines: 1, members: 3 */
	/* padding: 4 */
};

Signed-off-by: Bobby Eshleman <bobbyeshleman@meta.com>
Reviewed-by: Mina Almasry <almasrymina@google.com>
Link: https://patch.msgid.link/20251024-b4-devmem-remove-niov-max-v1-1-ba72c68bc869@meta.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-28 17:41:46 -07:00
Eric Dumazet
c72568c21b net: rps: softnet_data reorg to make enqueue_to_backlog() fast
enqueue_to_backlog() is showing up in kernel profiles on hosts
with many cores, when RFS/RPS is used.

The following softnet_data fields need to be updated:

- input_queue_tail
- input_pkt_queue (next, prev, qlen, lock)
- backlog.state (if input_pkt_queue was empty)

Unfortunately they are currenly using two cache lines:

	/* --- cacheline 3 boundary (192 bytes) --- */
	call_single_data_t         csd __attribute__((__aligned__(64))); /*  0xc0  0x20 */
	struct softnet_data *      rps_ipi_next;         /*  0xe0   0x8 */
	unsigned int               cpu;                  /*  0xe8   0x4 */
	unsigned int               input_queue_tail;     /*  0xec   0x4 */
	struct sk_buff_head        input_pkt_queue;      /*  0xf0  0x18 */

	/* --- cacheline 4 boundary (256 bytes) was 8 bytes ago --- */

	struct napi_struct         backlog __attribute__((__aligned__(8))); /* 0x108 0x1f0 */

Add one ____cacheline_aligned_in_smp to make sure they now are using
a single cache line.

Also, because napi_struct has written fields, make @state its first field.

We want to make sure that cpus adding packets to sd->input_pkt_queue
are not slowing down cpus processing their backlog because of
false sharing.

After this patch new layout is:

	/* --- cacheline 5 boundary (320 bytes) --- */
	long int                   pad[3] __attribute__((__aligned__(64))); /* 0x140  0x18 */
	unsigned int               input_queue_tail;     /* 0x158   0x4 */

	/* XXX 4 bytes hole, try to pack */

	struct sk_buff_head        input_pkt_queue;      /* 0x160  0x18 */
	struct napi_struct         backlog __attribute__((__aligned__(8))); /* 0x178 0x1f0 */

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20251024091240.3292546-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-28 17:41:17 -07:00
Eric Dumazet
a086e9860c net: optimize enqueue_to_backlog() for the fast path
Add likely() and unlikely() clauses for the common cases:

Device is running.
Queue is not full.
Queue is less than half capacity.

Add max_backlog parameter to skb_flow_limit() to avoid
a second READ_ONCE(net_hotdata.max_backlog).

skb_flow_limit() does not need the backlog_lock protection,
and can be called before we acquire the lock, for even better
resistance to attacks.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20251024090517.3289181-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-28 17:39:03 -07:00
Jakub Kicinski
34164142b5 tools: ynl: rework the string representation of NlError
In early days of YNL development dumping the NlMsg on errors
was quite useful, as the library itself could have been buggy.
These days increasingly the NlMsg is just taking up screen space
and means nothing to a typical user. Try to format the errors
more in line with how YNL C formats its errors strings.

Before:
  $ ynl --family ethtool  --do channels-set  --json '{}'
  Netlink error: Invalid argument
  nl_len = 44 (28) nl_flags = 0x300 nl_type = 2
	error: -22
	extack: {'miss-type': 'header'}

  $ ynl --family ethtool  --do channels-set  --json '{..., "tx-count": 999}'
  Netlink error: Invalid argument
  nl_len = 88 (72) nl_flags = 0x300 nl_type = 2
	error: -22
	extack: {'msg': 'requested channel count exceeds maximum', 'bad-attr': '.tx-count'}

After:
  $ ynl --family ethtool  --do channels-set  --json '{}'
  Netlink error: Invalid argument {'miss-type': 'header'}

  $ ynl --family ethtool  --do channels-set  --json '{..., "tx-count": 999}'
  Netlink error: requested channel count exceeds maximum: Invalid argument {'bad-attr': '.tx-count'}

Reviewed-by: Donald Hunter <donald.hunter@gmail.com>
Link: https://patch.msgid.link/20251027192958.2058340-2-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-28 16:35:06 -07:00
Jakub Kicinski
09e2603513 tools: ynl: fix indent issues in the main Python lib
Class NlError() and operation_do_attributes() are indented by 2 spaces
rather than 4 spaces used by the rest of the file.

Reviewed-by: Donald Hunter <donald.hunter@gmail.com>
Link: https://patch.msgid.link/20251027192958.2058340-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-28 16:35:06 -07:00
Jianyun Gao
54c134f379 libbpf: Fix the incorrect reference to the memlock_rlim variable in the comment.
The variable "memlock_rlim_max" referenced in the comment does not exist.
I think that the author probably meant the variable "memlock_rlim". So,
correct it.

Signed-off-by: Jianyun Gao <jianyungao89@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20251027032008.738944-1-jianyungao89@gmail.com
2025-10-28 10:28:53 -07:00
Jianyun Gao
4f361895ae libbpf: Optimize the redundant code in the bpf_object__init_user_btf_maps() function.
In the elf_sec_data() function, the input parameter 'scn' will be
evaluated. If it is NULL, then it will directly return NULL. Therefore,
the return value of the elf_sec_data() function already takes into
account the case where the input parameter scn is NULL. Therefore,
subsequently, the code only needs to check whether the return value of
the elf_sec_data() function is NULL.

Signed-off-by: Jianyun Gao <jianyungao89@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/bpf/20251024080802.642189-1-jianyungao89@gmail.com
2025-10-28 10:26:00 -07:00
Arnaud Lecomte
23f852daa4 bpf: Fix stackmap overflow check in __bpf_get_stackid()
Syzkaller reported a KASAN slab-out-of-bounds write in __bpf_get_stackid()
when copying stack trace data. The issue occurs when the perf trace
 contains more stack entries than the stack map bucket can hold,
 leading to an out-of-bounds write in the bucket's data array.

Fixes: ee2a098851 ("bpf: Adjust BPF stack helper functions to accommodate skip > 0")
Reported-by: syzbot+c9b724fbb41cf2538b7b@syzkaller.appspotmail.com
Signed-off-by: Arnaud Lecomte <contact@arnaud-lcm.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Acked-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/bpf/20251025192941.1500-1-contact@arnaud-lcm.com

Closes: https://syzkaller.appspot.com/bug?extid=c9b724fbb41cf2538b7b
2025-10-28 09:20:27 -07:00
Arnaud Lecomte
e17d62fedd bpf: Refactor stack map trace depth calculation into helper function
Extract the duplicated maximum allowed depth computation for stack
traces stored in BPF stacks from bpf_get_stackid() and __bpf_get_stack()
into a dedicated stack_map_calculate_max_depth() helper function.

This unifies the logic for:
- The max depth computation
- Enforcing the sysctl_perf_event_max_stack limit

No functional changes for existing code paths.

Signed-off-by: Arnaud Lecomte <contact@arnaud-lcm.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Acked-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/bpf/20251025192858.31424-1-contact@arnaud-lcm.com
2025-10-28 09:20:27 -07:00
Zhang Chujun
88427328e3 bpftool: Fix missing closing parethesis for BTF_KIND_UNKN
In the btf_dumper_do_type function, the debug print statement for
BTF_KIND_UNKN was missing a closing parenthesis in the output format.
This patch adds the missing ')' to ensure proper formatting of the
dump output.

Signed-off-by: Zhang Chujun <zhangchujun@cmss.chinamobile.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20251028063345.1911-1-zhangchujun@cmss.chinamobile.com
2025-10-28 09:00:55 -07:00
Paolo Abeni
cebba694d2 Merge branch 'net-stmmac-add-support-for-coarse-timestamping'
Maxime Chevallier says:

====================
net: stmmac: Add support for coarse timestamping

This is V2 for coarse timetamping support in stmmac. This version uses a
dedicated devlink param "ts_coarse" to control this mode.

This doesn't conflict with Russell's cleanup of hwif.

Maxime

[1] : https://lore.kernel.org/netdev/20200514102808.31163-1-olivier.dautricourt@orolia.com/

V1: https://lore.kernel.org/netdev/20251015102725.1297985-1-maxime.chevallier@bootlin.com/
====================

Link: https://patch.msgid.link/20251024070720.71174-1-maxime.chevallier@bootlin.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-28 15:34:36 +01:00
Maxime Chevallier
6920fa0c76 net: stmmac: Add a devlink attribute to control timestamping mode
The DWMAC1000 supports 2 timestamping configurations to configure how
frequency adjustments are made to the ptp_clock, as well as the reported
timestamp values.

There was a previous attempt at upstreaming support for configuring this
mode by Olivier Dautricourt and Julien Beraud a few years back [1]

In a nutshell, the timestamping can be either set in fine mode or in
coarse mode.

In fine mode, which is the default, we use the overflow of an accumulator to
trigger frequency adjustments, but by doing so we lose precision on the
timetamps that are produced by the timestamping unit. The main drawback
is that the sub-second increment value, used to generate timestamps, can't be
set to lower than (2 / ptp_clock_freq).

The "fine" qualification comes from the frequent frequency adjustments we are
able to do, which is perfect for a PTP follower usecase.

In Coarse mode, we don't do frequency adjustments based on an
accumulator overflow. We can therefore have very fine subsecond
increment values, allowing for better timestamping precision. However
this mode works best when the ptp clock frequency is adjusted based on
an external signal, such as a PPS input produced by a GPS clock. This
mode is therefore perfect for a Grand-master usecase.

Introduce a driver-specific devlink parameter "ts_coarse" to enable or
disable coarse mode, keeping the "fine" mode as a default.

This can then be changed with:

  devlink dev param set <dev> name ts_coarse value true cmode runtime

The associated documentation is also added.

[1] : https://lore.kernel.org/netdev/20200514102808.31163-1-olivier.dautricourt@orolia.com/

Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Kory Maincent <kory.maincent@bootlin.com>
Link: https://patch.msgid.link/20251024070720.71174-3-maxime.chevallier@bootlin.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-28 15:34:35 +01:00
Maxime Chevallier
792000fbcd net: stmmac: Move subsecond increment configuration in dedicated helper
In preparation for fine/coarse support, let's move the subsecond increment
and addend configuration in a dedicated helper.

Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/20251024070720.71174-2-maxime.chevallier@bootlin.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-28 15:34:34 +01:00
Paolo Abeni
d7d5eca4de Merge branch 'net-macb-eyeq5-support'
says:

====================
net: macb: EyeQ5 support

This series' goal is adding support to the MACB driver for EyeQ5 GEM.
The specifics for this compatible are:

 - HW cannot add dummy bytes at the start of IP packets for alignment
   purposes. The behavior can be detected using DCFG6 so it isn't
   attached to compatible data.

 - The hardware LSO/TSO is known to be buggy: add a compatible
   capability flag to force disable it.

 - At init, we have to wiggle two syscon registers that configure the
   PHY integration.

   In past attempts [0] we did it in macb_config->init() using a syscon
   regmap. That was far from ideal so now a generic PHY driver
   abstracts that away. We reuse the bp->sgmii_phy field used by some
   compatibles.

   We have to add a phy_set_mode() call as the PHY power on sequence
   depends on whether we do RGMII or SGMII.

[0]: https://lore.kernel.org/lkml/20250627-macb-v2-15-ff8207d0bb77@bootlin.com/

Signed-off-by: Théo Lebrun <theo.lebrun@bootlin.com>
---
Changes in v3:
- Drop Fixes: trailer on [2/5]. We don't fix any platform using the
  driver currently.
- Improve [5/5] commit message; add info about how an unconditional
  phy_set_mode_ext() won't break existing platforms.
- Hardbreak 82 characters line in [2/5]; warning by patchwork.
- Trailers:
  - 1x Acked-by: Conor Dooley on [1/5].
  - 2x Reviewed-by: Andrew Lunn on [1/5] and [4/5].
  - 2x Reviewed-by: Maxime Chevallier on [4/5] and [5/5].
- Link to v2: https://lore.kernel.org/r/20251022-macb-eyeq5-v2-0-7c140abb0581@bootlin.com

Changes in v2:
- Drop non net-next patches.
- Re-run get_maintainers.pl to shorten the To/Cc list.
- Rebase upon latest net-next; no changes. Tested on HW.
- Link to v1: https://lore.kernel.org/r/20251021-macb-eyeq5-v1-0-3b0b5a9d2f85@bootlin.com

Past versions of the MACB EyeQ5 patches:
 - March 2025: [PATCH net-next 00/13] Support the Cadence MACB/GEM
   instances on Mobileye EyeQ5 SoCs
   https://lore.kernel.org/lkml/20250321-macb-v1-0-537b7e37971d@bootlin.com/
 - June 2025: [PATCH net-next v2 00/18] Support the Cadence MACB/GEM
   instances on Mobileye EyeQ5 SoCs
   https://lore.kernel.org/lkml/20250627-macb-v2-0-ff8207d0bb77@bootlin.com/
 - August 2025: [PATCH net v3 00/16] net: macb: various fixes & cleanup
   https://lore.kernel.org/lkml/20250808-macb-fixes-v3-0-08f1fcb5179f@bootlin.com/

---
Théo Lebrun (5):
      dt-bindings: net: cdns,macb: add Mobileye EyeQ5 ethernet interface
      net: macb: match skb_reserve(skb, NET_IP_ALIGN) with HW alignment
      net: macb: add no LSO capability (MACB_CAPS_NO_LSO)
      net: macb: rename bp->sgmii_phy field to bp->phy
      net: macb: Add "mobileye,eyeq5-gem" compatible

 .../devicetree/bindings/net/cdns,macb.yaml         | 10 +++
 drivers/net/ethernet/cadence/macb.h                |  6 +-
 drivers/net/ethernet/cadence/macb_main.c           | 94 +++++++++++++++++-----
 3 files changed, 91 insertions(+), 19 deletions(-)
---
base-commit: 61b7ade9ba
change-id: 20251020-macb-eyeq5-fe2c0d1edc75

Best regards,
====================

Link: https://patch.msgid.link/20251023-macb-eyeq5-v3-0-af509422c204@bootlin.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-28 15:17:56 +01:00
Théo Lebrun
48cf0be9b9 net: macb: Add "mobileye,eyeq5-gem" compatible
Add support for the two GEM instances inside Mobileye EyeQ5 SoCs, using
compatible "mobileye,eyeq5-gem". With it, add a custom init sequence
that must grab a generic PHY and initialise it.

We use bp->phy in both RGMII and SGMII cases. Tell our mode by adding a
phy_set_mode_ext() during macb_open(), before phy_power_on(). We are
the first users of bp->phy that use it in non-SGMII cases.

The phy_set_mode_ext() call is made unconditionally. It cannot cause
issues on platforms where !bp->phy or !bp->phy->ops->set_mode as, in
those cases, the call is a no-op (returning zero). From reading
upstream DTS, we can figure out that no platform has a bp->phy and a
PHY driver that has a .set_mode() implementation:
 - cdns,zynqmp-gem: no DTS upstream.
 - microchip,mpfs-macb: microchip/mpfs.dtsi, &mac0..1, no PHY attached.
 - xlnx,versal-gem: xilinx/versal-net.dtsi, &gem0..1, no PHY attached.
 - xlnx,zynqmp-gem: xilinx/zynqmp.dtsi, &gem0..3, PHY attached to
   drivers/phy/xilinx/phy-zynqmp.c which has no .set_mode().

Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Signed-off-by: Théo Lebrun <theo.lebrun@bootlin.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20251023-macb-eyeq5-v3-5-af509422c204@bootlin.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-28 15:17:54 +01:00
Théo Lebrun
3f7e51cd5f net: macb: rename bp->sgmii_phy field to bp->phy
The bp->sgmii_phy field is initialised at probe by init_reset_optional()
if bp->phy_interface == PHY_INTERFACE_MODE_SGMII. It gets used by:
 - zynqmp_config: "cdns,zynqmp-gem" or "xlnx,zynqmp-gem" compatibles.
 - mpfs_config: "microchip,mpfs-macb" compatible.
 - versal_config: "xlnx,versal-gem" compatible.

Make name more generic as EyeQ5 requires the PHY in SGMII & RGMII cases.

Drop "for ZynqMP SGMII mode" comment that is already a lie, as it gets
used on Microchip platforms as well. And soon it won't be SGMII-only.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Signed-off-by: Théo Lebrun <theo.lebrun@bootlin.com>
Link: https://patch.msgid.link/20251023-macb-eyeq5-v3-4-af509422c204@bootlin.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-28 15:17:54 +01:00
Théo Lebrun
7a3d209145 net: macb: add no LSO capability (MACB_CAPS_NO_LSO)
LSO is runtime-detected using the PBUF_LSO field inside register DCFG6.
Allow disabling that feature if it is broken by using bp->caps coming
from match data.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Théo Lebrun <theo.lebrun@bootlin.com>
Link: https://patch.msgid.link/20251023-macb-eyeq5-v3-3-af509422c204@bootlin.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-28 15:17:54 +01:00
Théo Lebrun
ae7a9585ea net: macb: match skb_reserve(skb, NET_IP_ALIGN) with HW alignment
If HW is RSC capable, it cannot add dummy bytes at the start of IP
packets. Alignment (ie number of dummy bytes) is configured using the
RBOF field inside the NCFGR register.

On the software side, the skb_reserve(skb, NET_IP_ALIGN) call must only
be done if those dummy bytes are added by the hardware; notice the
skb_reserve() is done AFTER writing the address to the device.

We cannot do the skb_reserve() call BEFORE writing the address because
the address field ignores the low 2/3 bits. Conclusion: in some cases,
we risk not being able to respect the NET_IP_ALIGN value (which is
picked based on unaligned CPU access performance).

Signed-off-by: Théo Lebrun <theo.lebrun@bootlin.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20251023-macb-eyeq5-v3-2-af509422c204@bootlin.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-28 15:17:54 +01:00
Théo Lebrun
c51aa14be9 dt-bindings: net: cdns,macb: add Mobileye EyeQ5 ethernet interface
Add "cdns,eyeq5-gem" as compatible for the integrated GEM block inside
Mobileye EyeQ5 SoCs. It is different from other compatibles in two main
ways: (1) it requires a generic PHY and (2) it is better to keep TCP
Segmentation Offload (TSO) disabled.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Acked-by: Conor Dooley <conor.dooley@microchip.com>
Signed-off-by: Théo Lebrun <theo.lebrun@bootlin.com>
Link: https://patch.msgid.link/20251023-macb-eyeq5-v3-1-af509422c204@bootlin.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-28 15:17:54 +01:00
Li Qiang
3df2849667 wifi: iwlwifi: mld: add null check for kzalloc() in iwl_mld_send_proto_offload()
Add a missing NULL pointer check after kzalloc() in
iwl_mld_send_proto_offload(). Without this check, a failed
allocation could lead to a NULL dereference.

Fixes: d1e879ec60 ("wifi: iwlwifi: add iwlmld sub-driver")
Signed-off-by: Li Qiang <liqiang01@kylinos.cn>
Link: https://patch.msgid.link/20251017041128.1379715-1-liqiang01@kylinos.cn
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
2025-10-28 16:17:27 +02:00
Emmanuel Grumbach
75dd87e3f1 wifi: iwlwifi: mld: check for NULL pointer after kmalloc
Coverity complained that we didn't add a NULL check for the link we
allocate.

Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Reviewed-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20251019114304.d1f958160c5a.Icc891c14c633c3b8625372680fdc67ca33c83cc7@changeid
2025-10-28 16:17:27 +02:00
Johannes Berg
5ee10092eb wifi: iwlwifi: cfg: fix a few device names
There are going to be some devices called BN203 instead of
BN201, adjust the names accordingly.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20251019114304.2ad0f42ffb57.I2c7864d33f0d7d3dc49381949571c4ce620a9723@changeid
2025-10-28 16:17:27 +02:00
Nidhish A N
9e69bcb527 wifi: iwlwifi: mld: Move EMLSR prints to IWL_DL_EHT
Modify EMLSR debug prints to use IWL_DL_EHT instead
of IWL_DL_INFO. This will allow better communication
with validation as they might enable only IWL_DL_EHT
or IWL_DL_INFO as required.

Add prints to log attempt to switch links when
missed beacons exceed threshold.

Print both link ids and missed beacons when in EMLSR
mode.

Signed-off-by: Nidhish A N <nidhish.a.n@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20251019114304.3bfc2bc8f410.I405ab2aa81af1ba0ea5eaff343eae1778f2035d9@changeid
2025-10-28 16:17:27 +02:00
Emmanuel Grumbach
7ed47d4294 wifi: iwlwifi: disable EHT if the device doesn't allow it
We have a few devices that don't allow EHT. Make sure we reflect this
towards mac80211 so that we won't try to enable it.

Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20251019114304.71121f4e5557.I49e2329d4121f9e52d0889156d0c3e8778e27d88@changeid
2025-10-28 16:17:27 +02:00
Johannes Berg
0d0e8149c6 wifi: iwlwifi: bump core version for BZ/SC/DR
Start supporting Core 101 FW on these devices.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20251019114304.23300d52cd8b.I6aad50aed469d7734c165086796dfa9cdf9d81bd@changeid
2025-10-28 16:17:27 +02:00
Johannes Berg
bd8a6e46e6 wifi: iwlwifi: mld: use FW_CHECK on bad ROC notification
If the firmware sends a ROC notification after the driver
cancelled it, we can get into this WARN_ON(). Don't do that,
use IWL_FW_CHECK() instead.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Tested-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Reviewed-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20251019114304.48aff2c8922e.Ie27b21eb26b67c8010d13ce9590751cad417d1ad@changeid
2025-10-28 16:17:27 +02:00
Johannes Berg
3e24ba621b wifi: iwlwifi: mvm/mld: report non-HT frames as 20 MHz
Non-HT frames can only be encoded in 20 MHz, however, they
could be duplicated on all/some of the subchannels (mostly
used for RTS/CTS), in which case the firmware will report
and estimate of the overall used bandwidth based on energy
detected. This could be confusing so don't report it that
way, always use 20 MHz for non-HT/legacy frames instead.

Note that currently the value doesn't appear to be used by
mac80211, it never checks the bandwidth field for legacy
encodings.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Reviewed-by: Benjamin Berg <benjamin.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20251019114304.23e6695039ca.I3da7c542bde6de4362755f200248dbcc12aa246e@changeid
2025-10-28 16:17:27 +02:00
Johannes Berg
50d0cafec6 wifi: iwlwifi: bump core version for BZ/SC/DR
Start supporting Core 100 FW on those devices.
In addition, the move to the new Core scheme (instead of API scheme)
will start Core 100 and not 99, as planned. Adjust for that as well.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20251019114304.00c28b4259da.Idd6281cd647f1b33f2572a8c107c3a3228e03665@changeid
2025-10-28 16:17:26 +02:00
Johannes Berg
f67cf9aaae wifi: iwlwifi: fix build when mvm/mld not configured
When neither mvm nor mld are configured, we don't have the
iwl_bz_mac_cfg symbol and thus cannot check for it. But in
that case the relevant device entries aren't and cannot be
present, so just ifdef the test code for that.

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202509170625.BAJBe7Bi-lkp@intel.com/
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20251019114304.615810979e7b.I9a215f955bb3208d99239be8496d19e0f186b4d0@changeid
2025-10-28 16:17:26 +02:00
Miri Korenblit
7906c61a8f wifi: iwlwifi: mld: check the validity of noa_len
Validate iwl_probe_resp_data_notif::noa_attr::len_low since we are using
its value to determine the noa_len, which is later used for the NoA
attribute.

Reviewed-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20251019114304.b127a2b57e8c.I7ccaf118d236fb39da5da351b95ad9b37b825bc2@changeid
2025-10-28 16:17:26 +02:00
Emmanuel Grumbach
d24076e075 wifi: iwlwifi: stop checking the firmware's error pointer
It is not very clear what values we put in min_umac_error_event_table.
For Ma (Meteor Lake), this value is wrong and we get the print:
Not valid error log pointer ...

Just remove the check.

Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20251019114304.a64d0803150f.Ie2db385f68e17fb0adcdcb16e5bf0125289e177d@changeid
2025-10-28 16:17:26 +02:00
Emmanuel Grumbach
8377e92a3a wifi: iwlwifi: be more chatty when we fail to find a wifi7 device
All wifi7 devices need CONFIG_IWLMLD to be enabled.
If we can't support the wifi7 device and the module is not enabled,
complain to the user.
The check in iwl_req_fw_callback is then no longer required.

Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20251019114304.071dae9a5de2.I1603085bac5a796442faa75982f8675647becfec@changeid
2025-10-28 16:17:26 +02:00
Nidhish A N
7f59fadbcb wifi: iwlwifi: fw: remove support of several iwl_lari_config_change_cmd versions
We only need versions 1, 6, 8 and 12.
Remove versions 2, 3, 4, 5, 7, 9, 10, 11.

Signed-off-by: Nidhish A N <nidhish.a.n@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20251019114304.5c3de58594e8.I6e6f8707eb66a2b540fb19303c06393f13e1f68e@changeid
2025-10-28 16:17:26 +02:00
Johannes Berg
38f7950619 wifi: iwlwifi: mld: include raw PHY notification in radiotap
This is useful for debugging and can also be used to see
anything that isn't encoded in radiotap (yet.)

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20251019114304.5fe26e9061f2.Iefb45e3a6a2a62ff3247db4de3777059d390af95@changeid
2025-10-28 16:17:26 +02:00
Johannes Berg
92e87cee46 wifi: iwlwifi: mld: update to new sniffer API
This will break current sniffer functionality for firmware
versions that don't have the new API, but supporting both
would be very complex. Convert the code to use only the new
sniffer notification.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20251019114304.85b75a084a2f.I4a18b2043703c1f9a8f55c108dcaaeca7891e19c@changeid
2025-10-28 16:17:26 +02:00
Miri Korenblit
d676e01357 wifi: iwlwifi: mld: set wiphy::iftype_ext_capab dynamically
Instead of having a static const array for each possible combination of
features, build the extended capabilities dynamically.

With this we will also stop setting EHT capabilities when it might
actually be disabled.

Reviewed-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20250915113137.b3c03b56d5a3.I38eaf8ebaf3256e78b4643bef7e3a54aeb4989df@changeid
2025-10-28 16:17:26 +02:00
Emmanuel Grumbach
355431679a wifi: iwlwifi: mld: support get/set_antenna
This allows to set the antennas from user space.

Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20250915113137.5a45baf9513c.I5912e6b6d9a9ae6530d0ac45e9517d07f98b8d05@changeid
2025-10-28 16:17:26 +02:00
Miri Korenblit
d852e72d94 wifi: iwlwifi: mvm: cleanup unsupported phy command versions
The iwlmvm op mode is used for pre EHT HWs. Those HWs doesn't have wider
OFDMA, so phy command versions 5+ (that added support for wider OFDMA)
are not supported. Remove support for them.

This means that we also don't need to set the
IEEE80211_VIF_IGNORE_OFDMA_WIDER_BW, as we don't care about the ap
chandef anyway.

Reviewed-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20250915113137.e7e26fe71132.I8ec95ff86521661118782ecee1be20ef6e8e48e1@changeid
2025-10-28 16:17:26 +02:00
Johannes Berg
58a4ebe316 wifi: iwlwifi: fix remaining kernel-doc warnings
Fix the remaining kernel-doc warnings across the driver.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20250915113137.f94b6d4ef142.I91007eed4cf37125ca7a012f2021615b4fa9eb66@changeid
2025-10-28 16:17:21 +02:00
Johannes Berg
ef56bbed4c wifi: iwlwifi: tests: check listed PCI IDs have configs
Add a test that checks, for the old pre-CNVI devices, that
PCI IDs listed in the PCI IDs table will also match in the
config table. Newer ones we test against our database of
devices, but the current database doesn't go back that far,
so at least this checks against the PCI IDs the driver has.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20250915113137.eb728b270d46.Ie5754f4201b807eb0d55feb339a728fc0b42e8bf@changeid
2025-10-28 16:05:18 +02:00
Johannes Berg
30d47d8fe7 wifi: iwlwifi: cfg: add new device names
Add a couple of device names so that these new devices will
be shown correctly.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20250915113137.1cbc0251532f.I6183a6a08a7998e598042a50c7d7a6b82f9fa58e@changeid
2025-10-28 16:05:18 +02:00
Miri Korenblit
ba85816979 wifi: iwlwifi: iwlmld is always used for wifi7 devices
iwlmld is used since API 97 and for wifi7 devices.
Since APIs < 97 are no longer supported on such devices,
we can remove the API check and always load iwlmld for the wifi7
devices.

Reviewed-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20250915113137.45ab33fcdc00.Ia3a40b687b75c872cf7e7a19331a014bccf5f2d6@changeid
2025-10-28 16:05:18 +02:00
Johannes Berg
eade5cacc9 wifi: iwlwifi: mvm: move rate conversions to utils.c
These aren't really related to rate scaling, they're just
firmware API functions. Try to keep rs.c more for scaling
and move these out.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20250915113137.547129c7732e.I12c40876537722680d069b4bb5fc058206ba63d4@changeid
2025-10-28 16:05:17 +02:00
Miri Korenblit
5c0251598f wifi: iwlwifi: mld: reschedule check_tpt_wk also not in EMLSR
When the throughput count reaches the threshold, EMLSR is no longer
blocked by throughput.
This doesn't mean that EMLSR will be activated immediately, since there
might be other reasons that block EMLSR.

When the throughput blocker is not set, check_tpt_wk should run every 5
seconds and check if the throughput blocker should be set (if the
throughtput counter dropped).
If not, it should reschedule itself.

In the current code, the worker will reschedule itself only if we are in
EMLSR. This is wrong, since we might be in a case where the throughput
blocker is not set but we are not in EMLSR, and then we will never check
again the throughput counters (and block EMLSR if needed).

Fix this by rescheduling the worker also when EMLSR is not active.

Reviewed-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20250915113137.2a9cf2b2529d.I8284c0da9597e4c963e38ae133384f6f42044499@changeid
2025-10-28 16:05:17 +02:00
Miri Korenblit
9dc6e9dfdf wifi: iwlwifi: mld: remove support from of alive notif version 6
The last FW API that supports version 6 is 97. Since this API is no
longer supported on any device that loads iwlmld, we can remove support
of it.

Reviewed-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20250915113137.4923c981b0bf.Iff598c6d109fdbf0d5a5bab59d53485478ecc125@changeid
2025-10-28 16:05:17 +02:00
Miri Korenblit
37ff03b356 wifi: iwlwifi: align the name of iwl_alive_ntf_v6 to the convention
This struct rperesent versions 6 and 7. The convention is to name an API
struct with the last version it represent, so rename to
iwl_alive_ntf_v7.

Reviewed-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20250915113137.81240e1d4df3.I2c1264a49b9f0fc160f960cf3c5dc4cedf6ceb6d@changeid
2025-10-28 16:05:17 +02:00
Chaitanya Kulkarni
e48886b9d6 blktrace: for ftrace use correct trace format ver
The ftrace blktrace path allocates buffers and writes trace events but
was using the wrong recording function. After
commit 4d8bc7bd4f ("blktrace: move ftrace blk_io_tracer to blk_io_trace2"),
the ftrace interface was moved to use blk_io_trace2 format, but
__blk_add_trace() still called record_blktrace_event() which writes in
blk_io_trace (v1) format.

This causes critical data corruption:

- blk_io_trace (v1) has 32-bit 'action' field at offset 28
- blk_io_trace2 (v2) has 32-bit 'pid' at offset 28 and 64-bit 'action'
  at offset 32
- When record_blktrace_event() writes to a v2 buffer:
  * Writing pid (offset 32 in v1) corrupts the v2 action field
  * Writing action (offset 28 in v1) corrupts the v2 pid field
  * The 64-bit action is truncated to 32-bit via lower_32_bits()

Fix by:
1. Adding version switch to select correct format (v1 vs v2)
2. Calling appropriate recording function based on version
3. Defaulting to v2 for ftrace (as intended by commit 4d8bc7bd4f)
4. Adding WARN_ONCE for unexpected version values

Without this patch :-
linux-block (for-next) # sh reproduce_blktrace_bug.sh
              dd-14242   [033] d..1.  3903.022308: Unknown action 36a2
              dd-14242   [033] d..1.  3903.022333: Unknown action 36a2
              dd-14242   [033] d..1.  3903.022365: Unknown action 36a2
              dd-14242   [033] d..1.  3903.022366: Unknown action 36a2
              dd-14242   [033] d..1.  3903.022369: Unknown action 36a2

The action field is corrupted because:
  - ftrace allocated blk_io_trace2 buffer (64 bytes)
  - But called record_blktrace_event() (writes v1, 48 bytes)
  - Field offsets don't match, causing corruption

The hex value shown 0x30e3 is actually a PID, not an action code!

linux-block (for-next) #
linux-block (for-next) #
linux-block (for-next) # sh reproduce_blktrace_bug.sh
Trace output looks correct:

              dd-2420    [019] d..1.    59.641742: 251,0    Q  RS 0 + 8 [dd]
              dd-2420    [019] d..1.    59.641775: 251,0    G  RS 0 + 8 [dd]
              dd-2420    [019] d..1.    59.641784: 251,0    P   N [dd]
              dd-2420    [019] d..1.    59.641785: 251,0    U   N [dd] 1
              dd-2420    [019] d..1.    59.641788: 251,0    D  RS 0 + 8 [dd]

Fixes: 4d8bc7bd4f ("blktrace: move ftrace blk_io_tracer to blk_io_trace2")
Signed-off-by: Chaitanya Kulkarni <ckulkarnilinux@gmail.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-10-28 07:56:06 -06:00
Chaitanya Kulkarni
4a0940bdca blktrace: use debug print to report dropped events
The WARN_ON_ONCE introduced in
commit f9ee38bbf7 ("blktrace: add block trace commands for zone operations")
triggers kernel warnings when zone operations are traced with blktrace
version 1. This can spam the kernel log during normal operation with
zoned block devices when userspace is using the legacy blktrace
protocol.

Currently blktrace implementation drops newly added REQ_OP_ZONE_XXX
when blktrace userspce version is set to 1.

Remove the WARN_ON_ONCE and quietly filter these events. Add a
rate-limited debug message to help diagnose potential issues without
flooding the kernel log. The debug message can be enabled via dynamic
debug when needed for troubleshooting.

This approach is more appropriate as encountering zone operations with
blktrace v1 is an expected condition that should be handled gracefully
rather than warned about, since users may be running older blktrace
userspace tools that only support version 1 of the protocol.

With this patch :-
linux-block (for-next) # git log -1
commit c8966006a0971d2b4bf94c0426eb7e4407c6853f (HEAD -> for-next)
Author: Chaitanya Kulkarni <ckulkarnilinux@gmail.com>
Date:   Mon Oct 27 19:26:53 2025 -0700

    blktrace: use debug print to report dropped events
linux-block (for-next) # cdblktests
blktests (master) # ./check blktrace
blktrace/001 (blktrace zone management command tracing)      [passed]
    runtime  3.805s  ...  3.889s
blktests (master) # dmesg  -c
blktests (master) #  echo "file kernel/trace/blktrace.c +p" > /sys/kernel/debug/dynamic_debug/control
blktests (master) # ./check blktrace
blktrace/001 (blktrace zone management command tracing)      [passed]
    runtime  3.889s  ...  3.881s
blktests (master) # dmesg  -c
[   77.826237] blktrace: blktrace v1 cannot trace zone operation 0x1000190001
[   77.826260] blktrace: blktrace v1 cannot trace zone operation 0x1000190004
[   77.826282] blktrace: blktrace v1 cannot trace zone operation 0x1001490007
[   77.826288] blktrace: blktrace v1 cannot trace zone operation 0x1001890008
[   77.826343] blktrace: blktrace v1 cannot trace zone operation 0x1000190001
[   77.826347] blktrace: blktrace v1 cannot trace zone operation 0x1000190004
[   77.826350] blktrace: blktrace v1 cannot trace zone operation 0x1001490007
[   77.826354] blktrace: blktrace v1 cannot trace zone operation 0x1001890008
[   77.826373] blktrace: blktrace v1 cannot trace zone operation 0x1000190001
[   77.826377] blktrace: blktrace v1 cannot trace zone operation 0x1000190004
blktests (master) #  echo "file kernel/trace/blktrace.c -p" > /sys/kernel/debug/dynamic_debug/control
blktests (master) # ./check blktrace
blktrace/001 (blktrace zone management command tracing)      [passed]
    runtime  3.881s  ...  3.824s
blktests (master) # dmesg  -c
blktests (master) #

Reported-by: syzbot+153e64c0aa875d7e4c37@syzkaller.appspotmail.com
Fixes: f9ee38bbf7 ("blktrace: add block trace commands for zone operations")
Signed-off-by: Chaitanya Kulkarni <ckulkarnilinux@gmail.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-10-28 07:55:40 -06:00
Alexandra Winter
968822086b dibs: Use subsys_initcall()
In the case of built-in modules, the order of module_init() calls are
derived from the Makefiles.

Use subsys_initcall() for the dibs module, to make sure dibs_init() is
executed before dibs clients like smc and dibs devices like ism are
initialized. So future dibs client or dibs device modules can use
module_init() without the risk of getting the order in the Makefiles wrong.

Reported-by: Mete Durlu <meted@linux.ibm.com>
Signed-off-by: Alexandra Winter <wintera@linux.ibm.com>
Reviewed-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Link: https://patch.msgid.link/20251023150636.3995476-2-wintera@linux.ibm.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-28 13:42:31 +01:00
Alexandra Winter
182663bbff dibs: Remove reset of static vars in dibs_init()
'clients' and 'max_client' are static variables and therefore don't need to
be initialized.

Reported-by: Mete Durlu <meted@linux.ibm.com>
Signed-off-by: Alexandra Winter <wintera@linux.ibm.com>
Reviewed-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
Link: https://patch.msgid.link/20251023150636.3995476-1-wintera@linux.ibm.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-28 13:42:31 +01:00
Paolo Abeni
51f322550b Merge branch 'net-mlx5-add-balance-id-support-for-lag-multiplane-groups'
Tariq Toukan says:

====================
net/mlx5: Add balance ID support for LAG multiplane groups

This series adds balance ID support for MLX5 LAG in multiplane
configurations.

See detailed description by Mark below [1].

[1]
The problem: In complex multiplane LAG setups, we need finer control over LAG
groups. Currently, devices with the same system image GUID are treated
identically, but hardware now supports per-multiplane-group balance IDs that
let us differentiate between them. On such systems image system guid
isn't enough to decide which devices should be part of which LAG.

The solution: Extend the system image GUID with a balance ID byte when the
hardware supports it. This gives us the granularity we need without breaking
existing deployments.

What this series does:

1. Clean up some duplicate code while we're here
2. Rework the system image GUID infrastructure to handle variable lengths
3. Update PTP clock pairing to use the new approach
4. Restructure capability setting to make room for the new feature
5. Actually implement the balance ID support

The key insight is in patch 5: we only append the balance ID when both
capabilities are present, so older hardware and software continue to work
exactly as before. For newer setups, you get the extra byte that enables
per-multiplane-group load balancing.

This has been tested with both old and new hardware configurations.
====================

Link: https://patch.msgid.link/1761211020-925651-1-git-send-email-tariqt@nvidia.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-28 11:11:39 +01:00
Mark Bloch
20d78ead94 net/mlx5: Add balance ID support for LAG multiplane groups
Implement balance ID support for multiplane LAG configurations. This
feature enables per-multiplane group load balancing by extending the
software system image GUID with a balance ID component.

Key implementations:
- Enable lag_per_mp_group capability when supported by hardware.
- Append load_balance_id to software system image GUID when conditions
  are met.
- Increase MLX5_SW_IMAGE_GUID_MAX_BYTES from 8 to 9 to accommodate the
  extra byte.

The balance ID is appended to the system image GUID only when both
load_balance_id and lag_per_mp_group capabilities are available, ensuring
backward compatibility while enabling enhanced LAG functionality.

This enhancement allows for more granular load balancing control in complex
multi-plane LAG deployments, improving network performance and flexibility.

Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Reviewed-by: Shay Drori <shayd@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1761211020-925651-6-git-send-email-tariqt@nvidia.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-28 11:11:27 +01:00
Mark Bloch
075e85a126 net/mlx5: Refactor HCA cap 2 setting
Refactor HCA capability 2 setting logic to be more structured and
conditional. Move the sw_vhca_id_valid setting inside proper conditional
checks and prepare the function for additional capability settings.

The refactoring:
- Always copy current capabilities to set_hca_cap buffer.
- Apply sw_vhca_id_valid setting only when conditions are met.
- Improve code readability and maintainability.

This cleanup prepares the handle_hca_cap_2() function for the upcoming
balance ID capability setting.

Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Reviewed-by: Shay Drori <shayd@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1761211020-925651-5-git-send-email-tariqt@nvidia.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-28 11:11:27 +01:00
Mark Bloch
cd36818c34 net/mlx5: Refactor PTP clock devcom pairing
Refactor PTP clock device component pairing to use the clock identity
buffer instead of casting it to a u64 key. This change leverages the new
software system image GUID infrastructure.

Changes include:
- Pass identity buffer to mlx5_shared_clock_register().
- Use memcpy for identity buffer in devcom matching attributes.
- Remove intermediate u64 key conversion.
- Add BUILD_BUG_ON to ensure identity size fits in match key.

Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Shay Drori <shayd@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1761211020-925651-4-git-send-email-tariqt@nvidia.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-28 11:11:27 +01:00
Mark Bloch
7718f2a8b8 net/mlx5: Add software system image GUID infrastructure
Replace direct hardware system image GUID usage with a new software
system image GUID function that supports variable-length identifiers.

Key changes:
- Add mlx5_query_nic_sw_system_image_guid() function with length parameter.
- Update all callsites to use the new function and buffer/length approach.
- Modify mapping contexts to use byte arrays instead of u64 keys.
- Update devcom matching to support variable-length keys.
- Change mlx5_same_hw_devs() to use buffer comparison instead of u64.

This refactoring prepares the infrastructure for balance ID support,
which requires extending the system image GUID with additional data.
The change maintains backward compatibility while enabling future
enhancements.

Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Shay Drori <shayd@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1761211020-925651-3-git-send-email-tariqt@nvidia.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-28 11:11:27 +01:00
Mark Bloch
211de28b1c net/mlx5: Use common mlx5_same_hw_devs function
Refactor duplicate hardware device comparison code to use the common
mlx5_same_hw_devs() function instead of reimplementing system GUID
comparison logic in multiple places.

This cleanup eliminates code duplication in:
- Bridge representor device comparison.
- TC hardware device comparison.

Using the centralized function improves maintainability and ensures
consistent behavior across the driver.

Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Shay Drori <shayd@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1761211020-925651-2-git-send-email-tariqt@nvidia.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-28 11:11:27 +01:00
Paolo Abeni
0bc4059cc5 Merge branch 'implement-more-features-for-txgbe-devices'
Jiawen Wu says:

====================
Implement more features for txgbe devices

Based on the features of hardware support, implement RX desc merge and
TX head write-back for AML devices, support RSC offload for AML and SP
devices.
====================

Link: https://patch.msgid.link/20251023014538.12644-1-jiawenwu@trustnetic.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-28 10:44:19 +01:00
Jiawen Wu
eaed177706 net: txgbe: support RSC offload
Support to enable and disable RSC for txgbe devices.

Signed-off-by: Jiawen Wu <jiawenwu@trustnetic.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Link: https://patch.msgid.link/20251023014538.12644-4-jiawenwu@trustnetic.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-28 10:44:17 +01:00
Jiawen Wu
eb57b16d90 net: txgbe: support TX head write-back mode
TX head write-back mode is supported on AML devices. When it is enabled,
the hardware no longer writes the descriptors DD one by one, but write
back pointer of completion descriptor to the head_wb address.

Signed-off-by: Jiawen Wu <jiawenwu@trustnetic.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Link: https://patch.msgid.link/20251023014538.12644-3-jiawenwu@trustnetic.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-28 10:44:17 +01:00
Jiawen Wu
a71e367773 net: txgbe: support RX desc merge mode
RX descriptor merge mode is supported on AML devices. When it is
enabled, the hardware process the RX descriptors in batches.

Signed-off-by: Jiawen Wu <jiawenwu@trustnetic.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Link: https://patch.msgid.link/20251023014538.12644-2-jiawenwu@trustnetic.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-28 10:44:16 +01:00
Steffen Klassert
7197e080de Merge branch 'xfrm: IPsec hardware offload performance improvements'
Jianbo Liu says:

====================
This patch series optimizes IPsec crypto offload performance by
addressing a lock contention bottleneck using RSS.

The first patch refactors the xfrm_input to avoid a costly
unlock/relock cycle.

The second patch builds on this by removing a redundant replay check,
which is unnecessary for the synchronous hardware path.
====================

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
2025-10-28 07:46:28 +01:00
Xu Kuohai
f9db3a3822 selftests/bpf/benchs: Add overwrite mode benchmark for BPF ring buffer
Add --rb-overwrite option to benchmark BPF ring buffer in overwrite mode.
Since overwrite mode is not yet supported by libbpf for consumer, also add
--rb-bench-producer option to benchmark producer directly without a consumer.

Benchmarks on an x86_64 and an arm64 CPU are shown below for reference.

- AMD EPYC 9654 (x86_64)

Ringbuf, multi-producer contention in overwrite mode, no consumer
=================================================================
rb-prod nr_prod 1    32.180 ± 0.033M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 2    9.617 ± 0.003M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 3    8.810 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 4    9.272 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 8    9.173 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 12   3.086 ± 0.032M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 16   2.945 ± 0.021M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 20   2.519 ± 0.021M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 24   2.545 ± 0.021M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 28   2.363 ± 0.024M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 32   2.357 ± 0.021M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 36   2.267 ± 0.011M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 40   2.284 ± 0.020M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 44   2.215 ± 0.025M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 48   2.193 ± 0.023M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 52   2.208 ± 0.024M/s (drops 0.000 ± 0.000M/s)

- HiSilicon Kunpeng 920 (arm64)

Ringbuf, multi-producer contention in overwrite mode, no consumer
=================================================================
rb-prod nr_prod 1    14.478 ± 0.006M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 2    21.787 ± 0.010M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 3    6.045 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 4    5.352 ± 0.003M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 8    4.850 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 12   3.542 ± 0.016M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 16   3.509 ± 0.021M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 20   3.171 ± 0.010M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 24   3.154 ± 0.014M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 28   2.974 ± 0.015M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 32   3.167 ± 0.014M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 36   2.903 ± 0.010M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 40   2.866 ± 0.010M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 44   2.914 ± 0.010M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 48   2.806 ± 0.012M/s (drops 0.000 ± 0.000M/s)
Rb-prod nr_prod 52   2.840 ± 0.012M/s (drops 0.000 ± 0.000M/s)

Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20251018035738.4039621-4-xukuohai@huaweicloud.com
2025-10-27 19:47:32 -07:00
Xu Kuohai
8f7a86ecde selftests/bpf: Add overwrite mode test for BPF ring buffer
Add overwrite mode test for BPF ring buffer. The test creates a BPF ring
buffer in overwrite mode, then repeatedly reserves and commits records
to check if the ring buffer works as expected both before and after
overwriting occurs.

Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20251018035738.4039621-3-xukuohai@huaweicloud.com
2025-10-27 19:46:32 -07:00
Xu Kuohai
feeaf1346f bpf: Add overwrite mode for BPF ring buffer
When the BPF ring buffer is full, a new event cannot be recorded until one
or more old events are consumed to make enough space for it. In cases such
as fault diagnostics, where recent events are more useful than older ones,
this mechanism may lead to critical events being lost.

So add overwrite mode for BPF ring buffer to address it. In this mode, the
new event overwrites the oldest event when the buffer is full.

The basic idea is as follows:

1. producer_pos tracks the next position to record new event. When there
   is enough free space, producer_pos is simply advanced by producer to
   make space for the new event.

2. To avoid waiting for consumer when the buffer is full, a new variable,
   overwrite_pos, is introduced for producer. It points to the oldest event
   committed in the buffer. It is advanced by producer to discard one or more
   oldest events to make space for the new event when the buffer is full.

3. pending_pos tracks the oldest event to be committed. pending_pos is never
   passed by producer_pos, so multiple producers never write to the same
   position at the same time.

The following example diagrams show how it works in a 4096-byte ring buffer.

1. At first, {producer,overwrite,pending,consumer}_pos are all set to 0.

   0       512      1024    1536     2048     2560     3072     3584       4096
   +-----------------------------------------------------------------------+
   |                                                                       |
   |                                                                       |
   |                                                                       |
   +-----------------------------------------------------------------------+
   ^
   |
   |
producer_pos = 0
overwrite_pos = 0
pending_pos = 0
consumer_pos = 0

2. Now reserve a 512-byte event A.

   There is enough free space, so A is allocated at offset 0. And producer_pos
   is advanced to 512, the end of A. Since A is not submitted, the BUSY bit is
   set.

   0       512      1024    1536     2048     2560     3072     3584       4096
   +-----------------------------------------------------------------------+
   |        |                                                              |
   |   A    |                                                              |
   | [BUSY] |                                                              |
   +-----------------------------------------------------------------------+
   ^        ^
   |        |
   |        |
   |    producer_pos = 512
   |
overwrite_pos = 0
pending_pos = 0
consumer_pos = 0

3. Reserve event B, size 1024.

   B is allocated at offset 512 with BUSY bit set, and producer_pos is advanced
   to the end of B.

   0       512      1024    1536     2048     2560     3072     3584       4096
   +-----------------------------------------------------------------------+
   |        |                 |                                            |
   |   A    |        B        |                                            |
   | [BUSY] |      [BUSY]     |                                            |
   +-----------------------------------------------------------------------+
   ^                          ^
   |                          |
   |                          |
   |                   producer_pos = 1536
   |
overwrite_pos = 0
pending_pos = 0
consumer_pos = 0

4. Reserve event C, size 2048.

   C is allocated at offset 1536, and producer_pos is advanced to 3584.

   0       512      1024    1536     2048     2560     3072     3584       4096
   +-----------------------------------------------------------------------+
   |        |                 |                                   |        |
   |    A   |        B        |                 C                 |        |
   | [BUSY] |      [BUSY]     |               [BUSY]              |        |
   +-----------------------------------------------------------------------+
   ^                                                              ^
   |                                                              |
   |                                                              |
   |                                                    producer_pos = 3584
   |
overwrite_pos = 0
pending_pos = 0
consumer_pos = 0

5. Submit event A.

   The BUSY bit of A is cleared. B becomes the oldest event to be committed, so
   pending_pos is advanced to 512, the start of B.

   0       512      1024    1536     2048     2560     3072     3584       4096
   +-----------------------------------------------------------------------+
   |        |                 |                                   |        |
   |    A   |        B        |                 C                 |        |
   |        |      [BUSY]     |               [BUSY]              |        |
   +-----------------------------------------------------------------------+
   ^        ^                                                     ^
   |        |                                                     |
   |        |                                                     |
   |   pending_pos = 512                                  producer_pos = 3584
   |
overwrite_pos = 0
consumer_pos = 0

6. Submit event B.

   The BUSY bit of B is cleared, and pending_pos is advanced to the start of C,
   which is now the oldest event to be committed.

   0       512      1024    1536     2048     2560     3072     3584       4096
   +-----------------------------------------------------------------------+
   |        |                 |                                   |        |
   |    A   |        B        |                 C                 |        |
   |        |                 |               [BUSY]              |        |
   +-----------------------------------------------------------------------+
   ^                          ^                                   ^
   |                          |                                   |
   |                          |                                   |
   |                     pending_pos = 1536               producer_pos = 3584
   |
overwrite_pos = 0
consumer_pos = 0

7. Reserve event D, size 1536 (3 * 512).

   There are 2048 bytes not being written between producer_pos (currently 3584)
   and pending_pos, so D is allocated at offset 3584, and producer_pos is advanced
   by 1536 (from 3584 to 5120).

   Since event D will overwrite all bytes of event A and the first 512 bytes of
   event B, overwrite_pos is advanced to the start of event C, the oldest event
   that is not overwritten.

   0       512      1024    1536     2048     2560     3072     3584       4096
   +-----------------------------------------------------------------------+
   |                 |        |                                   |        |
   |      D End      |        |                 C                 | D Begin|
   |      [BUSY]     |        |               [BUSY]              | [BUSY] |
   +-----------------------------------------------------------------------+
   ^                 ^        ^
   |                 |        |
   |                 |   pending_pos = 1536
   |                 |   overwrite_pos = 1536
   |                 |
   |             producer_pos=5120
   |
consumer_pos = 0

8. Reserve event E, size 1024.

   Although there are 512 bytes not being written between producer_pos and
   pending_pos, E cannot be reserved, as it would overwrite the first 512
   bytes of event C, which is still being written.

9. Submit event C and D.

   pending_pos is advanced to the end of D.

   0       512      1024    1536     2048     2560     3072     3584       4096
   +-----------------------------------------------------------------------+
   |                 |        |                                   |        |
   |      D End      |        |                 C                 | D Begin|
   |                 |        |                                   |        |
   +-----------------------------------------------------------------------+
   ^                 ^        ^
   |                 |        |
   |                 |   overwrite_pos = 1536
   |                 |
   |             producer_pos=5120
   |             pending_pos=5120
   |
consumer_pos = 0

The performance data for overwrite mode will be provided in a follow-up
patch that adds overwrite-mode benchmarks.

A sample of performance data for non-overwrite mode, collected on an x86_64
CPU and an arm64 CPU, before and after this patch, is shown below. As we can
see, no obvious performance regression occurs.

- x86_64 (AMD EPYC 9654)

Before:

Ringbuf, multi-producer contention
==================================
rb-libbpf nr_prod 1  11.623 ± 0.027M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 2  15.812 ± 0.014M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 3  7.871 ± 0.003M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 4  6.703 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 8  2.896 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 12 2.054 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 16 1.864 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 20 1.580 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 24 1.484 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 28 1.369 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 32 1.316 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 36 1.272 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 40 1.239 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 44 1.226 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 48 1.213 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 52 1.193 ± 0.001M/s (drops 0.000 ± 0.000M/s)

After:

Ringbuf, multi-producer contention
==================================
rb-libbpf nr_prod 1  11.845 ± 0.036M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 2  15.889 ± 0.006M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 3  8.155 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 4  6.708 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 8  2.918 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 12 2.065 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 16 1.870 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 20 1.582 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 24 1.482 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 28 1.372 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 32 1.323 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 36 1.264 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 40 1.236 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 44 1.209 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 48 1.189 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 52 1.165 ± 0.002M/s (drops 0.000 ± 0.000M/s)

- arm64 (HiSilicon Kunpeng 920)

Before:

Ringbuf, multi-producer contention
==================================
rb-libbpf nr_prod 1  11.310 ± 0.623M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 2  9.947 ± 0.004M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 3  6.634 ± 0.011M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 4  4.502 ± 0.003M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 8  3.888 ± 0.003M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 12 3.372 ± 0.005M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 16 3.189 ± 0.010M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 20 2.998 ± 0.006M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 24 3.086 ± 0.018M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 28 2.845 ± 0.004M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 32 2.815 ± 0.008M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 36 2.771 ± 0.009M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 40 2.814 ± 0.011M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 44 2.752 ± 0.006M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 48 2.695 ± 0.006M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 52 2.710 ± 0.006M/s (drops 0.000 ± 0.000M/s)

After:

Ringbuf, multi-producer contention
==================================
rb-libbpf nr_prod 1  11.283 ± 0.550M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 2  9.993 ± 0.003M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 3  6.898 ± 0.006M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 4  5.257 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 8  3.830 ± 0.005M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 12 3.528 ± 0.013M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 16 3.265 ± 0.018M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 20 2.990 ± 0.007M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 24 2.929 ± 0.014M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 28 2.898 ± 0.010M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 32 2.818 ± 0.006M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 36 2.789 ± 0.012M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 40 2.770 ± 0.006M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 44 2.651 ± 0.007M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 48 2.669 ± 0.005M/s (drops 0.000 ± 0.000M/s)
rb-libbpf nr_prod 52 2.695 ± 0.009M/s (drops 0.000 ± 0.000M/s)

Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20251018035738.4039621-2-xukuohai@huaweicloud.com
2025-10-27 19:42:39 -07:00
Jui-Peng Tsai
a62b654125 wifi: rtw89: improve scan time on 6 GHz band
Reduce scan time for all supported channels from 4.4s to 3.5s.

If NL80211_SCAN_FLAG_COLOCATED_6GHZ is set in scan request, only scan PSC
channels and the channels from the RNR element found on the 2.4/5 GHz
channels. When firmware support parsing RNR element from received beacon
or probe response, offload the decision about non-PSC channels to firmware.
Driver do not need to fill non-PSC channels to scan list. If
NL80211_SCAN_FLAG_COLOCATED_6GHZ is not set, scan all supported channels.

Signed-off-by: Jui-Peng Tsai <emma_tsai@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251021133402.15467-9-pkshih@realtek.com
2025-10-28 09:52:57 +08:00
Chin-Yen Lee
e139b1c1f0 wifi: rtw89: restart hardware to recover firmware if power-save becomes abnormal
Somehow power-save related functions get failure, such as failed to
send null packet, or no response form firmware, and then WiFi will
become unstable. Trigger SER function actively to reset firmware/driver
to recover from abnormal states, including

 - firmware failed to ACK for entering PS mode
 - firmware failed to ACK for leaving PS mode
 - check PS H2C command received by firmware fail
 - failed to leave PS state

Signed-off-by: Chin-Yen Lee <timlee@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251021133402.15467-8-pkshih@realtek.com
2025-10-28 09:51:28 +08:00
Kuan-Chung Chen
a48ae54a67 wifi: rtw89: 8852c: fix ADC oscillation in 160MHz affecting RX performance
When operating in 160 MHz, the ADC may oscillate and affect AGC, leading
to unstable RX quality. This issue can be resolved by ensuring proper
RF filter bandwidth switching to avoid ADC oscillation.

Signed-off-by: Kuan-Chung Chen <damon.chen@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251021133402.15467-7-pkshih@realtek.com
2025-10-28 09:51:16 +08:00
Zong-Zhe Yang
0ac5ead00b wifi: rtw89: regd: apply ACPI policy even if country code is programmed
There are regulatory related policy according to BIOS/ACPI configuration,
e.g. distro decides to disable some bands or some channels. Even if chip
has a programmed country code, these policy should still be applied within
regulatory notifier.

Signed-off-by: Zong-Zhe Yang <kevin_yang@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251021133402.15467-6-pkshih@realtek.com
2025-10-28 09:49:49 +08:00
Zong-Zhe Yang
438c9178cd wifi: rtw89: support EHT rate pattern via bitrate mask
When setting bitrate mask, e.g. using iw set bitrates, there are some
designated patterns to be recognized, called rate pattern. When a rate
pattern is matched, treat the setting as asking a fixed rate. Now, add
support to recognize EHT rates.

Signed-off-by: Zong-Zhe Yang <kevin_yang@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251021133402.15467-5-pkshih@realtek.com
2025-10-28 09:49:36 +08:00
Zong-Zhe Yang
e79382ab03 wifi: rtw89: mlo: handle needed H2C when link switching is requested by stack
To switch link, FW needs H2C commands to indicate which link is on or off.
Originally, these H2C commands are considered only when the link switching
is initiated by driver. But, in some cases, e.g. ml_reconf or TTLM, link
switching would be initiated by stack. Hence, plan these H2C commands into
ieee80211_ops.

Signed-off-by: Zong-Zhe Yang <kevin_yang@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251021133402.15467-4-pkshih@realtek.com
2025-10-28 09:48:06 +08:00
Ping-Ke Shih
f44a9b14a7 wifi: rtw89: use skb_dequeue() for queued ROC packets to prevent racing
TX task can enqueue ROC skb, but other tasks dequeue the skb. Using
skb_queue_walk_safe() without locking will cause potential racing.
Use skb_dequeue() with lock instead.

Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251021133402.15467-3-pkshih@realtek.com
2025-10-28 09:47:55 +08:00
Ping-Ke Shih
b47d748110 wifi: rtw89: splice C2H events queue to local to prevent racing
RX task enqueues C2H events and fork a C2H work to handle events, but
the work uses skb_queue_walk_safe() without a lock causing potential
racing. Use skb_queue_splice() and its friends with spin_lock to splice
the queue to local, and then still use skb_queue_walk_safe() to iterate
all events.

Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20251021133402.15467-2-pkshih@realtek.com
2025-10-28 09:45:35 +08:00
Lad Prabhakar
19ab0a22ef dt-bindings: net: phy: vsc8531: Convert to DT schema
Convert VSC8531 Gigabit ethernet phy binding to DT schema format. While
at it add compatible string for VSC8541 PHY which is very much similar
to the VSC8531 PHY and is already supported in the kernel. VSC8541 PHY
is present on the Renesas RZ/T2H EVK.

Signed-off-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Reviewed-by: Rob Herring (Arm) <robh@kernel.org>
Link: https://patch.msgid.link/20251025064850.393797-1-prabhakar.mahadev-lad.rj@bp.renesas.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-27 18:21:32 -07:00
Jens Axboe
101e596e74 io_uring/fdinfo: cap SQ iteration at max SQ entries
A previous commit changed the logic around how SQ entries are iterated,
and as a result, had a few bugs. One is that it fully trusts the SQ
head and tail, which are user exposed. Another is that it fails to
increment the SQ head if the SQ index is out of range.

Fix both of those up, reverting to the previous logic of how to
iterate SQ entries.

Link: https://lore.kernel.org/io-uring/68ffdf18.050a0220.3344a1.039e.GAE@google.com/
Fixes: 1cba30bf9f ("io_uring: add support for IORING_SETUP_SQE_MIXED")
Reported-by: syzbot+10a9b495f54a17b607a6@syzkaller.appspotmail.com
Tested-by: syzbot+10a9b495f54a17b607a6@syzkaller.appspotmail.com
Reviewed-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-10-27 19:19:13 -06:00
Eric Dumazet
0ae1ac7335 tcp: remove one ktime_get() from recvmsg() fast path
Each time some payload is consumed by user space (recvmsg() and friends),
TCP calls tcp_rcv_space_adjust() to run DRS algorithm to check
if an increase of sk->sk_rcvbuf is needed.

This function is based on time sampling, and currently calls
tcp_mstamp_refresh(tp), which is a wrapper around ktime_get_ns().

ktime_get_ns() has a high cost on some platforms.
100+ cycles for rdtscp on AMD EPYC Turin for instance.

We do not have to refresh tp->tcp_mpstamp, using the last cached value
is enough. We only need to refresh it from __tcp_cleanup_rbuf()
if an ACK must be sent (this is a rare event).

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20251024120707.3516550-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-27 18:15:38 -07:00
Yue Haibing
6f147c8328 net/sched: Remove unused typedef psched_tdiff_t
Since commit 051d442098 ("net/sched: Retire CBQ qdisc")
this is not used anymore.

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Link: https://patch.msgid.link/20251024025145.4069583-1-yuehaibing@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-27 18:05:54 -07:00
Jakub Kicinski
c5a644d254 Merge branch 'sctp-avoid-redundant-initialisation-in-sctp_accept-and-sctp_do_peeloff'
Kuniyuki Iwashima says:

====================
sctp: Avoid redundant initialisation in sctp_accept() and sctp_do_peeloff().

When sctp_accept() and sctp_do_peeloff() allocates a new socket,
somehow sk_alloc() is used, and the new socket goes through full
initialisation, but most of the fields are overwritten later.

  1)
  sctp_accept()
  |- sctp_v[46]_create_accept_sk()
  |  |- sk_alloc()
  |  |- sock_init_data()
  |  |- sctp_copy_sock()
  |  `- newsk->sk_prot->init() / sctp_init_sock()
  |
  `- sctp_sock_migrate()
     `- sctp_copy_descendant(newsk, oldsk)

  sock_init_data() initialises struct sock, but many fields are
  overwritten by sctp_copy_sock(), which inherits fields of struct
  sock and inet_sock from the parent socket.

  sctp_init_sock() fully initialises struct sctp_sock, but later
  sctp_copy_descendant() inherits most fields from the parent's
  struct sctp_sock by memcpy().

  2)
  sctp_do_peeloff()
  |- sock_create()
  |  |
  |  ...
  |      |- sk_alloc()
  |      |- sock_init_data()
  |  ...
  |    `- newsk->sk_prot->init() / sctp_init_sock()
  |
  |- sctp_copy_sock()
  `- sctp_sock_migrate()
     `- sctp_copy_descendant(newsk, oldsk)

  sock_create() creates a brand new socket, but sctp_copy_sock()
  and sctp_sock_migrate() overwrite most of the fields.

So, sk_alloc(), sock_init_data(), sctp_copy_sock(), and
sctp_copy_descendant() can be replaced with a single function
like sk_clone_lock().

This series does the conversion and removes TODO comment added
by commit 4a997d49d9 ("tcp: Save lock_sock() for memcg in
inet_csk_accept().").

Tested accept() and SCTP_SOCKOPT_PEELOFF and both work properly.

  socket(AF_INET, SOCK_STREAM, IPPROTO_SCTP) = 3
  bind(3, {sa_family=AF_INET, sin_port=htons(0), sin_addr=inet_addr("127.0.0.1")}, 16) = 0
  listen(3, -1)                           = 0
  getsockname(3, {sa_family=AF_INET, sin_port=htons(49460), sin_addr=inet_addr("127.0.0.1")}, [16]) = 0
  socket(AF_INET, SOCK_STREAM, IPPROTO_SCTP) = 4
  connect(4, {sa_family=AF_INET, sin_port=htons(49460), sin_addr=inet_addr("127.0.0.1")}, 16) = 0
  accept(3, NULL, NULL)                   = 5
  ...
  socket(AF_INET, SOCK_SEQPACKET, IPPROTO_SCTP) = 3
  bind(3, {sa_family=AF_INET, sin_port=htons(0), sin_addr=inet_addr("127.0.0.1")}, 16) = 0
  listen(3, -1)                           = 0
  getsockname(3, {sa_family=AF_INET, sin_port=htons(48240), sin_addr=inet_addr("127.0.0.1")}, [16]) = 0
  socket(AF_INET, SOCK_SEQPACKET, IPPROTO_SCTP) = 4
  connect(4, {sa_family=AF_INET, sin_port=htons(48240), sin_addr=inet_addr("127.0.0.1")}, 16) = 0
  getsockopt(3, SOL_SCTP, SCTP_SOCKOPT_PEELOFF, "*\0\0\0\5\0\0\0", [8]) = 5

v1: https://lore.kernel.org/20251021214422.1941691-1-kuniyu@google.com
====================

Link: https://patch.msgid.link/20251023231751.4168390-1-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-27 18:05:02 -07:00
Kuniyuki Iwashima
71068e2e1b sctp: Remove sctp_copy_sock() and sctp_copy_descendant().
Now, sctp_accept() and sctp_do_peeloff() use sk_clone(), and
we no longer need sctp_copy_sock() and sctp_copy_descendant().

Let's remove them.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Acked-by: Xin Long <lucien.xin@gmail.com>
Link: https://patch.msgid.link/20251023231751.4168390-9-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-27 18:04:59 -07:00
Kuniyuki Iwashima
b7ddb55f31 sctp: Use sctp_clone_sock() in sctp_do_peeloff().
sctp_do_peeloff() calls sock_create() to allocate and initialise
struct sock, inet_sock, and sctp_sock, but later sctp_copy_sock()
and sctp_sock_migrate() overwrite most fields.

What sctp_do_peeloff() does is more like accept().

Let's use sock_create_lite() and sctp_clone_sock().

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Acked-by: Xin Long <lucien.xin@gmail.com>
Link: https://patch.msgid.link/20251023231751.4168390-8-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-27 18:04:58 -07:00
Kuniyuki Iwashima
c49ed521f1 sctp: Remove sctp_pf.create_accept_sk().
sctp_v[46]_create_accept_sk() are no longer used.

Let's remove sctp_pf.create_accept_sk().

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Acked-by: Xin Long <lucien.xin@gmail.com>
Link: https://patch.msgid.link/20251023231751.4168390-7-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-27 18:04:58 -07:00
Kuniyuki Iwashima
16942cf4d3 sctp: Use sk_clone() in sctp_accept().
sctp_accept() calls sctp_v[46]_create_accept_sk() to allocate a new
socket and calls sctp_sock_migrate() to copy fields from the parent
socket to the new socket.

sctp_v4_create_accept_sk() allocates sk by sk_alloc(), initialises
it by sock_init_data(), and copy a bunch of fields from the parent
socekt by sctp_copy_sock().

sctp_sock_migrate() calls sctp_copy_descendant() to copy most fields
in sctp_sock from the parent socket by memcpy().

These can be simply replaced by sk_clone().

Let's consolidate sctp_v[46]_create_accept_sk() to sctp_clone_sock()
with sk_clone().

We will reuse sctp_clone_sock() for sctp_do_peeloff() and then remove
sctp_copy_descendant().

Note that sock_reset_flag(newsk, SOCK_ZAPPED) is not copied to
sctp_clone_sock() as sctp does not use SOCK_ZAPPED at all.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Acked-by: Xin Long <lucien.xin@gmail.com>
Link: https://patch.msgid.link/20251023231751.4168390-6-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-27 18:04:58 -07:00
Kuniyuki Iwashima
151b98d10e net: Add sk_clone().
sctp_accept() will use sk_clone_lock(), but it will be called
with the parent socket locked, and sctp_migrate() acquires the
child lock later.

Let's add no lock version of sk_clone_lock().

Note that lockdep complains if we simply use bh_lock_sock_nested().

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Xin Long <lucien.xin@gmail.com>
Link: https://patch.msgid.link/20251023231751.4168390-5-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-27 18:04:57 -07:00
Kuniyuki Iwashima
b7185792f8 sctp: Don't call sk->sk_prot->init() in sctp_v[46]_create_accept_sk().
sctp_accept() calls sctp_v[46]_create_accept_sk() to allocate a new
socket and calls sctp_sock_migrate() to copy fields from the parent
socket to the new socket.

sctp_v[46]_create_accept_sk() calls sctp_init_sock() to initialise
sctp_sock, but most fields are overwritten by sctp_copy_descendant()
called from sctp_sock_migrate().

Things done in sctp_init_sock() but not in sctp_sock_migrate() are
the following:

  1. Copy sk->sk_gso
  2. Copy sk->sk_destruct (sctp_v6_init_sock())
  3. Allocate sctp_sock.ep
  4. Initialise sctp_sock.pd_lobby
  5. Count sk_sockets_allocated_inc(), sock_prot_inuse_add(),
     and SCTP_DBG_OBJCNT_INC()

Let's do these in sctp_copy_sock() and sctp_sock_migrate() and avoid
calling sk->sk_prot->init() in sctp_v[46]_create_accept_sk().

Note that sk->sk_destruct is already copied in sctp_copy_sock().

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Acked-by: Xin Long <lucien.xin@gmail.com>
Link: https://patch.msgid.link/20251023231751.4168390-4-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-27 18:04:57 -07:00
Kuniyuki Iwashima
2d4df59aae sctp: Don't copy sk_sndbuf and sk_rcvbuf in sctp_sock_migrate().
sctp_sock_migrate() is called from 2 places.

1) sctp_accept() calls sp->pf->create_accept_sk() before
   sctp_sock_migrate(), and sp->pf->create_accept_sk() calls
   sctp_copy_sock().

2) sctp_do_peeloff() also calls sctp_copy_sock() before
   sctp_sock_migrate().

sctp_copy_sock() copies sk_sndbuf and sk_rcvbuf from the
parent socket.

Let's not copy the two fields in sctp_sock_migrate().

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Acked-by: Xin Long <lucien.xin@gmail.com>
Link: https://patch.msgid.link/20251023231751.4168390-3-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-27 18:04:57 -07:00
Kuniyuki Iwashima
622e8838a2 sctp: Defer SCTP_DBG_OBJCNT_DEC() to sctp_destroy_sock().
SCTP_DBG_OBJCNT_INC() is called only when sctp_init_sock()
returns 0 after successfully allocating sctp_sk(sk)->ep.

OTOH, SCTP_DBG_OBJCNT_DEC() is called in sctp_close().

The code seems to expect that the socket is always exposed
to userspace once SCTP_DBG_OBJCNT_INC() is incremented, but
there is a path where the assumption is not true.

In sctp_accept(), sctp_sock_migrate() could fail after
sctp_init_sock().

Then, sk_common_release() does not call inet_release() nor
sctp_close().  Instead, it calls sk->sk_prot->destroy().

Let's move SCTP_DBG_OBJCNT_DEC() from sctp_close() to
sctp_destroy_sock().

Fixes: 1da177e4c3 ("Linux-2.6.12-rc2")
Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Acked-by: Xin Long <lucien.xin@gmail.com>
Link: https://patch.msgid.link/20251023231751.4168390-2-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-27 18:04:56 -07:00
Jakub Kicinski
8b2ee2df6a Merge branch 'convert-net-drivers-to-ndo_hwtstamp-api-part-2'
Vadim Fedorenko says:

====================
convert net drivers to ndo_hwtstamp API part 2

This is part 2 of patchset to convert drivers which support HW
timestamping to use .ndo_hwtstamp_get()/.ndo_hwtstamp_set() callbacks.
The new API uses netlink to communicate with user-space and have some
test coverage.
====================

Link: https://patch.msgid.link/20251023220457.3201122-1-vadim.fedorenko@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-27 18:04:39 -07:00
Vadim Fedorenko
329021eeae net: hns3: add hwtstamp_get/hwtstamp_set ops
And .ndo_hwtstamp_get()/.ndo_hwtstamp_set() callbacks to HNS3 framework
to support HW timestamp configuration via netlink and adopt hns3pf to
use .ndo_hwtstamp_get()/.ndo_hwtstamp_set() callbacks.

Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Reviewed-by: Jijie Shao <shaojijie@huawei.com>
Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Link: https://patch.msgid.link/20251023220457.3201122-7-vadim.fedorenko@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-27 18:04:36 -07:00
Vadim Fedorenko
87e1b590f7 net: renesas: rswitch: convert to ndo_hwtstamp API
Convert driver to use .ndo_hwtstamp_set()/.ndo_hwtstamp_get() callbacks.
rswitch_eth_ioctl() becomes phy_do_ioctl_running(), remove it and
replace .ndo_eth_ioctl callback with phy_do_ioctl_running().

Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Link: https://patch.msgid.link/20251023220457.3201122-6-vadim.fedorenko@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-27 18:04:36 -07:00
Vadim Fedorenko
faac57cddf net: ravb: convert to ndo_hwtstamp API
Convert driver to use .ndo_hwtstamp_set()/.ndo_hwtstamp_get callbacks.
ravb_do_ioctl() becomes pure phy_do_ioctl_running(), remove it and
replace in callbacks.

Reviewed-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Link: https://patch.msgid.link/20251023220457.3201122-5-vadim.fedorenko@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-27 18:04:36 -07:00
Vadim Fedorenko
38efb0ba3c ionic: convert to ndo_hwtstamp API
Convert driver to use .ndo_hwtstamp_get()/.ndo_hwtstamp_set() callbacks.
ionic_eth_ioctl() becomes empty, remove it.

Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Reviewed-by: Brett Creeley <brett.creeley@amd.com>
Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Link: https://patch.msgid.link/20251023220457.3201122-4-vadim.fedorenko@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-27 18:04:36 -07:00
Vadim Fedorenko
7a07dc723f mlx4: convert to ndo_hwtstamp API
Convert driver to use .ndo_hwtstamp_get()/.ndo_hwtstamp_set() callbacks.
mlx4_en_ioctl() becomes empty, remove it.

Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Link: https://patch.msgid.link/20251023220457.3201122-3-vadim.fedorenko@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-27 18:04:36 -07:00
Vadim Fedorenko
a5c12b060e octeontx2: convert to ndo_hwtstamp API
Convert driver to use .ndo_hwtstamp_get()/.ndo_hwtstamp_set() callbacks.
otx2_ioctl() becomes empty, remove it.

Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Link: https://patch.msgid.link/20251023220457.3201122-2-vadim.fedorenko@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-27 18:04:36 -07:00
Dan Carpenter
05e090620b net: airoha: Fix a copy and paste bug in probe()
This code has a copy and paste bug where it accidentally checks "if (err)"
instead of checking if "xsi_rsts" is NULL.  Also, as a free bonus, I
changed the allocation from kzalloc() to  kcalloc() which is a kernel
hardening measure to protect against integer overflows.

Fixes: 5863b4e065 ("net: airoha: Add airoha_eth_soc_data struct")
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Acked-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://patch.msgid.link/aPtht6y5DRokn9zv@stanley.mountain
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-27 18:03:30 -07:00
Jakub Kicinski
bbfa5e7c8d Merge tag 'batadv-next-pullrequest-20251024' of https://git.open-mesh.org/linux-merge
Simon Wunderlich says:

====================
This cleanup patchset includes the following patches:

 - bump version strings, by Simon Wunderlich

 - use skb_crc32c() instead of skb_seq_read(), by Sven Eckelmann

* tag 'batadv-next-pullrequest-20251024' of https://git.open-mesh.org/linux-merge:
  batman-adv: use skb_crc32c() instead of skb_seq_read()
  batman-adv: Start new development cycle
====================

Link: https://patch.msgid.link/20251024092315.232636-1-sw@simonwunderlich.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-27 18:02:38 -07:00
Jakub Kicinski
f4e52b326e Merge branch 'phy-mscc-fix-ptp-for-vsc8574-and-vsc8572'
Horatiu Vultur says:

====================
phy: mscc: Fix PTP for VSC8574 and VSC8572

The first patch will update the PHYs VSC8584, VSC8582, VSC8575 and VSC856X
to use PHY_ID_MATCH_EXACT because only rev B exists for these PHYs.
But for the PHYs VSC8574 and VSC8572 exists rev A, B, C, D and E.
This is just a preparation for the second patch to allow the VSC8574 and
VSC8572 to use the function vsc8584_probe().

We want to use vsc8584_probe() for VSC8574 and VSC8572 because this
function does the correct PTP initialization. This change is in the second
patch.
====================

Link: https://patch.msgid.link/20251023191350.190940-1-horatiu.vultur@microchip.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-27 17:58:05 -07:00
Horatiu Vultur
ea5df88aec phy: mscc: Fix PTP for VSC8574 and VSC8572
The PTP initialization is two-step. First part are the function
vsc8584_ptp_probe_once() and vsc8584_ptp_probe() at probe time which
initialize the locks, queues, creates the PTP device. The second part is
the function vsc8584_ptp_init() at config_init() time which initialize
PTP in the HW.

For VSC8574 and VSC8572, the PTP initialization is incomplete. It is
missing the first part but it makes the second part. Meaning that the
ptp_clock_register() is never called.

There is no crash without the first part when enabling PTP but this is
unexpected because some PHys have PTP functionality exposed by the
driver and some don't even though they share the same PTP clock PTP.

Fixes: 774626fa44 ("net: phy: mscc: Add PTP support for 2 more VSC PHYs")
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Link: https://patch.msgid.link/20251023191350.190940-3-horatiu.vultur@microchip.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-27 17:58:02 -07:00
Horatiu Vultur
1bc80d6730 phy: mscc: Use PHY_ID_MATCH_EXACT for VSC8584, VSC8582, VSC8575, VSC856X
As the PHYs VSC8584, VSC8582, VSC8575 and VSC856X exists only as rev B,
we can use PHY_ID_MATCH_EXACT to match exactly on revision B of the PHY.
Because of this change then there is not need the check if it is a
different revision than rev B in the function vsc8584_probe() as we
already know that this will never happen.
These changes are a preparation for the next patch because in that patch
we will make the PHYs VSC8574 and VSC8572 to use vsc8584_probe() and
these PHYs have multiple revision.

Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Link: https://patch.msgid.link/20251023191350.190940-2-horatiu.vultur@microchip.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-27 17:58:01 -07:00
Petr Machata
d10920607f selftests: bridge_mdb: Add a test for MDB flush on snooping disable
Check that non-permanent MDB entries are removed as IGMP / MLD snooping is
disabled.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://patch.msgid.link/9420dfbcf26c8e1134d31244e9e7d6a49d677a69.1761228273.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-27 17:57:21 -07:00
Petr Machata
68800bbf58 net: bridge: Flush multicast groups when snooping is disabled
When forwarding multicast packets, the bridge takes MDB into account when
IGMP / MLD snooping is enabled. Currently, when snooping is disabled, the
MDB is retained, even though it is not used anymore.

At the same time, during the time that snooping is disabled, the IGMP / MLD
control packets are obviously ignored, and after the snooping is reenabled,
the administrator has to assume it is out of sync. In particular, missed
join and leave messages would lead to traffic being forwarded to wrong
interfaces.

Keeping the MDB entries around thus serves no purpose, and just takes
memory. Note also that disabling per-VLAN snooping does actually flush the
relevant MDB entries.

This patch flushes non-permanent MDB entries as global snooping is
disabled.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://patch.msgid.link/5e992df1bb93b88e19c0ea5819e23b669e3dde5d.1761228273.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-27 17:57:21 -07:00
Wilfred Mallawa
5f30bc4706 selftests: tls: add tls record_size_limit test
Test that outgoing plaintext records respect the tls TLS_TX_MAX_PAYLOAD_LEN
set using setsockopt(). The limit is set to be 128, thus, in all received
records, the plaintext must not exceed this amount.

Also test that setting a new record size limit whilst a pending open
record exists is handled correctly by discarding the request.

Suggested-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Wilfred Mallawa <wilfred.mallawa@wdc.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Link: https://patch.msgid.link/20251022001937.20155-2-wilfred.opensource@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-27 16:13:43 -07:00
Wilfred Mallawa
82cb5be6ad net/tls: support setting the maximum payload size
During a handshake, an endpoint may specify a maximum record size limit.
Currently, the kernel defaults to TLS_MAX_PAYLOAD_SIZE (16KB) for the
maximum record size. Meaning that, the outgoing records from the kernel
can exceed a lower size negotiated during the handshake. In such a case,
the TLS endpoint must send a fatal "record_overflow" alert [1], and
thus the record is discarded.

Upcoming Western Digital NVMe-TCP hardware controllers implement TLS
support. For these devices, supporting TLS record size negotiation is
necessary because the maximum TLS record size supported by the controller
is less than the default 16KB currently used by the kernel.

Currently, there is no way to inform the kernel of such a limit. This patch
adds support to a new setsockopt() option `TLS_TX_MAX_PAYLOAD_LEN` that
allows for setting the maximum plaintext fragment size. Once set, outgoing
records are no larger than the size specified. This option can be used to
specify the record size limit.

[1] https://www.rfc-editor.org/rfc/rfc8449

Signed-off-by: Wilfred Mallawa <wilfred.mallawa@wdc.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Link: https://patch.msgid.link/20251022001937.20155-1-wilfred.opensource@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-27 16:13:42 -07:00
Alexei Starovoitov
ff880798de Merge branch 'bpf-introduce-file-dynptr'
Mykyta Yatsenko says:

====================
bpf: Introduce file dynptr

From: Mykyta Yatsenko <yatsenko@meta.com>

This series adds a new dynptr kind, file dynptr, which enables BPF
programs to perform safe reads from files in a structured way.
Initial motivations include:
 * Parsing the executable’s ELF to locate thread-local variable symbols
 * Capturing stack traces when frame pointers are disabled

By leveraging the existing dynptr abstraction, we reuse the verifier’s
lifetime/size checks and keep the API consistent with existing dynptr
read helpers.

Technical details:
1. Reuses the existing freader library to read files a folio at a time.
2. bpf_dynptr_slice() and bpf_dynptr_read() always copy data from folios
into a program-provided buffer; zero-copy access is intentionally not
supported to keep it simple.
3. Reads may sleep if the requested folios are not in the page cache.
4. Few verifier changes required:
  * Support dynptr destruction in kfuncs
  * Add kfunc address substitution based on whether the program runs in
  a sleepable or non-sleepable context.

Testing:
The final patch adds a selftest that validates BPF program reads the
same data as userspace, page faults are enabled in sleepable context and
disabled in non-sleepable.

Changelog:
---
v4 -> v5
v4: https://lore.kernel.org/all/20251021200334.220542-1-mykyta.yatsenko5@gmail.com/
 * Inlined and removed kfunc_call_imm(), run overflow check for call_imm
 only if !bpf_jit_supports_far_kfunc_call().

v3 -> v4
v3: https://lore.kernel.org/bpf/20251020222538.932915-1-mykyta.yatsenko5@gmail.com/
 * Remove ringbuf usage from selftests
 * bpf_dynptr_set_null(ptr) when discarding file dynptr
 * call kfunc_call_imm() in specialize_kfunc() only, removed
 call from add_kfunc_call()

v2 -> v3
v2: https://lore.kernel.org/bpf/20251015161155.120148-1-mykyta.yatsenko5@gmail.com/
 * Add negative tests
 * Rewrote tests to use LSM for bpf_get_task_exe_file()
 * Move call_imm overflow check into kfunc_call_imm()

v1 -> v2
v1: https://lore.kernel.org/bpf/20251003160416.585080-1-mykyta.yatsenko5@gmail.com/
 * Remove ELF parsing selftest
 * Expanded u32 -> u64 refactoring, changes in include/uapi/linux/bpf.h
 * Removed freader.{c,h}, instead move freader definitions into
 buildid.h.
 * Small refactoring of the multiple folios reading algorithm
 * Directly return error after unmark_stack_slots_dynptr().
 * Make kfuncs receive trusted arguments.
 * Remove enum bpf_is_sleepable, use bool instead
 * Remove unnecessary sorting from specialize_kfunc()
 * Remove bool kfunc_in_sleepable_ctx; field from the struct
 bpf_insn_aux_data, rely on non_sleepable field introduced by Kumar
 * Refactor selftests, do madvise(...MADV_PAGEOUT) for all pages read by
 the test
 * Introduce the test for non-sleepable case, verify it fails with -EFAULT
====================

Link: https://lore.kernel.org/r/20251026203853.135105-1-mykyta.yatsenko5@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-27 09:56:28 -07:00
Mykyta Yatsenko
784cdf9315 selftests/bpf: add file dynptr tests
Introducing selftests for validating file-backed dynptr works as
expected.
 * validate implementation supports dynptr slice and read operations
 * validate destructors should be paired with initializers
 * validate sleepable progs can page in.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Reviewed-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251026203853.135105-11-mykyta.yatsenko5@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-27 09:56:27 -07:00
Mykyta Yatsenko
2c52e8943a bpf: dispatch to sleepable file dynptr
File dynptr reads may sleep when the requested folios are not in
the page cache. To avoid sleeping in non-sleepable contexts while still
supporting valid sleepable use, given that dynptrs are non-sleepable by
default, enable sleeping only when bpf_dynptr_from_file() is invoked
from a sleepable context.

This change:
  * Introduces a sleepable constructor: bpf_dynptr_from_file_sleepable()
  * Override non-sleepable constructor with sleepable if it's always
  called in sleepable context

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251026203853.135105-10-mykyta.yatsenko5@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-27 09:56:27 -07:00
Mykyta Yatsenko
d869d56ca8 bpf: verifier: refactor kfunc specialization
Move kfunc specialization (function address substitution) to later stage
of verification to support a new use case, where we need to take into
consideration whether kfunc is called in sleepable context.

Minor refactoring in add_kfunc_call(), making sure that if function
fails, kfunc desc is not added to tab->descs (previously it could be
added or not, depending on what failed).

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251026203853.135105-9-mykyta.yatsenko5@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-27 09:56:27 -07:00
Mykyta Yatsenko
e3e36edb1b bpf: add kfuncs and helpers support for file dynptrs
Add support for file dynptr.

Introduce struct bpf_dynptr_file_impl to hold internal state for file
dynptrs, with 64-bit size and offset support.

Introduce lifecycle management kfuncs:
  - bpf_dynptr_from_file() for initialization
  - bpf_dynptr_file_discard() for destruction

Extend existing helpers to support file dynptrs in:
  - bpf_dynptr_read()
  - bpf_dynptr_slice()

Write helpers (bpf_dynptr_write() and bpf_dynptr_data()) are not
modified, as file dynptr is read-only.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251026203853.135105-8-mykyta.yatsenko5@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-27 09:56:27 -07:00
Mykyta Yatsenko
8d8771dc03 bpf: add plumbing for file-backed dynptr
Add the necessary verifier plumbing for the new file-backed dynptr type.
Introduce two kfuncs for its lifecycle management:
 * bpf_dynptr_from_file() for initialization
 * bpf_dynptr_file_discard() for destruction

Currently there is no mechanism for kfunc to release dynptr, this patch
add one:
 * Dynptr release function sets meta->release_regno
 * Call unmark_stack_slots_dynptr() if meta->release_regno is set and
 dynptr ref_obj_id is set as well.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251026203853.135105-7-mykyta.yatsenko5@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-27 09:56:27 -07:00
Mykyta Yatsenko
9cba966f1c bpf: verifier: centralize const dynptr check in unmark_stack_slots_dynptr()
Move the const dynptr check into unmark_stack_slots_dynptr() so callers
don’t have to duplicate it. This puts the validation next to the code
that manipulates dynptr stack slots and allows upcoming changes to reuse
it directly.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251026203853.135105-6-mykyta.yatsenko5@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-27 09:56:27 -07:00
Mykyta Yatsenko
5a5fff604f lib/freader: support reading more than 2 folios
freader_fetch currently reads from at most two folios. When a read spans
into a third folio, the overflow bytes are copied adjacent to the second
folio’s data instead of being handled as a separate folio.
This patch modifies fetch algorithm to support reading from many folios.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Reviewed-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20251026203853.135105-5-mykyta.yatsenko5@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-27 09:56:27 -07:00
Mykyta Yatsenko
76e4fed847 lib: move freader into buildid.h
Move struct freader and prototypes of the functions operating on it into
the buildid.h.

This allows reusing freader outside buildid, e.g. for file dynptr
support added later.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Link: https://lore.kernel.org/r/20251026203853.135105-4-mykyta.yatsenko5@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-27 09:56:27 -07:00
Mykyta Yatsenko
531b87d865 bpf: widen dynptr size/offset to 64 bit
Dynptr currently caps size and offset at 24 bits, which isn’t sufficient
for file-backed use cases; even 32 bits can be limiting. Refactor dynptr
helpers/kfuncs to use 64-bit size and offset, ensuring consistency
across the APIs.

This change does not affect internals of xdp, skb or other dynptrs,
which continue to behave as before. Also it does not break binary
compatibility.

The widening enables large-file access support via dynptr, implemented
in the next patches.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251026203853.135105-3-mykyta.yatsenko5@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-27 09:56:26 -07:00
Mykyta Yatsenko
a61a257ff5 selftests/bpf: remove unnecessary kfunc prototypes
Remove unnecessary kfunc prototypes from test programs, these are
provided by vmlinux.h

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251026203853.135105-2-mykyta.yatsenko5@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-27 09:56:26 -07:00
Jianbo Liu
b427c0c3bc xfrm: Skip redundant replay recheck for the hardware offload path
The xfrm_replay_recheck() function was introduced to handle the issues
arising from asynchronous crypto algorithms.

The crypto offload path is now effectively synchronous, as it holds
the state lock throughout its operation. This eliminates the race
condition, making the recheck an unnecessary overhead. This patch
improves performance by skipping the redundant call when
crypto_done is true.

Additionally, the sequence number assignment is moved to an earlier
point in the function. This improves performance by reducing lock
contention and places the logic at a more appropriate point, as the
full sequence number (including the higher-order bits) can be
determined as soon as the packet is received.

Signed-off-by: Jianbo Liu <jianbol@nvidia.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
2025-10-27 10:35:51 +01:00
Jianbo Liu
10a1186194 xfrm: Refactor xfrm_input lock to reduce contention with RSS
With newer NICs like mlx5 supporting RSS for IPsec crypto offload,
packets for a single Security Association (SA) are scattered across
multiple CPU cores for parallel processing. The xfrm_state spinlock
(x->lock) is held for each packet during xfrm processing.

When multiple connections or flows share the same SA, this parallelism
causes high lock contention on x->lock, creating a performance
bottleneck and limiting scalability.

The original xfrm_input() function exacerbated this issue by releasing
and immediately re-acquiring x->lock. For hardware crypto offload
paths, this unlock/relock sequence is unnecessary and introduces
significant overhead. This patch refactors the function to relocate
the type_offload->input_tail call for the offload path, performing all
necessary work while continuously holding the lock. This reordering is
safe, since packets which don't pass the checks below will still fail
them with the new code.

Performance testing with iperf using multiple parallel streams over a
single IPsec SA shows significant improvement in throughput as the
number of queues (and thus CPU cores) increases:

+-----------+---------------+--------------+-----------------+
| RX queues | Before (Gbps) | After (Gbps) | Improvement (%) |
+-----------+---------------+--------------+-----------------+
|         2 |          32.3 |         34.4 |             6.5 |
|         4 |          34.4 |         40.0 |            16.3 |
|         6 |          24.5 |         38.3 |            56.3 |
|         8 |          23.1 |         38.3 |            65.8 |
|        12 |          18.1 |         29.9 |            65.2 |
|        16 |          16.0 |         25.2 |            57.5 |
+-----------+---------------+--------------+-----------------+

Signed-off-by: Jianbo Liu <jianbol@nvidia.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
2025-10-27 10:24:30 +01:00
Roopni Devanathan
88de08348a wifi: cfg80211: Add parameters to radio-specific debugfs directories
In multi-radio wiphy architecture, where a single wiphy can have
multiple radios tied to it, radio specific configuration parameters
and global wiphy parameters are maintained for the entire physical
device and common to all radios. But, each radio in a wiphy can
have different values for each radio configuration parameter like
RTS threshold. With the current debugfs directory structure, the
values of global wiphy configuration parameters can be viewed, but,
values of individual radio configuration parameters cannot be viewed.

To address this requirement, maintain separate entries of each radio
configuration parameter i.e., RTS threshold in corresponding radio-
specific debugfs directory. This way, radio-specific configuration
parameters can be maintained along with global wiphy configuration
parameters. Whenever the values are changed for one radio, the values
for rest of the radios in the wiphy and the global wiphy parameter
value will remain intact.

Sample output:
/# iw phy#0 set rts 100 radio 1
/# iw phy#0 set rts 468 radio 0
/# cat /sys/kernel/debug/ieee80211/phy0/rts_threshold
-1
/# cat /sys/kernel/debug/ieee80211/phy0/radio0/radio_rts_threshold
468
/# cat /sys/kernel/debug/ieee80211/phy0/radio1/radio_rts_threshold
100

/# iw phy#0 set rts 500
/# cat /sys/kernel/debug/ieee80211/phy0/rts_threshold
500
/# cat /sys/kernel/debug/ieee80211/phy0/radio0/radio_rts_threshold
500
/# cat /sys/kernel/debug/ieee80211/phy0/radio1/radio_rts_threshold
500

Signed-off-by: Roopni Devanathan <quic_rdevanat@quicinc.com>
Link: https://patch.msgid.link/20251024044649.483557-3-quic_rdevanat@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-10-27 09:18:41 +01:00
Roopni Devanathan
7cc986c04a wifi: cfg80211: Add debugfs support for multi-radio wiphy
In multi-radio wiphy architecture, where a single wiphy can have
multiple radios tied to it, radio specific configuration parameters
and global wiphy parameters are maintained for the entire physical
device and common to all radios. But, each radio in a wiphy can have
different values for each radio configuration parameter, like RTS
threshold. With the current debugfs directory structure, the values
of global wiphy configuration parameters can be viewed, but, values
of individual radio configuration parameters cannot be viewed, as
radio specific configuration parameters are not maintained, separately.

To address this, in addition to maintaining global wiphy configuration
parameters common to all radios, create separate debugfs directories
for each radio in a wiphy to maintain parameters corresponding to that
radio in this directory.

In implementation, maintain a dentry structure in wiphy_radio_cfg, a
structure  containing radio configurations of a wiphy. This struct is
maintained to denote per-radio configurations of a wiphy. Create
separate directories representing each radio within phy#X directory in
debugfs during wiphy registration.

Sample directory structure with this change:
ls /sys/kernel/debug/ieee80211/phy0/radio
radio0/ radio1/ radio2/

Signed-off-by: Roopni Devanathan <quic_rdevanat@quicinc.com>
Link: https://patch.msgid.link/20251024044649.483557-2-quic_rdevanat@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-10-27 09:18:41 +01:00
Sarika Sharma
cc18fffa3a wifi: mac80211: fix missing RX bitrate update for mesh forwarding path
Currently, RX bitrate statistics are not updated for packets received
on the mesh forwarding path during fast RX processing. This results in
incomplete RX rate tracking in station dump outputs for mesh scenarios.

Update ieee80211_invoke_fast_rx() to record the RX rate using
sta_stats_encode_rate() and store it in the last_rate field of
ieee80211_sta_rx_stats when RX_QUEUED is returned from
ieee80211_rx_mesh_data(). This ensures that RX bitrate is properly
accounted for in both RSS and non-RSS paths.

Signed-off-by: Sarika Sharma <sarika.sharma@oss.qualcomm.com>
Link: https://patch.msgid.link/20251024043627.1640447-1-sarika.sharma@oss.qualcomm.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-10-27 09:18:19 +01:00
Lachlan Hodges
bca76b875d wifi: cfg80211: default S1G chandef width to 1MHz
When management frames are passed down to be transmitted by usermode, often
times the NL80211_ATTR_CHANNEL_WIDTH is not used as its implied to be
transmitted on the control width. This can lead to errors during chandef
validation as the offsets from the channel center are wrong. Ensure we
initialise S1G chandefs to a width of 1MHz rather then 20MHz.

Signed-off-by: Lachlan Hodges <lachlan.hodges@morsemicro.com>
Link: https://patch.msgid.link/20251021061201.235754-1-lachlan.hodges@morsemicro.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-10-27 09:18:08 +01:00
Lachlan Hodges
ad55aa3ad8 wifi: mac80211: get probe response chan via ieee80211_get_channel_khz
Make use of ieee80211_get_channel_khz() rather then the MHz counterpart
to ensure probe responses received on an S1G channel pass the check.

Signed-off-by: Lachlan Hodges <lachlan.hodges@morsemicro.com>
Link: https://patch.msgid.link/20251021061051.235258-1-lachlan.hodges@morsemicro.com
[modify indentation]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-10-27 09:17:43 +01:00
Johannes Berg
8f24be7088 wifi: mac80211: reset CRC valid after CSA
While waiting for a beacon after CSA, reset the CRC valid
so that the next beacon is handled even if it happens to
be identical the last one on the old channel. This is an
AP bug either way, but it's better to disconnect cleanly
than to have lingering CSA state.

In the iwlwifi instantiation of this problem, mac80211 is
ignoring the beacon but the firmware creates a new CSA,
and then crashes later because mac80211/driver didn't do
anything about it.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Reviewed-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20251019115024.521ad9c6b87d.I86376900df3d3423185b75bf63358c29f33a5eb6@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-10-27 09:17:11 +01:00
Aditya Kumar Singh
a3b16dfe79 wifi: mac80211_hwsim: advertise puncturing feature support
If userspace provides a puncturing bitmap via the NL80211_ATTR_PUNCT_BITMAP
attribute, the kernel with mac80211_hwsim driver currently rejects the
command with the error: "driver doesn't support puncturing", because the
driver does not advertise support for this feature.

At present, the following hwsim test cases utilize puncturing, but the
bitmap is not sent to the kernel. Instead, the puncturing information is
conveyed only through the beacon data:
 * eht_5ghz_80mhz_puncturing_override_1
 * eht_5ghz_80mhz_puncturing_override_2
 * eht_5ghz_80mhz_puncturing_override_3

A future change in hostapd will begin configuring the puncturing bitmap
explicitly, which will cause these test cases to fail unless the driver
advertises support.

To address this, update mac80211_hwsim driver to advertise puncturing
feature support.

Signed-off-by: Aditya Kumar Singh <aditya.kumar.singh@oss.qualcomm.com>
Link: https://patch.msgid.link/20251017-hwsim_set_punct_feature_bit-v1-1-3be1bb3450c0@oss.qualcomm.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-10-27 09:17:00 +01:00
Ryder Lee
a392cde88d wifi: cfg80211/mac80211: validate radio frequency range for monitor mode
In multi-radio devices, it is possible to have an MLD AP and a monitor
interface active at the same time. In such cases, monitor mode may not
be able to specify a fixed channel and could end up capturing frames
from all radios, including those outside the intended frequency bands.

This patch adds frequency validation for monitor mode. Received frames
are now only processed if their frequency fall within the allowed ranges
of the radios specified by the interface's radio_mask.

This prevents monitor mode from capturing frames outside the supported radio.

Signed-off-by: Ryder Lee <ryder.lee@mediatek.com>
Link: https://patch.msgid.link/700b8284e845d96654eb98431f8eeb5a81503862.1758647858.git.ryder.lee@mediatek.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-10-27 09:16:23 +01:00
Rosen Penev
428ea708b7 wifi: rt2x00: check retval for of_get_mac_address
of_get_mac_address can return -EPROBE_DEFER when nvmem is not probed yet
for whatever reason. In this case, nvmem mac assignments will not work.

Based on the function path, this change only has effect for rt2800soc.c
and rt2800pci.c. The former tends to use nvmem for assignments.

Signed-off-by: Rosen Penev <rosenp@gmail.com>
Acked-by: Stanislaw Gruszka <stf_xl@wp.pl>
Link: https://patch.msgid.link/20251014050833.46377-1-rosenp@gmail.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
2025-10-27 09:16:12 +01:00
Adithya Jayachandran
eea31f21dc {rdma,net}/mlx5: Query vports mac address from device
Before this patch during either switchdev or legacy mode enablement we
cleared the mac address of vports between changes. This change allows us
to preserve the vports mac address between eswitch mode changes.

Vports hold information for VFs/SFs such as the permanent mac address.
VF/SF mac can be set either by iproute vf interface or devlink function
interface. For no obvious reason we reset it to 0 on switchdev/legacy
mode changes, this patch is fixing that, to align with other vport
information that are never reset, e.g GUID,mtu,promisc mode, etc ..

Signed-off-by: Adithya Jayachandran <ajayachandra@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Acked-by: Leon Romanovsky <leon@kernel.org> # RDMA
2025-10-24 20:16:01 -07:00
Jakub Kicinski
bfe62db542 Merge branch 'dwmac-support-for-rockchip-rk3506'
Heiko Stuebner says:

====================
DWMAC support for Rockchip RK3506

Some cleanups to the DT binding for Rockchip variants of the dwmac
and adding the RK3506 support on top.

As well as the driver-glue needed for setting up the correct RMII
speed seitings.
====================

Link: https://patch.msgid.link/20251023111213.298860-1-heiko@sntech.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-24 19:07:48 -07:00
Heiko Stuebner
384d842632 MAINTAINERS: add dwmac-rk glue driver to the main Rockchip entry
The dwmac-rk glue driver is currently not caught by the general maintainer
entry for Rockchip SoCs, so add it explicitly, similar to the i2c driver.

The binding document in net/rockchip-dwmac.yaml already gets caught by
the wildcard match.

Signed-off-by: Heiko Stuebner <heiko@sntech.de>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20251023111213.298860-6-heiko@sntech.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-24 19:07:37 -07:00
David Wu
2010163a8e ethernet: stmmac: dwmac-rk: Add RK3506 GMAC support
Add the needed glue blocks for the RK3506-specific setup.

The RK3506 dwmac only supports up to 100MBit with a RMII PHY,
but no RGMII.

Signed-off-by: David Wu <david.wu@rock-chips.com>
Signed-off-by: Heiko Stuebner <heiko@sntech.de>
Link: https://patch.msgid.link/20251023111213.298860-5-heiko@sntech.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-24 19:07:37 -07:00
Heiko Stuebner
4a667bec74 dt-bindings: net: rockchip-dwmac: Add compatible string for RK3506
Rockchip RK3506 has two Ethernet controllers based on Synopsys DWC
Ethernet QoS IP.

Add compatible string for the RK3506 variant.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Acked-by: Conor Dooley <conor.dooley@microchip.com>
Signed-off-by: Heiko Stuebner <heiko@sntech.de>
Link: https://patch.msgid.link/20251023111213.298860-4-heiko@sntech.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-24 19:07:37 -07:00
Heiko Stuebner
e774c91dca dt-bindings: net: snps,dwmac: Sync list of Rockchip compatibles
A number of dwmac variants from Rockchip SoCs have turned up in the
Rockchip-specific binding, but not in the main list in snps,dwmac.yaml
which as the comment indicates is needed for accurate matching.

So add the missing rk3528, rk3568 and rv1126 to the main list.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Acked-by: Conor Dooley <conor.dooley@microchip.com>
Signed-off-by: Heiko Stuebner <heiko@sntech.de>
Link: https://patch.msgid.link/20251023111213.298860-3-heiko@sntech.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-24 19:07:37 -07:00
Heiko Stuebner
32dd679b88 dt-bindings: net: snps,dwmac: move rk3399 line to its correct position
Move the rk3399 compatible to its alphabetically correct position.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Acked-by: Conor Dooley <conor.dooley@microchip.com>
Signed-off-by: Heiko Stuebner <heiko@sntech.de>
Link: https://patch.msgid.link/20251023111213.298860-2-heiko@sntech.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-24 19:07:37 -07:00
Jakub Kicinski
568656789c Merge branch 'net-ravb-soc-specific-configuration'
Lad Prabhakar says:

====================
net: ravb: SoC-specific configuration

This series addresses several issues in the Renesas Ethernet AVB (ravb)
driver related to SoC-specific resource configuration.

The series includes the following changes:

- Make DBAT entry count configurable per SoC
The number of descriptor base address table (DBAT) entries is not uniform
across all SoCs. Pass this information via the hardware info structure and
allocate resources accordingly.

- Allocate correct number of queues based on SoC support
Use the per-SoC configuration to determine whether a network control queue
is available, and allocate queues dynamically to match the SoC's
capability.

v2: https://lore.kernel.org/20251017151830.171062-1-prabhakar.mahadev-lad.rj@bp.renesas.com
====================

Link: https://patch.msgid.link/20251023112111.215198-1-prabhakar.mahadev-lad.rj@bp.renesas.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-24 19:05:57 -07:00
Lad Prabhakar
3912e804ff net: ravb: Allocate correct number of queues based on SoC support
Use the per-SoC match data flag `nc_queues` to decide how many TX/RX
queues to allocate. If the SoC does not provide a network-control queue,
fall back to a single TX/RX queue. Obtain the match data before calling
alloc_etherdev_mqs() so the allocation is sized correctly.

Signed-off-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Reviewed-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Link: https://patch.msgid.link/20251023112111.215198-3-prabhakar.mahadev-lad.rj@bp.renesas.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-24 19:04:34 -07:00
Lad Prabhakar
9078e6c5f1 net: ravb: Make DBAT entry count configurable per-SoC
Avoid wasting coherent DMA memory by allocating the descriptor base
address table sized for the actual number of DBAT/CDARq entries supported
by the SoC. Some platforms (for example GBETH) only provide two CDARq
entries; previously the driver always allocated space for 22 entries which
needlessly consumed memory on those systems.

Pass the per-SoC dbat_entry_num via struct ravb_hw_info and use it for
allocation and initialization in probe. This sizes the table correctly and
removes the unnecessary memory overhead on SoCs with fewer DBAT entries.

Signed-off-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Reviewed-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Link: https://patch.msgid.link/20251023112111.215198-2-prabhakar.mahadev-lad.rj@bp.renesas.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-24 19:04:34 -07:00
Oliver Neukum
c09b183dc1 net: usb: usbnet: coding style for functions
Functions are not to have blanks between names
and parameter lists. Remove them.

Signed-off-by: Oliver Neukum <oneukum@suse.com>
Link: https://patch.msgid.link/20251023100136.909118-1-oneukum@suse.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-24 19:02:29 -07:00
Jakub Kicinski
fa6e6cd2fa Merge branch 'net-stmmac-pcs-support-part-2'
Russell King says:

====================
net: stmmac: pcs support part 2

This is the next part of stmmac PCS support. Not much here, other than
dealing with what remains of the interrupts, which are the PCS AN
complete and PCS Link interrupts, which are just cleared and update
accounting.

Currently, they are enabled at core init time, but if we have an
implementation that supports multiple PHY interfaces, we want to
enable only the appropriate interrupts.

I also noticed that stmmac_fpe_configure_pmac() also modifies the
interrupt mask during run time. As a pre-requisit, we need a way
to ensure that we don't have different threads modifying the
interrupt settings at the same time. So, the first patch introduces
a new function and a spinlock which must be held when manipulating
the interrupt enable/mask state.

The second patch adds the PCS bits for enabling the PCS AN and PCS
link interrupts when the PCS is in-use.
====================

Link: https://patch.msgid.link/aPn5YVeUcWo4CW3c@shell.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-24 18:56:37 -07:00
Russell King (Oracle)
eed68edac5 net: stmmac: add support for controlling PCS interrupts
Add support to the PCS instance for controlling the PCS interrupts
depending on whether the PCS is used.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vBrtp-0000000BMYs-3bhI@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-24 18:56:34 -07:00
Russell King (Oracle)
442a8c68f0 net: stmmac: add stmmac_mac_irq_modify()
Add a function to allow interrupts to be enabled and disabled in a
core independent manner.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vBrtk-0000000BMYm-3CV5@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-24 18:56:34 -07:00
Jakub Kicinski
86b66cb8d1 Merge branch 'net-add-phylink-managed-wol-and-convert-stmmac'
Russell King says:

====================
net: add phylink managed WoL and convert stmmac

This series is implementing the thoughts of Andrew, Florian and myself
to improve the quality of Wake-on-Lan (WoL) implementations.

This changes nothing for MAC drivers that do not wish to participate in
this, but if they do, then they gain the benefit of phylink configuring
WoL at the point closest to the media as possible.

We first need to solve the problem that the multitude of PHY drivers
report their device supports WoL, but are not capable of waking the
system. Correcting this is fundamental to choosing where WoL should be
enabled - a mis-reported WoL support can render WoL completely
ineffective.

The only PHY drivers which uses the driver model's wakeup support is
drivers/net/phy/broadcom.c, and until recently, realtek. This means
we have the opportunity for PHY drivers to be _correctly_ converted
to use this method of signalling wake-up capability only when they can
actually wake the system, and thus providing a way for phylink to
know whether to use PHY-based WoL at all.

However, a PHY driver not implementing that logic doesn't become a
blocker to MACs wanting to convert. In full, the logic is:

- phylink supports a flag, wol_phy_legacy, which forces phylink to use
  the PHY-based WoL even if the MDIO device is not marked as wake-up
  capable.

- when wol_phy_legacy is not set, we check whether the PHY MDIO device
  is wake-up capable. If it is, we offer the WoL request to the PHY.

- if neither wol_phy_legacy is set, or the PHY is not wake-up capable,
  we do not offer the WoL request to the PHY.

In both cases, after setting any PHY based WoL, we remove the options
that the PHY now reports are enabled from the options mask, and offer
these (if any) to the MAC. The mac will get a "mac_set_wol()" method
call when any settings change.

Phylink mainatains the WoL state for the MAC, so there's no need for
a "mac_get_wol()" method. There may be the need to set the initial
state but this is not supported at present.

I've also added support for doing the PHY speed-up/speed-down at
suspend/resume time depending on the WoL state, which takes another
issue from the MAC authors.

Lastly, with phylink now having the full picture for WoL, the
"mac_wol" argument for phylink_suspend() becomes redundant, and for
MAC drivers that implement mac_set_wol(), the value passed becomes
irrelevant.
====================

Link: https://patch.msgid.link/aPnyW54J80h9DmhB@shell.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-24 18:52:10 -07:00
Russell King (Oracle)
d65cb2e27e net: stmmac: convert to phylink managed WoL PHY speed
Convert stmmac to use phylink's management of the PHY speed when
Wake-on-Lan is enabled.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vBrRH-0000000BLzm-3JjF@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-24 18:52:08 -07:00
Russell King (Oracle)
6911308d7d net: stmmac: convert to phylink-managed Wake-on-Lan
Convert stmmac to use phylink-managed Wake-on-Lan support. To achieve
this, we implement the .mac_wol_set() method, which simply configures
the driver model's struct device wakeup for stmmac, and sets the
priv->wolopts appropriately.

When STMMAC_FLAG_USE_PHY_WOL is set, in the stmmac world this means to
only use the PHY's WoL support and ignore the MAC's WoL capabilities.
To preserve this behaviour, we enable phylink's legacy mode, and avoid
telling phylink that the MAC has any WoL support. This achieves the
same functionality for this case.

When STMMAC_FLAG_USE_PHY_WOL is not set, we provide the MAC's WoL
capabilities to phylink, which then allows phylink to choose between
the PHY and MAC for WoL depending on their individual capabilities
as described in the phylink commit. This only augments the WoL
functionality with PHYs that declare to the driver model that they are
wake-up capable. Currently, very few PHY drivers support this.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vBrRC-0000000BLzg-2tA4@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-24 18:52:08 -07:00
Russell King (Oracle)
dc1a2a9ce5 net: phylink: add phylink managed wake-on-lan PHY speed control
Some drivers, e.g. stmmac, use the speed_up()/speed_down() APIs to
gain additional power saving during Wake-on-LAN where the PHY is
managing the state.

Add support to phylink for this, which can be enabled by the MAC
driver. Only change the PHY speed if the PHY is configured for
wake-up, but without any wake-up on the MAC side, as MAC side
means changing the configuration once the negotiation has
completed.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vBrR7-0000000BLza-2PjK@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-24 18:52:08 -07:00
Russell King (Oracle)
b79fbd86c8 net: phylink: add phylink managed MAC Wake-on-Lan support
Add core phylink managed Wake-on-Lan support, which is enabled when the
MAC driver fills in the new .mac_wol_set() method that this commit
creates.

When this feature is disabled, phylink acts as it has in the past,
merely passing the ethtool WoL calls to phylib whenever a PHY exists.
No other new functionality provided by this commit is enabled.

When this feature is enabled, a more inteligent approach is used.
Phylink will first pass WoL options to the PHY, read them back, and
attempt to set any options that were not set at the PHY at the MAC.

Since we have PHY drivers that report they support WoL, and accept WoL
configuration even though they aren't wired up to be capable of waking
the system, we need a way to differentiate between PHYs that think
they support WoL and those which actually do. As PHY drivers do not
make use of the driver model's wake-up infrastructure, but could, we
use this to determine whether PHY drivers can participate. This gives
a path forward where, as MAC drivers are converted to this, it
encourages PHY drivers to also be converted.

Phylink will also ignore the mac_wol argument to phylink_suspend() as
it now knows the WoL state at the MAC.

MAC drivers are expected to record/configure the Wake-on-Lan state in
their .mac_set_wol() method, and deal appropriately with it in their
suspend/resume methods. The driver model provides assistance to set the
IRQ wake support which may assist driver authors in achieving the
necessary configuration.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vBrR2-0000000BLzU-1xYL@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-24 18:52:08 -07:00
Russell King (Oracle)
b344bfacf1 net: phy: add phy_may_wakeup()
Add phy_may_wakeup() which uses the driver model's device_may_wakeup()
when the PHY driver has marked the device as wakeup capable in the
driver model, otherwise use phy_drv_wol_enabled().

Replace the sites that used to call phy_drv_wol_enabled() with this
as checking the driver model will be more efficient than checking the
WoL state.

Export phy_may_wakeup() so that phylink can use it.

Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vBrQx-0000000BLzO-1RLt@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-24 18:52:07 -07:00
Russell King (Oracle)
330ce8ffc1 net: phy: add phy_can_wakeup()
Add phy_can_wakeup() to report whether the PHY driver has marked the
PHY device as being wake-up capable as far as the driver model is
concerned.

Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/E1vBrQs-0000000BLzI-0w3U@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-24 18:52:07 -07:00
Dust Li
f0773d0b41 smc: rename smc_find_ism_store_rc to reflect broader usage
The function smc_find_ism_store_rc() is used to record the reason
why a suitable device (either ISM or RDMA) could not be found.
However, its name suggests it is ISM-specific, which is misleading.

Rename it to better reflect its actual usage.

No functional changes.

Signed-off-by: Dust Li <dust.li@linux.alibaba.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20251023020012.69609-1-dust.li@linux.alibaba.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-24 18:48:03 -07:00
Julia Lawall
d0d2203b9a strparser: fix typo in comment
The name frags_list doesn't appear in the kernel.
It should be frag_list as in the next sentence.

Signed-off-by: Julia Lawall <Julia.Lawall@inria.fr>
Link: https://patch.msgid.link/20251023013051.1728388-1-Julia.Lawall@inria.fr
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-24 18:47:08 -07:00
Alessandro Zanni
13cb6ac5b5 selftest: net: prevent use of uninitialized variable
Fix to avoid the usage of the `ret` variable uninitialized in the
following macro expansions.

It solves the following warning:

In file included from netlink-dumps.c:21:
netlink-dumps.c: In function ‘dump_extack’:
../kselftest_harness.h:788:35: warning: ‘ret’ may be used uninitialized [-Wmaybe-uninitialized]
  788 |                         intmax_t  __exp_print = (intmax_t)__exp; \
      |                                   ^~~~~~~~~~~
../kselftest_harness.h:631:9: note: in expansion of macro ‘__EXPECT’
  631 |         __EXPECT(expected, #expected, seen, #seen, ==, 0)
      |         ^~~~~~~~
netlink-dumps.c:169:9: note: in expansion of macro ‘EXPECT_EQ’
  169 |         EXPECT_EQ(ret, FOUND_EXTACK);
      |         ^~~~~~~~~

The issue can be reproduced, building the tests, with the command:
make -C tools/testing/selftests TARGETS=net

Signed-off-by: Alessandro Zanni <alessandro.zanni87@gmail.com>
Link: https://patch.msgid.link/20251023205354.28249-1-alessandro.zanni87@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-24 18:43:37 -07:00
Jakub Kicinski
6128687625 Merge branch 'neighbour-convert-rtm_getneightbl-and-rtm_setneightbl-to-rcu'
Kuniyuki Iwashima says:

====================
neighbour: Convert RTM_GETNEIGHTBL and RTM_SETNEIGHTBL to RCU.

Patch 1 & 2 are prep for RCU conversion for RTM_GETNEIGHTBL.

Patch 3 & 4 converts RTM_GETNEIGHTBL and RTM_SETNEIGHTBL to RCU.

Patch 5 converts the neighbour table rwlock to the plain spinlock.
====================

Link: https://patch.msgid.link/20251022054004.2514876-1-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-24 17:57:28 -07:00
Kuniyuki Iwashima
3064d0fe02 neighbour: Convert rwlock of struct neigh_table to spinlock.
Only neigh_for_each() and neigh_seq_start/stop() are on the
reader side of neigh_table.lock.

Let's convert rwlock to the plain spinlock.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251022054004.2514876-6-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-24 17:57:20 -07:00
Kuniyuki Iwashima
55a6046b48 neighbour: Convert RTM_SETNEIGHTBL to RCU.
neightbl_set() fetches neigh_tables[] and updates attributes under
write_lock_bh(&tbl->lock), so RTNL is not needed.

neigh_table_clear() synchronises RCU only, and rcu_dereference_rtnl()
protects nothing here.

If we released RCU after fetching neigh_tables[], there would be no
synchronisation to block neigh_table_clear() further, so RCU is held
until the end of the function.

Another option would be to protect neigh_tables[] user with SRCU
and add synchronize_srcu() in neigh_table_clear().

But, holding RCU should be fine as we hold write_lock_bh() for the
rest of neightbl_set() anyway.

Let's perform RTM_SETNEIGHTBL under RCU and drop RTNL.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251022054004.2514876-5-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-24 17:57:20 -07:00
Kuniyuki Iwashima
4ae34be500 neighbour: Convert RTM_GETNEIGHTBL to RCU.
neightbl_dump_info() calls these functions for each neigh_tables[]
entry:

  1. neightbl_fill_info() for tbl->parms
  2. neightbl_fill_param_info() for tbl->parms_list (except tbl->parms)

Both functions rely on the table lock (read_lock_bh(&tbl->lock))
and RTNL is not needed.

Let's fetch the table under RCU and convert RTM_GETNEIGHTBL to RCU.

Note that the first entry of tbl->parms_list is tbl->parms.list and
embedded in neigh_table, so list_next_entry() is safe.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251022054004.2514876-4-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-24 17:57:20 -07:00
Kuniyuki Iwashima
35d7c70870 neighbour: Annotate access to neigh_parms fields.
NEIGH_VAR() is read locklessly in the fast path, and IPv6 ndisc uses
NEIGH_VAR_SET() locklessly.

The next patch will convert neightbl_dump_info() to RCU.

Let's annotate accesses to neigh_param with READ_ONCE() and WRITE_ONCE().

Note that ndisc_ifinfo_sysctl_change() uses &NEIGH_VAR() and we cannot
use '&' with READ_ONCE(), so NEIGH_VAR_PTR() is introduced.

Note also that NEIGH_VAR_INIT() does not need WRITE_ONCE() as it is before
parms is published.  Also, the only user hippi_neigh_setup_dev() is no
longer called since commit e3804cbebb ("net: remove COMPAT_NET_DEV_OPS"),
which looks wrong, but probably no one uses HIPPI and RoadRunner.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251022054004.2514876-3-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-24 17:57:20 -07:00
Kuniyuki Iwashima
06d6322280 neighbour: Use RCU list helpers for neigh_parms.list writers.
We will convert RTM_GETNEIGHTBL to RCU soon, where we traverse
tbl->parms_list under RCU in neightbl_dump_info().

Let's use RCU list helper for neigh_parms in neigh_parms_alloc()
and neigh_parms_release().

neigh_table_init() uses the plain list_add() for the default
neigh_parm that is embedded in the table and not yet published.

Note that neigh_parms_release() already uses call_rcu() to free
neigh_parms.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251022054004.2514876-2-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-24 17:57:20 -07:00
Przemek Kitszel
1390b8b3d2 ice: remove duplicate call to ice_deinit_hw() on error paths
Current unwinding code on error paths of ice_devlink_reinit_up() and
ice_probe() have manual call to ice_deinit_hw() (which is good, as there
is also manual call to ice_hw_init() there), which is then duplicated
(and was prior current series) in ice_deinit_dev().

Fix the above by removing ice_deinit_hw() from ice_deinit_dev().
Add a (now missing) call in ice_remove().

Reported-by: Jacob Keller <jacob.e.keller@intel.com>
Link: https://lore.kernel.org/intel-wired-lan/20250717-jk-ddp-safe-mode-issue-v1-1-e113b2baed79@intel.com/
Fixes: 4d3f59bfa2 ("ice: split ice_init_hw() out from ice_init_dev()")
Signed-off-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Tested-by: Rinitha S <sx.rinitha@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2025-10-24 13:27:20 -07:00
Przemek Kitszel
8a37f9e2ff ice: move ice_deinit_dev() to the end of deinit paths
ice_deinit_dev() takes care of turning off adminq processing, which is
much needed during driver teardown (remove, reset, error path). Move it
to the very end where applicable.
For example, ice_deinit_hw() called after adminq deinit slows rmmod on
my two-card setup by about 60 seconds.

ice_init_dev() and ice_deinit_dev() scopes were reduced by previous
commits of the series, with a final touch of extracting ice_init_dev_hw()
out now (there is no deinit counterpart).

Note that removed ice_service_task_stop() call from ice_remove() is placed
in the ice_deinit_dev() (and stopping twice makes no sense).

Signed-off-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Tested-by: Rinitha S <sx.rinitha@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2025-10-24 13:27:20 -07:00
Przemek Kitszel
c2fb9398f7 ice: extract ice_init_dev() from ice_init()
Extract ice_init_dev() from ice_init(), to allow service task and IRQ
scheme teardown to be put after clearing SW constructs in the subsequent
commit.

Signed-off-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Tested-by: Rinitha S <sx.rinitha@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2025-10-24 13:27:20 -07:00
Przemek Kitszel
ef825bdb46 ice: move ice_init_pf() out of ice_init_dev()
Move ice_init_pf() out of ice_init_dev().
Do the same for deinit counterpart.

Signed-off-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Tested-by: Rinitha S <sx.rinitha@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2025-10-24 13:27:20 -07:00
Przemek Kitszel
e3bf1cdde7 ice: move udp_tunnel_nic and misc IRQ setup into ice_init_pf()
Move udp_tunnel_nic setup and ice_req_irq_msix_misc() call into
ice_init_pf(), remove some redundancy in the former while moving.

Move ice_free_irq_msix_misc() call into ice_deinit_pf(), to mimic
the above in terms of needed cleanup. Guard it via emptiness check,
to keep the allowance of half-initialized pf being cleaned up.

Signed-off-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Tested-by: Rinitha S <sx.rinitha@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2025-10-24 13:27:20 -07:00
Przemek Kitszel
71430451f8 ice: ice_init_pf: destroy mutexes and xarrays on memory alloc failure
Unroll actions of ice_init_pf() when it fails.
ice_deinit_pf() happens to be perfect to call here.

Signed-off-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Tested-by: Rinitha S <sx.rinitha@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2025-10-24 13:24:00 -07:00
Przemek Kitszel
2fe18288fc ice: move ice_init_interrupt_scheme() prior ice_init_pf()
Move ice_init_interrupt_scheme() prior ice_init_pf().
To enable the move ice_set_pf_caps() was moved out from ice_init_pf()
to the caller (ice_init_dev()), and placed prior to the irq scheme init.

The move makes deinit order of ice_deinit_dev() and failure-path of
ice_init_pf() match (at least in terms of not calling
ice_clear_interrupt_scheme() and ice_deinit_pf() in opposite ways).

The new order aligns with findings made by Jakub Buchocki in
the commit 24b454bc35 ("ice: Fix ice module unload").

Signed-off-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Tested-by: Rinitha S <sx.rinitha@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2025-10-24 11:10:45 -07:00
Przemek Kitszel
806c4f32a8 ice: move service task start out of ice_init_pf()
Move service task start out of ice_init_pf(). Do analogous with deinit.
Service task is needed up to the very end of driver removal, later commit
of the series will move it later on execution timeline.

Signed-off-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Tested-by: Rinitha S <sx.rinitha@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2025-10-24 11:10:44 -07:00
Przemek Kitszel
c35c178fcd ice: enforce RTNL assumption of queue NAPI manipulation
Instead of making assumptions in comments move them into code.
Be also more precise, RTNL must be locked only when there is
NAPI, and we have VSIs w/o NAPI that call ice_vsi_clear_napi_queues()
during rmmod.

Signed-off-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Reviewed-by: Paul Menzel <pmenzel@molgen.mpg.de>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Tested-by: Rinitha S <sx.rinitha@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2025-10-24 11:10:44 -07:00
Sarika Sharma
197498315d wifi: ath12k: Assert base_lock is held before allocating REO update element
Add a lockdep assertion to verify that ab->base_lock is held prior to
allocating a REO update element in ath12k_dp_prepare_reo_update_elem().
This helps detect potential concurrency issues during development and
improves code robustness.
Compiled tested only.

Signed-off-by: Sarika Sharma <sarika.sharma@oss.qualcomm.com>
Reviewed-by: Vasanthakumar Thiagarajan <vasanthakumar.thiagarajan@oss.qualcomm.com>
Reviewed-by: Baochen Qiang <baochen.qiang@oss.qualcomm.com>
Link: https://patch.msgid.link/20251021112204.323242-1-sarika.sharma@oss.qualcomm.com
Signed-off-by: Jeff Johnson <jeff.johnson@oss.qualcomm.com>
2025-10-24 07:45:02 -07:00
Baochen Qiang
a41281f651 wifi: ath12k: restore register window after global reset
Hardware target implements an address space larger than that PCI BAR can
map. In order to be able to access the whole target address space, the
BAR space is split into 4 segments, of which the last 3, called windows,
can be dynamically mapped to the desired area. This is achieved by
updating WINDOW_REG_ADDRESS register with appropriate window value.
Currently each time when accessing a register that beyond WINDOW_START,
host calculates the window value and caches it after window update,
this way next time when accessing a register falling in the same window,
host knows that the window is already good hence no additional update
needed.

However this mechanism breaks after global reset is triggered in
ath12k_pci_soc_global_reset(), because with global reset hardware resets
WINDOW_REG_ADDRESS register hence the window is not properly mapped any
more. Current host does nothing about this, as a result a subsequent
register access may not work as expected if it falls in a window same as
before.

Although there is no obvious issue seen now, better to fix it to avoid
future problem. The fix is done by restoring the window register after
global reset.

Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.1.c5-00284.1-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3

Fixes: d889913205 ("wifi: ath12k: driver for Qualcomm Wi-Fi 7 devices")
Signed-off-by: Baochen Qiang <baochen.qiang@oss.qualcomm.com>
Reviewed-by: Vasanthakumar Thiagarajan <vasanthakumar.thiagarajan@oss.qualcomm.com>
Link: https://patch.msgid.link/20251017-ath12k-reset-window-cache-v1-1-29e0e751deed@oss.qualcomm.com
Signed-off-by: Jeff Johnson <jeff.johnson@oss.qualcomm.com>
2025-10-24 07:45:02 -07:00
Wei Zhang
f7746cfcdb wifi: ath12k: add support for BSS color change
Add support for handling BSS color collision events reported by firmware.

There are two scenarios where a BSS color collision may be detected:
1. The AP's MAC detects the collision directly, and firmware reports a
   BSS color collision event to the host.
2. A STA associated with the AP detects the collision. The notification
   frame from the peer is routed directly to the AP firmware, which handles
   it and sends the BSS color collision event to the host.

Add logic to parse and handle such events, and pass the data
up to mac80211.

Unlike CSA, firmware does not provide an offload mechanism for BSS color
change. Instead, the color change process is triggered via beacon offload
TX completion events sent by firmware.

BSS color feature is enabled depending on service flag advertised by
firmware, based on which color change functionality is invoked.

This change builds upon the following ath11k patch.
commit 886433a984 ("ath11k: add support for BSS color change")

Tested-on: WCN7850 hw2.0 PCI WLAN.IOE_HMT.1.1-00011-QCAHMTSWPL_V1.0_V2.0_SILICONZ-1

Signed-off-by: Wei Zhang <wei.zhang@oss.qualcomm.com>
Reviewed-by: Vasanthakumar Thiagarajan <vasanthakumar.thiagarajan@oss.qualcomm.com>
Reviewed-by: Baochen Qiang <baochen.qiang@oss.qualcomm.com>
Link: https://patch.msgid.link/20251017060100.1751692-1-wei.zhang@oss.qualcomm.com
Signed-off-by: Jeff Johnson <jeff.johnson@oss.qualcomm.com>
2025-10-24 07:45:02 -07:00
Jakub Kicinski
f0a24b2547 Merge branch 'net-dsa-lantiq_gswip-use-regmap-for-register-access'
Daniel Golle says:

====================
net: dsa: lantiq_gswip: use regmap for register access

This series refactors the lantiq_gswip driver to utilize the regmap API
for register access, replacing the previous approach of open-coding
register operations.

Using regmap paves the way for supporting different busses to access the
switch registers, for example it makes it easier to use an MDIO-based
method required to access the registers of the MaxLinear GSW1xx series
of dedicated switch ICs.

Apart from that, the use of regmap improves readability and
maintainability of the driver by standardizing register access.

When ever possible changes were made using Coccinelle semantic patches,
sometimes adjusting white space and adding line breaks when needed.
The remaining changes which were not done using semantic patches are
small and should be easy to review and verify.

The whole series has been
Tested-by: Alexander Sverdlin <alexander.sverdlin@siemens.com>
====================

Link: https://patch.msgid.link/cover.1761045000.git.daniel@makrotopia.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-23 18:53:08 -07:00
Daniel Golle
b0911b9e01 net: dsa: lantiq_gswip: harmonize gswip_mii_mask_*() parameters
The 'clear' parameter of gswip_mii_mask_cfg() and gswip_mii_mask_pcdu()
is inconsistent with the semantics of regmap_write_bits() which also
applies the mask to the value to be written.
Change the semantic mask/set of the functions gswip_mii_mask_cfg() and
gswip_mii_mask_pcdu() to follow the regmap_write_bits() pattern.

Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Acked-by; Hauke Mehrtens <hauke@hauke-m.de>:
Acked-by; Hauke Mehrtens <hauke@hauke-m.de>:
Link: https://patch.msgid.link/218854236c97a152af071852bda83d02ff2dd918.1761045000.git.daniel@makrotopia.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-23 18:53:06 -07:00
Daniel Golle
1d88358303 net: dsa: lantiq_gswip: optimize regmap_write_bits() statements
Further optimize the previous naive conversion of the *_mask() accessor
functions to regmap_write_bits by manually removing redundant mask
operands.

Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Acked-by; Hauke Mehrtens <hauke@hauke-m.de>:
Acked-by; Hauke Mehrtens <hauke@hauke-m.de>:
Link: https://patch.msgid.link/fce2f964b22fe3efc234c664b1e50de28dddf512.1761045000.git.daniel@makrotopia.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-23 18:53:06 -07:00
Daniel Golle
748b0aebd4 net: dsa: lantiq_gswip: replace *_mask() functions with regmap API
Use coccinelle to replace all uses of *_mask() with an equivalent call
to regmap_write_bits().

// Replace gswip_switch_mask with regmap_write_bits
@@
expression priv, clear, set, offset;
@@
- gswip_switch_mask(priv, clear, set, offset)
+ regmap_write_bits(priv->gswip, offset, clear | set, set)

// Replace gswip_mdio_mask with regmap_write_bits
@@
expression priv, clear, set, offset;
@@
- gswip_mdio_mask(priv, clear, set, offset)
+ regmap_write_bits(priv->mdio, offset, clear | set, set)

// Replace gswip_mii_mask with regmap_write_bits
@@
expression priv, clear, set, offset;
@@
- gswip_mii_mask(priv, clear, set, offset)
+ regmap_write_bits(priv->mii, offset, clear | set, set)

Remove the new unused *_mask() functions.
This naive approach will be further optmized manually in the next commit.

Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Acked-by; Hauke Mehrtens <hauke@hauke-m.de>:
Acked-by; Hauke Mehrtens <hauke@hauke-m.de>:
Link: https://patch.msgid.link/258d931386a512b7089924c70073ca7acba71168.1761045000.git.daniel@makrotopia.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-23 18:53:06 -07:00
Daniel Golle
4cc06901ef net: dsa: lantiq_gswip: manually convert remaining uses of read accessors
Manually convert the remaining uses of the read accessor functions and
remove them now that they are unused.

Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Acked-by; Hauke Mehrtens <hauke@hauke-m.de>:
Acked-by; Hauke Mehrtens <hauke@hauke-m.de>:
Link: https://patch.msgid.link/0e2a44b83131b40fc1ee558ed1f536c26e1232ba.1761045000.git.daniel@makrotopia.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-23 18:53:06 -07:00
Daniel Golle
128f5cf40f net: dsa: lantiq_gswip: convert trivial accessor uses to regmap
Use coccinelle semantic patch to convert all trivial uses of the register
accessor functions to use the regmap API directly.

// Replace gswip_switch_w with regmap_write
@@
expression priv, val, offset;
@@
- gswip_switch_w(priv, val, offset)
+ regmap_write(priv->gswip, offset, val)

// Replace gswip_mdio_w with regmap_write
@@
expression priv, val, offset;
@@
- gswip_mdio_w(priv, val, offset)
+ regmap_write(priv->mdio, offset, val)

// Replace gswip_switch_r in simple assignment - only for u32
@@
expression priv, offset;
u32 var;
@@
- var = gswip_switch_r(priv, offset)
+ regmap_read(priv->gswip, offset, &var)

// Replace gswip_switch_mask with regmap_set_bits when clear is 0
@@
expression priv, set, offset;
@@
- gswip_switch_mask(priv, 0, set, offset)
+ regmap_set_bits(priv->gswip, offset, set)

// Replace gswip_mdio_mask with regmap_set_bits when clear is 0
@@
expression priv, set, offset;
@@
- gswip_mdio_mask(priv, 0, set, offset)
+ regmap_set_bits(priv->mdio, offset, set)

// Replace gswip_switch_mask with regmap_clear_bits when set is 0
@@
expression priv, clear, offset;
@@
- gswip_switch_mask(priv, clear, 0, offset)
+ regmap_clear_bits(priv->gswip, offset, clear)

// Replace gswip_mdio_mask with regmap_clear_bits when set is 0
@@
expression priv, clear, offset;
@@
- gswip_mdio_mask(priv, clear, 0, offset)
+ regmap_clear_bits(priv->mdio, offset, clear)

Remove gswip_switch_w() and gswip_mdio_w() functions as they now no
longer have any users.

Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Acked-by; Hauke Mehrtens <hauke@hauke-m.de>:
Acked-by; Hauke Mehrtens <hauke@hauke-m.de>:
Link: https://patch.msgid.link/48a60f386b1bd487c410b1f5fb25ba50ceddc6f7.1761045000.git.daniel@makrotopia.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-23 18:53:06 -07:00
Daniel Golle
7053597973 net: dsa: lantiq_gswip: convert accessors to use regmap
Use regmap for register access in preparation for supporting the MaxLinear
GSW1xx family of switches connected via MDIO or SPI.
Rewrite the existing accessor read-poll-timeout functions to use calls to
the regmap API for now.

Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Acked-by; Hauke Mehrtens <hauke@hauke-m.de>:
Acked-by; Hauke Mehrtens <hauke@hauke-m.de>:
Link: https://patch.msgid.link/535d968bc6319a74bdf76166ef19364ee659285f.1761045000.git.daniel@makrotopia.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-23 18:53:05 -07:00
Daniel Golle
41b66240e9 net: dsa: lantiq_gswip: clarify GSWIP 2.2 VLAN mode in comment
The comment above writing the default PVID incorrectly states that
"GSWIP 2.2 (GRX300) and later program here the VID directly."
The truth is that even GSWIP 2.2 and newer maintain the behavior of
GSWIP 2.1 unless the VLANMD bit in PCE Global Control Register 1 is
set ("GSWIP2.2 VLAN Mode").
Fix the misleading comment accordingly.

Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Acked-by; Hauke Mehrtens <hauke@hauke-m.de>:
Acked-by; Hauke Mehrtens <hauke@hauke-m.de>:
Link: https://patch.msgid.link/018056a575503d9797f3222f71a988e825316be0.1761045000.git.daniel@makrotopia.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-23 18:53:05 -07:00
Eric Biggers
05774d7e42 tcp: Remove unnecessary null check in tcp_inbound_md5_hash()
The 'if (!key && hash_location)' check in tcp_inbound_md5_hash() implies
that hash_location might be null.  However, later code in the function
dereferences hash_location anyway, without checking for null first.
Fortunately, there is no real bug, since tcp_inbound_md5_hash() is
called only with non-null values of hash_location.

Therefore, remove the unnecessary and misleading null check of
hash_location.  This silences a Smatch static checker warning
(https://lore.kernel.org/netdev/aPi4b6aWBbBR52P1@stanley.mountain/)

Also fix the related comment at the beginning of the function.

Signed-off-by: Eric Biggers <ebiggers@kernel.org>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Dmitry Safonov <0x7f454c46@gmail.com>
Link: https://patch.msgid.link/20251022221209.19716-1-ebiggers@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-23 18:49:55 -07:00
Bagas Sanjaya
9ff8609265 net: rmnet: Use section heading markup for packet format subsections
Format subsections of "Packet format" section as reST subsections.

Link: https://lore.kernel.org/linux-doc/aO_MefPIlQQrCU3j@horms.kernel.org/
Suggested-by: Simon Horman <horms@kernel.org>
Signed-off-by: Bagas Sanjaya <bagasdotme@gmail.com>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://patch.msgid.link/20251022025456.19004-2-bagasdotme@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-23 17:28:44 -07:00
Sunday Adelodun
ec538867a3 net: unix: remove outdated BSD behavior comment in unix_release_sock()
Remove the long-standing comment in unix_release_sock() that described a
behavioral difference between Linux and BSD regarding when ECONNRESET is
sent to connected UNIX sockets upon closure.

As confirmed by testing on macOS (similar to BSD behavior), ECONNRESET
is only observed for SOCK_DGRAM sockets, not for SOCK_STREAM. Meanwhile,
Linux already returns ECONNRESET in cases where a socket is closed with
unread data or is not yet accept()ed. This means the previous comment no
longer accurately describes current behavior and is misleading.

Suggested-by: Kuniyuki Iwashima <kuniyu@google.com>
Signed-off-by: Sunday Adelodun <adelodunolaoluwa@yahoo.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20251021195906.20389-1-adelodunolaoluwa@yahoo.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-23 17:26:16 -07:00
Lorenzo Bianconi
99ad2b6815 net: airoha: Remove code duplication in airoha_regs.h
This patch does not introduce any logical change, it just removes
duplicated code in airoha_regs.h.
Fix naming conventions in airoha_regs.h.

Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://patch.msgid.link/20251022-airoha-regs-cosmetics-v2-1-e0425b3f2c2c@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-23 17:17:57 -07:00
Jakub Kicinski
2b7553db91 Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
Cross-merge networking fixes after downstream PR (net-6.18-rc3).

No conflicts or adjacent changes.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-23 10:53:08 -07:00
Jeff Johnson
50cb7ccab8 wifi: ath11k: Correctly use "ab" macro parameter
The checkpatch script is reporting multiple instances of:
	Argument 'x' is not used in function-like macro

Fix these by renaming the argument to match the usage. In the process,
also add parenthesis to the usage to avoid MACRO_ARG_PRECEDENCE
issues.

Compile tested only.

Link: https://patch.msgid.link/20251022-ath11k-bad-macro-arg-v1-1-93a8eadb6191@oss.qualcomm.com
Signed-off-by: Jeff Johnson <jeff.johnson@oss.qualcomm.com>
2025-10-23 10:34:06 -07:00
Aditya Kumar Singh
998c68e96c wifi: ath11k: advertise NL80211_FEATURE_TX_POWER_INSERTION
Now that driver is capable of inserting Tx power, advertise the support
for the same to upper layers.

Tested-on: QCN9074 hw1.0 PCI WLAN.HK.2.9.0.1-02146-QCAHKSWPL_SILICONZ-1

Signed-off-by: Aditya Kumar Singh <aditya.kumar.singh@oss.qualcomm.com>
Reviewed-by: Vasanthakumar Thiagarajan <vasanthakumar.thiagarajan@oss.qualcomm.com>
Reviewed-by: Baochen Qiang <baochen.qiang@oss.qualcomm.com>
Link: https://patch.msgid.link/20251017-add_tx_power_insertion_support-v1-4-f08feacfca93@oss.qualcomm.com
Signed-off-by: Jeff Johnson <jeff.johnson@oss.qualcomm.com>
2025-10-23 06:59:21 -07:00
Aditya Kumar Singh
722015690f wifi: ath11k: add support for Tx Power insertion in RRM action frame
For certain action frames like the TPC Report IE in the spectrum management
TPC Report action frame, and in the Radio Measurement Link Measurement
Report action frame there is a requirement to fill in the current
and max Tx power of the device in the packet.

Add support to populate these fields in the relevant packets.

In software-encrypted cases such as PMF, skip insertion since the packets
are already encrypted and cannot be modified.

Tested-on: QCN9074 hw1.0 PCI WLAN.HK.2.9.0.1-02146-QCAHKSWPL_SILICONZ-1

Signed-off-by: Aditya Kumar Singh <aditya.kumar.singh@oss.qualcomm.com>
Reviewed-by: Vasanthakumar Thiagarajan <vasanthakumar.thiagarajan@oss.qualcomm.com>
Reviewed-by: Baochen Qiang <baochen.qiang@oss.qualcomm.com>
Link: https://patch.msgid.link/20251017-add_tx_power_insertion_support-v1-3-f08feacfca93@oss.qualcomm.com
Signed-off-by: Jeff Johnson <jeff.johnson@oss.qualcomm.com>
2025-10-23 06:59:21 -07:00
Aditya Kumar Singh
c243d5e44f wifi: ath11k: wrap ath11k_mac_op_get_txpower() with lock-aware internal helper
Refactor ath11k_mac_op_get_txpower() by introducing a new internal function
ath11k_mac_handle_get_txpower(), which assumes the caller holds the
appropriate lock. This prepares the codebase for future change where the
internal function may be invoked directly with the lock already acquired,
improving modularity and lock handling consistency.

No functional change intended.

Tested-on: QCN9074 hw1.0 PCI WLAN.HK.2.9.0.1-02146-QCAHKSWPL_SILICONZ-1

Signed-off-by: Aditya Kumar Singh <aditya.kumar.singh@oss.qualcomm.com>
Reviewed-by: Vasanthakumar Thiagarajan <vasanthakumar.thiagarajan@oss.qualcomm.com>
Reviewed-by: Baochen Qiang <baochen.qiang@oss.qualcomm.com>
Link: https://patch.msgid.link/20251017-add_tx_power_insertion_support-v1-2-f08feacfca93@oss.qualcomm.com
Signed-off-by: Jeff Johnson <jeff.johnson@oss.qualcomm.com>
2025-10-23 06:59:20 -07:00
Aditya Kumar Singh
6688728223 wifi: ath11k: relocate some Tx power related functions in mac.c
A forthcoming change necessitates that these functions be defined prior to
their usage. Therefore, relocate them now as a preparatory step for the
upcoming modifications.

Compile tested only.

Signed-off-by: Aditya Kumar Singh <aditya.kumar.singh@oss.qualcomm.com>
Reviewed-by: Vasanthakumar Thiagarajan <vasanthakumar.thiagarajan@oss.qualcomm.com>
Reviewed-by: Baochen Qiang <baochen.qiang@oss.qualcomm.com>
Link: https://patch.msgid.link/20251017-add_tx_power_insertion_support-v1-1-f08feacfca93@oss.qualcomm.com
Signed-off-by: Jeff Johnson <jeff.johnson@oss.qualcomm.com>
2025-10-23 06:59:20 -07:00
Baochen Qiang
4a013ca2d4 wifi: ath11k: fix peer HE MCS assignment
In ath11k_wmi_send_peer_assoc_cmd(), peer's transmit MCS is sent to
firmware as receive MCS while peer's receive MCS sent as transmit MCS,
which goes against firmwire's definition.

While connecting to a misbehaved AP that advertises 0xffff (meaning not
supported) for 160 MHz transmit MCS map, firmware crashes due to 0xffff
is assigned to he_mcs->rx_mcs_set field.

	Ext Tag: HE Capabilities
	    [...]
	    Supported HE-MCS and NSS Set
		[...]
	        Rx and Tx MCS Maps 160 MHz
		    [...]
	            Tx HE-MCS Map 160 MHz: 0xffff

Swap the assignment to fix this issue.

As the HE rate control mask is meant to limit our own transmit MCS, it
needs to go via he_mcs->rx_mcs_set field. With the aforementioned swapping
done, change is needed as well to apply it to the peer's receive MCS.

Tested-on: WCN6855 hw2.1 PCI WLAN.HSP.1.1-03125-QCAHSPSWPL_V1_V2_SILICONZ_LITE-3.6510.41
Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.4.1-00199-QCAHKSWPL_SILICONZ-1

Fixes: 61fe43e721 ("ath11k: add support for setting fixed HE rate/gi/ltf")
Signed-off-by: Baochen Qiang <baochen.qiang@oss.qualcomm.com>
Reviewed-by: Vasanthakumar Thiagarajan <vasanthakumar.thiagarajan@oss.qualcomm.com>
Link: https://patch.msgid.link/20251017-ath11k-mcs-assignment-v1-2-da40825c1783@oss.qualcomm.com
Signed-off-by: Jeff Johnson <jeff.johnson@oss.qualcomm.com>
2025-10-23 06:59:20 -07:00
Baochen Qiang
47d0cd6bcc wifi: ath11k: fix VHT MCS assignment
While associating, firmware needs to know peer's receive capability to
calculate its own VHT transmit MCS, currently host sends this information
to firmware via mcs->rx_mcs_set field, this is wrong as firmware actually
takes it from mcs->tx_mcs_set field. Till now there is no failure seen
due to this, most likely because almost all peers are advertising the
same capability for both transmit and receive. Swap the assignment to
fix it.

Besides, rate control mask is meant to limit our own transmit MCS, hence
need to go via mcs->tx_mcs_set field. With the aforementioned swapping
done, change is needed as well to apply it to the peer's receive
capability rather than transmit capability.

Tested-on: WCN6855 hw2.1 PCI WLAN.HSP.1.1-03125-QCAHSPSWPL_V1_V2_SILICONZ_LITE-3.6510.41
Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.4.1-00199-QCAHKSWPL_SILICONZ-1

Fixes: d5c65159f2 ("ath11k: driver for Qualcomm IEEE 802.11ax devices")
Signed-off-by: Baochen Qiang <baochen.qiang@oss.qualcomm.com>
Reviewed-by: Vasanthakumar Thiagarajan <vasanthakumar.thiagarajan@oss.qualcomm.com>
Link: https://patch.msgid.link/20251017-ath11k-mcs-assignment-v1-1-da40825c1783@oss.qualcomm.com
Signed-off-by: Jeff Johnson <jeff.johnson@oss.qualcomm.com>
2025-10-23 06:59:20 -07:00
Horatiu Vultur
61b7ade9ba net: phy: micrel: Add support for non PTP SKUs for lan8814
The lan8814 has 4 different SKUs and for 2 of these SKUs the PTP is
disabled. All these SKUs have the same value in the register 2 and 3.
Meaning that we can't differentiate them based on device id, therefore
check the SKU register and based on this allow or not to create a PTP
device.

Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20251021070726.3690685-1-horatiu.vultur@microchip.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-23 13:11:55 +02:00
Markus Elfring
e0665df8c5 net: ti: icssg-prueth: Omit a variable reassignment in prueth_netdev_init()
An error code was assigned to a variable and checked accordingly.
This value was passed to a dev_err_probe() call in an if branch.
This function is documented in the way that the same value is returned.
Thus delete two redundant variable reassignments.

The source code was transformed by using the Coccinelle software.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Link: https://patch.msgid.link/71f7daa3-d4f4-4753-aae8-67040fc8297d@web.de
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-23 11:17:28 +02:00
Pei Xiao
d550d63d00 eth: fbnic: fix integer overflow warning in TLV_MAX_DATA definition
The TLV_MAX_DATA macro calculates (PAGE_SIZE - 512) which can exceed
the maximum value of a 16-bit unsigned integer on architectures with
large page sizes, causing compiler warnings:

drivers/net/ethernet/meta/fbnic/fbnic_tlv.h:83:24: warning: conversion
from 'long unsigned int' to 'short unsigned int' changes value from
'261632' to '65024' [-Woverflow]

Fix this by explicitly masking the result to 16 bits using bitwise AND
with 0xFFFF, ensuring the value fits within the expected data type
while maintaining the intended behavior for normal page sizes.

This preserves the existing functionality while eliminating the
compiler warning and potential undefined behavior from integer
truncation.

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202510190832.3SQkTCHe-lkp@intel.com/
Signed-off-by: Pei Xiao <xiaopei01@kylinos.cn>
Link: https://patch.msgid.link/182b9d0235d044d69d7a57c1296cc6f46e395beb.1761039651.git.xiaopei01@kylinos.cn
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-22 19:21:35 -07:00
Yue Haibing
114573962a net/sched: Remove unused inline helper qdisc_from_priv()
Since commit fb38306ceb ("net/sched: Retire ATM qdisc"), this is
not used and can be removed.

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Link: https://patch.msgid.link/20251021114626.3148894-1-yuehaibing@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-22 19:20:48 -07:00
Gustavo A. R. Silva
10e0378f05 net: spacemit: Avoid -Wflex-array-member-not-at-end warnings
-Wflex-array-member-not-at-end was introduced in GCC-14, and we are
getting ready to enable it, globally.

Use regular arrays instead of flexible-array members (they're not
really needed in this case) in a couple of unions, and fix the
following warnings:

      1 drivers/net/ethernet/spacemit/k1_emac.c:122:42: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end]
      1 drivers/net/ethernet/spacemit/k1_emac.c:122:32: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end]
      1 drivers/net/ethernet/spacemit/k1_emac.c:121:42: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end]
      1 drivers/net/ethernet/spacemit/k1_emac.c:121:32: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end]

Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Reviewed-by: Simon Horman <horms@kernel.org>
Acked-by: Vivian Wang <wangruikang@iscas.ac.cn>
Link: https://patch.msgid.link/aPd0YjO-oP60Lgvj@kspp
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-22 19:20:29 -07:00
Russell King (Oracle)
26ab9830be net: stmmac: replace has_xxxx with core_type
Replace the has_gmac, has_gmac4 and has_xgmac ints, of which only one
can be set when matching a core to its driver backend, with an
enumerated type carrying the DWMAC core type.

Tested-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Acked-by: Chen-Yu Tsai <wens@kernel.org>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Tested-by: Mohd Ayaan Anwar <mohd.anwar@oss.qualcomm.com>
Reviewed-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Link: https://patch.msgid.link/E1vB6ld-0000000BIPy-2Qi4@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-22 18:28:57 -07:00
Keith Busch
0ecf0e6748 io_uring/fdinfo: show SQEs for no array setup
The sq_head indicates the index directly in the submission queue when
the IORING_SETUP_NO_SQARRAY option is used, so use that instead of
skipping showing the entries.

Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-10-22 16:09:41 -06:00
Johannes Thumshirn
4ae8efb4f9 blktrace: handle BLKTRACESETUP2 ioctl
Handle the BLKTRACESETUP2 ioctl, requesting an extended version of the
blktrace protocol from user-space.

Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-10-22 11:14:06 -06:00
Johannes Thumshirn
3f6722816a blktrace: trace zone write plugging operations
Trace zone write plugging operations on block devices.

As tracing of zoned block commands needs the upper 32bit of the widened
64bit action, only add traces to blktrace if user-space has requested
version 2 of the blktrace protocol.

Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-10-22 11:14:05 -06:00
Johannes Thumshirn
1c164fcc1b blktrace: expose ZONE APPEND completions to blktrace
Expose ZONE APPEND completions as a block trace completion action to
blktrace.

As tracing of zoned block commands needs the upper 32bit of the widened
64bit action, only add traces to blktrace if user-space has requested
version 2 of the blktrace protocol.

Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-10-22 11:14:05 -06:00
Johannes Thumshirn
f9ee38bbf7 blktrace: add block trace commands for zone operations
Add block trace commands for zone operations. These commands can only be
handled with version 2 of the blktrace protocol. For version 1, warn if a
command that does not fit into the 16 bits reserved for the command in
this version is passed in.

Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-10-22 11:14:05 -06:00
Johannes Thumshirn
4d8bc7bd4f blktrace: move ftrace blk_io_tracer to blk_io_trace2
Move ftrace's blk_io_tracer to the new blk_io_trace2 infrastructure.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-10-22 11:14:05 -06:00
Johannes Thumshirn
67bfa74d81 blktrace: move trace_note to blk_io_trace2
Move trace_note() to the new blk_io_trace2 infrastructure.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-10-22 11:14:05 -06:00
Johannes Thumshirn
915bb53860 blktrace: differentiate between blk_io_trace versions
Differentiate between blk_io_trace and blk_io_trace2 when relaying to
user-space depending on which version has been requested by the blktrace
utility.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-10-22 11:14:05 -06:00
Johannes Thumshirn
c44347d606 blktrace: add definitions for struct blk_io_trace2
Add definitions for the extended version of the blktrace protocol using a
wider action type to be able to record new actions in the kernel.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-10-22 11:14:05 -06:00
Johannes Thumshirn
113cbd6282 blktrace: pass blk_user_trace2 to setup functions
Pass struct blk_user_trace_setup2 to blktrace_setup_finalize(). This
prepares for the incoming extension of the blktrace protocol with a 64bit
act_mask.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-10-22 11:14:05 -06:00
Johannes Thumshirn
0d8627cc93 blktrace: add definitions for blk_user_trace_setup2
Add definitions for a version 2 of the blk_user_trace_setup ioctl. This
new ioctl will enable a different struct layout of the binary data passed
to user-space when using a new version of the blktrace utility requesting
the new struct layout.

Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-10-22 11:14:05 -06:00
Johannes Thumshirn
42da88a724 blktrace: split do_blk_trace_setup into two functions
Split do_blk_trace_setup into two functions, this is done to prepare for
an incoming new BLKTRACESETUP2 ioctl(2) which can receive extended
parameters from user-space.

Also move the size verification logic to the callers in preparation for
using a new internal structure later.

Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-10-22 11:14:05 -06:00
Johannes Thumshirn
370cd70a40 blktrace: change the internal action to 64bit
Change the internal use of the action in blktrace to 64bit. Although for
now only the lower 32bits will be used.

With the upcoming version 2 of the blktrace user-space protocol the upper
32bit will also be utilized.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-10-22 11:14:05 -06:00
Johannes Thumshirn
70e3c62b89 blktrace: untangle if/else sequence in __blk_add_trace
Untangle the if/else sequence setting the trace action in
__blk_add_trace() and turn it into a switch statement for better
extensibility.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-10-22 11:14:05 -06:00
Johannes Thumshirn
04678e72e9 blktrace: split out relaying a blktrace event
Split out the code relaying a blktrace event to user-space using relayfs.

This enables adding a second version supporting a new version of the
protocol.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-10-22 11:14:05 -06:00
Johannes Thumshirn
472eca5383 blktrace: factor out recording a blktrace event
Factor out the recording of a blktrace event into its own function,
deduplicating the code.

This also enables recording different versions of the blktrace protocol
later on.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-10-22 11:14:05 -06:00
Johannes Thumshirn
a65988a0ad blktrace: only calculate trace length once
De-duplicate the calculation of the trace length instead of doing the
calculation twice, once for calling trace_buffer_lock_reserve() and once
for calling relay_reserve().

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-10-22 11:14:05 -06:00
Pavel Begunkov
dde92a5026 io_uring: check for user passing 0 nr_submit
io_submit_sqes() shouldn't be stepping into its main loop when there is
nothing to submit, i.e. nr=0. Fix 0 submission queue entries checks,
which should follow after all user input truncations.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-10-22 11:12:54 -06:00
Loic Poulain
54be197109 wifi: ath10k: Support for FTM TLV test commands
Existing tools like myftm use 'legacy' test command API. Similarly
to ath11k and ath12k, we want to support raw TLV payload submitted
from the test tool. This requires segmenting the TLV payload and
encapsulating it within a WMI command. The opposite operation needs
to be done upon corresponding event receiving.

Tested-on: WCN3990 hw1.0 WLAN.HL.3.3.7.c2-00931-QCAHLSWMTPLZ-1

Signed-off-by: Loic Poulain <loic.poulain@oss.qualcomm.com>
Link: https://patch.msgid.link/20251020153759.407516-1-loic.poulain@oss.qualcomm.com
Signed-off-by: Jeff Johnson <jeff.johnson@oss.qualcomm.com>
2025-10-22 08:30:52 -07:00
Keith Busch
5c5028ee59 block: rename min_segment_size
Despite its name, the block layer is fine with segments smaller that the
"min_segment_size" limit. The value is an optimization limit indicating
the largest segment that can be used without considering boundary
limits. Smaller segments can take a fast path, so give it a name that
reflects that: max_fast_segment_size.

Signed-off-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-10-22 07:39:39 -06:00
Keith Busch
1cba30bf9f io_uring: add support for IORING_SETUP_SQE_MIXED
Normal rings support 64b SQEs for posting submissions, while certain
features require the ring to be configured with IORING_SETUP_SQE128, as
they need to convey more information per submission. This, in turn,
makes ALL the SQEs be 128b in size. This is somewhat wasteful and
inefficient, particularly when only certain SQEs need to be of the
bigger variant.

This adds support for setting up a ring with mixed SQE sizes, using
IORING_SETUP_SQE_MIXED. When setup in this mode, SQEs posted to the ring
may be either 64b or 128b in size. If a SQE is 128b in size, then opcode
will be set to a variante to indicate that this is the case. Any other
non-128b opcode will assume the SQ's default size.

SQEs on these types of mixed rings may also utilize NOP with skip
success set.  This can happen if the ring is one (small) SQE entry away
from wrapping, and an attempt is made to get a 128b SQE. As SQEs must be
contiguous in the SQ ring, a 128b SQE cannot wrap the ring. For this
case, a single NOP SQE should be inserted with the SKIP_SUCCESS flag
set. The kernel will process this as a normal NOP and without posting a
CQE.

Signed-off-by: Keith Busch <kbusch@kernel.org>
[axboe: {} style fix and assign sqe before opcode read]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-10-22 07:34:57 -06:00
Jiapeng Chong
962ac5ca99 net: macb: Remove duplicate linux/inetdevice.h header
./drivers/net/ethernet/cadence/macb_main.c: linux/inetdevice.h is included more than once.

Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=26474
Signed-off-by: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Acked-by: Nicolas Ferre <nicolas.ferre@microchip.com>
Link: https://patch.msgid.link/20251020014441.2070356-1-jiapeng.chong@linux.alibaba.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-21 18:40:04 -07:00
Vadim Fedorenko
91f76771db bnxt_en: support PPS in/out on all pins
Add supported_extts_flags and supported_perout_flags configuration to make
the driver complaint with the latest API.

Initialize channel information to 0 to avoid confusing users, because HW
doesn't actually care about channels.

Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Reviewed-by: Pavan Chebbi <pavan.chebbi@broadcom.com>
Link: https://patch.msgid.link/20251019225720.898550-1-vadim.fedorenko@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-21 18:39:47 -07:00
Heiner Kallweit
4a107a0e83 net: stmmac: mdio: use phy_find_first to simplify stmmac_mdio_register
Simplify the code by using phy_find_first().

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20ca4962-9588-40b8-b021-fb349a92e9e5@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-21 18:35:51 -07:00
Shi Hao
1471a274b7 eth: 3c515: replace cleanup_module with __exit
update old legacy cleanup_module from the file with __exit
module as per kernel code practices and restore the #ifdef
MODULE condition to allow successful compilation as a built
-in driver.

The file had an old cleanup_module still in use which could
be updated with __exit module function although its init_module
is indeed newer however the cleanup_module was still
using the older version of exit.

To set proper exit module function replace cleanup_module
with __exit corkscrew_exit_module to align it to the
kernel code consistency.

Signed-off-by: Shi Hao <i.shihao.999@gmail.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20251018052541.124365-1-i.shihao.999@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-21 18:28:56 -07:00
Jakub Kicinski
9549c74e01 Merge branch 'net-dsa-yt921x-add-support-for-motorcomm-yt921x'
David Yang says:

====================
net: dsa: yt921x: Add support for Motorcomm YT921x

Motorcomm YT921x is a series of ethernet switches developed by Shanghai
Motorcomm Electronic Technology, including:

  - YT9215S / YT9215RB / YT9215SC: 5 GbE phys
  - YT9213NB / YT9214NB: 2 GbE phys
  - YT9218N / YT9218MB: 8 GbE phys

and up to 2 serdes interfaces.

This patch adds basic support for a working DSA switch.
====================

Link: https://patch.msgid.link/20251017060859.326450-1-mmyangfl@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-21 18:25:35 -07:00
David Yang
0c5480ac96 MAINTAINERS: add entry for Motorcomm YT921x ethernet switch driver
Add a MAINTAINERS entry for the Motorcomm YT921x ethernet switch driver
and its DT binding.

Signed-off-by: David Yang <mmyangfl@gmail.com>
Link: https://patch.msgid.link/20251017060859.326450-5-mmyangfl@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-21 18:25:30 -07:00
David Yang
186623f4aa net: dsa: yt921x: Add support for Motorcomm YT921x
Motorcomm YT921x is a series of ethernet switches developed by Shanghai
Motorcomm Electronic Technology, including:

  - YT9215S / YT9215RB / YT9215SC: 5 GbE PHYs
  - YT9213NB / YT9214NB: 2 GbE PHYs
  - YT9218N / YT9218MB: 8 GbE PHYs

and up to 2 GMACs.

Driver verified on a stock wireless router with IPQ5018 + YT9215S.

Signed-off-by: David Yang <mmyangfl@gmail.com>
Link: https://patch.msgid.link/20251017060859.326450-4-mmyangfl@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-21 18:25:30 -07:00
David Yang
ca4709843b net: dsa: tag_yt921x: add support for Motorcomm YT921x tags
Add support for Motorcomm YT921x tags, which includes a proper
configurable ethertype field (default to 0x9988).

Signed-off-by: David Yang <mmyangfl@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20251017060859.326450-3-mmyangfl@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-21 18:25:30 -07:00
David Yang
a9dff2b5f7 dt-bindings: net: dsa: yt921x: Add Motorcomm YT921x switch support
The Motorcomm YT921x series is a family of Ethernet switches with up to
8 internal GbE PHYs and up to 2 GMACs.

Signed-off-by: David Yang <mmyangfl@gmail.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://patch.msgid.link/20251017060859.326450-2-mmyangfl@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-21 18:25:30 -07:00
Jakub Kicinski
21e45ec8a5 Merge branch 'net-common-feature-compute-for-upper-interface'
Hangbin Liu says:

====================
net: common feature compute for upper interface

Some high-level virtual drivers need to compute features from their
lower devices, but each currently has its own implementation and may
miss some feature computations. This patch set introduces a common function
to compute features for such devices.

Currently, bonding, team, and bridge have been updated to use the new
helper.
====================

Link: https://patch.msgid.link/20251017034155.61990-1-liuhangbin@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-21 18:08:25 -07:00
Hangbin Liu
0152747a52 net: bridge: use common function to compute the features
Previously, bridge ignored all features propagation and DST retention,
only handling explicitly the GSO limits.

By switching to the new helper netdev_compute_master_upper_features(), the bridge
now expose additional features, depending on the lowers capabilities.

Since br_set_gso_limits() is already covered by the helper, it can be
removed safely.

Bridge has it's own way to update needed_headroom. So we don't need to
update it in the helper.

Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Link: https://patch.msgid.link/20251017034155.61990-5-liuhangbin@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-21 18:08:23 -07:00
Hangbin Liu
745cd46c2a team: use common function to compute the features
Use the new helper netdev_compute_master_upper_features() to compute the
team device features. This helper performs both the feature computation
and the netdev_change_features() call.

Note that such change replace the lower layer traversing currently done
using team->port_list with netdev_for_each_lower_dev(). Such change is
safe as `port_list` contains exactly the same elements as
`team->dev->adj_list.lower` and the helper is always invoked under the
RTNL lock.

With this change, the explicit netdev_change_features() in team_add_slave()
can be safely removed, as team_port_add() already takes care of the
notification via netdev_compute_master_upper_features(), and same thing for
team_del_slave()

This also fixes missing computations for MPLS, XFRM, and TSO/GSO partial
features.

Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Link: https://patch.msgid.link/20251017034155.61990-4-liuhangbin@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-21 18:08:23 -07:00
Hangbin Liu
d4fde269a9 bonding: use common function to compute the features
Use the new functon netdev_compute_master_upper_features() to compute the bonding
features.

Note that bond_compute_features() currently uses bond_for_each_slave()
to traverse the lower devices list, and that is just a macro wrapper of
netdev_for_each_lower_private(). We use similar helper
netdev_for_each_lower_dev() in netdev_compute_master_upper_features() to
iterate the slave device, as there is not need to get the private data.

No functional change intended.

Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Link: https://patch.msgid.link/20251017034155.61990-3-liuhangbin@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-21 18:08:23 -07:00
Hangbin Liu
28098defc7 net: add a common function to compute features for upper devices
Some high level software drivers need to compute features from lower
devices. But each has their own implementations and may lost some
feature compute. Let's use one common function to compute features
for kinds of these devices.

The new helper uses the current bond implementation as the reference
one, as the latter already handles all the relevant aspects: netdev
features, TSO limits and dst retention.

Suggested-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Link: https://patch.msgid.link/20251017034155.61990-2-liuhangbin@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-21 18:08:23 -07:00
Alok Tiwari
0364ca3309 devlink: region: correct port region lookup to use port_ops
The function devlink_port_region_get_by_name() incorrectly uses
region->ops->name to compare the region name. as it is not any critical
impact as ops and port_ops define as union for devlink_region but as per
code logic it should refer port_ops here.

No functional impact as ops and port_ops are part of same union,
and name is the first member of both.

Update it to use region->port_ops->name to properly reference
the name of the devlink port region.

Signed-off-by: Alok Tiwari <alok.a.tiwari@oracle.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Link: https://patch.msgid.link/20251020170916.1741808-1-alok.a.tiwari@oracle.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-21 17:40:26 -07:00
Anton Protopopov
e7586577b7 libbpf: fix formatting of bpf_object__append_subprog_code
The commit 6c918709bd ("libbpf: Refactor bpf_object__reloc_code")
added the bpf_object__append_subprog_code() with incorrect indentations.
Use tabs instead. (This also makes a consequent commit better readable.)

Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20251019202145.3944697-14-a.s.protopopov@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-21 11:20:23 -07:00
Anton Protopopov
2f69c56854 bpf: make bpf_insn_successors to return a pointer
The bpf_insn_successors() function is used to return successors
to a BPF instruction. So far, an instruction could have 0, 1 or 2
successors. Prepare the verifier code to introduction of instructions
with more than 2 successors (namely, indirect jumps).

To do this, introduce a new struct, struct bpf_iarray, containing
an array of bpf instruction indexes and make bpf_insn_successors
to return a pointer of that type. The storage for all instructions
is allocated in the env->succ, which holds an array of size 2,
to be used for all instructions.

Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251019202145.3944697-10-a.s.protopopov@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-21 11:20:23 -07:00
Anton Protopopov
44481e4925 bpf: generalize and export map_get_next_key for arrays
The kernel/bpf/array.c file defines the array_map_get_next_key()
function which finds the next key for array maps. It actually doesn't
use any map fields besides the generic max_entries field. Generalize
it, and export as bpf_array_get_next_key() such that it can be
re-used by other array-like maps.

Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251019202145.3944697-4-a.s.protopopov@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-21 11:17:25 -07:00
Anton Protopopov
f7d72d0b3f bpf: save the start of functions in bpf_prog_aux
Introduce a new subprog_start field in bpf_prog_aux. This field may
be used by JIT compilers wanting to know the real absolute xlated
offset of the function being jitted. The func_info[func_id] may have
served this purpose, but func_info may be NULL, so JIT compilers
can't rely on it.

Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251019202145.3944697-3-a.s.protopopov@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-21 11:17:25 -07:00
Anton Protopopov
6ea5fc92a0 bpf: fix the return value of push_stack
In [1] Eduard mentioned that on push_stack failure verifier code
should return -ENOMEM instead of -EFAULT. After checking with the
other call sites I've found that code randomly returns either -ENOMEM
or -EFAULT. This patch unifies the return values for the push_stack
(and similar push_async_cb) functions such that error codes are
always assigned properly.

  [1] https://lore.kernel.org/bpf/20250615085943.3871208-1-a.s.protopopov@gmail.com

Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251019202145.3944697-2-a.s.protopopov@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-21 11:17:25 -07:00
Shardul Bankar
96d31dff3f bpf: Clarify get_outer_instance() handling in propagate_to_outer_instance()
propagate_to_outer_instance() calls get_outer_instance() and uses the
returned pointer to reset and commit stack write marks. Under normal
conditions, update_instance() guarantees that an outer instance exists,
so get_outer_instance() cannot return an ERR_PTR.

However, explicitly checking for IS_ERR(outer_instance) makes this code
more robust and self-documenting. It reduces cognitive load when reading
the control flow and silences potential false-positive reports from
static analysis or automated tooling.

No functional change intended.

Signed-off-by: Shardul Bankar <shardulsb08@gmail.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251021080849.860072-1-shardulsb08@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-21 09:39:05 -07:00
Eric Dumazet
3ff9bcecce net: avoid extra access to sk->sk_wmem_alloc in sock_wfree()
UDP TX packets destructor is sock_wfree().

It suffers from a cache line bouncing in sock_def_write_space_wfree().

Instead of reading sk->sk_wmem_alloc after we just did an atomic RMW
on it, use __refcount_sub_and_test() to get the old value for free,
and pass the new value to sock_def_write_space_wfree().

Add __sock_writeable() helper.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20251017133712.2842665-1-edumazet@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-21 15:56:21 +02:00
Paolo Abeni
d1d7998df9 Merge branch 'net-airoha-add-an7583-ethernet-controller-support'
Lorenzo Bianconi says:

====================
net: airoha: Add AN7583 ethernet controller support

Introduce support for AN7583 ethernet controller to airoha-eth dirver.
The main differences between EN7581 and AN7583 is the latter runs a
single PPE module while EN7581 runs two of them. Moreover PPE SRAM in
AN7583 SoC is reduced to 8K (while SRAM is 16K on EN7581).

v2: https://lore.kernel.org/r/20251016-an7583-eth-support-v2-0-ea6e7e9acbdb@kernel.org
v1: https://lore.kernel.org/r/20251015-an7583-eth-support-v1-0-064855f05923@kernel.org
====================

Link: https://patch.msgid.link/20251017-an7583-eth-support-v3-0-f28319666667@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-21 13:07:08 +02:00
Lorenzo Bianconi
e4e5ce823b net: airoha: Add AN7583 SoC support
Introduce support for AN7583 ethernet controller to airoha-eth dirver.

Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://patch.msgid.link/20251017-an7583-eth-support-v3-13-f28319666667@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-21 13:07:05 +02:00
Lorenzo Bianconi
63f283d36b net: airoha: ppe: Do not use magic numbers in airoha_ppe_foe_get_entry_locked()
Explicit the size of entries pointed by hwe pointer in
airoha_ppe_foe_get_entry_locked routine

Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://patch.msgid.link/20251017-an7583-eth-support-v3-12-f28319666667@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-21 13:07:05 +02:00
Lorenzo Bianconi
9d5b5219f6 net: airoha: Refactor src port configuration in airhoha_set_gdm2_loopback
AN7583 chipset relies on different definitions for source-port
identifier used for hw offloading. In order to support hw offloading
in AN7583 controller, refactor src port configuration in
airhoha_set_gdm2_loopback routine and introduce get_src_port_id
callback.

Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://patch.msgid.link/20251017-an7583-eth-support-v3-11-f28319666667@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-21 13:07:05 +02:00
Lorenzo Bianconi
c71a7a861e net: airoha: Select default ppe cpu port in airoha_dev_init()
Select the PPE default cpu port in airoha_dev_init routine.
This patch allows to distribute the load between the two available cpu
ports (FE_PSE_PORT_CDM1 and FE_PSE_PORT_CDM2) if the device is running a
single PPE module (e.g. 7583) selecting the cpu port based on the use
QDMA device. For multi-PPE device (e.g. 7581) assign FE_PSE_PORT_CDM1 to
PPE1 and FE_PSE_PORT_CDM2 to PPE2.

Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://patch.msgid.link/20251017-an7583-eth-support-v3-10-f28319666667@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-21 13:07:05 +02:00
Lorenzo Bianconi
620d7b91aa net: airoha: ppe: Flush PPE SRAM table during PPE setup
Rely on airoha_ppe_foe_commit_sram_entry routine to flush SRAM PPE table
entries. This patch allow moving PPE SRAM flush during PPE setup and
avoid dumping uninitialized values via the debugfs if no entries are
offloaded yet.

Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://patch.msgid.link/20251017-an7583-eth-support-v3-9-f28319666667@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-21 13:07:05 +02:00
Lorenzo Bianconi
306b78f503 net: airoha: ppe: Configure SRAM PPE entries via the cpu
Introduce airoha_ppe_foe_commit_sram_entry routine in order to configure
the SRAM PPE entries directly via the CPU instead of using the NPU APIs.
This is a preliminary patch to enable netfilter flowtable hw offload for
AN7583 SoC.

Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://patch.msgid.link/20251017-an7583-eth-support-v3-8-f28319666667@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-21 13:07:05 +02:00
Lorenzo Bianconi
41139125f5 net: airoha: ppe: Remove airoha_ppe_is_enabled() where not necessary
Now each PPE has always PPE_STATS_NUM_ENTRIES entries so we do not need
to run airoha_ppe_is_enabled routine to check if the hash refers to
PPE1 or PPE2.

Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://patch.msgid.link/20251017-an7583-eth-support-v3-7-f28319666667@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-21 13:07:04 +02:00
Lorenzo Bianconi
5bd1d1fd48 net: airoha: ppe: Move PPE memory info in airoha_eth_soc_data struct
AN7583 SoC runs a single PPE device while EN7581 runs two of them.
Moreover PPE SRAM in AN7583 SoC is reduced to 8K (while SRAM is 16K on
EN7581). Take into account PPE memory layout during PPE configuration.

Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://patch.msgid.link/20251017-an7583-eth-support-v3-6-f28319666667@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-21 13:07:04 +02:00
Lorenzo Bianconi
ef9449f080 net: airoha: Generalize airoha_ppe2_is_enabled routine
Rename airoha_ppe2_is_enabled() in airoha_ppe_is_enabled() and
generalize it in order to check if each PPE module is enabled.
Rely on airoha_ppe_is_enabled routine to properly initialize PPE for
AN7583 SoC since AN7583 does not support PPE2.

Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://patch.msgid.link/20251017-an7583-eth-support-v3-5-f28319666667@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-21 13:07:04 +02:00
Lorenzo Bianconi
5863b4e065 net: airoha: Add airoha_eth_soc_data struct
Introduce airoha_eth_soc_data struct to contain differences between
various SoC. Move XSI reset names in airoha_eth_soc_data. This is a
preliminary patch to enable AN7583 ethernet controller support in
airoha-eth driver.

Co-developed-by: Christian Marangi <ansuelsmth@gmail.com>
Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://patch.msgid.link/20251017-an7583-eth-support-v3-4-f28319666667@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-21 13:07:04 +02:00
Lorenzo Bianconi
15f357cd45 net: airoha: Add airoha_ppe_get_num_stats_entries() and airoha_ppe_get_num_total_stats_entries()
Introduce airoha_ppe_get_num_stats_entries and
airoha_ppe_get_num_total_stats_entries routines in order to make the
code more readable controlling if CONFIG_NET_AIROHA_FLOW_STATS is
enabled or disabled.
Modify airoha_ppe_foe_get_flow_stats_index routine signature relying on
airoha_ppe_get_num_total_stats_entries().

Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://patch.msgid.link/20251017-an7583-eth-support-v3-3-f28319666667@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-21 13:07:04 +02:00
Lorenzo Bianconi
6d5b601d52 net: airoha: ppe: Dynamically allocate foe_check_time array in airoha_ppe struct
This is a preliminary patch to properly enable PPE support for AN7583
SoC.

Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://patch.msgid.link/20251017-an7583-eth-support-v3-2-f28319666667@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-21 13:07:04 +02:00
Lorenzo Bianconi
51538c0c9d dt-bindings: net: airoha: Add AN7583 support
Introduce AN7583 ethernet controller support to Airoha EN7581
device-tree bindings. The main difference between EN7581 and AN7583 is
the number of reset lines required by the controller (AN7583 does not
require hsi-mac).

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Reviewed-by: Conor Dooley <conor.dooley@microchip.com>
Link: https://patch.msgid.link/20251017-an7583-eth-support-v3-1-f28319666667@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-21 13:07:04 +02:00
Paolo Abeni
3dc835a66e Merge branch 'networking-docs-section-headings-cleanup'
Bagas Sanjaya says:

====================
networking docs section headings cleanup

Just two net docs patches cleaning up section headings. The shortlog below
should be self-explanatory.
====================

Link: https://patch.msgid.link/20251017064525.28836-2-bagasdotme@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-21 10:57:47 +02:00
Bagas Sanjaya
97aa8ecb57 net: 6pack: Demote "How to turn on 6pack support" section heading
"How to turn on 6pack support" is a subsection of "Building and
installing the 6pack driver". Yet, the former is in the same heading
level as the latter as sections, making it listed in networking docs
toctree.

Demote it to subsection.

Signed-off-by: Bagas Sanjaya <bagasdotme@gmail.com>
Link: https://patch.msgid.link/20251017064525.28836-4-bagasdotme@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-21 10:57:45 +02:00
Bagas Sanjaya
122d696c17 net: nfc: Format userspace interface subsection headings
Subsection headings of "Userspace interface" is written in normal
paragraph, all-capped. Properly format them as reST section headings.

Signed-off-by: Bagas Sanjaya <bagasdotme@gmail.com>
Link: https://patch.msgid.link/20251017064525.28836-3-bagasdotme@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-21 10:57:45 +02:00
Bitterblue Smith
9b567039e5 wifi: rtl8xxxu: Use correct power off sequence for RTL8192CU
RTL8192CU disappears and reappears when rtl8xxxu is unloaded:

usbcore: deregistering interface driver rtl8xxxu
wlp3s0f3u2: deauthenticating from ... by local choice (Reason: 3=DEAUTH_LEAVING)
usb 1-2: rtl8xxxu_active_to_emu: Disabling MAC timed out
usb 1-2: USB disconnect, device number 7
usb 1-2: disconnecting
usb 1-2: new high-speed USB device number 8 using xhci_hcd
usb 1-2: New USB device found, idVendor=0bda, idProduct=8178, bcdDevice= 2.00
usb 1-2: New USB device strings: Mfr=1, Product=2, SerialNumber=3
usb 1-2: Product: 802.11n WLAN Adapter
usb 1-2: Manufacturer: Realtek
usb 1-2: SerialNumber: 00e04c000001

This is because rtl8xxxu is using the power off sequence for RTL8723AU.
Add the correct power off sequence for RTL8192CU.

rtl8xxxu_power_off(), rtl8xxxu_active_to_emu(), and
rtl8xxxu_emu_to_disabled() are now only used for RTL8723AU, so move
them to 8723a.c and rename them to have the "rtl8723au" prefix.

Tested only with RTL8192CU.

Signed-off-by: Bitterblue Smith <rtl8821cerfe2@gmail.com>
Reviewed-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/b9d3f137-12ce-4bd9-8ada-3b8874bc3615@gmail.com
2025-10-21 14:44:44 +08:00
Bitterblue Smith
fb5183aa65 wifi: rtl8xxxu: Dump the efuse right after reading it
Someone reported a problem with their TP-Link WN722N (RTL8188EUS):

usb 1-1: This Realtek USB WiFi dongle (0x2357:0x010c) is untested!
usb 1-1: Please report results to Jes.Sorensen@gmail.com
usb 1-1: Fatal - failed to parse EFuse
rtl8xxxu: probe of 1-1:1.0 failed with error -22

This error can happen when the first two bytes of the efuse don't have
the expected magic value of 0x8129. In a situation like this it could
be useful to see the contents of the efuse.

Dump the efuse right after reading it, before trying to parse it.

Signed-off-by: Bitterblue Smith <rtl8821cerfe2@gmail.com>
Reviewed-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/1f544dc4-17f6-4401-995d-5cb4567b82fa@gmail.com
2025-10-21 14:44:33 +08:00
Bitterblue Smith
7aeb8c118e wifi: rtl8xxxu: Report the signal strength only if it's known
These chips don't report the signal strength for many (any?) data
frames. When the signal strength is not known, set RX_FLAG_NO_SIGNAL_VAL
in order to avoid reporting a signal strength of 0.

Tested with RTL8192FU and RTL8192CU.

Signed-off-by: Bitterblue Smith <rtl8821cerfe2@gmail.com>
Reviewed-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/cdbe1a18-f6f1-4575-bad9-e85db8101247@gmail.com
2025-10-21 14:44:22 +08:00
Yu-Chun Lin
8b795fe429 wifi: rtw89: Replace hardcoded strings with helper functions
Replace hardcoded strings with 'str_on_off()', 'str_enable_disable()',
and 'str_read_write()'.

The change improves readability.

Signed-off-by: Yu-Chun Lin <eleanor15x@gmail.com>
Acked-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20250930000545.2192692-1-eleanor15x@gmail.com
2025-10-21 13:51:40 +08:00
Zenm Chen
b377dcd9a2 wifi: rtw88: Add USB ID 2001:3329 for D-Link AC13U rev. A1
Add USB ID 2001:3329 for D-Link AC13U rev. A1 which is a RTL8812CU-based
Wi-Fi adapter.

Compile tested only.

Cc: stable@vger.kernel.org # 6.6.x
Signed-off-by: Zenm Chen <zenmchen@gmail.com>
Acked-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20250929035719.6172-2-zenmchen@gmail.com
2025-10-21 13:38:19 +08:00
Zenm Chen
3f9553f65d wifi: rtl8xxxu: Add USB ID 2001:3328 for D-Link AN3U rev. A1
Add USB ID 2001:3328 for D-Link AN3U rev. A1 which is a RTL8192FU-based
Wi-Fi adapter.

Compile tested only.

Cc: stable@vger.kernel.org # 6.6.x
Signed-off-by: Zenm Chen <zenmchen@gmail.com>
Reviewed-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Link: https://patch.msgid.link/20250929035719.6172-1-zenmchen@gmail.com
2025-10-21 13:33:25 +08:00
Jakub Kicinski
ebc742edc9 Merge tag 'linux-can-next-for-6.19-20251017' of git://git.kernel.org/pub/scm/linux/kernel/git/mkl/linux-can-next
Marc Kleine-Budde says:

====================
pull-request: can-next 2025-10-17

The first patch is by me and adds support for an optional reset to the
m_can drivers.

Vincent Mailhol's patch targets all drivers and removes the
can_change_mtu() function, since the netdev's min and max MTU are
populated.

Markus Schneider-Pargmann contributes 4 patches to the m_can driver to
add am62 wakeup support.

The last 7 patches are by me and provide various cleanups to the m_can
driver.

* tag 'linux-can-next-for-6.19-20251017' of git://git.kernel.org/pub/scm/linux/kernel/git/mkl/linux-can-next:
  can: m_can: m_can_get_berr_counter(): don't wake up controller if interface is down
  can: m_can: m_can_tx_submit(): remove unneeded sanity checks
  can: m_can: m_can_class_register(): remove error message in case devm_kzalloc() fails
  can: m_can: m_can_interrupt_enable(): use m_can_write() instead of open coding it
  net: m_can: convert dev_{dbg,info,err} -> netdev_{dbg,info,err}
  can: m_can: hrtimer_callback(): rename to m_can_polling_timer()
  can: m_can: m_can_init_ram(): make static
  can: m_can: Support pinctrl wakeup state
  can: m_can: Return ERR_PTR on error in allocation
  can: m_can: Map WoL to device_set_wakeup_enable
  dt-bindings: can: m_can: Add wakeup properties
  can: treewide: remove can_change_mtu()
  can: m_can: add support for optional reset
====================

Link: https://patch.msgid.link/20251017150819.1415685-1-mkl@pengutronix.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-20 18:36:17 -07:00
Aswin Karuvally
38516e3fa4 s390/iucv: Convert sprintf/snprintf to scnprintf
Convert sprintf/snprintf calls to scnprintf to better align with the
kernel development community practices [1].

Link: https://lwn.net/Articles/69419 [1]
Reviewed-by: Alexandra Winter <wintera@linux.ibm.com>
Signed-off-by: Aswin Karuvally <aswin@linux.ibm.com>
Signed-off-by: Alexandra Winter <wintera@linux.ibm.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20251017094954.1402684-1-wintera@linux.ibm.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-20 18:35:21 -07:00
Jakub Kicinski
baa515ef82 Merge branch 'intel-wired-lan-driver-updates-2025-10-15-ice-iavf-ixgbe-i40e-e1000e'
Jacob Keller says:

====================
Intel Wired LAN Driver Updates 2025-10-15 (ice, iavf, ixgbe, i40e, e1000e) [part]

Jacob revives one-year-old work from Jesse Brandeburg to implement the
standardized statistics interfaces from ethtool in the ice driver.

Vitaly introduces a new private flag to control the K1 power state of ICH
network controllers supported by the e1000e driver. This flag has been
extensively discussed on the list and deemed the best available option to
provide a field workaround without impacting the many configurations that
have no issues with the K1 power state.
====================

Link: https://patch.msgid.link/20251016-jk-iwl-next-2025-10-15-v2-0-ff3a390d9fc6@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-20 18:34:14 -07:00
Vitaly Lifshits
3c7bf5af21 e1000e: Introduce private flag to disable K1
The K1 state reduces power consumption on ICH family network controllers
during idle periods, similarly to L1 state on PCI Express NICs. Therefore,
it is recommended and enabled by default.
However, on some systems it has been observed to have adverse side
effects, such as packet loss. It has been established through debug that
the problem may be due to firmware misconfiguration of specific systems,
interoperability with certain link partners, or marginal electrical
conditions of specific units.

These problems typically cannot be fixed in the field, and generic
workarounds to resolve the side effects on all systems, while keeping K1
enabled, were found infeasible.
Therefore, add the option for users to globally disable K1 idle state on
the adapter.

Additionally, disable K1 by default for MTL and later platforms, due to
issues reported with the current configuration.

Link: https://lore.kernel.org/intel-wired-lan/CAMqyJG3LVqfgqMcTxeaPur_Jq0oQH7GgdxRuVtRX_6TTH2mX5Q@mail.gmail.com/
Link: https://lore.kernel.org/intel-wired-lan/20250626153544.1853d106@onyx.my.domain/
Link: https://lore.kernel.org/intel-wired-lan/Z_z9EjcKtwHCQcZR@mail-itl/
Link: https://github.com/QubesOS/qubes-issues/issues/9896
Link: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/2115393

Signed-off-by: Vitaly Lifshits <vitaly.lifshits@intel.com>
Reviewed-by: Timo Teräs <timo.teras@iki.fi>
Tested-by: Timo Teräs <timo.teras@iki.fi>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Reviewed-by: Dima Ruinskiy <dima.ruinskiy@intel.com>
Tested-by: Avraham Koren <Avrahamx.koren@intel.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Link: https://patch.msgid.link/20251016-jk-iwl-next-2025-10-15-v2-14-ff3a390d9fc6@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-20 18:31:26 -07:00
Jesse Brandeburg
a308ea9721 ice: refactor to use helpers
Use the ice_netdev_to_pf() helper in more places and remove a bunch of
boilerplate code. Not every instance could be replaced due to use of the
netdev_priv() output or the vsi variable within a bunch of functions.

Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Tested-by: Rinitha S <sx.rinitha@intel.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Link: https://patch.msgid.link/20251016-jk-iwl-next-2025-10-15-v2-12-ff3a390d9fc6@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-20 18:31:26 -07:00
Jesse Brandeburg
71462475d0 ice: implement transmit hardware timestamp statistics
The kernel now has common statistics for transmit timestamps, so
implement them in the ice driver.

use via
ethtool -I -T eth0

Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Hariprasad Kelam <hkelam@marvell.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Tested-by: Rinitha S <sx.rinitha@intel.com>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Link: https://patch.msgid.link/20251016-jk-iwl-next-2025-10-15-v2-11-ff3a390d9fc6@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-20 18:31:25 -07:00
Jesse Brandeburg
4368d5fe02 ice: add tracking of good transmit timestamps
As a pre-requisite to implementing timestamp statistics, start tracking
successful PTP timestamps. There already existed a trace event, but
add a counter as well so it can be displayed by the next patch.

Good count is a u64 as it is much more likely to be incremented. The
existing error stats are all u32 as before, and are less likely so will
wrap less.

Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Tested-by: Rinitha S <sx.rinitha@intel.com>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Link: https://patch.msgid.link/20251016-jk-iwl-next-2025-10-15-v2-10-ff3a390d9fc6@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-20 18:31:25 -07:00
Jesse Brandeburg
20ae87514a ice: implement ethtool standard stats
Add support for MAC/pause/RMON stats. This enables reporting hardware
statistics in a common way via:

ethtool -S eth0 --all-groups
and
ethtool --include-statistics --show-pause eth0

While doing so, add support for one new stat, receive length error
(RLEC), which is extremely unlikely to happen since most L2 frames have
a type/length field specifying a "type", and raw ethernet frames aren't
used much any longer.

NOTE: I didn't implement Ctrl aka control frame stats because the
hardware doesn't seem to implement support.

Reviewed-by: Marcin Szycik <marcin.szycik@linux.intel.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Tested-by: Rinitha S <sx.rinitha@intel.com>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Link: https://patch.msgid.link/20251016-jk-iwl-next-2025-10-15-v2-9-ff3a390d9fc6@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-20 18:31:25 -07:00
Jesse Brandeburg
98c2f0b42e net: docs: add missing features that can have stats
While trying to figure out ethtool -I | --include-statistics, I noticed
some docs got missed when implementing commit 0e9c127729 ("ethtool:
add interface to read Tx hardware timestamping statistics").

Fix up the docs to match the kernel code, and while there, sort them in
alphabetical order.

Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Reviewed-by: Rahul Rameshbabu <rrameshbabu@nvidia.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Reviewed-by: Paul Menzel <pmenzel@molgen.mpg.de>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Link: https://patch.msgid.link/20251016-jk-iwl-next-2025-10-15-v2-8-ff3a390d9fc6@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-20 18:31:25 -07:00
Ankit Garg
c30fd916c4 gve: Consolidate and persist ethtool ring changes
Refactor the ethtool ring parameter configuration logic to address two
issues: unnecessary queue resets and lost configuration changes when
the interface is down.

Previously, `gve_set_ringparam` could trigger multiple queue
destructions and recreations for a single command, as different settings
(e.g., header split, ring sizes) were applied one by one. Furthermore,
if the interface was down, any changes made via ethtool were discarded
instead of being saved for the next time the interface was brought up.

This patch centralizes the configuration logic. Individual functions
like `gve_set_hsplit_config` are modified to only validate and stage
changes in a temporary config struct.

The main `gve_set_ringparam` function now gathers all staged changes
and applies them as a single, combined configuration:
1.  If the interface is up, it calls `gve_adjust_config` once.
2.  If the interface is down, it saves the settings directly to the
    driver's private struct, ensuring they persist and are used when
    the interface is brought back up.

Signed-off-by: Ankit Garg <nktgrg@google.com>
Reviewed-by: Harshitha Ramamurthy <hramamurthy@google.com>
Reviewed-by: Jordan Rhee <jordanrhee@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Joshua Washington <joshwash@google.com>
Link: https://patch.msgid.link/20251017012614.3631351-1-joshwash@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-20 17:43:23 -07:00
Eric Dumazet
a5cd3a60aa net: shrink napi_skb_cache_{put,get}() and napi_skb_cache_get_bulk()
Following loop in napi_skb_cache_put() is unrolled by the compiler
even if CONFIG_KASAN is not enabled:

for (i = NAPI_SKB_CACHE_HALF; i < NAPI_SKB_CACHE_SIZE; i++)
	kasan_mempool_unpoison_object(nc->skb_cache[i],
				kmem_cache_size(net_hotdata.skbuff_cache));

We have 32 times this sequence, for a total of 384 bytes.

	48 8b 3d 00 00 00 00 	net_hotdata.skbuff_cache,%rdi
	e8 00 00 00 00       	call   kmem_cache_size

This is because kmem_cache_size() is not an inline and not const,
and kasan_unpoison_object_data() is an inline function.

Cache kmem_cache_size() result in a variable, so that
the compiler can remove dead code (and variable) when/if
CONFIG_KASAN is unset.

After this patch, napi_skb_cache_put() is inlined in its callers,
and we avoid one kmem_cache_size() call in napi_skb_cache_get()
and napi_skb_cache_get_bulk().

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Link: https://patch.msgid.link/20251016182911.1132792-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-20 17:37:57 -07:00
Jakub Kicinski
6ae022f8ac Merge branch 'convert-net-drivers-to-ndo_hwtstamp-api-part-1'
Vadim Fedorenko says:

====================
convert net drivers to ndo_hwtstamp API part 1

This is part 1 of patchset to convert drivers which support HW
timestamping to use .ndo_hwtstamp_get()/.ndo_hwtstamp_set() callbacks.
The new API uses netlink to communicate with user-space and have some
test coverage. Part 2 will contain another 6 patches from v1 of the
series.
There are some drivers left with old ioctl interface after this series:
- mlx5 driver be shortly converted by nVidia folks
- TI netcp ethss driver which needs separate series which I'll post
  after this one.
====================

Link: https://patch.msgid.link/20251016152515.3510991-1-vadim.fedorenko@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-20 17:30:28 -07:00
Vadim Fedorenko
dc34040654 funeth: convert to ndo_hwtstamp API
Convert driver to use .ndo_hwtstamp_get()/.ndo_hwtstamp_set() callbacks.
.ndo_eth_ioctl() implementation becomes empty, remove it.

Reviewed-by: Simon Horman <horms@kernel.org>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Link: https://patch.msgid.link/20251016152515.3510991-8-vadim.fedorenko@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-20 17:30:26 -07:00
Vadim Fedorenko
d8db98db0d tsnep: convert to ndo_hwtstatmp API
Convert to .ndo_hwtstamp_get()/.ndo_hwtstamp_set() callbacks.
After conversions the rest of tsnep_netdev_ioctl() becomes pure
phy_do_ioctl_running(), so remove tsnep_netdev_ioctl() and replace
it with phy_do_ioctl_running() in .ndo_eth_ioctl.

Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Reviewed-by: Gerhard Engleder <gerhard@engleder-embedded.com>
Link: https://patch.msgid.link/20251016152515.3510991-7-vadim.fedorenko@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-20 17:30:26 -07:00
Vadim Fedorenko
a6a64bb411 cxgb4: convert to ndo_hwtstamp API
Convert to use .ndo_hwtstamp_get()/.ndo_hwtstamp_set() callbacks.

There is some change in the logic as well. Previously, the driver was
storing newly requested configuration regardless of whether it was
applied or not. In case of request validation error, inconsistent
configuration would be returned by the driver. New logic stores
configuration only if was properly validated and applied. It brings the
consistency between reported and actual configuration.

Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Link: https://patch.msgid.link/20251016152515.3510991-6-vadim.fedorenko@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-20 17:30:26 -07:00
Vadim Fedorenko
8a15a84e80 net: atlantic: convert to ndo_hwtstamp API
Convert driver to .ndo_hwtstamp_get()/.ndo_hwtstamp_set() callbacks.
.ndo_eth_ioctl() becomes empty so remove it. Also simplify code with no
functional changes.

Reviewed-by: Simon Horman <horms@kernel.org>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Link: https://patch.msgid.link/20251016152515.3510991-5-vadim.fedorenko@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-20 17:30:26 -07:00
Vadim Fedorenko
149cfae711 amd-xgbe: convert to ndo_hwtstamp callbacks
Convert driver to use .ndo_hwtstamp_get()/.ndo_hwtstamp_set() callbacks.
.ndo_eth_ioctl() becomes empty function, remove it.

Reviewed-by: Simon Horman <horms@kernel.org>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Link: https://patch.msgid.link/20251016152515.3510991-4-vadim.fedorenko@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-20 17:30:26 -07:00
Vadim Fedorenko
b8fa98ea4a ti: icssg: convert to ndo_hwtstamp API
Convert driver to use .ndo_hwtstamp_get()/.ndo_hwtstamp_set() API.
.ndo_eth_ioctl() implementation becomes pure phy_do_ioctl(), remove
it from common module, remove exported symbol and replace ndo callback.

Reviewed-by: Simon Horman <horms@kernel.org>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Link: https://patch.msgid.link/20251016152515.3510991-3-vadim.fedorenko@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-20 17:30:25 -07:00
Vadim Fedorenko
ed5d5928bd net: ti: am65-cpsw: move hw timestamping to ndo callback
Migrate driver to new API for HW timestamping.

Reviewed-by: Simon Horman <horms@kernel.org>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Link: https://patch.msgid.link/20251016152515.3510991-2-vadim.fedorenko@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-20 17:30:25 -07:00
Jakub Kicinski
6117dc5c88 Merge branch 'net-stmmac-phylink-pcs-conversion'
Russell King says:

====================
net: stmmac: phylink PCS conversion

This series is radical - it takes the brave step of ripping out much of
the existing PCS support code and throwing it all away.

I have discussed the introduction of the STMMAC_FLAG_HAS_INTEGRATED_PCS
flag with Bartosz Golaszewski, and the conclusion I came to is that
this is to workaround the breakage that I've been going on about
concerning the phylink conversion for the last five or six years.

The problem is that the stmmac PCS code manipulates the netif carrier
state, which confuses phylink.

There is a way of testing this out on the Jetson Xavier NX platform as
the "PCS" code paths can be exercised while in RGMII mode - because
RGMII also has in-band status and the status register is shared with
SGMII. Testing this out confirms my long held theory: the interrupt
handler manipulates the netif carrier state before phylink gets a
look-in, which means that the mac_link_up() and mac_link_down() methods
are never called, resulting in the device being non-functional.

Moreover, on dwmac4 cores, ethtool reports incorrect information -
despite having a full-duplex link, ethtool reports that it is
half-dupex.

Thus, this code is completely broken - anyone using it will not have
a functional platform, and thus it doesn't deserve to live any longer,
especially as it's a thorn in phylink.

Rip all this out, leaving just the bare bones initialisation in place.

However, this is not the last of what's broken. We have this hw->ps
integer which is really not descriptive, and the DT property from
which it comes from does little to help understand what's going on.
Putting all the clues together:

- early configuration of the GMAC configuration register for the
  speed.
- setting the SGMII rate adapter layer to take its speed from the
  GMAC configuration register.

Lastly, setting the transmit enable (TE) bit, which is a typo that puts
the nail in the coffin of this code. It should be the transmit
configuration (TC) bit. Given that when the link comes up, phylink
will call mac_link_up() which will overwrite the speed in the GMAC
configuration register, the only part of this that is functional is
changing where the SGMII rate adapter layer gets its speed from,
which is a boolean.

From what I've found so far, everyone who sets the snps,ps-speed
property which configures this mode also configures a fixed link,
so the pre-configuration is unnecessary - the link will come up
anyway.

So, this series rips that out the preconfiguration as well, and
replaces hw->ps with a boolean hw->reverse_sgmii_enable flag.

We then move the sole PCS configuration into a phylink_pcs instance,
which configures the PCS control register in the same way as is done
during the probe function.

Thus, we end up with much easier and simpler conversion to phylink PCS
than previous attempts.

Even so, this still results in inband mode always being enabled at
the moment in the new .pcs_config() method to reflect what the probe
function was doing. The next stage will be to change that to allow
phylink to correctly configure the PCS. This needs fixing to allow
platform glue maintainers who are currently blocked to progress.

Please note, however, that this has not been tested with any SGMII
platform.

I've tried to get as many people into the Cc list with get_maintainers,
I hope that's sufficient to get enough eyeballs on this.

Changes since RFC:
- new patch (7) to remove RGMII "pcs" mode
- new patch (8) to move reverse "pcs" mode to stmmac_check_pcs_mode()
- new patch (9) to simplify the code moved in the previous patch
- new patch (10) to rename the confusing hw->ps to something more
  understandable.
- new patch (11) to shut up inappropriate complaints about
  "snps,ps-speed" being invalid.
- new patch (13) to add a MAC .pcs_init method, which will only be
  called when core has PCS present.
- modify patch 14 to use this new pcs_init method.

Despite getting a couple of responses to the RFC series posted in
September, I have had nothing testing this on hardware. I have tested
this on the Jetson Xavier NX, which included trial runs with enabling
the RGMII "pcs" mode, hence the new patches that rip out this mode. I
have come to the conclusion that the only way to get stmmac changes
tested is to get them merged into net-next, thereby forcing people to
have to run with them... and we'll deal with any fallout later.
====================

Link: https://patch.msgid.link/aPECqg0vZGnBFCbh@shell.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-20 17:17:23 -07:00
Russell King (Oracle)
2c81f33571 net: stmmac: convert to phylink PCS support
Now that stmmac's PCS support is much more simple - just a matter of
configuring the control register - the basic conversion to phylink PCS
support becomes straight forward.

Create the infrastructure to setup a phylink_pcs instance for the
integrated PCS:
- add a struct stmmac_pcs to encapsulate the phylink_pcs structure,
  pointer to stmmac_priv, and the core-specific base address of the PCS
  registers.
- modify stmmac_priv and stmmac_mac_select_pcs() to return the embedded
  phylink_pcs structure when setup and STMMAC_PCS_SGMII is in use, and
  move the comment from stmmac_hw_setup() to here.
- create stmmac_pcs.c, which contains the phylink_pcs_ops structure, a
  dummy .pcs_get_state() method which always reports link-down, and
  .pcs_config() method, moving the call to stmmac_pcs_ctrl_ane() here,
  but without indirecting through the dwmac specific core code. The
  link-down behaviour mentioned above maintains the current behaviour
  when phylink is used with inband but without a PCS.

This will ensure that the PCS control register is configured to the
same settings as before, but will now happen when the netdev is opened
or reusmed rather than only during probe time. However, this will be
before the .fix_mac_speed() method is called, which is critical for the
behaviour in dwmac-qcom-ethqos's ethqos_configure_sgmii() function to
be maintained.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Tested-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Tested-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Link: https://patch.msgid.link/E1v9P72-0000000AomR-3ro4@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-20 17:17:16 -07:00
Russell King (Oracle)
237e54caea net: stmmac: provide PCS initialisation hook
dwmac cores provide a feature bit to indicate when the PCS block is
present, but features are only read after the core's setup() function
has been called, meaning we can't decide whether to initialise the
integrated PCS in the setup function. Provide a new MAC core hook
for PCS initialisation, which will be called after the feature
registers have been read.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Tested-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Tested-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Link: https://patch.msgid.link/E1v9P6x-0000000AomL-3OKd@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-20 17:17:16 -07:00
Russell King (Oracle)
045d7e5727 net: stmmac: only call stmmac_pcs_ctrl_ane() for integrated SGMII PCS
The internal PCS registers only exist if the core is synthesized with
SGMII, TBI or RTBI support. They have no relevance for RGMII.

However, priv->hw->pcs contains a STMMAC_PCS_RGMII flag, which is set
if a PCS has been synthesized but we are operating in RGMII mode. As
the register has no effect for RGMII, there is no point calling
stmmac_pcs_ctrl_ane() in this case. Add a comment describing this
and make it conditional on STMMAC_PCS_SGMII.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Tested-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Tested-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Link: https://patch.msgid.link/E1v9P6s-0000000AomE-2pAa@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-20 17:17:15 -07:00
Russell King (Oracle)
5c61db08d9 net: stmmac: do not require snps,ps-speed for SGMII
SGMII mode does not require port-speed to be specified; this only
switches SGMII to use the MAC configuration register speed settings
and the actual value is irrelevant when the link comes up.

As it seems the intention was to support "reverse SGMII" with this
setting, but the code didn't actually configure that due to a typo,
the warning and bad DT binding documentation has led people to
specify snps,ps-speed in their DT files inappropriately.

If mac_port_sel_speed is zero, then don't complain that the speed
is invalid, as this means we're using "normal" SGMII.

This does _not_ obsolete snps,ps-speed, nor does it change the
behaviour of that property, with the exception of not making people
mistakenly believe that they need to specify this option to use
normal SGMII. There is no need to modify the binding.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Tested-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Tested-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Link: https://patch.msgid.link/E1v9P6n-0000000Aom9-2LuZ@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-20 17:17:15 -07:00
Russell King (Oracle)
5d1e7621f8 net: stmmac: hw->ps becomes hw->reverse_sgmii_enable
After a lot of digging, it seems that the oddly named hw->ps member
is all about setting the core into reverse SGMII speed. When set to
a non-zero value, it:

1. Configures the MAC at initialisation time to operate at a specific
   speed.
2. It _incorrectly_ enables the transmitter (GMAC_CONFIG_TE) which
   makes no sense, rather than enabling the "transmit configuration"
   bit (GMAC_CONFIG_TC).
3. It configures the SGMII rate adapter layer to retrieve its speed
   setting from the MAC configuration register rather than the PHY.

In the previous commit, we removed (1) and (2) as phylink overwrites
the configuration set at that step.

Thus, the only functional aspect is (3), which is a boolean operation.
This means there is no need to store the actual speed, and just have a
boolean flag.

Convert the priv->ps member to a boolean, and rename it to
priv->reverse_sgmii_enable to make it more understandable.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Tested-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Tested-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Link: https://patch.msgid.link/E1v9P6i-0000000Aom3-1y2y@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-20 17:17:15 -07:00
Russell King (Oracle)
412d5f32cb net: stmmac: simplify stmmac_check_pcs_mode()
Now that we only support one mode, simplify stmmac_check_pcs_mode().

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Tested-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Tested-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Link: https://patch.msgid.link/E1v9P6d-0000000Aolw-1T7d@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-20 17:17:15 -07:00
Russell King (Oracle)
c7b0d7874d net: stmmac: move reverse-"pcs" mode setup to stmmac_check_pcs_mode()
The broken reverse-mode, selected by snps,ps-speed, is configured when
the platform provides a valid port speed and a PCS is being used.

Both these remain constant after the driver has probed, so the software
state doesn't need to be re-initialised each time stmmac_hw_setup() is
called (which is called at open and resume time.)

Move the software setup of reverse-mode to stmmac_check_pcs_mode()
which is called from the driver probe function.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Tested-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Tested-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Link: https://patch.msgid.link/E1v9P6Y-0000000Aolr-0vLH@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-20 17:17:15 -07:00
Russell King (Oracle)
70589b05a0 net: stmmac: remove RGMII "pcs" mode
Remove the RGMII "pcs" code in stmmac_check_pcs_mode() due to:

1) This should never have been conditional on a PCS being present, as
   when a core is synthesised using only RGMII, the PCS won't be present
   and priv->dma_cap.pcs will be false. Only multi-interface cores which
   have a PCS present would have detected RGMII.

2) STMMAC_PCS_RGMII has no effect since the broken netif_carrier and
   ethtool code was removed.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Tested-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Tested-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Link: https://patch.msgid.link/E1v9P6T-0000000Aoll-0Ify@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-20 17:17:15 -07:00
Russell King (Oracle)
aa1b6775ae net: stmmac: remove hw->ps xxx_core_init() hardware setup
After a lot of digging, it seems that the oddly named hw->ps member is
all about configuring the core for reverse SGMII. This member is set to
one of 0, SPEED_10, SPEED_100 or SPEED_1000 depending on
priv->plat->mac_port_sel_speed. On DT systems, this comes from the
"snps,ps-speed" DT property.

When set to a non-zero value, it:

1. Configures the MAC at initialisation time to operate at a specific
   speed. However, this will be overwritten by mac_link_up() when the
   link comes up (e.g. with the fixed-link parameters.)

   Note that dwxgmac2 wants to also support SPEED_2500 and SPEED_10000,
   but both these values are impossible.

2. It _incorrectly_ enables the transmitter (GMAC_CONFIG_TE) which
   makes no sense, rather than enabling the "transmit configuration"
   bit (GMAC_CONFIG_TC). Likely a typo.

3. It configures the SGMII rate adapter layer to retrieve its speed
   setting from the MAC configuration register rather than the PHY.

There are two ways forward here:

a) fixing (2) so that we set GMAC_CONFIG_TC. However, we have platform
   that set the "snps,ps-speed" property and that work today. Fixing
   this will cause the RGMII, SGMII or SMII inband configuration to be
   transmitted, which will be a functional change which could cause a
   regression.

b) ripping out (1) and (2) as they are ineffective. This also has the
   possibility of regressions, but the patch author believes this risk
   is much lower than (a).

Therefore, this commit takes the approach in (b).

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Tested-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Tested-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Link: https://patch.msgid.link/E1v9P6N-0000000Aolg-3y0a@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-20 17:17:14 -07:00
Russell King (Oracle)
19064a58bd net: stmmac: remove unused PCS loopback support
Nothing calls stmmac_pcs_ctrl_ane() with the "loopback" argument set to
anything except zero, so this serves no useful purpose. Remove the
argument to reduce the code complexity.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Tested-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Tested-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Link: https://patch.msgid.link/E1v9P6I-0000000Aola-3Sih@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-20 17:17:14 -07:00
Russell King (Oracle)
ebc5d656b7 net: stmmac: remove PCS "mode" pause handling
Remove the "we always autoneg pause" forcing when the stmmac driver
decides that a "PCS" is present, which blocks passing the ethtool
pause calls to phylink when using SGMII mode.

This prevents the pause results being reported when a PHY is attached
using SGMII mode, or the pause settings being changed in SGMII mode.
There is no reason to prevent this.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Tested-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Tested-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Link: https://patch.msgid.link/E1v9P6D-0000000AolU-2zjv@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-20 17:17:14 -07:00
Russell King (Oracle)
2e2c878a31 net: stmmac: remove SGMII/RGMII/SMII interrupt handling
Now that the only use for the interrupt is to clear it and increment a
statistic counter (which is not that relevant anymore) remove all this
code and ensure that the interrupt remains disabled to avoid a stuck
interrupt.

dwmac-sun8i still uses this statistic counter, so it is inappropriate
for this patch to remove it.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Tested-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Tested-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Link: https://patch.msgid.link/E1v9P68-0000000AolO-2W5s@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-20 17:17:14 -07:00
Russell King (Oracle)
14f74bc6dc net: stmmac: remove xstats.pcs_* members
As a result of the previous commit, the pcs_link, pcs_duplex and
pcs_speed members are not used outside of the interrupt handling code,
and are only used to print their status using the misleading "Link is"
messages that bear no relation to the actual status of the link.

Remove the printing of these messages, these members, and the code
that decodes them from the hardware.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Tested-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Tested-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Link: https://patch.msgid.link/E1v9P63-0000000AolI-23Kf@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-20 17:17:14 -07:00
Russell King (Oracle)
813882ae22 net: stmmac: remove broken PCS code
Changing the netif_carrier_*() state behind phylink's back has always
been prohibited because it messes up with phylinks state tracking, and
means that phylink no longer guarantees to call the mac_link_down()
and mac_link_up() methods at the appropriate times.  This was later
documented in the sfp-phylink network driver conversion guide.

stmmac was converted to phylink in 2019, but nothing was done with the
"PCS" code. Since then, apart from the updates as part of phylink
development, nothing has happened with stmmac to improve its use of
phylink, or even to address this point.

A couple of years ago, a has_integrated_pcs boolean was added by Bart,
which later became the STMMAC_FLAG_HAS_INTEGRATED_PCS flag, to avoid
manipulating the netif_carrier_*() state. This flag is mis-named,
because whenever the stmmac is synthesized for its native SGMII, TBI
or RTBI interfaces, it has an "integrated PCS". This boolean/flag
actually means "ignore the status from the integrated PCS".

Discussing with Bart, the reasons for this are lost to the winds of
time (which is why we should always document the reasons in the commit
message.)

RGMII also has in-band status, and the dwmac cores and stmmac code
supports this but with one bug that saves the day.

When dwmac cores are synthesised for RGMII only, they do not contain
an integrated PCS, and so priv->dma_cap.pcs is clear, which prevents
(incorrectly) the "RGMII PCS" being used, meaning we don't read the
in-band status. However, a core synthesised for RGMII and also SGMII,
TBI or RTBI will have this capability bit set, thus making these
code paths reachable.

The Jetson Xavier NX uses RGMII mode to talk to its PHY, and removing
the incorrect check for priv->dma_cap.pcs reveals the theortical issue
with netif_carrier_*() manipulation is real:

dwc-eth-dwmac 2490000.ethernet eth0: Register MEM_TYPE_PAGE_POOL RxQ-0
dwc-eth-dwmac 2490000.ethernet eth0: PHY [stmmac-0:00] driver [RTL8211F Gigabit Ethernet] (irq=141)
dwc-eth-dwmac 2490000.ethernet eth0: No Safety Features support found
dwc-eth-dwmac 2490000.ethernet eth0: IEEE 1588-2008 Advanced Timestamp supported
dwc-eth-dwmac 2490000.ethernet eth0: registered PTP clock
dwc-eth-dwmac 2490000.ethernet eth0: configuring for phy/rgmii-id link mode
8021q: adding VLAN 0 to HW filter on device eth0
dwc-eth-dwmac 2490000.ethernet eth0: Adding VLAN ID 0 is not supported
Link is Up - 1000/Full
Link is Down
Link is Up - 1000/Full

This looks good until one realises that the phylink "Link" status
messages are missing, even when the RJ45 cable is reconnected. Nothing
one can do results in the interface working. The interrupt handler
(which prints those "Link is" messages) always wins over phylink's
resolve worker, meaning phylink never calls the mac_link_up() nor
mac_link_down() methods.

eth0 also sees no traffic received, and is unable to obtain a DHCP
address:

3: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq state UP group defa
ult qlen 1000
    link/ether e6:d3:6a:e6:92:de brd ff:ff:ff:ff:ff:ff
    RX: bytes  packets  errors  dropped overrun mcast
    0          0        0       0       0       0
    TX: bytes  packets  errors  dropped carrier collsns
    27686      149      0       0       0       0

With the STMMAC_FLAG_HAS_INTEGRATED_PCS flag set, which disables the
netif_carrier_*() manipulation then stmmac works normally:

dwc-eth-dwmac 2490000.ethernet eth0: Register MEM_TYPE_PAGE_POOL RxQ-0
dwc-eth-dwmac 2490000.ethernet eth0: PHY [stmmac-0:00] driver [RTL8211F Gigabit Ethernet] (irq=141)
dwc-eth-dwmac 2490000.ethernet eth0: No Safety Features support found
dwc-eth-dwmac 2490000.ethernet eth0: IEEE 1588-2008 Advanced Timestamp supported
dwc-eth-dwmac 2490000.ethernet eth0: registered PTP clock
dwc-eth-dwmac 2490000.ethernet eth0: configuring for phy/rgmii-id link mode
8021q: adding VLAN 0 to HW filter on device eth0
dwc-eth-dwmac 2490000.ethernet eth0: Adding VLAN ID 0 is not supported
Link is Up - 1000/Full
dwc-eth-dwmac 2490000.ethernet eth0: Link is Up - 1Gbps/Full - flow control rx/tx

and packets can be transferred.

This clearly shows that when priv->hw->pcs is set, but
STMMAC_FLAG_HAS_INTEGRATED_PCS is clear, the driver reliably fails.

Discovering whether a platform falls into this is impossible as
parsing all the dtsi and dts files to find out which use the stmmac
driver, whether any of them use RGMII or SGMII and also depends
whether an external interface is being used. The kernel likely
doesn't contain all dts files either.

The only driver that sets this flag uses the qcom,sa8775p-ethqos
compatible, and uses SGMII or 2500BASE-X.

but these are saved from this problem by the incorrect check for
priv->dma_cap.pcs.

So, we have to assume that for every other platform that uses SGMII
with stmmac is using an external PCS.

Moreover, ethtool output can be incorrect. With the full-duplex link
negotiated, ethtool reports:

        Speed: 1000Mb/s
        Duplex: Half

because with dwmac4, the full-duplex bit is in bit 16 of the status,
priv->xstats.pcs_duplex becomes BIT(16) for full duplex, but the
ethtool ksettings duplex member is u8 - so becomes zero. Moreover,
the supported, advertised and link partner modes are all "not
reported".

Finally, ksettings_set() won't be able to set the advertisement on
a PHY if this PCS code is activated, which is incorrect when SGMII
is used with a PHY.

Thus, remove:
1. the incorrect netif_carrier_*() manipulation.
2. the broken ethtool ksettings code.

Given that all uses of STMMAC_FLAG_HAS_INTEGRATED_PCS are now gone,
remove the flag from stmmac.h and dwmac-qcom-ethqos.c.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Tested-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Tested-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Link: https://patch.msgid.link/E1v9P5y-0000000AolC-1QWH@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-20 17:17:14 -07:00
Randy Dunlap
3701572931 nl802154: fix some kernel-doc warnings
Correct multiple kernel-doc warnings in nl802154.h:

- Fix a typo on one enum name to avoid a kernel-doc warning.
- Drop 2 enum descriptions that are no longer needed.
- Mark 2 internal enums as "private:" so that kernel-doc is not needed
  for them.

Warning: nl802154.h:239 Enum value 'NL802154_CAP_ATTR_MAX_MAXBE' not described in enum 'nl802154_wpan_phy_capability_attr'
Warning: nl802154.h:239 Excess enum value '%NL802154_CAP_ATTR_MIN_CCA_ED_LEVEL' description in 'nl802154_wpan_phy_capability_attr'
Warning: nl802154.h:239 Excess enum value '%NL802154_CAP_ATTR_MAX_CCA_ED_LEVEL' description in 'nl802154_wpan_phy_capability_attr'
Warning: nl802154.h:369 Enum value '__NL802154_CCA_OPT_ATTR_AFTER_LAST' not described in enum 'nl802154_cca_opts'
Warning: nl802154.h:369 Enum value 'NL802154_CCA_OPT_ATTR_MAX' not described in enum 'nl802154_cca_opts'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20251016035917.1148012-1-rdunlap@infradead.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-20 17:13:40 -07:00
Eric Dumazet
f8a55d5e71 net: add a fast path in __netif_schedule()
Cpus serving NIC interrupts and specifically TX completions are often
trapped in also restarting a busy qdisc (because qdisc was stopped by BQL
or the driver's own flow control).

When they call netdev_tx_completed_queue() or netif_tx_wake_queue(),
they call __netif_schedule() so that the queue can be run
later from net_tx_action() (involving NET_TX_SOFTIRQ)

Quite often, by the time the cpu reaches net_tx_action(), another cpu
grabbed the qdisc spinlock from __dev_xmit_skb(), and we spend too much
time spinning on this lock.

We can detect in __netif_schedule() if a cpu is already at a specific
point in __dev_xmit_skb() where we have the guarantee the queue will
be run.

This patch gives a 13 % increase of throughput on an IDPF NIC (200Gbit),
32 TX qeues, sending UDP packets of 120 bytes.

This also helps __qdisc_run() to not force a NET_TX_SOFTIRQ
if another thread is waiting in __dev_xmit_skb()

Before:

sar -n DEV 5 5|grep eth1|grep Average
Average:         eth1   1496.44 52191462.56    210.00 13369396.90      0.00      0.00      0.00     54.76

After:

sar -n DEV 5 5|grep eth1|grep Average
Average:         eth1   1457.88 59363099.96    205.08 15206384.35      0.00      0.00      0.00     62.29

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20251017145334.3016097-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-20 16:45:25 -07:00
Alok Tiwari
ba397fde5e net: phy: micrel: fix typos in comments
Fix several spelling and grammatical errors in comments across
micrel PHY drivers. Corrections include:
- "dealy" -> "delay"
- "autonegotation" -> "autonegotiation"
- "recheas" -> "reaches"
- "one" -> "on"
- "improvenent" -> "improvement"
- "intput" -> "input"

Signed-off-by: Alok Tiwari <alok.a.tiwari@oracle.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20251017193525.1457064-2-alok.a.tiwari@oracle.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-20 16:43:54 -07:00
Alok Tiwari
3dfdc98d1d net: phy: micrel: simplify return in ksz9477_phy_errata()
ksz9477_phy_errata function currently assigns the return value of
genphy_restart_aneg() to a variable and then immediately returns it

    err = genphy_restart_aneg(phydev);
    if (err)
        return err;

    return err;

This can be simplified by directly returning the function call
result, as the intermediate variable and conditional are redundant.

Signed-off-by: Alok Tiwari <alok.a.tiwari@oracle.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20251017193525.1457064-1-alok.a.tiwari@oracle.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-20 16:43:54 -07:00
Daniel Borkmann
04a899573f bpf: Do not let BPF test infra emit invalid GSO types to stack
Yinhao et al. reported that their fuzzer tool was able to trigger a
skb_warn_bad_offload() from netif_skb_features() -> gso_features_check().
When a BPF program - triggered via BPF test infra - pushes the packet
to the loopback device via bpf_clone_redirect() then mentioned offload
warning can be seen. GSO-related features are then rightfully disabled.

We get into this situation due to convert___skb_to_skb() setting
gso_segs and gso_size but not gso_type. Technically, it makes sense
that this warning triggers since the GSO properties are malformed due
to the gso_type. Potentially, the gso_type could be marked non-trustworthy
through setting it at least to SKB_GSO_DODGY without any other specific
assumptions, but that also feels wrong given we should not go further
into the GSO engine in the first place.

The checks were added in 121d57af30 ("gso: validate gso_type in GSO
handlers") because there were malicious (syzbot) senders that combine
a protocol with a non-matching gso_type. If we would want to drop such
packets, gso_features_check() currently only returns feature flags via
netif_skb_features(), so one location for potentially dropping such skbs
could be validate_xmit_unreadable_skb(), but then otoh it would be
an additional check in the fast-path for a very corner case. Given
bpf_clone_redirect() is the only place where BPF test infra could emit
such packets, lets reject them right there.

Fixes: 850a88cc40 ("bpf: Expose __sk_buff wire_len/gso_segs to BPF_PROG_TEST_RUN")
Fixes: cf62089b0e ("bpf: Add gso_size to __sk_buff")
Reported-by: Yinhao Hu <dddddd@hust.edu.cn>
Reported-by: Kaiyan Mei <M202472210@hust.edu.cn>
Reported-by: Dongliang Mu <dzm91@hust.edu.cn>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20251020075441.127980-1-daniel@iogearbox.net
2025-10-20 13:16:10 -07:00
Mehdi Ben Hadj Khelifa
e5a82249d8 blk-mq: use struct_size() in kmalloc()
Change struct size calculation to use struct_size()
to align with new recommended practices[1] which quotes:
"Another common case to avoid is calculating the size of a structure with
a trailing array of others structures, as in:

header = kzalloc(sizeof(*header) + count * sizeof(*header->item),
                 GFP_KERNEL);

Instead, use the helper:

header = kzalloc(struct_size(header, item, count), GFP_KERNEL);"

Signed-off-by: Mehdi Ben Hadj Khelifa <mehdi.benhadjkhelifa@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-10-20 10:38:56 -06:00
Pavel Begunkov
5b6d8a032e io_uring: only publish fully handled mem region
io_register_mem_region() can try to remove a region right after
publishing it. This non-atomicity is annoying. Do it in two steps
similar to io_register_mem_region(), create memory first and publish it
once the rest of the handling is done. Remove now unused
io_create_region_mmap_safe(), which was assumed to be a temporary
solution from day one.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Reviewed-by: Gabriel Krisman Bertazi <krisman@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-10-20 10:37:56 -06:00
Pavel Begunkov
dec10a1ad1 io_uring/kbuf: use io_create_region for kbuf creation
kbuf ring is published by io_buffer_add_list(), which correctly protects
with mmap_lock, there is no need to use io_create_region_mmap_safe()
before as the region is not yet exposed to the userspace via mmap.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Reviewed-by: Gabriel Krisman Bertazi <krisman@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-10-20 10:37:56 -06:00
Pavel Begunkov
6e9752977c io_uring: don't free never created regions
io_free_region() tolerates empty regions but there is no reason to that
either. If the first io_create_region() in io_register_resize_rings()
fails, just return the error without attempting to clean it up.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Reviewed-by: Gabriel Krisman Bertazi <krisman@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-10-20 10:37:56 -06:00
Pavel Begunkov
0c89dbbcad io_uring: remove extra args from io_register_free_rings
io_register_free_rings() doesn't use its "struct io_uring_params"
parameter, remove it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Reviewed-by: Gabriel Krisman Bertazi <krisman@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-10-20 10:37:56 -06:00
Pavel Begunkov
4c53e392a1 io_uring: use no mmap safe region helpers on resizing
io_create_region_mmap_safe() is only needed when the created region is
exposed to userspace code via mmap. io_register_resize_rings() creates
them locally on stack, so the no mmap_safe version of the helper is
enough.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Reviewed-by: Gabriel Krisman Bertazi <krisman@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-10-20 10:37:56 -06:00
Pavel Begunkov
284306f6e6 io_uring: sanity check sizes before attempting allocation
It's a good practice to validate parameters before doing any heavy stuff
like queue allocations. Do that for io_allocate_scq_urings().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Reviewed-by: Gabriel Krisman Bertazi <krisman@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-10-20 10:37:56 -06:00
Pavel Begunkov
12aced0a55 io_uring: deduplicate array_size in io_allocate_scq_urings
A minor cleanup precomputing the sq size first instead of branching
array_size() in io_allocate_scq_urings().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Reviewed-by: Gabriel Krisman Bertazi <krisman@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-10-20 10:37:56 -06:00
Jens Axboe
ab673c1bca io_uring/waitid: use io_waitid_remove_wq() consistently
Use it everywhere that the wait_queue_entry is removed from the head,
and be a bit more cautious in zeroing out iw->head whenever the entry is
removed from the list.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-10-20 10:37:48 -06:00
Jens Axboe
a48c0cbf28 io_uring/waitid: have io_waitid_complete() remove wait queue entry
Both callers of this need the entry potentially removed, so shift the
removal into the completion side and kill it from the two callers.

While at it, add a helper for removing the wait_queue_entry based
on the passed in io_kiocb.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-10-20 10:37:48 -06:00
Jens Axboe
7be20254a7 io_uring: unify task_work cancelation checks
Rather than do per-tw checking, which needs to dip into the task_struct
for checking flags, do it upfront before running task_work. This places
a 'cancel' member in io_tw_token_t, which is assigned before running
task_work for that given ctx.

This is both more efficient in doing it upfront rather than for every
task_work, and it means that io_should_terminate_tw() can be made
private in io_uring.c rather than need to be called by various
callbacks of task_work.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-10-20 10:37:48 -06:00
Bart Van Assche
d60055cf52 block/mq-deadline: Switch back to a single dispatch list
Commit c807ab520f ("block/mq-deadline: Add I/O priority support")
modified the behavior of request flag BLK_MQ_INSERT_AT_HEAD from
dispatching a request before other requests into dispatching a request
before other requests with the same I/O priority. This is not correct since
BLK_MQ_INSERT_AT_HEAD is used when requeuing requests and also when a flush
request is inserted.  Both types of requests should be dispatched as soon
as possible. Hence, make the mq-deadline I/O scheduler again ignore the I/O
priority for BLK_MQ_INSERT_AT_HEAD requests.

Cc: Damien Le Moal <dlemoal@kernel.org>
Cc: Yu Kuai <yukuai@kernel.org>
Reported-by: chengkaitao <chengkaitao@kylinos.cn>
Closes: https://lore.kernel.org/linux-block/20251009155253.14611-1-pilgrimtao@gmail.com/
Fixes: c807ab520f ("block/mq-deadline: Add I/O priority support")
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Damien Le Moalv <dlemoal@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-10-20 10:37:42 -06:00
Bart Van Assche
93a358af59 block/mq-deadline: Introduce dd_start_request()
Prepare for adding a second caller of this function. No functionality
has been changed.

Cc: Damien Le Moal <dlemoal@kernel.org>
Cc: Yu Kuai <yukuai@kernel.org>
Cc: chengkaitao <chengkaitao@kylinos.cn>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-10-20 10:37:42 -06:00
Puranjay Mohan
7361c86485 selftests/bpf: Fix list_del() in arena list
The __list_del fuction doesn't set the previous node's next pointer to
the next node of the node to be deleted. It just updates the local variable
and not the actual pointer in the previous node.

The test was passing up till now because the bpf code is doing bpf_free()
after list_del and therfore reading head->first from the userspace will
read all zeroes. But after arena_list_del() is finished, head->first should
point to NULL;

Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20251017141727.51355-1-puranjay@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-18 19:27:26 -07:00
Chu Guangqing
b74938a3bd samples/bpf: Fix spelling typos in samples/bpf
do_hbm_test.sh:
The comment incorrectly used "upcomming" instead of "upcoming".

hbm.c
The comment incorrectly used "Managment" instead of "Management".
The comment incorrectly used "Currrently" instead of "Currently".

tcp_cong_kern.c
The comment incorrectly used "deteremined" instead of "determined".

tracex1.bpf.c
The comment incorrectly used "loobpack" instead of "loopback".

Signed-off-by: Chu Guangqing <chuguangqing@inspur.com>
Link: https://lore.kernel.org/r/20251015015024.2212-2-chuguangqing@inspur.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-18 19:26:23 -07:00
Yonghong Song
4f8543b5f2 selftests/bpf: Fix selftest verif_scale_strobemeta failure with llvm22
With latest llvm22, I hit the verif_scale_strobemeta selftest failure
below:
  $ ./test_progs -n 618
  libbpf: prog 'on_event': BPF program load failed: -E2BIG
  libbpf: prog 'on_event': -- BEGIN PROG LOAD LOG --
  BPF program is too large. Processed 1000001 insn
  verification time 7019091 usec
  stack depth 488
  processed 1000001 insns (limit 1000000) max_states_per_insn 28 total_states 33927 peak_states 12813 mark_read 0
  -- END PROG LOAD LOG --
  libbpf: prog 'on_event': failed to load: -E2BIG
  libbpf: failed to load object 'strobemeta.bpf.o'
  scale_test:FAIL:expect_success unexpected error: -7 (errno 7)
  #618     verif_scale_strobemeta:FAIL

But if I increase the verificaiton insn limit from 1M to 10M, the above
test_progs run actually will succeed. The below is the result from veristat:
  $ ./veristat strobemeta.bpf.o
  Processing 'strobemeta.bpf.o'...
  File              Program   Verdict  Duration (us)    Insns  States  Program size  Jited size
  ----------------  --------  -------  -------------  -------  ------  ------------  ----------
  strobemeta.bpf.o  on_event  success       90250893  9777685  358230         15954       80794
  ----------------  --------  -------  -------------  -------  ------  ------------  ----------
  Done. Processed 1 files, 0 programs. Skipped 1 files, 0 programs.

Further debugging shows the llvm commit [1] is responsible for the verificaiton
failure as it tries to convert certain switch statement to if-condition. Such
change may cause different transformation compared to original switch statement.

In bpf program strobemeta.c case, the initial llvm ir for read_int_var() function is
  define internal void @read_int_var(ptr noundef %0, i64 noundef %1, ptr noundef %2,
      ptr noundef %3, ptr noundef %4) #2 !dbg !535 {
    %6 = alloca ptr, align 8
    %7 = alloca i64, align 8
    %8 = alloca ptr, align 8
    %9 = alloca ptr, align 8
    %10 = alloca ptr, align 8
    %11 = alloca ptr, align 8
    %12 = alloca i32, align 4
    ...
    %20 = icmp ne ptr %19, null, !dbg !561
    br i1 %20, label %22, label %21, !dbg !562

  21:                                               ; preds = %5
    store i32 1, ptr %12, align 4
    br label %48, !dbg !563

  22:
    %23 = load ptr, ptr %9, align 8, !dbg !564
    ...

  47:                                               ; preds = %38, %22
    store i32 0, ptr %12, align 4, !dbg !588
    br label %48, !dbg !588

  48:                                               ; preds = %47, %21
    call void @llvm.lifetime.end.p0(ptr %11) #4, !dbg !588
    %49 = load i32, ptr %12, align 4
    switch i32 %49, label %51 [
      i32 0, label %50
      i32 1, label %50
    ]

  50:                                               ; preds = %48, %48
    ret void, !dbg !589

  51:                                               ; preds = %48
    unreachable
  }

Note that the above 'switch' statement is added by clang frontend.
Without [1], the switch statement will survive until SelectionDag,
so the switch statement acts like a 'barrier' and prevents some
transformation involved with both 'before' and 'after' the switch statement.

But with [1], the switch statement will be removed during middle end
optimization and later middle end passes (esp. after inlining) have more
freedom to reorder the code.

The following is the related source code:

  static void *calc_location(struct strobe_value_loc *loc, void *tls_base):
        bpf_probe_read_user(&tls_ptr, sizeof(void *), dtv);
        /* if pointer has (void *)-1 value, then TLS wasn't initialized yet */
        return tls_ptr && tls_ptr != (void *)-1
                ? tls_ptr + tls_index.offset
                : NULL;

  In read_int_var() func, we have:
        void *location = calc_location(&cfg->int_locs[idx], tls_base);
        if (!location)
                return;

        bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location);
        ...

The static func calc_location() is called inside read_int_var(). The asm code
without [1]:
     77: .123....89 (85) call bpf_probe_read_user#112
     78: ........89 (79) r1 = *(u64 *)(r10 -368)
     79: .1......89 (79) r2 = *(u64 *)(r10 -8)
     80: .12.....89 (bf) r3 = r2
     81: .123....89 (0f) r3 += r1
     82: ..23....89 (07) r2 += 1
     83: ..23....89 (79) r4 = *(u64 *)(r10 -464)
     84: ..234...89 (a5) if r2 < 0x2 goto pc+13
     85: ...34...89 (15) if r3 == 0x0 goto pc+12
     86: ...3....89 (bf) r1 = r10
     87: .1.3....89 (07) r1 += -400
     88: .1.3....89 (b4) w2 = 16
In this case, 'r2 < 0x2' and 'r3 == 0x0' go to null 'locaiton' place,
so the verifier actually prefers to do verification first at 'r1 = r10' etc.

The asm code with [1]:
    119: .123....89 (85) call bpf_probe_read_user#112
    120: ........89 (79) r1 = *(u64 *)(r10 -368)
    121: .1......89 (79) r2 = *(u64 *)(r10 -8)
    122: .12.....89 (bf) r3 = r2
    123: .123....89 (0f) r3 += r1
    124: ..23....89 (07) r2 += -1
    125: ..23....89 (a5) if r2 < 0xfffffffe goto pc+6
    126: ........89 (05) goto pc+17
    ...
    144: ........89 (b4) w1 = 0
    145: .1......89 (6b) *(u16 *)(r8 +80) = r1
In this case, if 'r2 < 0xfffffffe' is true, the control will go to
non-null 'location' branch, so 'goto pc+17' will actually go to
null 'location' branch. This seems causing tremendous amount of
verificaiton state.

To fix the issue, rewrite the following code
  return tls_ptr && tls_ptr != (void *)-1
                ? tls_ptr + tls_index.offset
                : NULL;
to if/then statement and hopefully these explicit if/then statements
are sticky during middle-end optimizations.

Test with llvm20 and llvm21 as well and all strobemeta related selftests
are passed.

  [1] https://github.com/llvm/llvm-project/pull/161000

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20251014051639.1996331-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-18 19:25:03 -07:00
Alexei Starovoitov
7a9f475d52 Merge branch 'bpf-mm-related-minor-changes'
Yafang Shao  says:
====================
These two minor patches were developed during the implementation of
BPF-THP:
  https://lwn.net/Articles/1042138/

As suggested by Andrii, they are being submitted separately.
====================

Link: https://patch.msgid.link/20251016063929.13830-1-laoar.shao@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-18 19:23:46 -07:00
Yafang Shao
7484e7cd8a bpf: mark vma->{vm_mm,vm_file} as __safe_trusted_or_null
The vma->vm_mm might be NULL and it can be accessed outside of RCU. Thus,
we can mark it as trusted_or_null. With this change, BPF helpers can safely
access vma->vm_mm to retrieve the associated mm_struct from the VMA.
Then we can make policy decision from the VMA.

The "trusted" annotation enables direct access to vma->vm_mm within kfuncs
marked with KF_TRUSTED_ARGS or KF_RCU, such as bpf_task_get_cgroup1() and
bpf_task_under_cgroup(). Conversely, "null" enforcement requires all
callsites using vma->vm_mm to perform NULL checks.

The lsm selftest must be modified because it directly accesses vma->vm_mm
without a NULL pointer check; otherwise it will break due to this
change.

For the VMA based THP policy, the use case is as follows,

  @mm = @vma->vm_mm; // vm_area_struct::vm_mm is trusted or null
  if (!@mm)
      return;
  bpf_rcu_read_lock(); // rcu lock must be held to dereference the owner
  @owner = @mm->owner; // mm_struct::owner is rcu trusted or null
  if (!@owner)
    goto out;
  @cgroup1 = bpf_task_get_cgroup1(@owner, MEMCG_HIERARCHY_ID);

  /* make the decision based on the @cgroup1 attribute */

  bpf_cgroup_release(@cgroup1); // release the associated cgroup
out:
  bpf_rcu_read_unlock();

PSI memory information can be obtained from the associated cgroup to inform
policy decisions. Since upstream PSI support is currently limited to cgroup
v2, the following example demonstrates cgroup v2 implementation:

  @owner = @mm->owner;
  if (@owner) {
      // @ancestor_cgid is user-configured
      @ancestor = bpf_cgroup_from_id(@ancestor_cgid);
      if (bpf_task_under_cgroup(@owner, @ancestor)) {
          @psi_group = @ancestor->psi;

          /* Extract PSI metrics from @psi_group and
           * implement policy logic based on the values
           */

      }
  }

The vma::vm_file can also be marked with __safe_trusted_or_null.

No additional selftests are required since vma->vm_file and vma->vm_mm are
already validated in the existing selftest suite.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Acked-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Link: https://lore.kernel.org/r/20251016063929.13830-3-laoar.shao@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-18 19:23:08 -07:00
Yafang Shao
ec8e3e27a1 bpf: mark mm->owner as __safe_rcu_or_null
When CONFIG_MEMCG is enabled, we can access mm->owner under RCU. The
owner can be NULL. With this change, BPF helpers can safely access
mm->owner to retrieve the associated task from the mm. We can then make
policy decision based on the task attribute.

The typical use case is as follows,

  bpf_rcu_read_lock(); // rcu lock must be held for rcu trusted field
  @owner = @mm->owner; // mm_struct::owner is rcu trusted or null
  if (!@owner)
      goto out;

  /* Do something based on the task attribute */

out:
  bpf_rcu_read_unlock();

Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Acked-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Link: https://lore.kernel.org/r/20251016063929.13830-2-laoar.shao@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-18 19:23:08 -07:00
Tiezhu Yang
c67f4ae737 selftests/bpf: Silence unused-but-set build warnings
There are some set but not used build errors when compiling bpf selftests
with the latest upstream mainline GCC, at the beginning add the attribute
__maybe_unused for the variables, but it is better to just add the option
-Wno-unused-but-set-variable to CFLAGS in Makefile to disable the errors
instead of hacking the tests.

  tools/testing/selftests/bpf/map_tests/lpm_trie_map_basic_ops.c:229:36:
  error: variable ‘n_matches_after_delete’ set but not used [-Werror=unused-but-set-variable=]

  tools/testing/selftests/bpf/map_tests/lpm_trie_map_basic_ops.c:229:25:
  error: variable ‘n_matches’ set but not used [-Werror=unused-but-set-variable=]

  tools/testing/selftests/bpf/prog_tests/bpf_cookie.c:426:22:
  error: variable ‘j’ set but not used [-Werror=unused-but-set-variable=]

  tools/testing/selftests/bpf/prog_tests/find_vma.c:52:22:
  error: variable ‘j’ set but not used [-Werror=unused-but-set-variable=]

  tools/testing/selftests/bpf/prog_tests/perf_branches.c:67:22:
  error: variable ‘j’ set but not used [-Werror=unused-but-set-variable=]

  tools/testing/selftests/bpf/prog_tests/perf_link.c:15:22:
  error: variable ‘j’ set but not used [-Werror=unused-but-set-variable=]

Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>
Link: https://lore.kernel.org/r/20251018082815.20622-1-yangtiezhu@loongson.cn
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-18 19:21:29 -07:00
Alexei Starovoitov
50de48a4dd Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf at 6.18-rc2
Cross-merge BPF and other fixes after downstream PR.

No conflicts.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-18 18:20:57 -07:00
Takashi Iwai
38e3a94084 wifi: ath12k: Add MODULE_FIRMWARE() entries
Some systems such as live-image or installer require the firmware
information for each module declared by MODULE_FIRMWARE(), which is
currently missing in ath12k driver.

For addressing it, this patch adds the MODULE_FIRMWARE() entries.
Like ath11k driver, we can just put the currently used firmware
entries for QCN9274 and WCN7850 with wildcards.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Reviewed-by: Baochen Qiang <baochen.qiang@oss.qualcomm.com>
Reviewed-by: Vasanthakumar Thiagarajan <vasanthakumar.thiagarajan@oss.qualcomm.com>
Link: https://patch.msgid.link/20251003082955.11436-1-tiwai@suse.de
Signed-off-by: Jeff Johnson <jeff.johnson@oss.qualcomm.com>
2025-10-18 08:36:14 -07:00
Jakub Kicinski
88224095b4 Merge branch 'net-dsa-lantiq_gswip-clean-up-and-improve-vlan-handling'
Daniel Golle says:

====================
net: dsa: lantiq_gswip: clean up and improve VLAN handling

This series was developed by Vladimir Oltean to improve and clean up the
VLAN handling logic in the Lantiq GSWIP DSA driver.

As Vladimir currently doesn't have the availability to take care of the
submission process, we agreed that I would send the patches on his
behalf.

The series focuses on consolidating the VLAN management paths for both
VLAN-unaware and VLAN-aware bridges, simplifying internal logic, and
removing legacy or redundant code. It also fixes a number of subtle
inconsistencies regarding VLAN ID 0 handling, bridge FDB entries, and
brings the driver into shape to permit dynamic changes to the VLAN
filtering state.

Notable changes include:

 - Support for bridge FDB entries on the CPU port

 - Consolidation of gswip_vlan_add_unaware() and gswip_vlan_add_aware()
   into a unified implementation

 - Removal of legacy VLAN configuration options and redundant
   assignments

 - Improved handling of VLAN ID 0 and PVID behavior

 - Better validation and error reporting in VLAN removal paths

 - Support for dynamic VLAN filtering configuration changes

Overall, this refactor improves readability and maintainability of the
Lantiq GSWIP DSA driver. It also results in all local-termination.sh
tests now passing, and slightly improves the results of
bridge-vlan-{un,}aware.sh.

All patches have been authored by Vladimir Oltean; a small unintended
functional change in patch "net: dsa: lantiq_gswip: merge
gswip_vlan_add_unaware() and gswip_vlan_add_aware()" has been ironed out
and some of the commit descriptions were improved by me, apart from that
I'm only handling the submission and will help with follow-up
discussions or review feedback as needed.

Despite the fact that some changes here do actually fix things (in the
sense that selftests which would previously FAIL now PASS) we decided
that it would be the best for this series of patches to go via net-next.
If requested some of it can still be ported to stable kernels later on.
====================

Link: https://patch.msgid.link/cover.1760566491.git.daniel@makrotopia.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-17 17:55:03 -07:00
Vladimir Oltean
1f89ed0ebf net: dsa: lantiq_gswip: treat VID 0 like the PVID
Documentation/networking/switchdev.rst says that VLAN-aware bridges must
treat packets tagged with VID 0 the same as untagged. It appears from
the documentation that setting the GSWIP_PCE_VCTRL_VID0 flag (which this
driver already had defined) might achieve this.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Link: https://patch.msgid.link/b220ac149922839a261b754202c05df5bb253c98.1760566491.git.daniel@makrotopia.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-17 17:54:59 -07:00
Vladimir Oltean
3bb500caf6 net: dsa: lantiq_gswip: drop untagged on VLAN-aware bridge ports with no PVID
Implement the required functionality, as written in
Documentation/networking/switchdev.rst section "Bridge VLAN filtering",
by using the "VLAN Ingress Tag Rule" feature of the switch.

The bit field definitions for this were found while browsing the Intel
dual BSD/GPLv2 licensed drivers for this switch IP.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Link: https://patch.msgid.link/787aa807d00b726d75db2a40add215c8b8ba7466.1760566491.git.daniel@makrotopia.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-17 17:54:59 -07:00
Vladimir Oltean
a576276266 net: dsa: lantiq_gswip: put a more descriptive error print in gswip_vlan_remove()
Improve the error message printed in case of a port VLAN entry not being
found upon removal.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Link: https://patch.msgid.link/abd4ec58e0f0f53eb3d7027097a20af0bd7b1d6d.1760566491.git.daniel@makrotopia.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-17 17:54:59 -07:00
Vladimir Oltean
7ed1965f10 net: dsa: lantiq_gswip: remove vlan_aware and pvid arguments from gswip_vlan_remove()
"bool pvid" is unused since commit "net: dsa: lantiq_gswip: remove
legacy configure_vlan_while_not_filtering option".

"bool vlan_aware" shouldn't have a role in finding the bridge VLAN.
It should be identified by VID regardless of VLAN-aware or VLAN-unaware.
The driver sets up VID 0 for the VLAN-unaware PVID.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Link: https://patch.msgid.link/c63f89ca19269ef6c8bf00a62cacc739164b4441.1760566491.git.daniel@makrotopia.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-17 17:54:59 -07:00
Vladimir Oltean
96a91e6eeb net: dsa: lantiq_gswip: disallow changes to privately set up VID 0
User space can force the altering of VID 0 as it was privately set up by
this driver.

For example, when the port joins a VLAN-aware bridge,
dsa_user_manage_vlan_filtering() will set NETIF_F_HW_VLAN_CTAG_FILTER.
If the port is subsequently brought up and CONFIG_VLAN_8021Q is enabled,
the vlan_vid0_add() function will want to make sure we are capable of
accepting packets tagged with VID 0.

Generally, DSA/switchdev drivers want to suppress that bit of help from
the 8021q layer, and handle VID 0 filters themselves. The 8021q layer
might actually be even detrimential, because VLANs added through
vlan_vid_add() pass through dsa_user_vlan_rx_add_vid(), which is
documented as this:

	/* This API only allows programming tagged, non-PVID VIDs */
	.flags = 0,

so it will force VID 0 to be reconfigured as egress-tagged, non-PVID.
Whereas the driver configures it as PVID and egress-untagged, the exact
opposite.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Link: https://patch.msgid.link/9f68340c34b5312c3b8c6c7ecf3cfce574a3f65d.1760566491.git.daniel@makrotopia.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-17 17:54:59 -07:00
Vladimir Oltean
ab3ce58559 net: dsa: lantiq_gswip: permit dynamic changes to VLAN filtering state
The driver should now tolerate these changes, now that the PVID is
automatically recalculated on a VLAN awareness state change.

The VLAN-unaware PVID must be installed to hardware even if the
joined bridge is currently VLAN-aware. Otherwise, when the bridge VLAN
filtering state dynamically changes to VLAN-unaware later, this PVID
will be missing.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Link: https://patch.msgid.link/c58759074fb699581336dc2c2c6bf106257b134e.1760566491.git.daniel@makrotopia.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-17 17:54:59 -07:00
Vladimir Oltean
21c3237c60 net: dsa: lantiq_gswip: remove legacy configure_vlan_while_not_filtering option
This driver doesn't support dynamic VLAN filtering changes, for simplicity.
It expects that on a port, either gswip_vlan_add_unaware() or
gswip_vlan_add_aware() is called, but not both.

When !br_vlan_enabled(), the configure_vlan_while_not_filtering = false
option is exactly what will prevent calls to gswip_port_vlan_add() from
being issued by DSA.

In fact, at the time these features were submitted:
https://patchwork.ozlabs.org/project/netdev/patch/20190501204506.21579-3-hauke@hauke-m.de/
"configure_vlan_while_not_filtering = false" did not even have a name,
it was implicit behaviour. It only became legacy in commit 54a0ed0df4
("net: dsa: provide an option for drivers to always receive bridge
VLANs").

Section "Bridge VLAN filtering" of Documentation/networking/switchdev.rst
describes the exact set of rules. Notably, the PVID of the port must
follow the VLAN awareness state of the bridge port. A VLAN-unaware
bridge port should not respond to the addition of a bridge VLAN with the
PVID flag. In fact, the pvid_change() test in
tools/testing/selftests/net/forwarding/bridge_vlan_unaware.sh tests
exactly this.

The lantiq_gswip driver indeed does not respond to the addition of PVID
VLANs while VLAN-unaware in the way described above, but only because of
configure_vlan_while_not_filtering. Our purpose here is to get rid of
configure_vlan_while_not_filtering, so we must add more complex logic
which follows the VLAN awareness state and walks through the Active VLAN
table entries, to find the index of the PVID register that should be
committed to hardware on each port.

As a side-effect of now having a proper implementation to assign the
PVID all the "VLAN upper: ..." tests of the local_termination.sh self-
tests which would previously all FAIL now all PASS (or XFAIL, but
that's ok).

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Tested-by: Daniel Golle <daniel@makrotopia.org>
Link: https://patch.msgid.link/47dab8a8b69ebb92624b9795b723114475d3fe4e.1760566491.git.daniel@makrotopia.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-17 17:54:58 -07:00
Vladimir Oltean
b92068755e net: dsa: lantiq_gswip: merge gswip_vlan_add_unaware() and gswip_vlan_add_aware()
The two functions largely duplicate functionality. The differences
consist in:

- the "fid" passed to gswip_vlan_active_create(). The unaware variant
  always passes -1, the aware variant passes fid = priv->vlans[i].fid,
  where i is an index into priv->vlans[] for which priv->vlans[i].bridge
  is equal to the given bridge.

- the "vid" is not passed to gswip_vlan_add_unaware(). It is implicitly
  GSWIP_VLAN_UNAWARE_PVID (zero).

- The "untagged" is not passed to gswip_vlan_add_unaware(). It is
  implicitly true. Also, the CPU port must not be a tag member of the
  PVID used for VLAN-unaware bridging.

- The "pvid" is not passed to gswip_vlan_add_unaware(). It is implicitly
  true.

- The GSWIP_PCE_DEFPVID(port) register is written by the aware variant
  with an "idx", but with a hardcoded 0 by the unaware variant.

Merge the two functions into a single unified function without any
functional changes.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Link: https://patch.msgid.link/2be190701d4c17038ce4b8047f9fb0bdf8abdf6e.1760566491.git.daniel@makrotopia.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-17 17:54:58 -07:00
Vladimir Oltean
8f5c71e444 net: dsa: lantiq_gswip: remove duplicate assignment to vlan_mapping.val[0]
When idx == -1 in gswip_vlan_add(), we set vlan_mapping.val[0] = vid,
even though we do the exact same thing again outside the if/else block.

Remove the duplicate assignment.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Link: https://patch.msgid.link/039ecb48e038cea856a9a6230ad1543db2bc382d.1760566491.git.daniel@makrotopia.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-17 17:54:58 -07:00
Vladimir Oltean
92790e6c11 net: dsa: lantiq_gswip: define VLAN ID 0 constant
This patch adds an explicit definition for VID 0 to the Lantiq GSWIP DSA
driver, clarifying its special meaning.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Link: https://patch.msgid.link/e8862239d0bb727723cf60947d2262473b46c96d.1760566491.git.daniel@makrotopia.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-17 17:54:58 -07:00
Vladimir Oltean
e29bbd73ad net: dsa: lantiq_gswip: support bridge FDB entries on the CPU port
Currently, the driver takes the bridge from dsa_port_bridge_dev_get(),
which only works for user ports. This is why it has to ignore FDB
entries installed on the CPU port.

Commit c26933639b ("net: dsa: request drivers to perform FDB
isolation") introduced the possibility of getting the originating bridge
from the passed dsa_db argument, so let's do that instead. This way, we
can act on the local FDB entries coming from the bridge.

Note that we do not expect FDB events for the DSA_DB_PORT database,
because this driver doesn't fulfill the dsa_switch_supports_uc_filtering()
requirements. So we can just return -EOPNOTSUPP and expect it will never
be triggered.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Link: https://patch.msgid.link/ed9d847c0356f0fec81422bdad9ebdcc6a59da79.1760566491.git.daniel@makrotopia.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-17 17:54:58 -07:00
Jakub Kicinski
e90576829c Merge tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Martin KaFai Lau says:

====================
pull-request: bpf-next 2025-10-16

We've added 6 non-merge commits during the last 1 day(s) which contain
a total of 18 files changed, 577 insertions(+), 38 deletions(-).

The main changes are:

1) Bypass the global per-protocol memory accounting either by setting
   a netns sysctl or using bpf_setsockopt in a bpf program,
   from Kuniyuki Iwashima.

* tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next:
  selftests/bpf: Add test for sk->sk_bypass_prot_mem.
  bpf: Introduce SK_BPF_BYPASS_PROT_MEM.
  bpf: Support bpf_setsockopt() for BPF_CGROUP_INET_SOCK_CREATE.
  net: Introduce net.core.bypass_prot_mem sysctl.
  net: Allow opt-out from global protocol memory accounting.
  tcp: Save lock_sock() for memcg in inet_csk_accept().
====================

Link: https://patch.msgid.link/20251016204539.773707-1-martin.lau@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-17 17:20:42 -07:00
Eric Biggers
37a183d3b7 tcp: Convert tcp-md5 to use MD5 library instead of crypto_ahash
Make tcp-md5 use the MD5 library API (added in 6.18) instead of the
crypto_ahash API.  This is much simpler and also more efficient:

- The library API just operates on struct md5_ctx.  Just allocate this
  struct on the stack instead of using a pool of pre-allocated
  crypto_ahash and ahash_request objects.

- The library API accepts standard pointers and doesn't require
  scatterlists.  So, for hashing the headers just use an on-stack buffer
  instead of a pool of pre-allocated kmalloc'ed scratch buffers.

- The library API never fails.  Therefore, checking for MD5 hashing
  errors is no longer necessary.  Update tcp_v4_md5_hash_skb(),
  tcp_v6_md5_hash_skb(), tcp_v4_md5_hash_hdr(), tcp_v6_md5_hash_hdr(),
  tcp_md5_hash_key(), tcp_sock_af_ops::calc_md5_hash, and
  tcp_request_sock_ops::calc_md5_hash to return void instead of int.

- The library API provides direct access to the MD5 code, eliminating
  unnecessary overhead such as indirect function calls and scatterlist
  management.  Microbenchmarks of tcp_v4_md5_hash_skb() on x86_64 show a
  speedup from 7518 to 7041 cycles (6% fewer) with skb->len == 1440, or
  from 1020 to 678 cycles (33% fewer) with skb->len == 140.

Since tcp_sigpool_hash_skb_data() can no longer be used, add a function
tcp_md5_hash_skb_data() which is specialized to MD5.  Of course, to the
extent that this duplicates any code, it's well worth it.

To preserve the existing behavior of TCP-MD5 support being disabled when
the kernel is booted with "fips=1", make tcp_md5_do_add() check
fips_enabled itself.  Previously it relied on the error from
crypto_alloc_ahash("md5") being bubbled up.  I don't know for sure that
this is actually needed, but this preserves the existing behavior.

Tested with bidirectional TCP-MD5, both IPv4 and IPv6, between a kernel
that includes this commit and a kernel that doesn't include this commit.

(Side note: please don't use TCP-MD5!  It's cryptographically weak.  But
as long as Linux supports it, it might as well be implemented properly.)

Signed-off-by: Eric Biggers <ebiggers@kernel.org>
Link: https://patch.msgid.link/20251014215836.115616-1-ebiggers@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-17 17:14:54 -07:00
Carlos Llamas
f578ff4c53 selftests/net: io_uring: fix unknown errnum values
The io_uring functions return negative error values, but error() expects
these to be positive to properly match them to an errno string. Fix this
to make sure the correct error descriptions are displayed upon failure.

Signed-off-by: Carlos Llamas <cmllamas@google.com>
Link: https://patch.msgid.link/20251016182538.3790567-1-cmllamas@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-17 16:57:53 -07:00
Heiner Kallweit
3dc2a17efc r8169: reconfigure rx unconditionally before chip reset when resuming
There's a good chance that more chip versions suffer from the same
hw issue. So let's reconfigure rx unconditionally before the chip reset
when resuming. This shouldn't have any side effect on unaffected chip
versions.

Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://patch.msgid.link/a5c2e2d2-226f-4896-b8f6-45e2d91f0e24@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-17 16:57:19 -07:00
Florian Westphal
2af8ff1e47 net: Kconfig: discourage drop_monitor enablement
Quoting Eric Dumazet:
"I do not understand the fascination with net/core/drop_monitor.c [..]
misses all the features, flexibility, scalability  that 'perf',
eBPF tracing, bpftrace, .... have today."

Reword DROP_MONITOR kconfig help text to clearly state that its not
related to perf-based drop monitoring and that its safe to disable
this unless support for the older netlink-based tools is needed.

Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Simon Horman <horms@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251016115147.18503-1-fw@strlen.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-17 16:29:26 -07:00
Jakub Kicinski
38f3cd3703 Merge branch 'net-avoid-ehash-lookup-races'
Xuanqiang Luo says:

====================
net: Avoid ehash lookup races

After replacing R/W locks with RCU in commit 3ab5aee7fe ("net: Convert
TCP & DCCP hash tables to use RCU / hlist_nulls"), a race window emerged
during the switch from reqsk/sk to sk/tw.

Now that both timewait sock (tw) and full sock (sk) reside on the same
ehash chain, it is appropriate to introduce hlist_nulls replace
operations, to eliminate the race conditions caused by this window.

Before this series of patches, I previously sent another version of the
patch, attempting to avoid the issue using a lock mechanism. However, it
seems there are some problems with that approach now, so I've switched to
the "replace" method in the current patches to resolve the issue.
For details, refer to:
https://lore.kernel.org/netdev/20250903024406.2418362-1-xuanqiang.luo@linux.dev/

Before I encountered this type of issue recently, I found there had been
several historical discussions about it. Therefore, I'm adding this
background information for those interested to reference:
1. https://lore.kernel.org/lkml/20230118015941.1313-1-kerneljasonxing@gmail.com/
2. https://lore.kernel.org/netdev/20230606064306.9192-1-duanmuquan@baidu.com/
====================

Link: https://patch.msgid.link/20251015020236.431822-1-xuanqiang.luo@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-17 16:08:46 -07:00
Xuanqiang Luo
b8ec80b130 inet: Avoid ehash lookup race in inet_twsk_hashdance_schedule()
Since ehash lookups are lockless, if another CPU is converting sk to tw
concurrently, fetching the newly inserted tw with tw->tw_refcnt == 0 cause
lookup failure.

The call trace map is drawn as follows:
   CPU 0                                CPU 1
   -----                                -----
				     inet_twsk_hashdance_schedule()
				     spin_lock()
				     inet_twsk_add_node_rcu(tw, ...)
__inet_lookup_established()
(find tw, failure due to tw_refcnt = 0)
				     __sk_nulls_del_node_init_rcu(sk)
				     refcount_set(&tw->tw_refcnt, 3)
				     spin_unlock()

By replacing sk with tw atomically via hlist_nulls_replace_init_rcu() after
setting tw_refcnt, we ensure that tw is either fully initialized or not
visible to other CPUs, eliminating the race.

It's worth noting that we held lock_sock() before the replacement, so
there's no need to check if sk is hashed. Thanks to Kuniyuki Iwashima!

Fixes: 3ab5aee7fe ("net: Convert TCP & DCCP hash tables to use RCU / hlist_nulls")
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Jiayuan Chen <jiayuan.chen@linux.dev>
Signed-off-by: Xuanqiang Luo <luoxuanqiang@kylinos.cn>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251015020236.431822-4-xuanqiang.luo@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-17 16:08:43 -07:00
Xuanqiang Luo
1532ed0d07 inet: Avoid ehash lookup race in inet_ehash_insert()
Since ehash lookups are lockless, if one CPU performs a lookup while
another concurrently deletes and inserts (removing reqsk and inserting sk),
the lookup may fail to find the socket, an RST may be sent.

The call trace map is drawn as follows:
   CPU 0                           CPU 1
   -----                           -----
				inet_ehash_insert()
                                spin_lock()
                                sk_nulls_del_node_init_rcu(osk)
__inet_lookup_established()
	(lookup failed)
                                __sk_nulls_add_node_rcu(sk, list)
                                spin_unlock()

As both deletion and insertion operate on the same ehash chain, this patch
introduces a new sk_nulls_replace_node_init_rcu() helper functions to
implement atomic replacement.

Fixes: 5e0724d027 ("tcp/dccp: fix hashdance race for passive sessions")
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Jiayuan Chen <jiayuan.chen@linux.dev>
Signed-off-by: Xuanqiang Luo <luoxuanqiang@kylinos.cn>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251015020236.431822-3-xuanqiang.luo@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-17 16:08:43 -07:00
Xuanqiang Luo
9c4609225e rculist: Add hlist_nulls_replace_rcu() and hlist_nulls_replace_init_rcu()
Add two functions to atomically replace RCU-protected hlist_nulls entries.

Keep using WRITE_ONCE() to assign values to ->next and ->pprev, as
mentioned in the patch below:
commit efd04f8a8b ("rcu: Use WRITE_ONCE() for assignments to ->next for
rculist_nulls")
commit 860c8802ac ("rcu: Use WRITE_ONCE() for assignments to ->pprev for
hlist_nulls")

Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Xuanqiang Luo <luoxuanqiang@kylinos.cn>
Link: https://patch.msgid.link/20251015020236.431822-2-xuanqiang.luo@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-17 16:08:42 -07:00
Kuniyuki Iwashima
1c17f4373d ipv6: Move ipv6_fl_list from ipv6_pinfo to inet_sock.
In {tcp6,udp6,raw6}_sock, struct ipv6_pinfo is always placed at
the beginning of a new cache line because

  1. __alignof__(struct tcp_sock) is 64 due to ____cacheline_aligned
     of __cacheline_group_begin(tcp_sock_write_tx)

  2. __alignof__(struct udp_sock) is 64 due to ____cacheline_aligned
     of struct numa_drop_counters

  3. in raw6_sock, struct numa_drop_counters is placed before
     struct ipv6_pinfo

.  struct ipv6_pinfo is 136 bytes, but the last cache line is
only used by ipv6_fl_list:

  $ pahole -C ipv6_pinfo vmlinux
  struct ipv6_pinfo {
  ...
  	/* --- cacheline 2 boundary (128 bytes) --- */
  	struct ipv6_fl_socklist *  ipv6_fl_list;         /*   128     8 */

  	/* size: 136, cachelines: 3, members: 23 */

Let's move ipv6_fl_list from struct ipv6_pinfo to struct inet_sock
to save a full cache line for {tcp6,udp6,raw6}_sock.

Now, struct ipv6_pinfo is 128 bytes, and {tcp6,udp6,raw6}_sock have
64 bytes less, while {tcp,udp,raw}_sock retain the same size.

Before:

  # grep -E "^(RAW|UDP[^L\-]|TCP)" /proc/slabinfo | awk '{print $1, "\t", $4}'
  RAWv6 	 1408
  UDPv6 	 1472
  TCPv6 	 2560
  RAW 		 1152
  UDP	 	 1280
  TCP 		 2368

After:

  # grep -E "^(RAW|UDP[^L\-]|TCP)" /proc/slabinfo | awk '{print $1, "\t", $4}'
  RAWv6 	 1344
  UDPv6 	 1408
  TCPv6 	 2496
  RAW 		 1152
  UDP	 	 1280
  TCP 		 2368

Also, ipv6_fl_list and inet_flags (SNDFLOW bit) are placed in the
same cache line.

  $ pahole -C inet_sock vmlinux
  ...
  	/* --- cacheline 11 boundary (704 bytes) was 56 bytes ago --- */
  	struct ipv6_pinfo *        pinet6;               /*   760     8 */
  	/* --- cacheline 12 boundary (768 bytes) --- */
  	struct ipv6_fl_socklist *  ipv6_fl_list;         /*   768     8 */
  	unsigned long              inet_flags;           /*   776     8 */

Doc churn is due to the insufficient Type column (only 1 space short).

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20251014224210.2964778-1-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-17 16:06:52 -07:00
Jijie Shao
0746da0176 net: hibmcge: support pci_driver.shutdown()
support pci_driver.shutdown() for hibmcge driver.

Signed-off-by: Jijie Shao <shaojijie@huawei.com>
Link: https://patch.msgid.link/20251014134018.1178385-1-shaojijie@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-17 15:42:57 -07:00
Sidharth Seela
9948dcb2f7 ntfs3: Fix uninit buffer allocated by __getname()
Fix uninit errors caused after buffer allocation given to 'de'; by
initializing the buffer with zeroes. The fix was found by using KMSAN.

Reported-by: syzbot+332bd4e9d148f11a87dc@syzkaller.appspotmail.com
Fixes: 78ab59fee0 ("fs/ntfs3: Rework file operations")
Signed-off-by: Sidharth Seela <sidharthseela@gmail.com>
Signed-off-by: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
2025-10-17 16:45:38 +02:00
Raphael Pinsonneault-Thibeault
73e6b9dacf ntfs3: fix uninit memory after failed mi_read in mi_format_new
Fix a KMSAN un-init bug found by syzkaller.

ntfs_get_bh() expects a buffer from sb_getblk(), that buffer may not be
uptodate. We do not bring the buffer uptodate before setting it as
uptodate. If the buffer were to not be uptodate, it could mean adding a
buffer with un-init data to the mi record. Attempting to load that record
will trigger KMSAN.

Avoid this by setting the buffer as uptodate, if it’s not already, by
overwriting it.

Reported-by: syzbot+7a2ba6b7b66340cff225@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=7a2ba6b7b66340cff225
Tested-by: syzbot+7a2ba6b7b66340cff225@syzkaller.appspotmail.com
Fixes: 4342306f0f ("fs/ntfs3: Add file operations and implementation")
Signed-off-by: Raphael Pinsonneault-Thibeault <rpthibeault@gmail.com>
Signed-off-by: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
2025-10-17 16:45:37 +02:00
YangWen
02f312754c ntfs3: fix use-after-free of sbi->options in cmp_fnames
The root cause is that sbi->options points directly to fc->fs_private.
If fc->fs_private is freed while sbi still exists, sbi->options becomes
a dangling pointer.

This patch ensures that sbi->options is a separate copy of fc->fs_private
and duplicates nls_name if present. On superblock release or error,
sbi->options->nls_name and sbi->options are freed and sbi->options
is set to NULL to avoid any dangling pointer.

Reported-by: syzbot+d77c546c60db651a389c@syzkaller.appspotmail.com
Signed-off-by: YangWen <anmuxixixi@gmail.com>
[almaz.alexandrovich@paragon-software.com: remove syzbot logs from description]
Signed-off-by: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
2025-10-17 16:45:37 +02:00
Matthew Wilcox (Oracle)
68f6bd128e ntfs: Do not overwrite uptodate pages
When reading a compressed file, we may read several pages in addition to
the one requested.  The current code will overwrite pages in the page
cache with the data from disc which can definitely result in changes
that have been made being lost.

For example if we have four consecutie pages ABCD in the file compressed
into a single extent, on first access, we'll bring in ABCD.  Then we
write to page B.  Memory pressure results in the eviction of ACD.
When we attempt to write to page C, we will overwrite the data in page
B with the data currently on disk.

I haven't investigated the decompression code to check whether it's
OK to overwrite a clean page or whether it might be possible to see
corrupt data.  Out of an abundance of caution, decline to overwrite
uptodate pages, not just dirty pages.

Fixes: 4342306f0f (fs/ntfs3: Add file operations and implementation)
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: stable@vger.kernel.org
Signed-off-by: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
2025-10-17 16:45:36 +02:00
Matthew Wilcox (Oracle)
953b79a7a1 ntfs: Do not kmap page cache pages for compression
These pages are accessed through vmap; they are not accessed
by calling page_address(), so they do not need to be kmapped.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
2025-10-17 16:45:35 +02:00
Matthew Wilcox (Oracle)
14656154d2 ntfs: Do not kmap pages used for reading from disk
These pages are accessed through DMA and vmap; they are not accessed
by calling page_address(), so they do not need to be kmapped.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
2025-10-17 16:45:34 +02:00
Sven Eckelmann
ed5730f3f7 batman-adv: use skb_crc32c() instead of skb_seq_read()
Make batadv_bla_check_duplist() just use the new function skb_crc32c(),
instead of calling skb_seq_read() with crc32c(). This is faster and
simpler.

Suggested-by: Eric Biggers <ebiggers@kernel.org>
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
2025-10-17 16:30:43 +02:00
Simon Wunderlich
e5ae07b2ef batman-adv: Start new development cycle
This version will contain all the (major or even only minor) changes for
Linux 6.19.

The version number isn't a semantic version number with major and minor
information. It is just encoding the year of the expected publishing as
Linux -rc1 and the number of published versions this year (starting at 0).

Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
2025-10-17 16:30:43 +02:00
Marc Kleine-Budde
e41287a079 Merge patch series "can: m_can: various cleanups"
Marc Kleine-Budde <mkl@pengutronix.de> says:

While working on the m_can driver, I created several cleanups commits, make
m_can_init_ram() static, rename hrtimer function, convert debugging and
error output to netdev_(), replace open coded register write by
m_can_write(), remove not needed error messages and sanity checks and don't
wake up hte controller during m_can_get_berr_counter() if the interface is
down.

Link: https://patch.msgid.link/20251008-m_can-cleanups-v1-0-1784a18eaa84@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-10-17 15:18:25 +02:00
Marc Kleine-Budde
91a55c72a8 can: m_can: m_can_get_berr_counter(): don't wake up controller if interface is down
If the interface is down, the CAN controller might be powered down, the
clock disabled, and/or it's external reset asserted.

Don't wake up the controller to read the CAN bus error counters, if the
interface is down.

Reviewed-by: Markus Schneider-Pargmann <msp@baylibre.com>
Link: https://patch.msgid.link/20251008-m_can-cleanups-v1-7-1784a18eaa84@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-10-17 15:17:19 +02:00
Marc Kleine-Budde
b24b43522e can: m_can: m_can_tx_submit(): remove unneeded sanity checks
m_can_tx_submit() is only called for peripheral devices. So remove the
sanity check.

Link: https://patch.msgid.link/20251008-m_can-cleanups-v1-6-1784a18eaa84@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-10-17 15:17:13 +02:00
Marc Kleine-Budde
6218391758 can: m_can: m_can_class_register(): remove error message in case devm_kzalloc() fails
If devm_kzalloc() fails, it already outputs an error message. Remove the
error message from m_can_class_register() accordingly.

Link: https://patch.msgid.link/20251008-m_can-cleanups-v1-5-1784a18eaa84@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-10-17 15:17:08 +02:00
Marc Kleine-Budde
c6cbd24f65 can: m_can: m_can_interrupt_enable(): use m_can_write() instead of open coding it
As everywhere else in the driver, use m_can_write() instead of open coding
it.

Link: https://patch.msgid.link/20251008-m_can-cleanups-v1-4-1784a18eaa84@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-10-17 15:17:03 +02:00
Marc Kleine-Budde
293735053e net: m_can: convert dev_{dbg,info,err} -> netdev_{dbg,info,err}
To ease debugging use the netdev_{dbg,info,err}() functions instead of
dev_{dbg,info,err}.

Link: https://patch.msgid.link/20251008-m_can-cleanups-v1-3-1784a18eaa84@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-10-17 15:16:59 +02:00
Marc Kleine-Budde
60af9dbb63 can: m_can: hrtimer_callback(): rename to m_can_polling_timer()
The original use of struct m_can_classdev::hrtimer was to support polling
for devices without IRQ, with the timer function called hrtimer_callback().

Commit 07f25091ca ("can: m_can: Implement receive coalescing") uses the
hrtimer for software-supported IRQ coalescence, with the timer function
called m_can_coalescing_timer().

To improve the readability of the driver, rename hrtimer_callback() to
m_can_polling_timer(), which better describes the functionality.

Link: https://patch.msgid.link/20251008-m_can-cleanups-v1-2-1784a18eaa84@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-10-17 15:16:51 +02:00
Marc Kleine-Budde
c6dcc2b321 can: m_can: m_can_init_ram(): make static
Since commit eaacfeaca7 ("can: m_can: Call the RAM init directly from
m_can_chip_config") m_can_init_ram() is not used outside of m_can.c.

Mark as static and remove the EXPORT_SYMBOL_GPL().

Link: https://patch.msgid.link/20251008-m_can-cleanups-v1-1-1784a18eaa84@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-10-17 15:16:50 +02:00
Konstantin Komarov
801f614ba2 fs/ntfs3: fix mount failure for sparse runs in run_unpack()
Some NTFS volumes failed to mount because sparse data runs were not
handled correctly during runlist unpacking. The code performed arithmetic
on the special SPARSE_LCN64 marker, leading to invalid LCN values and
mount errors.

Add an explicit check for the case described above, marking the run as
sparse without applying arithmetic.

Fixes: 736fc7bf5f ("fs: ntfs3: Fix integer overflow in run_unpack()")
Cc: stable@vger.kernel.org
Signed-off-by: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
2025-10-17 12:08:43 +02:00
Konstantin Komarov
a846cd0d0a fs/ntfs3: Reformat code and update terminology
Reformatted the driver code according to the current .clang-format rules
and updated description of used terminology. No functional changes
intended.

Signed-off-by: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
2025-10-17 12:08:42 +02:00
Konstantin Komarov
5180138604 fs/ntfs3: Support timestamps prior to epoch
Before it used an unsigned 64-bit type, which prevented proper handling
of timestamps earlier than 1970-01-01. Switch to a signed 64-bit type to
support pre-epoch timestamps. The issue was caught by xfstests.

Signed-off-by: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
2025-10-17 12:08:24 +02:00
Marc Kleine-Budde
578dbbb952 Merge patch series "can: m_can: Add am62 wakeup support"
Markus Schneider-Pargmann (TI.com) <msp@baylibre.com> says:

This series adds support for wakeup capabilities to the m_can driver,
which is necessary for enabling Partial-IO functionality on am62, am62a,
and am62p SoCs. It implements the wake-on-lan interface for m_can
devices and handles the pinctrl states needed for wakeup functionality.

am62, am62a and am62p support Partial-IO, a low power system state in
which nearly everything is turned off except the pins of the CANUART
group. This group contains mcu_mcan0, mcu_mcan1, wkup_uart0 and
mcu_uart0 devices.

To support mcu_mcan0 and mcu_mcan1 wakeup for the mentioned SoCs, the
series introduces a notion of wake-on-lan for m_can. If the user decides
to enable wake-on-lan for a m_can device, the device is set to wakeup
enabled. A 'wakeup' pinctrl state is selected to enable wakeup flags for
the relevant pins. If wake-on-lan is disabled the default pinctrl is
selected.

Partial-IO Overview
------------------
Partial-IO is a low power system state in which nearly everything is
turned off except the pins of the CANUART group (mcu_mcan0, mcu_mcan1,
wkup_uart0 and mcu_uart0). These devices can trigger a wakeup of the
system on pin activity. Note that this does not resume the system as the
DDR is off as well. So this state can be considered a power-off state
with wakeup capabilities.

A documentation can also be found in section 6.2.4 in the TRM:
  https://www.ti.com/lit/pdf/spruiv7

Implementation Details
----------------------
The complete Partial-IO feature requires three coordinated series, each
handling a different aspect of the implementation:

1. This series (m_can driver): Implements device-specific wakeup
   functionality for m_can devices, allowing them to be set as wakeup
   sources.

2. Devicetree series: Defines system states and wakeup sources in the
   devicetree for am62, am62a and am62p.
   https://gitlab.baylibre.com/msp8/linux/-/tree/topic/am62-dt-partialio/v6.17?ref_type=heads

3. TI-SCI firmware series: Implements the firmware interface to enter
   Partial-IO mode when appropriate wakeup sources are enabled.
   https://gitlab.baylibre.com/msp8/linux/-/tree/topic/tisci-partialio/v6.17?ref_type=heads

Devicetree Bindings
-------------------
The wakeup-source property is used with references to
system-idle-states. This depends on the dt-schema pull request that adds
bindings for system-idle-states and updates the binding for
wakeup-source:
  https://github.com/devicetree-org/dt-schema/pull/150

This is merged now and upstream in dt-schema.

Testing
-------
A test branch is available here that includes all patches required to
test Partial-IO:

https://gitlab.baylibre.com/msp8/linux/-/tree/integration/am62-partialio/v6.17?ref_type=heads

After enabling Wake-on-LAN the system can be powered off and will enter
the Partial-IO state in which it can be woken up by activity on the
specific pins:
    ethtool -s can0 wol p
    ethtool -s can1 wol p
    poweroff

I tested these patches on am62-lp-sk.

Previous versions:
 v1: https://lore.kernel.org/lkml/20240523075347.1282395-1-msp@baylibre.com/
 v2: https://lore.kernel.org/lkml/20240729074135.3850634-1-msp@baylibre.com/
 v3: https://lore.kernel.org/lkml/20241011-topic-mcan-wakeup-source-v6-12-v3-0-9752c714ad12@baylibre.com
 v4: https://lore.kernel.org/r/20241015-topic-mcan-wakeup-source-v6-12-v4-0-fdac1d1e7aa6@baylibre.com
 v5: https://lore.kernel.org/r/20241028-topic-mcan-wakeup-source-v6-12-v5-0-33edc0aba629@baylibre.com
 v6: https://lore.kernel.org/r/20241219-topic-mcan-wakeup-source-v6-12-v6-0-1356c7f7cfda@baylibre.com
 v7: https://lore.kernel.org/r/20250421-topic-mcan-wakeup-source-v6-12-v7-0-1b7b916c9832@baylibre.com
 v8: https://lore.kernel.org/r/20250812-topic-mcan-wakeup-source-v6-12-v8-0-6972a810d63b@baylibre.com
 v9: https://lore.kernel.org/r/20250820-topic-mcan-wakeup-source-v6-12-v9-0-0ac13f2ddd67@baylibre.com

Changes in v10:
 - Change dt-binding to be able to set pinctrl-names = "default", "wakeup";
 - Fix wording in the dt-binging
 - Fix mcan commit message to have correct naming of the SoC
 - Change function name from m_can_class_setup_optional_pinctrl() to
   m_can_class_parse_pinctrl()

Changes in v9:
 - Update the binding to accept the sleep pinctrl state which is
   already in use by other devicetrees
 - Modify suspend/resume to not set the sleep state if wakeup is enabled
   and a wakeup pinctrl state is present. If wakeup pinctrl is active
   this should be kept enabled even after suspend
 - Modify m_can_set_wol() to use pinctrl_pm_select_default_state() to
   get rid of the manually managed default pinctrl.

Changes in v8:
 - Rebase to v6.17-rc1

Changes in v7:
 - Separate this series from "firmware: ti_sci: Partial-IO support"
   again as was requested internally
 - All DT changes are now in their own series to avoid conflicts
 - wakeup-source definition in the m_can binding is now only an
   extension to the dt-schema binding and a pull request was created

Changes in v6:
 - Rebased to v6.13-rc1
 - After feedback of the other Partial-IO series, I updated this series
   and removed all use of regulator-related patches.
 - wakeup-source is now not only a boolean property but can also be a
   list of power states in which the device is wakeup capable.

Changes in v5:
 - Make the check of wol options nicer to read

Changes in v4:
 - Remove leftover testing code that always returned -EIO in a specific
 - Redesign pincontrol setup to be easier understandable and less nested
 - Fix missing parantheses around wol_enable expression
 - Remove | from binding description

Changes in v3:
 - Rebase to v6.12-rc1
 - Change 'wakeup-source' to only 'true'
 - Simplify m_can_set_wol by returning early on error
 - Add vio-suuply binding and handling of this optional property.
   vio-supply is used to reflect the SoC architecture and which power
   line powers the m_can unit. This is important as some units are
   powered in special low power modes.

Changes in v2:
 - Rebase to v6.11-rc1
 - Squash these two patches for the binding into one:
   dt-bindings: can: m_can: Add wakeup-source property
   dt-bindings: can: m_can: Add wakeup pinctrl state
 - Add error handling to multiple patches of the m_can driver
 - Add error handling in m_can_class_allocate_dev(). This also required
   to add a new patch to return error pointers from
   m_can_class_allocate_dev().

Link: https://patch.msgid.link/20251001-topic-mcan-wakeup-source-v6-12-v10-0-4ab508ac5d1e@baylibre.com
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-10-17 11:02:57 +02:00
Markus Schneider-Pargmann (TI.com)
a77a297753 can: m_can: Support pinctrl wakeup state
TI AM62x SoC requires a wakeup flag being set in pinctrl when mcan pins
act as a wakeup source. Add support to select the wakeup state if WOL
is enabled.

Signed-off-by: Markus Schneider-Pargmann (TI.com) <msp@baylibre.com>
Link: https://patch.msgid.link/20251001-topic-mcan-wakeup-source-v6-12-v10-4-4ab508ac5d1e@baylibre.com
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-10-17 11:02:28 +02:00
Markus Schneider-Pargmann (TI.com)
148e125d4e can: m_can: Return ERR_PTR on error in allocation
We have more detailed error values available, return them in the core
driver and the calling drivers to return proper errors to callers.

Reviewed-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Reviewed-by: Dhruva Gole <d-gole@ti.com>
Reviewed-by: Kendall Willis <k-willis@ti.com>
Signed-off-by: Markus Schneider-Pargmann (TI.com) <msp@baylibre.com>
Link: https://patch.msgid.link/20251001-topic-mcan-wakeup-source-v6-12-v10-3-4ab508ac5d1e@baylibre.com
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-10-17 11:02:28 +02:00
Markus Schneider-Pargmann (TI.com)
04d5826b07 can: m_can: Map WoL to device_set_wakeup_enable
In some devices the pins of the m_can module can act as a wakeup source.
This patch helps do that by connecting the PHY_WAKE WoL option to
device_set_wakeup_enable. By marking this device as being wakeup
enabled, this setting can be used by platform code to decide which
sleep or poweroff mode to use.

Also this prepares the driver for the next patch in which the pinctrl
settings are changed depending on the desired wakeup source.

Reviewed-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Reviewed-by: Kendall Willis <k-willis@ti.com>
Signed-off-by: Markus Schneider-Pargmann (TI.com) <msp@baylibre.com>
Link: https://patch.msgid.link/20251001-topic-mcan-wakeup-source-v6-12-v10-2-4ab508ac5d1e@baylibre.com
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-10-17 11:02:28 +02:00
Markus Schneider-Pargmann (TI.com)
73cc2882b6 dt-bindings: can: m_can: Add wakeup properties
The pins associated with m_can have to have a special configuration to
be able to wakeup the SoC from some system states. This configuration is
described in the wakeup pinctrl state while the default state describes
the default configuration. Also add the sleep state which is already in
use by some devicetrees.

Also m_can can be a wakeup-source if capable of wakeup.

Signed-off-by: Markus Schneider-Pargmann (TI.com) <msp@baylibre.com>
Reviewed-by: Dhruva Gole <d-gole@ti.com>
Reviewed-by: Rob Herring (Arm) <robh@kernel.org>
Link: https://patch.msgid.link/20251001-topic-mcan-wakeup-source-v6-12-v10-1-4ab508ac5d1e@baylibre.com
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-10-17 11:02:27 +02:00
Vincent Mailhol
f968a24cad can: treewide: remove can_change_mtu()
can_change_mtu() became obsolete by commit 2304993860 ("can: populate the
minimum and maximum MTU values"). Now that net_device->min_mtu and
net_device->max_mtu are populated, all the checks are already done by
dev_validate_mtu() in net/core/dev.c.

Remove the net_device_ops->ndo_change_mtu() callback of all the physical
interfaces, then remove can_change_mtu(). Only keep the vcan_change_mtu()
and vxcan_change_mtu() because the virtual interfaces use their own
different MTU logic.

The only functional change this patch introduces is that now the user will
be able to change the MTU even if the interface is up. This does not matter
for Classical CAN and CAN FD because their MTU range is composed of only
one value, respectively CAN_MTU and CANFD_MTU. For the upcoming CAN XL, the
MTU will be configurable within the CANXL_MIN_MTU to CANXL_MAX_MTU range at
any time, even if the interface is up. This is consistent with the other
net protocols and does not contradict ISO 11898-1:2024 as having a
modifiable MTU is a kernel extension.

Signed-off-by: Vincent Mailhol <mailhol@kernel.org>
Link: https://patch.msgid.link/20251003-remove-can_change_mtu-v1-1-337f8bc21181@kernel.org
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-10-17 09:57:13 +02:00
Marc Kleine-Budde
9271d0ea07 can: m_can: add support for optional reset
This patch has been split from the original series [1].

In some SoCs (observed on the STM32MP15) the M_CAN IP core keeps the CAN
state and CAN error counters over an internal reset cycle. The STM32MP15
SoC provides an external reset, which is shared between both M_CAN cores.

Add support for an optional external reset. Take care of shared resets,
de-assert reset during the probe phase in m_can_class_register() and while
the interface is up, assert the reset otherwise.

[1] https://lore.kernel.org/all/20250923-m_can-fix-state-handling-v3-0-06d8baccadbf@pengutronix.de

Reviewed-by: Philipp Zabel <p.zabel@pengutronix.de>
Reviewed-by: Markus Schneider-Pargmann <msp@baylibre.com>
Link: https://patch.msgid.link/20251008-m_can-add-reset-v1-1-49f0bbf820c4@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
2025-10-17 09:57:13 +02:00
Jakub Kicinski
7e0d4c1113 Merge branch 'net-macb-various-cleanups'
Théo Lebrun says:

====================
net: macb: various cleanups

Fix many oddities inside the MACB driver. They accumulated in my
work-in-progress branch while working on MACB/GEM EyeQ5 support.

Part of this series has been seen on the lkml in March then June.
See below for a semblance of a changelog.

The initial goal was to post them alongside EyeQ5 support, but that
makes for too big of a series. It'll come afterwards, with new
features (interrupt coalescing, ethtool .set_channels() and XDP mostly).

[0]: https://lore.kernel.org/lkml/20250627-macb-v2-0-ff8207d0bb77@bootlin.com/
====================

Link: https://patch.msgid.link/20251014-macb-cleanup-v1-0-31cd266e22cd@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-16 16:59:32 -07:00
Théo Lebrun
8ebeef3d01 net: macb: sort #includes
Sort #include preprocessor directives.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Sean Anderson <sean.anderson@linux.dev>
Signed-off-by: Théo Lebrun <theo.lebrun@bootlin.com>
Link: https://patch.msgid.link/20251014-macb-cleanup-v1-15-31cd266e22cd@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-16 16:59:28 -07:00
Théo Lebrun
1ce9662e31 net: macb: apply reverse christmas tree in macb_tx_map()
The arguments grew over time; follow conventions and apply reverse
christmas tree (RCT).

Signed-off-by: Théo Lebrun <theo.lebrun@bootlin.com>
Link: https://patch.msgid.link/20251014-macb-cleanup-v1-14-31cd266e22cd@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-16 16:59:28 -07:00
Théo Lebrun
b5fe4f3e59 net: macb: drop count local variable in macb_tx_map()
Local variable `count` is useless: it counts number of DMA descriptors
used and returns it. But the return value is only checked for error.
Drop counting the number of DMA descriptors and return a usual
negative-if-error integer.

Signed-off-by: Théo Lebrun <theo.lebrun@bootlin.com>
Link: https://patch.msgid.link/20251014-macb-cleanup-v1-13-31cd266e22cd@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-16 16:59:28 -07:00
Théo Lebrun
027202adf0 net: macb: drop entry local variable in macb_tx_map()
The pattern:
   entry = macb_tx_ring_wrap(bp, i);
   tx_skb = &queue->tx_skb[entry];
is the exact definition of:
   macb_tx_skb(queue, i);

The pattern:
   entry = macb_tx_ring_wrap(bp, i);
   desc = macb_tx_desc(queue, entry);
is redundant because macb_tx_desc() calls macb_tx_ring_wrap().

One explicit call to macb_tx_ring_wrap() is still required for checking
if it is the last buffer (TX_WRAP case).

Signed-off-by: Théo Lebrun <theo.lebrun@bootlin.com>
Link: https://patch.msgid.link/20251014-macb-cleanup-v1-12-31cd266e22cd@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-16 16:59:27 -07:00
Théo Lebrun
f26c6438a2 net: macb: replace min() with umin() calls
Whenever min(a, b) is used with a and b unsigned variables or literals,
`make W=2` complains. Change four min() calls into umin().

stderr extract (GCC 11.2.0, MIPS Codescape):

./include/linux/minmax.h:68:57: warning: comparison is always true due
                          to limited range of data type [-Wtype-limits]
   68 | #define __is_nonneg(ux) statically_true((long long)(ux) >= 0)
      |                                                         ^~
drivers/net/ethernet/cadence/macb_main.c:2299:26: note: in expansion of
                                                            macro ‘min’
 2299 |              hdrlen = min(skb_headlen(skb), bp->max_tx_length);
      |                       ^~~

Signed-off-by: Théo Lebrun <theo.lebrun@bootlin.com>
Link: https://patch.msgid.link/20251014-macb-cleanup-v1-11-31cd266e22cd@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-16 16:59:27 -07:00
Théo Lebrun
39a913db6a net: macb: remove bp->queue_mask
The low 16 bits of GEM_DCFG6 tell us which queues are enabled in HW. In
theory, there could be holes in the bitfield. In practice, the macb
driver would fail if there were holes as most loops iterate upon
bp->num_queues. Only macb_init() iterated correctly.

 - Drop bp->queue_mask field.
 - Error out at probe if a hole is in the queue mask.
 - Rely upon bp->num_queues for iteration.
 - As we drop the queue_mask probe local variable, fix RCT.
 - Compute queue_mask on the fly for TAPRIO using bp->num_queues.

Signed-off-by: Théo Lebrun <theo.lebrun@bootlin.com>
Link: https://patch.msgid.link/20251014-macb-cleanup-v1-10-31cd266e22cd@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-16 16:59:27 -07:00
Théo Lebrun
02d11c6105 net: macb: introduce DMA descriptor helpers (is 64bit? is PTP?)
Introduce macb_dma64() and macb_dma_ptp() helper functions.
Many codepaths are made simpler by dropping conditional compilation.

This implies two additional changes:
 - Always compile related structure definitions inside <macb.h>.
 - MACB_EXT_DESC can be dropped as it is useless now.

The common case:

	#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
		struct macb_dma_desc_64 *desc_64;
		if (bp->hw_dma_cap & HW_DMA_CAP_64B) {
			desc_64 = macb_64b_desc(bp, desc);
			// ...
		}
	#endif

Is replaced by:

	if (macb_dma64(bp)) {
		struct macb_dma_desc_64 *desc_64 = macb_64b_desc(bp, desc);
		// ...
	}

Signed-off-by: Théo Lebrun <theo.lebrun@bootlin.com>
Link: https://patch.msgid.link/20251014-macb-cleanup-v1-9-31cd266e22cd@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-16 16:59:26 -07:00
Théo Lebrun
731e991afb net: macb: move bp->hw_dma_cap flags to bp->caps
Drop bp->hw_dma_cap field and put its two flags into bp->caps.

On my specific config (eyeq5_defconfig), bloat-o-meter indicates:
 - macb_main.o: Before=56251, After=56359, chg +0.19%
 - macb_ptp.o:  Before= 3976, After= 3952, chg -0.60%

Signed-off-by: Théo Lebrun <theo.lebrun@bootlin.com>
Link: https://patch.msgid.link/20251014-macb-cleanup-v1-8-31cd266e22cd@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-16 16:59:26 -07:00
Théo Lebrun
62e6c17463 net: macb: simplify macb_adj_dma_desc_idx()
The function body uses a switch statement on bp->hw_dma_cap and handles
its four possible values: 0, is_64b, is_ptp, is_64b && is_ptp.

Instead, refactor by noticing that the return value is:
   desc_size * MULT
with MULT = 3 if is_64b && is_ptp,
            2 if is_64b || is_ptp,
            1 otherwise.

MULT can be expressed as:
   1 + is_64b + is_ptp

Signed-off-by: Théo Lebrun <theo.lebrun@bootlin.com>
Link: https://patch.msgid.link/20251014-macb-cleanup-v1-7-31cd266e22cd@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-16 16:59:26 -07:00
Théo Lebrun
94a164598d net: macb: simplify macb_dma_desc_get_size()
macb_dma_desc_get_size() does a switch on bp->hw_dma_cap and covers all
four cases: 0, 64B, PTP, 64B+PTP. It also covers the #ifndef
MACB_EXT_DESC separately, making it four codepaths.

Instead, notice the descriptor size grows with enabled features and use
plain if-statements on 64B and PTP flags.

Signed-off-by: Théo Lebrun <theo.lebrun@bootlin.com>
Link: https://patch.msgid.link/20251014-macb-cleanup-v1-6-31cd266e22cd@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-16 16:59:25 -07:00
Théo Lebrun
d7a4a20abe net: macb: drop macb_config NULL checking
Remove NULL checks on macb_config as it is always valid:
 - either it is its default value &default_gem_config,
 - or it got overridden using match data.

Reviewed-by: Sean Anderson <sean.anderson@linux.dev>
Signed-off-by: Théo Lebrun <theo.lebrun@bootlin.com>
Link: https://patch.msgid.link/20251014-macb-cleanup-v1-5-31cd266e22cd@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-16 16:59:25 -07:00
Théo Lebrun
80cf78c59a net: macb: Remove local variables clk_init and init in macb_probe()
Remove local variables clk_init and init. Those function pointers are
always equivalent to macb_config->clk_init and macb_config->init.

Reviewed-by: Sean Anderson <sean.anderson@linux.dev>
Signed-off-by: Théo Lebrun <theo.lebrun@bootlin.com>
Link: https://patch.msgid.link/20251014-macb-cleanup-v1-4-31cd266e22cd@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-16 16:59:25 -07:00
Théo Lebrun
bd0b35ec83 net: macb: remove gap in MACB_CAPS_* flags
MACB_CAPS_* are bit constants that get used in bp->caps. They occupy
bits 0..12 + 24..31. Remove 11..23 gap by moving bits 24..31 to 13..20.

Occupation bitfields:

   31  29  27  25  23  21  19  17  15  13  11  09  07  05  03  01
     30  28  26  24  22  20  18  16  14  12  10  08  06  04  02  00
   -- Before ------------------------------------------------------
    1 1 1 1 1 1 1 1                       1 1 1 1 1 1 1 1 1 1 1 1 1
                    0 0 0 0 0 0 0 0 0 0 0
   -- After -------------------------------------------------------
                          1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
    0 0 0 0 0 0 0 0 0 0 0

Signed-off-by: Théo Lebrun <theo.lebrun@bootlin.com>
Link: https://patch.msgid.link/20251014-macb-cleanup-v1-3-31cd266e22cd@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-16 16:59:24 -07:00
Théo Lebrun
a23b0b79e9 net: macb: use BIT() macro for capability definitions
Replace all capabilities values by calls to the BIT() macro.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Sean Anderson <sean.anderson@linux.dev>
Signed-off-by: Théo Lebrun <theo.lebrun@bootlin.com>
Link: https://patch.msgid.link/20251014-macb-cleanup-v1-2-31cd266e22cd@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-16 16:59:24 -07:00
Théo Lebrun
f1150b7795 dt-bindings: net: cdns,macb: sort compatibles
Compatibles inside this enum are sorted-ish. Make it sorted.

Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Théo Lebrun <theo.lebrun@bootlin.com>
Link: https://patch.msgid.link/20251014-macb-cleanup-v1-1-31cd266e22cd@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-16 16:59:24 -07:00
Jakub Kicinski
2df75cc5bd Merge branch 'net-optimize-tx-throughput-and-efficiency'
Eric Dumazet says:

====================
net: optimize TX throughput and efficiency

In this series, I replace the busylock spinlock we have in
__dev_queue_xmit() and use lockless list (llist) to reduce
spinlock contention to the minimum.

Idea is that only one cpu might spin on the qdisc spinlock,
while others simply add their skb in the llist.

After this series, we get a 300 % (4x) improvement on heavy TX workloads,
sending twice the number of packets per second, for half the cpu cycles.
====================

Link: https://patch.msgid.link/20251014171907.3554413-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-16 16:25:16 -07:00
Eric Dumazet
100dfa74ca net: dev_queue_xmit() llist adoption
Remove busylock spinlock and use a lockless list (llist)
to reduce spinlock contention to the minimum.

Idea is that only one cpu might spin on the qdisc spinlock,
while others simply add their skb in the llist.

After this patch, we get a 300 % improvement on heavy TX workloads.
- Sending twice the number of packets per second.
- While consuming 50 % less cycles.

Note that this also allows in the future to submit batches
to various qdisc->enqueue() methods.

Tested:

- Dual Intel(R) Xeon(R) 6985P-C  (480 hyper threads).
- 100Gbit NIC, 30 TX queues with FQ packet scheduler.
- echo 64 >/sys/kernel/slab/skbuff_small_head/cpu_partial (avoid contention in mm)
- 240 concurrent "netperf -t UDP_STREAM -- -m 120 -n"

Before:

16 Mpps (41 Mpps if each thread is pinned to a different cpu)

vmstat 2 5
procs -----------memory---------- ---swap-- -----io---- -system-- ------cpu-----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
243  0      0 2368988672  51036 1100852    0    0   146     1  242   60  0  9 91  0  0
244  0      0 2368988672  51036 1100852    0    0   536    10 487745 14718  0 52 48  0  0
244  0      0 2368988672  51036 1100852    0    0   512     0 503067 46033  0 52 48  0  0
244  0      0 2368988672  51036 1100852    0    0   512     0 494807 12107  0 52 48  0  0
244  0      0 2368988672  51036 1100852    0    0   702    26 492845 10110  0 52 48  0  0

Lock contention (1 second sample taken on 8 cores)
perf lock record -C0-7 sleep 1; perf lock contention
 contended   total wait     max wait     avg wait         type   caller

    442111      6.79 s     162.47 ms     15.35 us     spinlock   dev_hard_start_xmit+0xcd
      5961      9.57 ms      8.12 us      1.60 us     spinlock   __dev_queue_xmit+0x3a0
       244    560.63 us      7.63 us      2.30 us     spinlock   do_softirq+0x5b
        13     25.09 us      3.21 us      1.93 us     spinlock   net_tx_action+0xf8

If netperf threads are pinned, spinlock stress is very high.
perf lock record -C0-7 sleep 1; perf lock contention
 contended   total wait     max wait     avg wait         type   caller

    964508      7.10 s     147.25 ms      7.36 us     spinlock   dev_hard_start_xmit+0xcd
       201    268.05 us      4.65 us      1.33 us     spinlock   __dev_queue_xmit+0x3a0
        12     26.05 us      3.84 us      2.17 us     spinlock   do_softirq+0x5b

@__dev_queue_xmit_ns:
[256, 512)            21 |                                                    |
[512, 1K)            631 |                                                    |
[1K, 2K)           27328 |@                                                   |
[2K, 4K)          265392 |@@@@@@@@@@@@@@@@                                    |
[4K, 8K)          417543 |@@@@@@@@@@@@@@@@@@@@@@@@@@                          |
[8K, 16K)         826292 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[16K, 32K)        733822 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@      |
[32K, 64K)         19055 |@                                                   |
[64K, 128K)        17240 |@                                                   |
[128K, 256K)       25633 |@                                                   |
[256K, 512K)           4 |                                                    |

After:

29 Mpps (57 Mpps if each thread is pinned to a different cpu)

vmstat 2 5
procs -----------memory---------- ---swap-- -----io---- -system-- ------cpu-----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
78  0      0 2369573632  32896 1350988    0    0    22     0  331  254  0  8 92  0  0
75  0      0 2369573632  32896 1350988    0    0    22    50 425713 280199  0 23 76  0  0
104  0      0 2369573632  32896 1350988    0    0   290     0 430238 298247  0 23 76  0  0
86  0      0 2369573632  32896 1350988    0    0   132     0 428019 291865  0 24 76  0  0
90  0      0 2369573632  32896 1350988    0    0   502     0 422498 278672  0 23 76  0  0

perf lock record -C0-7 sleep 1; perf lock contention
 contended   total wait     max wait     avg wait         type   caller

      2524    116.15 ms    486.61 us     46.02 us     spinlock   __dev_queue_xmit+0x55b
      5821    107.18 ms    371.67 us     18.41 us     spinlock   dev_hard_start_xmit+0xcd
      2377      9.73 ms     35.86 us      4.09 us     spinlock   ___slab_alloc+0x4e0
       923      5.74 ms     20.91 us      6.22 us     spinlock   ___slab_alloc+0x5c9
       121      3.42 ms    193.05 us     28.24 us     spinlock   net_tx_action+0xf8
         6    564.33 us    167.60 us     94.05 us     spinlock   do_softirq+0x5b

If netperf threads are pinned (~54 Mpps)
perf lock record -C0-7 sleep 1; perf lock contention
     32907    316.98 ms    195.98 us      9.63 us     spinlock   dev_hard_start_xmit+0xcd
      4507     61.83 ms    212.73 us     13.72 us     spinlock   __dev_queue_xmit+0x554
      2781     23.53 ms     40.03 us      8.46 us     spinlock   ___slab_alloc+0x5c9
      3554     18.94 ms     34.69 us      5.33 us     spinlock   ___slab_alloc+0x4e0
       233      9.09 ms    215.70 us     38.99 us     spinlock   do_softirq+0x5b
       153    930.66 us     48.67 us      6.08 us     spinlock   net_tx_action+0xfd
        84    331.10 us     14.22 us      3.94 us     spinlock   ___slab_alloc+0x5c9
       140    323.71 us      9.94 us      2.31 us     spinlock   ___slab_alloc+0x4e0

@__dev_queue_xmit_ns:
[128, 256)       1539830 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@                  |
[256, 512)       2299558 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[512, 1K)         483936 |@@@@@@@@@@                                          |
[1K, 2K)          265345 |@@@@@@                                              |
[2K, 4K)          145463 |@@@                                                 |
[4K, 8K)           54571 |@                                                   |
[8K, 16K)          10270 |                                                    |
[16K, 32K)          9385 |                                                    |
[32K, 64K)          7749 |                                                    |
[64K, 128K)        26799 |                                                    |
[128K, 256K)        2665 |                                                    |
[256K, 512K)         665 |                                                    |

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Tested-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Link: https://patch.msgid.link/20251014171907.3554413-7-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-16 16:25:10 -07:00
Eric Dumazet
526f5fb112 net: sched: claim one cache line in Qdisc
Replace state2 field with a boolean.

Move it to a hole between qstats and state so that
we shrink Qdisc by a full cache line.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Tested-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Link: https://patch.msgid.link/20251014171907.3554413-6-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-16 16:25:10 -07:00
Eric Dumazet
178ca30889 Revert "net/sched: Fix mirred deadlock on device recursion"
This reverts commits 0f022d32c3
and 44180feacc.

Prior patch in this series implemented loop detection
in act_mirred, we can remove q->owner to save some cycles
in the fast path.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Reviewed-by: Victor Nogueira <victor@mojatatu.com>
Tested-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Link: https://patch.msgid.link/20251014171907.3554413-5-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-16 16:25:10 -07:00
Eric Dumazet
fe946a751d net/sched: act_mirred: add loop detection
Commit 0f022d32c3 ("net/sched: Fix mirred deadlock on device recursion")
added code in the fast path, even when act_mirred is not used.

Prepare its revert by implementing loop detection in act_mirred.

Adds an array of device pointers in struct netdev_xmit.

tcf_mirred_is_act_redirect() can detect if the array
already contains the target device.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Tested-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Link: https://patch.msgid.link/20251014171907.3554413-4-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-16 16:25:10 -07:00
Eric Dumazet
5b2b7dec05 net: add add indirect call wrapper in skb_release_head_state()
While stress testing UDP senders on a host with expensive indirect
calls, I found cpus processing TX completions where showing
a very high cost (20%) in sock_wfree() due to
CONFIG_MITIGATION_RETPOLINE=y.

Take care of TCP and UDP TX destructors and use INDIRECT_CALL_3() macro.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Tested-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Link: https://patch.msgid.link/20251014171907.3554413-3-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-16 16:25:09 -07:00
Eric Dumazet
56cef47c28 selftests/net: packetdrill: unflake tcp_user_timeout_user-timeout-probe.pkt
This test fails the first time I am running it after a fresh virtme-ng boot.

tcp_user_timeout_user-timeout-probe.pkt:33: runtime error in write call: Expected result -1 but got 24 with errno 2 (No such file or directory)

Tweaks the timings a bit, to reduce flakiness.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Soham Chakradeo <sohamch@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Tested-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Link: https://patch.msgid.link/20251014171907.3554413-2-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-16 16:25:09 -07:00
Sagar Cheluvegowda
01b6aca22b dt-bindings: net: qcom: ethernet: Add interconnect properties
Add documentation for the interconnect and interconnect-names
properties required when voting for AHB and AXI buses.

Suggested-by: Andrew Halaney <ahalaney@redhat.com>
Signed-off-by: Sagar Cheluvegowda <quic_scheluve@quicinc.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Konrad Dybcio <konrad.dybcio@oss.qualcomm.com>
Link: https://patch.msgid.link/20251015-topic-qc_stmmac_icc_bindings-v5-1-da39126cff28@oss.qualcomm.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-16 16:11:54 -07:00
Jakub Kicinski
0c3cd7f044 Merge branch 'add-driver-support-for-eswin-eic7700-soc-ethernet-controller'
Shangjuan Wei says:

====================
Add driver support for Eswin eic7700 SoC ethernet controller
====================

Link: https://patch.msgid.link/20251015113751.1114-1-weishangjuan@eswincomputing.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-16 16:07:25 -07:00
Shangjuan Wei
ea77dbbdbc net: stmmac: add Eswin EIC7700 glue driver
Add Ethernet controller support for Eswin's eic7700 SoC. The driver
implements hardware initialization, clock configuration, delay
adjustment functions based on DWC Ethernet controller, and supports
device tree configuration and platform driver integration.

Signed-off-by: Zhi Li <lizhi2@eswincomputing.com>
Signed-off-by: Shangjuan Wei <weishangjuan@eswincomputing.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20251015114101.1218-1-weishangjuan@eswincomputing.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-16 16:07:23 -07:00
Shangjuan Wei
888bd0eca9 dt-bindings: ethernet: eswin: Document for EIC7700 SoC
Add ESWIN EIC7700 Ethernet controller, supporting clock
configuration, delay adjustment and speed adaptive functions.

Signed-off-by: Zhi Li <lizhi2@eswincomputing.com>
Signed-off-by: Shangjuan Wei <weishangjuan@eswincomputing.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://patch.msgid.link/20251015114041.1166-1-weishangjuan@eswincomputing.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-16 16:07:23 -07:00
Jakub Kicinski
7dbef65e54 Merge branch 'net-stmmac-more-cleanups'
Russell King says:

====================
net: stmmac: more cleanups

The subject for the cover message is wearing thin as I've used it a
number of times, but the scope for cleaning up the driver continues,
and continue it will do, because this is just a small fraction of the
queue.

1. make a better job of one of my previous commits, moving the holding
   of the lock into stmmac_mdio.c

2. move the mac_finish() method to be in-order with the layout of
   struct phylink_mac_ops - this order was chosen because it reflects
   the order that the methods are called, thus making the flow more
   obvious when reading code.

3. continuing on the "removal of stuff that doesn't need to happen",
   patch 3 removes the phylink_speed_(up|down) out of the path that
   is used for MTU changes - we really don't need to fiddle with the
   PHY advertisement when changing the MTU!

4. clean up tc_init()'s initialisation of flow_entries_max - this is
   the sole place that this is written, and we might as well make the
   code more easy to follow.

5. stmmac_phy_setup() really confuses me when I read the code, it's
   not really about PHY setup, but about phylink setup. So, name its
   name reflect its functionality.
====================

Link: https://patch.msgid.link/aO_HIwT_YvxkDS8D@shell.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-16 15:58:25 -07:00
Russell King (Oracle)
4a4094ba7a net: stmmac: rename stmmac_phy_setup() to include phylink
stmmac_phy_setup() does not set up any PHY, but does setup phylink.
Rename this function to stmmac_phylink_setup() to reflect more what
it is doing.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Reviewed-by: Gatien Chevallier <gatien.chevallier@foss.st.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/E1v945d-0000000Ameh-3Bs7@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-16 15:58:22 -07:00
Russell King (Oracle)
07d91ec99a net: stmmac: rearrange tc_init()
To make future changes easier, rearrange the use of dma_cap->l3l4fnum
vs priv->flow_entries_max.

Always initialise priv->flow_entries_max from dma_cap->l3l4fnum, then
use priv->flow_entries_max to determine whether we allocate
priv->flow_entries and set it up.

This change is safe because tc_init() is only called once from
stmmac_dvr_probe().

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Link: https://patch.msgid.link/E1v945Y-0000000Ameb-2gDI@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-16 15:58:22 -07:00
Russell King (Oracle)
e82c64be9b net: stmmac: avoid PHY speed change when configuring MTU
There is no need to do the speed-down, speed-up dance when changing
the MTU as there is little power saving that can be gained from such
a brief interval between these, and the autonegotiation they cause
takes much longer.

Move the calls to phylink_speed_up() and phylink_speed_down() into
stmmac_open() and stmmac_release() respectively, reducing the work
done in the __-variants of these functions.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Reviewed-by: Gatien Chevallier <gatien.chevallier@foss.st.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/E1v945T-0000000AmeV-2BvU@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-16 15:58:22 -07:00
Russell King (Oracle)
0bc832a54d net: stmmac: place .mac_finish() method more appropriately
Place the .mac_finish() initialiser and implementation after the
.mac_config() initialiser and method which reflects the order that
they appear in struct phylink_mac_ops, and the order in which they
are called. This keeps logically similar code together.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/E1v945O-0000000AmeP-1k0t@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-16 15:58:22 -07:00
Russell King (Oracle)
12a7b7bc14 net: stmmac: dwc-qos-eth: move MDIO bus locking into stmmac_mdio
Rather than dwc-qos-eth manipulating the MDIO bus lock directly, add
helpers to the stmmac MDIO layer and use them in dwc-qos-eth. This
improves my commit 87f43e6f06 ("net: stmmac: dwc-qos: calibrate tegra
with mdio bus idle").

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/E1v945J-0000000AmeJ-1GOb@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-16 15:58:21 -07:00
Jan Vaclav
f18c231fb1 net/hsr: add interlink to fill_info output
Currently, it is possible to configure the interlink
port, but no way to read it back from userspace.

Add it to the output of hsr_fill_info(), so it can be
read from userspace, for example:

$ ip -d link show hsr0
12: hsr0: <BROADCAST,MULTICAST> mtu ...
...
hsr slave1 veth0 slave2 veth1 interlink veth2 ...

Signed-off-by: Jan Vaclav <jvaclav@redhat.com>
Reviewed-by: Fernando Fernandez Mancera <fmancera@suse.de>
Link: https://patch.msgid.link/20251015101001.25670-2-jvaclav@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-16 15:52:10 -07:00
Harshit Mogalapalli
e104852075 Octeontx2-af: Fix pci_alloc_irq_vectors() return value check
In cgx_probe() when pci_alloc_irq_vectors() fails the error value will
be negative and that check is sufficient.

	err = pci_alloc_irq_vectors(pdev, nvec, nvec, PCI_IRQ_MSIX);
        if (err < 0 || err != nvec) {
        	...
	}

When pci_alloc_irq_vectors() fail to allocate nvec number of vectors,
-ENOSPC is returned, so it would be safe to remove the check that
compares err with nvec.

Suggested-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Harshit Mogalapalli <harshit.m.mogalapalli@oracle.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20251015090117.1557870-1-harshit.m.mogalapalli@oracle.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-16 15:47:28 -07:00
Hangbin Liu
38c31c2620 netdevsim: add ipsec hw_features
Currently, netdevsim only sets dev->features, which makes the ESP features
fixed. For example:

  # ethtool -k eni0np1 | grep esp
  tx-esp-segmentation: on [fixed]
  esp-hw-offload: on [fixed]
  esp-tx-csum-hw-offload: on [fixed]

This patch adds the ESP features to hw_features, allowing them to be
changed manually. For example:

  # ethtool -k eni0np1 | grep esp
  tx-esp-segmentation: on
  esp-hw-offload: on
  esp-tx-csum-hw-offload: on

Suggested-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Link: https://patch.msgid.link/20251015083649.54744-1-liuhangbin@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-16 15:44:32 -07:00
Alok Tiwari
bd853a59a8 net: amd-xgbe: use EOPNOTSUPP instead of ENOTSUPP in xgbe_phy_mii_read_c45
The MDIO read callback xgbe_phy_mii_read_c45() can propagate its return
value up through phylink_mii_ioctl() to user space via netdev ioctls such
as SIOCGMIIREG. Returning ENOTSUPP results in user space seeing
"Unknown error", since ENOTSUPP is not a standard errno value.

Replace ENOTSUPP with EOPNOTSUPP to align with the MDIO core’s
usage and ensure user space receives a proper "Operation not supported"
error instead of an unknown code.

Signed-off-by: Alok Tiwari <alok.a.tiwari@oracle.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Acked-by: Shyam Sundar S K <Shyam-sundar.S-k@amd.com>
Link: https://patch.msgid.link/20251015025751.1532149-1-alok.a.tiwari@oracle.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-16 15:42:10 -07:00
Martin KaFai Lau
03de843bd0 Merge branch 'bpf-allow-opt-out-from-sk-sk_prot-memory_allocated'
Kuniyuki Iwashima says:

====================
bpf: Allow opt-out from sk->sk_prot->memory_allocated.

This series allows opting out of the global per-protocol memory
accounting if socket is configured as such by sysctl or BPF prog.

This series is the successor of the series below [0], but the changes
now fall in net and bpf subsystems only.

I discussed with Roman Gushchin offlist, and he suggested not mixing
two independent subsystems and it would be cleaner not to depend on
memcg.

So, sk->sk_memcg and memcg code are no longer touched, and instead we
use another hole near sk->sk_prot to store a flag for the pure net
opt-out feature.

Overview of the series:

  patch 1 is misc cleanup
  patch 2 allows opt-out from sk->sk_prot->memory_allocated
  patch 3 introduces net.core.bypass_prot_mem
  patch 4 & 5 supports flagging sk->sk_bypass_prot_mem via bpf_setsockopt()
  patch 6 is selftest

Thank you very much for all your help, Shakeel, Roman, Martin, and Eric!

[0]: https://lore.kernel.org/bpf/20250920000751.2091731-1-kuniyu@google.com/

Changes:
  v2:
    * Patch 2:
      * Fill kdoc for skc_bypass_prot_mem
    * Patch 6
      * Fix server fd leak in tcp_create_sockets()
      * Avoid close(0) in check_bypass()

  v1: https://lore.kernel.org/bpf/20251007001120.2661442-1-kuniyu@google.com/
====================

Link: https://patch.msgid.link/20251014235604.3057003-1-kuniyu@google.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
2025-10-16 12:15:10 -07:00
Kuniyuki Iwashima
5f941dd87b selftests/bpf: Add test for sk->sk_bypass_prot_mem.
The test does the following for IPv4/IPv6 x TCP/UDP sockets
with/without sk->sk_bypass_prot_mem, which can be turned on by
net.core.bypass_prot_mem or bpf_setsockopt(SK_BPF_BYPASS_PROT_MEM).

  1. Create socket pairs
  2. Send NR_PAGES (32) of data (TCP consumes around 35 pages,
     and UDP consuems 66 pages due to skb overhead)
  3. Read memory_allocated from sk->sk_prot->memory_allocated and
     sk->sk_prot->memory_per_cpu_fw_alloc
  4. Check if unread data is charged to memory_allocated

If sk->sk_bypass_prot_mem is set, memory_allocated should not be
changed, but we allow a small error (up to 10 pages) in case
other processes on the host use some amounts of TCP/UDP memory.

The amount of allocated pages are buffered to per-cpu variable
{tcp,udp}_memory_per_cpu_fw_alloc up to +/- net.core.mem_pcpu_rsv
before reported to {tcp,udp}_memory_allocated.

At 3., memory_allocated is calculated from the 2 variables at
fentry of socket create function.

We drain the receive queue only for UDP before close() because UDP
recv queue is destroyed after RCU grace period.  When I printed
memory_allocated, UDP bypass cases sometimes saw the no-bypass
case's leftover, but it's still in the small error range (<10 pages).

  bpf_trace_printk: memory_allocated: 0   <-- TCP no-bypass
  bpf_trace_printk: memory_allocated: 35
  bpf_trace_printk: memory_allocated: 0   <-- TCP w/ sysctl
  bpf_trace_printk: memory_allocated: 0
  bpf_trace_printk: memory_allocated: 0   <-- TCP w/ bpf
  bpf_trace_printk: memory_allocated: 0
  bpf_trace_printk: memory_allocated: 0   <-- UDP no-bypass
  bpf_trace_printk: memory_allocated: 66
  bpf_trace_printk: memory_allocated: 2   <-- UDP w/ sysctl (2 pages leftover)
  bpf_trace_printk: memory_allocated: 2
  bpf_trace_printk: memory_allocated: 2   <-- UDP w/ bpf (2 pages leftover)
  bpf_trace_printk: memory_allocated: 2

We prefer finishing tests faster than oversleeping for call_rcu()
 + sk_destruct().

The test completes within 2s on QEMU (64 CPUs) w/ KVM.

  # time ./test_progs -t sk_bypass
  #371/1   sk_bypass_prot_mem/TCP  :OK
  #371/2   sk_bypass_prot_mem/UDP  :OK
  #371/3   sk_bypass_prot_mem/TCPv6:OK
  #371/4   sk_bypass_prot_mem/UDPv6:OK
  #371     sk_bypass_prot_mem:OK
  Summary: 1/4 PASSED, 0 SKIPPED, 0 FAILED

  real	0m1.481s
  user	0m0.181s
  sys	0m0.441s

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Link: https://patch.msgid.link/20251014235604.3057003-7-kuniyu@google.com
2025-10-16 12:04:47 -07:00
Kuniyuki Iwashima
38163af068 bpf: Introduce SK_BPF_BYPASS_PROT_MEM.
If a socket has sk->sk_bypass_prot_mem flagged, the socket opts out
of the global protocol memory accounting.

This is easily controlled by net.core.bypass_prot_mem sysctl, but it
lacks flexibility.

Let's support flagging (and clearing) sk->sk_bypass_prot_mem via
bpf_setsockopt() at the BPF_CGROUP_INET_SOCK_CREATE hook.

  int val = 1;

  bpf_setsockopt(ctx, SOL_SOCKET, SK_BPF_BYPASS_PROT_MEM,
                 &val, sizeof(val));

As with net.core.bypass_prot_mem, this is inherited to child sockets,
and BPF always takes precedence over sysctl at socket(2) and accept(2).

SK_BPF_BYPASS_PROT_MEM is only supported at BPF_CGROUP_INET_SOCK_CREATE
and not supported on other hooks for some reasons:

  1. UDP charges memory under sk->sk_receive_queue.lock instead
     of lock_sock()

  2. Modifying the flag after skb is charged to sk requires such
     adjustment during bpf_setsockopt() and complicates the logic
     unnecessarily

We can support other hooks later if a real use case justifies that.

Most changes are inline and hard to trace, but a microbenchmark on
__sk_mem_raise_allocated() during neper/tcp_stream showed that more
samples completed faster with sk->sk_bypass_prot_mem == 1.  This will
be more visible under tcp_mem pressure (but it's not a fair comparison).

  # bpftrace -e 'kprobe:__sk_mem_raise_allocated { @start[tid] = nsecs; }
    kretprobe:__sk_mem_raise_allocated /@start[tid]/
    { @end[tid] = nsecs - @start[tid]; @times = hist(@end[tid]); delete(@start[tid]); }'
  # tcp_stream -6 -F 1000 -N -T 256

Without bpf prog:

  [128, 256)          3846 |                                                    |
  [256, 512)       1505326 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
  [512, 1K)        1371006 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@     |
  [1K, 2K)          198207 |@@@@@@                                              |
  [2K, 4K)           31199 |@                                                   |

With bpf prog in the next patch:
  (must be attached before tcp_stream)
  # bpftool prog load sk_bypass_prot_mem.bpf.o /sys/fs/bpf/test type cgroup/sock_create
  # bpftool cgroup attach /sys/fs/cgroup/test cgroup_inet_sock_create pinned /sys/fs/bpf/test

  [128, 256)          6413 |                                                    |
  [256, 512)       1868425 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
  [512, 1K)        1101697 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@                      |
  [1K, 2K)          117031 |@@@@                                                |
  [2K, 4K)           11773 |                                                    |

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Link: https://patch.msgid.link/20251014235604.3057003-6-kuniyu@google.com
2025-10-16 12:04:47 -07:00
Kuniyuki Iwashima
5ed17896a0 bpf: Support bpf_setsockopt() for BPF_CGROUP_INET_SOCK_CREATE.
We will support flagging sk->sk_bypass_prot_mem via bpf_setsockopt()
at the BPF_CGROUP_INET_SOCK_CREATE hook.

BPF_CGROUP_INET_SOCK_CREATE is invoked by __cgroup_bpf_run_filter_sk()
that passes a pointer to struct sock to the bpf prog as void *ctx.

But there are no bpf_func_proto for bpf_setsockopt() that receives
the ctx as a pointer to struct sock.

Also, bpf_getsockopt() will be necessary for a cgroup with multiple
bpf progs running.

Let's add new bpf_setsockopt() and bpf_getsockopt() variants for
BPF_CGROUP_INET_SOCK_CREATE.

Note that inet_create() is not under lock_sock() and has the same
semantics with bpf_lsm_unlocked_sockopt_hooks.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Link: https://patch.msgid.link/20251014235604.3057003-5-kuniyu@google.com
2025-10-16 12:04:47 -07:00
Kuniyuki Iwashima
b46ab63181 net: Introduce net.core.bypass_prot_mem sysctl.
If a socket has sk->sk_bypass_prot_mem flagged, the socket opts out
of the global protocol memory accounting.

Let's control the flag by a new sysctl knob.

The flag is written once during socket(2) and is inherited to child
sockets.

Tested with a script that creates local socket pairs and send()s a
bunch of data without recv()ing.

Setup:

  # mkdir /sys/fs/cgroup/test
  # echo $$ >> /sys/fs/cgroup/test/cgroup.procs
  # sysctl -q net.ipv4.tcp_mem="1000 1000 1000"
  # ulimit -n 524288

Without net.core.bypass_prot_mem, charged to tcp_mem & memcg

  # python3 pressure.py &
  # cat /sys/fs/cgroup/test/memory.stat | grep sock
  sock 22642688 <-------------------------------------- charged to memcg
  # cat /proc/net/sockstat| grep TCP
  TCP: inuse 2006 orphan 0 tw 0 alloc 2008 mem 5376 <-- charged to tcp_mem
  # ss -tn | head -n 5
  State Recv-Q Send-Q Local Address:Port  Peer Address:Port
  ESTAB 2000   0          127.0.0.1:34479    127.0.0.1:53188
  ESTAB 2000   0          127.0.0.1:34479    127.0.0.1:49972
  ESTAB 2000   0          127.0.0.1:34479    127.0.0.1:53868
  ESTAB 2000   0          127.0.0.1:34479    127.0.0.1:53554
  # nstat | grep Pressure || echo no pressure
  TcpExtTCPMemoryPressures        1                  0.0

With net.core.bypass_prot_mem=1, charged to memcg only:

  # sysctl -q net.core.bypass_prot_mem=1
  # python3 pressure.py &
  # cat /sys/fs/cgroup/test/memory.stat | grep sock
  sock 2757468160 <------------------------------------ charged to memcg
  # cat /proc/net/sockstat | grep TCP
  TCP: inuse 2006 orphan 0 tw 0 alloc 2008 mem 0 <- NOT charged to tcp_mem
  # ss -tn | head -n 5
  State Recv-Q Send-Q  Local Address:Port  Peer Address:Port
  ESTAB 111000 0           127.0.0.1:36019    127.0.0.1:49026
  ESTAB 110000 0           127.0.0.1:36019    127.0.0.1:45630
  ESTAB 110000 0           127.0.0.1:36019    127.0.0.1:44870
  ESTAB 111000 0           127.0.0.1:36019    127.0.0.1:45274
  # nstat | grep Pressure || echo no pressure
  no pressure

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Link: https://patch.msgid.link/20251014235604.3057003-4-kuniyu@google.com
2025-10-16 12:04:47 -07:00
Kuniyuki Iwashima
7c268eaeec net: Allow opt-out from global protocol memory accounting.
Some protocols (e.g., TCP, UDP) implement memory accounting for socket
buffers and charge memory to per-protocol global counters pointed to by
sk->sk_proto->memory_allocated.

Sometimes, system processes do not want that limitation.  For a similar
purpose, there is SO_RESERVE_MEM for sockets under memcg.

Also, by opting out of the per-protocol accounting, sockets under memcg
can avoid paying costs for two orthogonal memory accounting mechanisms.
A microbenchmark result is in the subsequent bpf patch.

Let's allow opt-out from the per-protocol memory accounting if
sk->sk_bypass_prot_mem is true.

sk->sk_bypass_prot_mem and sk->sk_prot are placed in the same cache
line, and sk_has_account() always fetches sk->sk_prot before accessing
sk->sk_bypass_prot_mem, so there is no extra cache miss for this patch.

The following patches will set sk->sk_bypass_prot_mem to true, and
then, the per-protocol memory accounting will be skipped.

Note that this does NOT disable memcg, but rather the per-protocol one.

Another option not to use the hole in struct sock_common is create
sk_prot variants like tcp_prot_bypass, but this would complicate
SOCKMAP logic, tcp_bpf_prots etc.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Link: https://patch.msgid.link/20251014235604.3057003-3-kuniyu@google.com
2025-10-16 12:04:47 -07:00
Kuniyuki Iwashima
4a997d49d9 tcp: Save lock_sock() for memcg in inet_csk_accept().
If memcg is enabled, accept() acquires lock_sock() twice for each new
TCP/MPTCP socket in inet_csk_accept() and __inet_accept().

Let's move memcg operations from inet_csk_accept() to __inet_accept().

Note that SCTP somehow allocates a new socket by sk_alloc() in
sk->sk_prot->accept() and clones fields manually, instead of using
sk_clone_lock().

mem_cgroup_sk_alloc() is called for SCTP before __inet_accept(),
so I added the protocol check in __inet_accept(), but this can be
removed once SCTP uses sk_clone_lock().

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Link: https://patch.msgid.link/20251014235604.3057003-2-kuniyu@google.com
2025-10-16 12:04:47 -07:00
Jakub Kicinski
55db64ddd6 Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
Cross-merge networking fixes after downstream PR (net-6.18-rc2).

No conflicts or adjacent changes.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-16 11:06:28 -07:00
Paolo Abeni
ef672e4665 Merge branch 'txgbe-feat-new-aml-firmware'
Jiawen Wu says:

====================
TXGBE feat new AML firmware

The firmware of AML devices are redesigned to adapt to more PHY
interfaces. Optimize the driver to be compatible with the new firmware.

v1: https://lore.kernel.org/all/20250928093923.30456-1-jiawenwu@trustnetic.com/
====================

Link: https://patch.msgid.link/20251014061726.36660-1-jiawenwu@trustnetic.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-16 15:41:38 +02:00
Jiawen Wu
a058de9262 net: txgbe: rename txgbe_get_phy_link()
The function txgbe_get_phy_link() is more appropriately named
txgbe_get_mac_link(), since it reads the link status from the MAC
register.

Signed-off-by: Jiawen Wu <jiawenwu@trustnetic.com>
Link: https://patch.msgid.link/20251014061726.36660-4-jiawenwu@trustnetic.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-16 15:41:34 +02:00
Jiawen Wu
1f863ce5c7 net: txgbe: optimize the flow to setup PHY for AML devices
To adapt to new firmware for AML devices, the driver should send the
"SET_LINK_CMD" to the firmware only once when switching PHY interface
mode, and no longer needs to re-trigger PHY configuration based on the
RX signal interrupt (TXGBE_GPIOBIT_3).

In previous firmware versions, the PHY was configured only after receiving
"SET_LINK_CMD", and might remain incomplete if the RX signal was lost.
To handle this case, the driver used TXGBE_GPIOBIT_3 interrupt to resend
the command. This workaround is no longer necessary with the new firmware.

And the unknown link speed is permitted in the mailbox buffer.

Signed-off-by: Jiawen Wu <jiawenwu@trustnetic.com>
Link: https://patch.msgid.link/20251014061726.36660-3-jiawenwu@trustnetic.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-16 15:41:34 +02:00
Jiawen Wu
af3fce9f1b net: txgbe: expend SW-FW mailbox buffer size to identify QSFP module
Recent firmware updates introduce additional fields in the mailbox message
to provide more information for identifying 40G and 100G QSFP modules.
To accommodate these new fields, expand the mailbox buffer size by 4 bytes.

Without this change, drivers built against the updated firmware cannot
properly identify modules due to mismatched mailbox message lengths.

The old firmware version that used the smaller mailbox buffer has never
been publicly released, so there are no backward-compatibility concerns.

Signed-off-by: Jiawen Wu <jiawenwu@trustnetic.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20251014061726.36660-2-jiawenwu@trustnetic.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-16 15:41:34 +02:00
Rob Herring (Arm)
00922eeaca dt-bindings: net: Convert amd,xgbe-seattle-v1a to DT schema
Convert amd,xgbe-seattle-v1a binding to DT schema format. It's a
straight-forward conversion.

Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
Link: https://patch.msgid.link/20251013213049.686797-2-robh@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-16 11:55:28 +02:00
Paolo Abeni
6608b952ae Merge branch 'add-aarch64-support-for-fbnic'
Dimitri Daskalakis says:

====================
Add aarch64 support for FBNIC

We need to support aarch64 with 64K PAGE_SIZE, and I uncovered an issue
during testing.
====================

Link: https://patch.msgid.link/20251013211449.1377054-1-dimitri.daskalakis1@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-16 11:36:32 +02:00
Dimitri Daskalakis
75b350839b net: fbnic: Allow builds for all 64 bit architectures
This enables aarch64 testing, but there's no reason we cannot support other
architectures.

Signed-off-by: Dimitri Daskalakis <dimitri.daskalakis1@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20251013211449.1377054-3-dimitri.daskalakis1@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-16 11:36:29 +02:00
Dimitri Daskalakis
4bd451f4c2 net: fbnic: Fix page chunking logic when PAGE_SIZE > 4K
The HW always works on a 4K page size. When the OS supports larger
pages, we fragment them across multiple BDQ descriptors.
We were not properly incrementing the descriptor, which resulted in us
specifying the last chunks id/addr and then 15 zero descriptors. This
would cause packet loss and driver crashes. This is not a fix since the
Kconfig prevents use outside of x86.

Signed-off-by: Dimitri Daskalakis <dimitri.daskalakis1@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20251013211449.1377054-2-dimitri.daskalakis1@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-16 11:36:29 +02:00
Jakub Kicinski
cb85ca4c0a Merge branch 'net-airoha-npu-introduce-support-for-airoha-7583-npu'
Lorenzo Bianconi says:

====================
net: airoha: npu: Introduce support for Airoha 7583 NPU

Introduce support for Airoha 7583 SoC NPU.
====================

Link: https://patch.msgid.link/20251013-airoha-npu-7583-v3-0-00f748b5a0c7@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-15 17:52:59 -07:00
Lorenzo Bianconi
4478596f71 net: airoha: npu: Add 7583 SoC support
Introduce support for Airoha 7583 SoC NPU selecting proper firmware images.

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20251013-airoha-npu-7583-v3-3-00f748b5a0c7@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-15 17:52:55 -07:00
Lorenzo Bianconi
0850ae496d net: airoha: npu: Add airoha_npu_soc_data struct
Introduce airoha_npu_soc_data structure in order to generalize per-SoC
NPU firmware info. Introduce airoha_npu_load_firmware utility routine.
This is a preliminary patch in order to introduce AN7583 NPU support.

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20251013-airoha-npu-7583-v3-2-00f748b5a0c7@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-15 17:52:55 -07:00
Lorenzo Bianconi
9fbafbfa5b dt-bindings: net: airoha: npu: Add AN7583 support
Introduce AN7583 NPU support to Airoha EN7581 NPU device-tree bindings.

Acked-by: Rob Herring (Arm) <robh@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://patch.msgid.link/20251013-airoha-npu-7583-v3-1-00f748b5a0c7@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-15 17:52:54 -07:00
Jakub Kicinski
1ecd749556 Merge branch 'preserve-pse-pd692x0-configuration-across-reboots'
Kory Maincent says:

====================
Preserve PSE PD692x0 configuration across reboots

Previously, the driver would always reconfigure the PSE hardware on
probe, causing a port matrix reflash that resulted in temporary power
loss to all connected devices. This change maintains power continuity
by preserving existing configuration when the PSE has been previously
initialized.
====================

Link: https://patch.msgid.link/20251013-feature_pd692x0_reboot_keep_conf-v2-0-68ab082a93dd@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-15 17:50:31 -07:00
Kory Maincent (Dent Project)
8f3d044b34 net: pse-pd: pd692x0: Preserve PSE configuration across reboots
Detect when PSE hardware is already configured (user byte == 42) and
skip hardware initialization to prevent power interruption to connected
devices during system reboots.

Previously, the driver would always reconfigure the PSE hardware on
probe, causing a port matrix reflash that resulted in temporary power
loss to all connected devices. This change maintains power continuity
by preserving existing configuration when the PSE has been previously
initialized.

Signed-off-by: Kory Maincent <kory.maincent@bootlin.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20251013-feature_pd692x0_reboot_keep_conf-v2-3-68ab082a93dd@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-15 17:50:29 -07:00
Kory Maincent (Dent Project)
6fa1f8b64a net: pse-pd: pd692x0: Separate configuration parsing from hardware setup
Cache the port matrix configuration in driver private data to enable
PSE controller reconfiguration. This refactoring separates device tree
parsing from hardware configuration application, allowing settings to be
reapplied without reparsing the device tree.

This refactoring is a prerequisite for preserving PSE configuration
across reboots to prevent power disruption to connected devices.

Signed-off-by: Kory Maincent <kory.maincent@bootlin.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20251013-feature_pd692x0_reboot_keep_conf-v2-2-68ab082a93dd@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-15 17:50:28 -07:00
Kory Maincent (Dent Project)
f197902cd2 net: pse-pd: pd692x0: Replace __free macro with explicit kfree calls
Replace __free(kfree) with explicit kfree() calls to follow the net
subsystem policy of avoiding automatic cleanup macros as described in
the documentation.

Signed-off-by: Kory Maincent <kory.maincent@bootlin.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20251013-feature_pd692x0_reboot_keep_conf-v2-1-68ab082a93dd@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-15 17:50:28 -07:00
Florian Fainelli
e1f5bb196f net: bcmasp: Add support for PHY-based Wake-on-LAN
If available, interrogate the PHY to find out whether we can use it for
Wake-on-LAN. This can be a more power efficient way of implementing that
feature, especially when the MAC is powered off in low power states.

Signed-off-by: Florian Fainelli <florian.fainelli@broadcom.com>
Link: https://patch.msgid.link/20251013172306.2250223-1-florian.fainelli@broadcom.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-15 17:45:52 -07:00
Eric Dumazet
e5b670e543 net: remove obsolete WARN_ON(refcount_read(&sk->sk_refcnt) == 1)
sk->sk_refcnt has been converted to refcount_t in 2017.

__sock_put(sk) being refcount_dec(&sk->sk_refcnt), it will complain
loudly if the current refcnt is 1 (or less) in a non racy way.

We can remove four WARN_ON() in favor of the generic refcount_dec()
check.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Xuanqiang Luo<luoxuanqiang@kylinos.cn>
Link: https://patch.msgid.link/20251014140605.2982703-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-15 17:18:38 -07:00
Pradeep Kumar Chitrapu
9c5f229b13 wifi: ath12k: fix TX and RX MCS rate configurations in HE mode
Currently, the TX and RX MCS rate configurations per peer are
reversed when sent to the firmware. As a result, RX MCS rates
are configured for TX, and vice versa. This commit rectifies
the configuration to match what the firmware expects.

Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.0.1-00029-QCAHKSWPL_SILICONZ-1

Fixes: d889913205 ("wifi: ath12k: driver for Qualcomm Wi-Fi 7 devices")
Signed-off-by: Pradeep Kumar Chitrapu <quic_pradeepc@quicinc.com>
Reviewed-by: Vasanthakumar Thiagarajan <vasanthakumar.thiagarajan@oss.qualcomm.com>
Reviewed-by: Baochen Qiang <baochen.qiang@oss.qualcomm.com>
Link: https://patch.msgid.link/20251009211656.2386085-3-quic_pradeepc@quicinc.com
Signed-off-by: Jeff Johnson <jeff.johnson@oss.qualcomm.com>
2025-10-15 16:37:54 -07:00
Baochen Qiang
8c21b32c2c wifi: ath12k: fix VHT MCS assignment
While associating, firmware needs the peer's receive capability
to calculate its own VHT transmit MCS. Currently, the host
sends this information via mcs->rx_mcs_set field, but firmware
actually reads it from mcs->tx_mcs_set field. This mismatch is
incorrect.

This issue has not caused failures so far because most peers
advertise identical TX and RX capabilities. Fix this by
assigning the value to tx_mcs_set as expected.

Additionally, the rate control mask is intended to limit our
transmit MCS, so it should also apply to the peer's receive
capability. Update the logic accordingly.

Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.0.1-00029-QCAHKSWPL_SILICONZ-1

Fixes: d889913205 ("wifi: ath12k: driver for Qualcomm Wi-Fi 7 devices")
Signed-off-by: Baochen Qiang <baochen.qiang@oss.qualcomm.com>
Signed-off-by: Pradeep Kumar Chitrapu <quic_pradeepc@quicinc.com>
Reviewed-by: Vasanthakumar Thiagarajan <vasanthakumar.thiagarajan@oss.qualcomm.com>
Reviewed-by: Baochen Qiang <baochen.qiang@oss.qualcomm.com>
Link: https://patch.msgid.link/20251009211656.2386085-2-quic_pradeepc@quicinc.com
Signed-off-by: Jeff Johnson <jeff.johnson@oss.qualcomm.com>
2025-10-15 16:37:54 -07:00
Thiraviyam Mariyappan
b94f523cc5 wifi: ath12k: Fix NSS value update in ext_rx_stats
Currently, in ext_rx_stats, the NSS value is taken directly from
the firmware, which results in incorrect mapping:
        4x4, 3x3, 2x2, 1x1 SS are incorrectly updated as 3x3, 2x2,
1x1, 0x0 SS respectively.

Fix the issue by incrementing the NSS value by 1 while updating
the PPDU info to ensure accurate spatial stream statistics.

Remove the redundant +1 increment in the radiotap header when monitor
mode is enabled to prevent double counting.

Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.5-01651-QCAHKSWPL_SILICONZ-1

Signed-off-by: Thiraviyam Mariyappan <thiraviyam.mariyappan@oss.qualcomm.com>
Reviewed-by: Vasanthakumar Thiagarajan <vasanthakumar.thiagarajan@oss.qualcomm.com>
Link: https://patch.msgid.link/20251007133332.1092178-1-thiraviyam.mariyappan@oss.qualcomm.com
Signed-off-by: Jeff Johnson <jeff.johnson@oss.qualcomm.com>
2025-10-15 16:37:54 -07:00
Aditya Kumar Singh
6917e268c4 wifi: ath12k: Defer vdev bring-up until CSA finalize to avoid stale beacon
Mac80211 schedules CSA finalize work twice during a channel switch: first
during the reserved switch phase and again during the finalize phase.
The beacon content is updated only during the second schedule, which occurs
after the reserved switch completes. However, the ath12k driver attempts to
bring up the VDEV during the channel switch callback
(ath12k_mac_update_vif_chan()), which leads to premature installation of
stale beacon templates before the updated content is available.

This premature VDEV bring-up causes outdated beacon information to be
broadcast, which can result in updated channel parameters during the
transition. In MBSSID scenarios, this behavior is particularly problematic
because the transmitting interface's beacon must be updated before
non-transmitting interfaces are brought up. Failing to do so can lead to
beacon mismatches across interfaces.

Introduce a is_csa_in_progress flag to defer VDEV_UP until CSA finalize is
complete. Set this flag during the channel switch callback when CSA is
active. In bss_info_changed(), check this flag and issue VDEV_UP only
after the beacon template has been updated.

Ensure that in MBSSID cases, the transmitting interface is brought up
first, followed by all non-transmitting interfaces. This ordering makes
sure correct beacon propagation and avoids stale beacon installation
during CSA transitions.

Additionally, move the call to ath12k_mac_update_peer_puncturing_width()
before VDEV bring-up during CSA handling. This ensures that the puncturing
bitmap and bandwidth settings are applied before the VDEV is brought up.

Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.3.1-00173-QCAHKSWPL_SILICONZ-1

Fixes: 8c6faa56bf ("wifi: ath12k: add MBSSID beacon support")
Signed-off-by: Aditya Kumar Singh <aditya.kumar.singh@oss.qualcomm.com>
Signed-off-by: Maharaja Kennadyrajan <maharaja.kennadyrajan@oss.qualcomm.com>
Reviewed-by: Vasanthakumar Thiagarajan <vasanthakumar.thiagarajan@oss.qualcomm.com>
Link: https://patch.msgid.link/20250924134336.888-1-maharaja.kennadyrajan@oss.qualcomm.com
Signed-off-by: Jeff Johnson <jeff.johnson@oss.qualcomm.com>
2025-10-15 16:37:54 -07:00
Sarika Sharma
43ba986e7a wifi: ath12k: track dropped MSDU buffer type packets in REO exception ring
Add a counter "reo_excep_msdu_buf_type" in
ath12k_debugfs_dump_device_dp_stats() to account for packets dropped
due to unexpected MSDU buffer types in the RX error path. These
packets are discarded to prevent incorrect parsing and potential
kernel crashes. This helps in debugging and monitoring RX error
handling behavior.

Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.4.1-00199-QCAHKSWPL_SILICONZ-1

Signed-off-by: Sarika Sharma <sarika.sharma@oss.qualcomm.com>
Reviewed-by: Baochen Qiang <baochen.qiang@oss.qualcomm.com>
Reviewed-by: Vasanthakumar Thiagarajan <vasanthakumar.thiagarajan@oss.qualcomm.com>
Link: https://patch.msgid.link/20250930091551.3305312-3-sarika.sharma@oss.qualcomm.com
Signed-off-by: Jeff Johnson <jeff.johnson@oss.qualcomm.com>
2025-10-15 16:37:53 -07:00
Sarika Sharma
36f9edbb9d wifi: ath12k: Fix MSDU buffer types handling in RX error path
Currently, packets received on the REO exception ring from
unassociated peers are of MSDU buffer type, while the driver expects
link descriptor type packets. These packets are not parsed further due
to a return check on packet type in ath12k_hal_desc_reo_parse_err(),
but the associated skb is not freed. This may lead to kernel
crashes and buffer leaks.

Hence to fix, update the RX error handler to explicitly drop
MSDU buffer type packets received on the REO exception ring.
This prevents further processing of invalid packets and ensures
stability in the RX error handling path.

Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.4.1-00199-QCAHKSWPL_SILICONZ-1

Fixes: d889913205 ("wifi: ath12k: driver for Qualcomm Wi-Fi 7 devices")
Signed-off-by: Sarika Sharma <sarika.sharma@oss.qualcomm.com>
Reviewed-by: Baochen Qiang <baochen.qiang@oss.qualcomm.com>
Reviewed-by: Vasanthakumar Thiagarajan <vasanthakumar.thiagarajan@oss.qualcomm.com>
Link: https://patch.msgid.link/20250930091551.3305312-2-sarika.sharma@oss.qualcomm.com
Signed-off-by: Jeff Johnson <jeff.johnson@oss.qualcomm.com>
2025-10-15 16:37:53 -07:00
Baochen Qiang
596b911644 wifi: ath11k: restore register window after global reset
Hardware target implements an address space larger than that PCI BAR can
map. In order to be able to access the whole target address space, the BAR
space is split into 4 segments, of which the last 3, called windows, can
be dynamically mapped to the desired area. This is achieved by updating
window register with appropriate window value. Currently each time when
accessing a register that beyond ATH11K_PCI_WINDOW_START, host calculates
the window value and caches it after window update, this way next time
when accessing a register falling in the same window, host knows that the
window is already good hence no additional update needed.

However this mechanism breaks after global reset is triggered in
ath11k_pci_soc_global_reset(), because with global reset hardware resets
window register hence the window is not properly mapped any more. Current
host does nothing about this, as a result a subsequent register access may
not work as expected if it falls in a window same as before.

Although there is no obvious issue seen now, better to fix it to avoid
future problem. The fix is done by restoring the window register after
global reset.

Tested-on: WCN6855 hw2.0 PCI WLAN.HSP.1.1-03125-QCAHSPSWPL_V1_V2_SILICONZ_LITE-3.6510.30

Fixes: d5c65159f2 ("ath11k: driver for Qualcomm IEEE 802.11ax devices")
Signed-off-by: Baochen Qiang <baochen.qiang@oss.qualcomm.com>
Reviewed-by: Vasanthakumar Thiagarajan <vasanthakumar.thiagarajan@oss.qualcomm.com>
Link: https://patch.msgid.link/20251014-ath11k-reset-window-cache-v1-1-b85271b111dd@oss.qualcomm.com
Signed-off-by: Jeff Johnson <jeff.johnson@oss.qualcomm.com>
2025-10-15 16:37:53 -07:00
Jeff Johnson
d34a368be2 wifi: ath12k: Remove struct wmi_bcn_send_from_host_cmd
struct wmi_bcn_send_from_host_cmd is unused, so remove it.

Compile tested only.

Link: https://patch.msgid.link/20251010-ath12k-nuke-wmi_bcn_send_from_host_cmd-v1-1-6f1172b77848@oss.qualcomm.com
Signed-off-by: Jeff Johnson <jeff.johnson@oss.qualcomm.com>
2025-10-15 16:37:53 -07:00
Jeff Johnson
960fc268a9 wifi: ath11k: Remove struct wmi_bcn_send_from_host_cmd
struct wmi_bcn_send_from_host_cmd is unused, so remove it.

Compile tested only.

Link: https://patch.msgid.link/20251010-ath11k-nuke-wmi_bcn_send_from_host_cmd-v1-1-bfb5118d9018@oss.qualcomm.com
Signed-off-by: Jeff Johnson <jeff.johnson@oss.qualcomm.com>
2025-10-15 16:37:52 -07:00
Kang Yang
f35a07a484 wifi: ath10k: move recovery check logic into a new work
Currently, ath10k has a recovery check logic. It will wait for the
last recovery to finish by wait_for_completion_timeout();

But in SDIO scenarios, the recovery function may be invoked from
interrupt context, where long blocking waits are undesirable and can
lead to system instability.

Additionally, Linux’s ordered workqueue processes one task at a time.
If a previous recovery is still queued or executing, new triggers are
ignored. This prevents accurate tracking of consecutive failures and
delays transition to the WEDGED state.

To address this, move the recovery check logic into a different
workqueue.

Tested-on: QCA6174 hw3.2 PCI WLAN.RM.4.4.1-00288-QCARMSWPZ-1
Tested-on: QCA6174 hw3.2 SDIO WLAN.RMH.4.4.1-00189

Fixes: c256a94d1b ("wifi: ath10k: shutdown driver when hardware is unreliable")
Signed-off-by: Kang Yang <kang.yang@oss.qualcomm.com>
Reviewed-by: Baochen Qiang <baochen.qiang@oss.qualcomm.com>
Link: https://patch.msgid.link/20251014110757.155-1-kang.yang@oss.qualcomm.com
Signed-off-by: Jeff Johnson <jeff.johnson@oss.qualcomm.com>
2025-10-15 16:37:52 -07:00
Dr. David Alan Gilbert
4077d7fb27 wifi: wcn36xx: Remove unused wcn36xx_smd_update_scan_params
wcn36xx_smd_update_scan_params() last use was removed in 2020 by
commit 5973a29474 ("wcn36xx: Fix software-driven scan")

Remove it.

This leaves the wcn36xx_hal_update_scan_params_req_ex and
wcn36xx_hal_update_scan_params_resp structs unused.

Remove them, together with the unused
wcn36xx_hal_update_scan_params_req.

Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Acked-by: Loic Poulain <loic.poulain@oss.qualcomm.com>
Link: https://patch.msgid.link/20251011001038.352393-1-linux@treblig.org
Signed-off-by: Jeff Johnson <jeff.johnson@oss.qualcomm.com>
2025-10-15 16:37:52 -07:00
Heiner Kallweit
378e6523eb net: bcmgenet: remove unused platform code
This effectively reverts b0ba512e25 ("net: bcmgenet: enable driver to
work without a device tree"). There has never been an in-tree user of
struct bcmgenet_platform_data, all devices use OF or ACPI.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Link: https://patch.msgid.link/108b4e64-55d4-4b4e-9a11-3c810c319d66@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-15 09:09:41 -07:00
Abhishek Rawal
a8e846b8d9 r8152: Advertise software timestamp information.
Driver calls skb_tx_timestamp(skb) in rtl8152_start_xmit(), but does not advertise the capability in ethtool.
Advertise software timestamp capabilities on struct ethtool_ops.

Signed-off-by: Abhishek Rawal <rawal.abhishek92@gmail.com>
Reviewed-by: Jamie Bainbridge <jamie.bainbridge@gmail.com>
Reviewed-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Link: https://patch.msgid.link/20251014055234.46527-1-rawal.abhishek92@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-15 09:08:56 -07:00
Jakub Kicinski
4ca05145d4 Merge branch 'net-deal-with-sticky-tx-queues'
Eric Dumazet says:

====================
net: deal with sticky tx queues

Back in 2010, Tom Herbert added skb->ooo_okay to TCP flows.

Extend the feature to connected flows for other protocols like UDP.

skb->ooo_okay might never be set for bulk flows that always
have at least one skb in a qdisc queue of NIC queue,
especially if TX completion is delayed because of a stressed cpu
or aggressive interrupt mitigation.

The so-called "strange attractors" has caused many performance
issues, we need to do better now that TCP reacts better to
potential reorders.

Add new net.core.txq_reselection_ms sysctl to let
flows follow XPS and select a more efficient queue.

After this series, we no longer have to make sure threads
are pinned to cpus, they can migrate without adding
too much [spinlock, qdisc, TX completion] pressure anymore.
====================

Link: https://patch.msgid.link/20251013152234.842065-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-15 09:04:24 -07:00
Eric Dumazet
4a7708443d net: allow busy connected flows to switch tx queues
This is a followup of commit 726e9e8b94 ("tcp: refine
skb->ooo_okay setting") and of prior commit in this series
("net: control skb->ooo_okay from skb_set_owner_w()")

skb->ooo_okay might never be set for bulk flows that always
have at least one skb in a qdisc queue of NIC queue,
especially if TX completion is delayed because of a stressed cpu.

The so-called "strange attractors" has caused many performance
issues (see for instance 9b462d02d6 ("tcp: TCP Small Queues
and strange attractors")), we need to do better.

We have tried very hard to avoid reorders because TCP was
not dealing with them nicely a decade ago.

Use the new net.core.txq_reselection_ms sysctl to let
flows follow XPS and select a more efficient queue.

After this patch, we no longer have to make sure threads
are pinned to cpus, they now can be migrated without
adding too much spinlock/qdisc/TX completion pressure anymore.

TX completion part was problematic, because it added false sharing
on various socket fields, but also added false sharing and spinlock
contention in mm layers. Calling skb_orphan() from ndo_start_xmit()
is not an option unfortunately.

Note for later:

1) move sk->sk_tx_queue_mapping closer
to sk_tx_queue_mapping_jiffies for better cache locality.

2) Study if 9b462d02d6 ("tcp: TCP Small Queues
and strange attractors") could be revised.

Tested:

Used a host with 32 TX queues, shared by groups of 8 cores.
XPS setup :

echo ff >/sys/class/net/eth1/queue/tx-0/xps_cpus
echo ff00 >/sys/class/net/eth1/queue/tx-1/xps_cpus
echo ff0000 >/sys/class/net/eth1/queue/tx-2/xps_cpus
echo ff000000 >/sys/class/net/eth1/queue/tx-3/xps_cpus
echo ff,00000000 >/sys/class/net/eth1/queue/tx-4/xps_cpus
echo ff00,00000000 >/sys/class/net/eth1/queue/tx-5/xps_cpus
echo ff0000,00000000 >/sys/class/net/eth1/queue/tx-6/xps_cpus
echo ff000000,00000000 >/sys/class/net/eth1/queue/tx-7/xps_cpus
...

Launched a tcp_stream with 15 threads and 1000 flows, initially affined to core 0-15

taskset -c 0-15 tcp_stream -T15 -F1000 -l1000 -c -H target_host

Checked that only queues 0 and 1 are used as instructed by XPS :
tc -s qdisc show dev eth1|grep backlog|grep -v "backlog 0b 0p"
 backlog 123489410b 1890p
 backlog 69809026b 1064p
 backlog 52401054b 805p

Then force each thread to run on cpu 1,9,17,25,33,41,49,57,65,73,81,89,97,105,113,121

C=1;PID=`pidof tcp_stream`;for P in `ls /proc/$PID/task`; do taskset -pc $C $P; C=$(($C + 8));done

Set txq_reselection_ms to 1000
echo 1000 > /proc/sys/net/core/txq_reselection_ms

Check that the flows have migrated nicely:

tc -s qdisc show dev eth1|grep backlog|grep -v "backlog 0b 0p"
 backlog 130508314b 1916p
 backlog 8584380b 126p
 backlog 8584380b 126p
 backlog 8379990b 123p
 backlog 8584380b 126p
 backlog 8487484b 125p
 backlog 8584380b 126p
 backlog 8448120b 124p
 backlog 8584380b 126p
 backlog 8720640b 128p
 backlog 8856900b 130p
 backlog 8584380b 126p
 backlog 8652510b 127p
 backlog 8448120b 124p
 backlog 8516250b 125p
 backlog 7834950b 115p

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20251013152234.842065-5-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-15 09:04:22 -07:00
Eric Dumazet
2ddef3462b net: add /proc/sys/net/core/txq_reselection_ms control
Add a new sysctl to control how often a queue reselection
can happen even if a flow has a persistent queue of skbs
in a Qdisc or NIC queue.

A value of zero means the feature is disabled.

Default is 1000 (1 second).

This sysctl is used in the following patch.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20251013152234.842065-4-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-15 09:04:21 -07:00
Eric Dumazet
d365c9bca3 net: control skb->ooo_okay from skb_set_owner_w()
15 years after Tom Herbert added skb->ooo_okay, only TCP transport
benefits from it.

We can support other transports directly from skb_set_owner_w().

If no other TX packet for this socket is in a host queue (qdisc, NIC queue)
there is no risk of self-inflicted reordering, we can set skb->ooo_okay.

This allows netdev_pick_tx() to choose a TX queue based on XPS settings,
instead of reusing the queue chosen at the time the first packet was sent
for connected sockets.

Tested:
  500 concurrent UDP_RR connected UDP flows, host with 32 TX queues,
  512 cpus, XPS setup.

  super_netperf 500 -t UDP_RR -H <host> -l 1000 -- -r 100,100 -Nn &

This patch saves between 10% and 20% of cycles, depending on how
process scheduler migrates threads among cpus.

Using following bpftrace script, we can see the effect on Qdisc/NIC tx queues
being better used (less cache line misses).

bpftrace -e '
k:__dev_queue_xmit { @start[cpu] = nsecs; }
kr:__dev_queue_xmit {
 if (@start[cpu]) {
    $delay = nsecs - @start[cpu];
    delete(@start[cpu]);
    @__dev_queue_xmit_ns = hist($delay);
 }
}
END { clear(@start); }'

Before:
@__dev_queue_xmit_ns:
[128, 256)             6 |                                                    |
[256, 512)        116283 |                                                    |
[512, 1K)        1888205 |@@@@@@@@@@@                                         |
[1K, 2K)         8106167 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@    |
[2K, 4K)         8699293 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[4K, 8K)         2600676 |@@@@@@@@@@@@@@@                                     |
[8K, 16K)         721688 |@@@@                                                |
[16K, 32K)        122995 |                                                    |
[32K, 64K)         10639 |                                                    |
[64K, 128K)          119 |                                                    |
[128K, 256K)           1 |                                                    |

After:
@__dev_queue_xmit_ns:
[128, 256)             3 |                                                    |
[256, 512)        651112 |@@                                                  |
[512, 1K)        8109938 |@@@@@@@@@@@@@@@@@@@@@@@@@@                          |
[1K, 2K)        16081031 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[2K, 4K)         2411692 |@@@@@@@                                             |
[4K, 8K)           98994 |                                                    |
[8K, 16K)           1536 |                                                    |
[16K, 32K)           587 |                                                    |
[32K, 64K)             2 |                                                    |

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20251013152234.842065-3-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-15 09:04:21 -07:00
Eric Dumazet
6ddb811a57 net: add SK_WMEM_ALLOC_BIAS constant
sk->sk_wmem_alloc is initialized to 1, and sk_wmem_alloc_get()
takes care of this initial value.

Add SK_WMEM_ALLOC_BIAS define to not spread this magic value.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20251013152234.842065-2-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-15 09:04:21 -07:00
Eric Dumazet
1c51450f1a tcp: better handle TCP_TX_DELAY on established flows
Some applications uses TCP_TX_DELAY socket option after TCP flow
is established.

Some metrics need to be updated, otherwise TCP might take time to
adapt to the new (emulated) RTT.

This patch adjusts tp->srtt_us, tp->rtt_min, icsk_rto
and sk->sk_pacing_rate.

This is best effort, and for instance icsk_rto is reset
without taking backoff into account.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251013145926.833198-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-15 08:56:30 -07:00
Frank Li
6378e25ee1 dt-bindings: net: dsa: nxp,sja1105: Add optional clock
Add optional clock for OSC_IN and fix the below CHECK_DTBS warnings:
  arch/arm/boot/dts/nxp/imx/imx6qp-prtwd3.dtb: switch@0 (nxp,sja1105q): Unevaluated properties are not allowed ('clocks' was unexpected)

Signed-off-by: Frank Li <Frank.Li@nxp.com>
Acked-by: Conor Dooley <conor.dooley@microchip.com>
Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://patch.msgid.link/20251010183418.2179063-1-Frank.Li@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-15 08:55:40 -07:00
Andrii Nakryiko
48a97ffc6c bpf: Consistently use bpf_rcu_lock_held() everywhere
We have many places which open-code what's now is bpf_rcu_lock_held()
macro, so replace all those places with a clean and short macro invocation.
For that, move bpf_rcu_lock_held() macro into include/linux/bpf.h.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/bpf/20251014201403.4104511-1-andrii@kernel.org
2025-10-15 12:26:12 +02:00
Jakub Kicinski
6033d2a246 Merge branch 'net-airoha-add-some-new-ethtool-bits'
Lorenzo Bianconi says:

====================
net: airoha: Add some new ethtool bits

- add missing stats to ethtool ethtool_eth_mac_stats struct
- set get_link ethtool callback to ethtool_op_get_link routine
====================

Link: https://patch.msgid.link/20251013-airoha-ethtool-improvements-v1-0-fdd1c6fc9be1@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-14 19:35:51 -07:00
Lorenzo Bianconi
fc4fed9054 net: airoha: Add get_link ethtool callback
Set get_link ethtool callback to ethtool_op_get_link routine.

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20251013-airoha-ethtool-improvements-v1-2-fdd1c6fc9be1@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-14 19:35:45 -07:00
Lorenzo Bianconi
331f8a8bea net: airoha: Add missing stats to ethtool_eth_mac_stats
Add the following stats to ethtool ethtool_eth_mac_stats stats:
- FramesTransmittedOK
- OctetsTransmittedOK
- FramesReceivedOK
- OctetsReceivedOK

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20251013-airoha-ethtool-improvements-v1-1-fdd1c6fc9be1@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-14 19:35:45 -07:00
Denis Benato
c3527eeb65 eth: fealnx: fix typo in comments
There are a few typos in comments:
 - replace "avilable" with "available"
 - replace "mutlicast" with "multicast"

Signed-off-by: Denis Benato <benato.denis96@gmail.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20251013183632.1226627-1-benato.denis96@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-14 19:18:45 -07:00
Heiner Kallweit
10c4b4f60f net: mdio: use macro module_driver to avoid boilerplate code
Use macro module_driver to avoid boilerplate code.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://patch.msgid.link/e5c37417-4984-4b57-8154-264deef61e0d@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-14 19:16:13 -07:00
Jonas Gorski
bdec4271e8 net: dsa: b53: implement port isolation support
Implement port isolation support via the Protected Ports register.

Protected ports can only communicate with unprotected ports, but not
with each other, matching the expected behaviour of isolated ports.

Tested on BCM963268BU.

Signed-off-by: Jonas Gorski <jonas.gorski@gmail.com>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
Link: https://patch.msgid.link/20251013152834.100169-1-jonas.gorski@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-14 13:33:36 -07:00
Alok Tiwari
e0aa115271 eth: fbnic: fix various typos in comments and strings
Fix several minor typos and grammatical errors in comments and log
(in fbnic firmware, PCI, and time modules)

Changes include:
 - "cordeump" -> "coredump"
 - "of" -> "off" in RPC config comment
 - "healty" -> "healthy" in firmware heartbeat comment
 - "Firmware crashed detected!" -> "Firmware crash detected!"
 - "The could be caused" -> "This could be caused"
 - "lockng" -> "locking" in fbnic_time.c

Signed-off-by: Alok Tiwari <alok.a.tiwari@oracle.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20251013160507.768820-1-alok.a.tiwari@oracle.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-14 12:16:17 -07:00
Alok Tiwari
0513a3f97b net: bridge: correct debug message function name in br_fill_ifinfo
The debug message in br_fill_ifinfo() incorrectly refers to br_fill_info
instead of the actual function name. Update it for clarity in debugging
output.

Signed-off-by: Alok Tiwari <alok.a.tiwari@oracle.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://patch.msgid.link/20251013100121.755899-1-alok.a.tiwari@oracle.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-10-14 12:13:36 -07:00
Jan Vaclav
16a2206354 net/hsr: add protocol version to fill_info output
Currently, it is possible to configure IFLA_HSR_VERSION, but
there is no way to check in userspace what the currently
configured HSR protocol version is.

Add it to the output of hsr_fill_info(), when the interface
is using the HSR protocol. Let's not expose it when using
the PRP protocol, since it only has one version and it's
not possible to set it from userspace.

This info could then be used by e.g. ip(8), like so:
$ ip -d link show hsr0
12: hsr0: <BROADCAST,MULTICAST> mtu ...
    ...
    hsr slave1 veth0 slave2 veth1 ... proto 0 version 1
Reviewed-by: Fernando Fernandez Mancera <fmancera@suse.de>
Signed-off-by: Jan Vaclav <jvaclav@redhat.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20251009210903.1055187-6-jvaclav@redhat.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-14 15:09:01 +02:00
Byungchul Park
53615ad26e netmem: replace __netmem_clear_lsb() with netmem_to_nmdesc()
Now that we have struct netmem_desc, it'd better access the pp fields
via struct netmem_desc rather than struct net_iov.

Introduce netmem_to_nmdesc() for safely converting netmem_ref to
netmem_desc regardless of the type underneath e.i. netmem_desc, net_iov.

While at it, remove __netmem_clear_lsb() and make netmem_to_nmdesc()
used instead.

Suggested-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Byungchul Park <byungchul@sk.com>
Reviewed-by: Mina Almasry <almasrymina@google.com>
Link: https://patch.msgid.link/20251013044133.69472-1-byungchul@sk.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-14 13:37:26 +02:00
Kriish Sharma
3dacc900c0 hdlc_ppp: fix potential null pointer in ppp_cp_event logging
drivers/net/wan/hdlc_ppp.c: In function ‘ppp_cp_event’:
drivers/net/wan/hdlc_ppp.c:353:17: warning: ‘%s’ directive argument is null [-Wformat-overflow=]
  353 |                 netdev_info(dev, "%s down\n", proto_name(pid));
      |                 ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
drivers/net/wan/hdlc_ppp.c:342:17: warning: ‘%s’ directive argument is null [-Wformat-overflow=]
  342 |                 netdev_info(dev, "%s up\n", proto_name(pid));
      |                 ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Update proto_name() to return "LCP" by default instead of NULL.
This change silences the compiler without changing existing behavior
and removes the need for the local 'pname' variable in ppp_cp_event.

Suggested-by: Krzysztof Hałasa <khalasa@piap.pl>
Signed-off-by: Kriish Sharma <kriish.sharma2006@gmail.com>
Acked-by: Krzysztof Hałasa <khalasa@piap.pl>
Link: https://patch.msgid.link/20251013014319.1608706-1-kriish.sharma2006@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-10-14 12:40:49 +02:00
Alexei Starovoitov
39e9d5f630 Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf before 6.18-rc1
Cross-merge BPF and other fixes after downstream PR.

No conflicts.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-11 18:27:47 -07:00
Mykyta Yatsenko
4c97c4b149 bpf: Extract internal structs validation logic into helpers
The arraymap and hashtab duplicate the logic that checks for and frees
internal structs (timer, workqueue, task_work) based on
BTF record flags. Centralize this by introducing two helpers:

  * bpf_map_has_internal_structs(map)
    Returns true if the map value contains any of internal structs:
    BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK.

  * bpf_map_free_internal_structs(map, obj)
    Frees the internal structs for a single value object.

Convert arraymap and both the prealloc/malloc hashtab paths to use the
new generic functions. This keeps the functionality for when/how to free
these special fields in one place and makes it easier to add support for
new internal structs in the future without touching every map
implementation.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251010164606.147298-3-mykyta.yatsenko5@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-10 11:13:28 -07:00
Mykyta Yatsenko
bca2b74ea9 selftests/bpf: Add more bpf_wq tests
Add bpf_wq selftests to verify:
 * BPF program using non-constant offset of struct bpf_wq is rejected
 * BPF program using map with no BTF for storing struct bpf_wq is rejected

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Tested-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251010164606.147298-2-mykyta.yatsenko5@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-10 11:12:51 -07:00
Mykyta Yatsenko
5f8d411729 bpf: Fix handling maps with no BTF and non-constant offsets for the bpf_wq
Fix handling maps with no BTF and non-constant offsets for the bpf_wq.

This de-duplicates logic with other internal structs (task_work, timer),
keeps error reporting consistent, and makes future changes to the layout
handling centralized.

Fixes: d940c9b94d ("bpf: add support for KF_ARG_PTR_TO_WORKQUEUE")
Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251010164606.147298-1-mykyta.yatsenko5@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-10 11:08:10 -07:00
Martin KaFai Lau
7dc484fe48 Merge branch 'support-non-linear-skbs-for-bpf_prog_test_run'
Paul Chaignon says:

====================
Support non-linear skbs for BPF_PROG_TEST_RUN

This patchset adds support for non-linear skbs when running tc programs
with BPF_PROG_TEST_RUN.

We've had multiple bugs in the past few years in Cilium caused by
missing calls to bpf_skb_pull_data(). Daniel suggested to support
non linear skb in BPF_PROG_TEST_RUN to uncover these bugs in
our BPF tests.

Changes in v8:
  - Fix uninitialized data pointer spotted by Martin.
  - Error out in test_loader if __linear_size tag is used on unsupported
    program types.
Changes in v7:
  - Refactor use of 'size' variable as suggested by Martin.
  - Support copying back the non-linear area to data_out.
  - Minor code changes for readability, suggested by Martin.
Changes in v6:
  - Disallow non-linear skb in prog_run_skb only for LWT programs
    instead of all non-L2 program types, on suggestion from Martin.
  - Reject non-null ctx->data and ctx->data_meta, as suggested by Amery.
  - Bound linear_size to 'PAGE_SIZE - headroom - tailroom' to be
    consistent with prog_run_xdp, as suggested by Martin.
  - Allocate exactly linear_size bytes in bpf_test_init, spotted by
    Martin.
  - Fix wrong conflict resolution on double-free fix, spotted by Amery.
  - Rebased.
Changes in v5:
  - Fix double free on data in first patch.
Changes in v4:
  - Per Martin's suggestion, follow the XDP code pattern and use
    bpf_test_init only to initialize the linear area. That way data is
    directly copied to the right areas and we avoid the call to
    __pskb_pull_tail.
  - Fixed outdated commit descriptions.
  - Rebased.
Changes in v3:
  - Dropped BPF_F_TEST_SKB_NON_LINEAR and used the ctx->data_end to
    determine if the user wants non-linear skb, as suggested by Amery.
  - Introduced a second commit with a bit of refactoring to allow for
    the above requested change.
  - Fix bug found by syzkaller on third commit.
  - Rebased.
Changes in v2:
  - Made the linear size configurable via ctx->data_end, as suggested
    by Amery.
  - Reworked the selftests to allow testing the configurable linear
    size.
  - Fix warnings reported by kernel test robot on first commit.
  - Rebased.
====================

Link: https://patch.msgid.link/cover.1760037899.git.paul.chaignon@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
2025-10-10 10:43:04 -07:00
Paul Chaignon
bc3eeb4259 selftests/bpf: Test direct packet access on non-linear skbs
This patch adds new selftests in the direct packet access suite, to
cover the non-linear case. The first six tests cover the behavior of
the bounds check with a non-linear skb. The last test adds a call to
bpf_skb_pull_data() to be able to access the packet.

Note that the size of the linear area includes the L2 header, but for
some program types like cgroup_skb, ctx->data points to the L3 header.
Therefore, a linear area of 22 bytes will have only 8 bytes accessible
to the BPF program (22 - ETH_HLEN). For that reason, the cgroup_skb test
cases access the packet at an offset of 8 bytes.

Signed-off-by: Paul Chaignon <paul.chaignon@gmail.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/ceedbfd719e58f0d49dcceb8592f5e6bd38ce5fe.1760037899.git.paul.chaignon@gmail.com
2025-10-10 10:43:04 -07:00
Paul Chaignon
8d45d0398d selftests/bpf: Support non-linear flag in test loader
This patch adds support for a new tag __linear_size in the test loader,
to specify the size of the linear area in case of non-linear skbs. If
the tag is absent or null, a linear skb is crafted.

Signed-off-by: Paul Chaignon <paul.chaignon@gmail.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/7ad928ec7591daef4f1b84032aeb86c918b3e5a7.1760037899.git.paul.chaignon@gmail.com
2025-10-10 10:43:04 -07:00
Paul Chaignon
838baa351c bpf: Craft non-linear skbs in BPF_PROG_TEST_RUN
This patch adds support for crafting non-linear skbs in BPF test runs
for tc programs. The size of the linear area is given by ctx->data_end,
with a minimum of ETH_HLEN always pulled in the linear area. If ctx or
ctx->data_end are null, a linear skb is used.

This is particularly useful to test support for non-linear skbs in large
codebases such as Cilium. We've had multiple bugs in the past few years
where we were missing calls to bpf_skb_pull_data(). This support in
BPF_PROG_TEST_RUN would allow us to automatically cover this case in our
BPF tests.

LWT program types are currently excluded in this patch. Allowing
non-linear skbs for these programs would require a bit more care because
they are able to call helpers (ex., bpf_clone_redirect, bpf_redirect)
that themselves call eth_type_trans(). eth_type_trans() assumes there
are at least ETH_HLEN bytes in the linear area. That may not be true
for LWT programs as we already pulled the L2 header via the
eth_type_trans() call in bpf_prog_test_run_skb().

In addition to the selftests introduced later in the series, this patch
was tested by enabling non-linear skbs for all tc selftests programs
and checking test failures were expected.

Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Paul Chaignon <paul.chaignon@gmail.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Tested-by: syzbot@syzkaller.appspotmail.com
Link: https://patch.msgid.link/5694d4d1af31bddf974afcb1bbb1e28b8998dcd0.1760037899.git.paul.chaignon@gmail.com
2025-10-10 10:43:03 -07:00
Paul Chaignon
57bb2f6717 bpf: Reorder bpf_prog_test_run_skb initialization
This patch reorders the initialization of bpf_prog_test_run_skb to
simplify the subsequent patch. Program types are checked first, followed
by the ctx init, and finally the data init. With the subsequent patch,
program types and the ctx init provide information that is used in the
data init. Thus, we need the data init to happen last.

Signed-off-by: Paul Chaignon <paul.chaignon@gmail.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/063475176f15828a882c07846017394baf72f682.1760037899.git.paul.chaignon@gmail.com
2025-10-10 10:43:03 -07:00
Paul Chaignon
d8accf661f bpf: Refactor cleanup of bpf_prog_test_run_skb
This bit of refactoring aims to simplify how we free memory in
bpf_prog_test_run_skb to avoid code duplication.

Signed-off-by: Paul Chaignon <paul.chaignon@gmail.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Tested-by: syzbot@syzkaller.appspotmail.com
Link: https://patch.msgid.link/8971e01ae87b84f5af6b8b40defd3c310faf1c0f.1760037899.git.paul.chaignon@gmail.com
2025-10-10 10:43:03 -07:00
Alexei Starovoitov
2e36338df4 Merge branch 'add-kfuncs-bpf_strcasestr-and-bpf_strncasestr'
Rong Tao says:

====================
Add kfuncs bpf_strcasestr and bpf_strncasestr

From: Rong Tao <rongtao@cestc.cn>

Add kfuncs bpf_strcasestr and bpf_strncasestr, which are extensions of
bpf_strstr and bpf_strnstr, suitable for more scenarios.

v4: Fix wrong comment.
v3: keep __bpf_strnstr() static and compress some tests.
    https://lore.kernel.org/lkml/tencent_6E59062E4249590597452A06AFCDA3098808@qq.com/
v2: remove extra __bpf_kfunc and fix comment of bpf_strncasestr().
    https://lore.kernel.org/all/tencent_6D228941AB904DD6E1E58C8ACDEBEC280C06@qq.com/
v1: https://lore.kernel.org/all/tencent_8AF4D15B4475031E2185ACDE4B1495995707@qq.com/
====================

Link: https://patch.msgid.link/tencent_98ABC9680EA2A20198904316DAE5A84AD906@qq.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-10 10:05:58 -07:00
Rong Tao
eca0b643ef selftests/bpf: Test bpf_strcasestr,bpf_strncasestr kfuncs
Add tests for new kfuncs bpf_strcasestr() and bpf_strncasestr().

Signed-off-by: Rong Tao <rongtao@cestc.cn>
Link: https://lore.kernel.org/r/tencent_4F1A340A8966155C52AA9CBDB68FD221FE0A@qq.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-10 10:05:32 -07:00
Rong Tao
b5b693f735 bpf: add bpf_strcasestr,bpf_strncasestr kfuncs
bpf_strcasestr() and bpf_strncasestr() functions perform same like
bpf_strstr() and bpf_strnstr() except ignoring the case of the
characters.

Signed-off-by: Rong Tao <rongtao@cestc.cn>
Link: https://lore.kernel.org/r/tencent_B01165355D42A8B8BF5E8D0A21EE1A88090A@qq.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-10 10:05:32 -07:00
Alexei Starovoitov
17566cf0e3 Merge branch 'fix-sleepable-context-tracking-for-async-callbacks'
Kumar Kartikeya Dwivedi says:

====================
Fix sleepable context tracking for async callbacks

Currently, asynchronous execution primitives set up their callback
execution simulation using push_async_cb, which will end up inheriting
the sleepable or non-sleepable bit from the program triggering the
simulation of the callback. This is incorrect, as the actual execution
context of the asynchronous callback has nothing to do with the program
arming its execution.

This set fixes this oversight, and supplies a few test cases ensuring
the correct behavior is tested across different types of primitives
(i.e. timer, wq, task_work).

While looking at this bug, it was noticed that the GFP flag setting
logic for storage_get helpers is also broken, hence fix it while we
are at it.

PSA: These fixes and unit tests were primarily produced by prompting an
AI assistant (Claude), and then modified in minor ways, in an exercise
to understand how useful it can be at general kernel development tasks.

Changelog:
----------
v1 -> v2
v1: https://lore.kernel.org/bpf/20251007014310.2889183-1-memxor@gmail.com

 * Squash fix for GFP flags into 1st commit. (Eduard)
 * Add a commit refactoring func_atomic to non_sleepable, make it
   generic, also set for kfuncs in addition to helpers. (Eduard)
 * Leave selftest as-is, coverage for global subprogs calling sleepable
   kfuncs or helpers is provided in rcu_read_lock.c.
====================

Link: https://patch.msgid.link/20251007220349.3852807-1-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-10 10:04:52 -07:00
Kumar Kartikeya Dwivedi
5b1b5d380a selftests/bpf: Add tests for async cb context
Add tests to verify that async callback's sleepable attribute is
correctly determined by the callback type, not the arming program's
context, reflecting its true execution context.

Introduce verifier_async_cb_context.c with tests for all three async
callback primitives: bpf_timer, bpf_wq, and bpf_task_work. Each
primitive is tested when armed from both sleepable (lsm.s/file_open) and
non-sleepable (fentry) programs.

Test coverage:
- bpf_timer callbacks: Verify they are never sleepable, even when armed
  from sleepable programs. Both tests should fail when attempting to use
  sleepable helper bpf_copy_from_user() in the callback.

- bpf_wq callbacks: Verify they are always sleepable, even when armed
  from non-sleepable programs. Both tests should succeed when using
  sleepable helpers in the callback.

- bpf_task_work callbacks: Verify they are always sleepable, even when
  armed from non-sleepable programs. Both tests should succeed when
  using sleepable helpers in the callback.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20251007220349.3852807-4-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-10 10:04:51 -07:00
Kumar Kartikeya Dwivedi
f233d48559 bpf: Refactor storage_get_func_atomic to generic non_sleepable flag
Rename the storage_get_func_atomic flag to a more generic non_sleepable
flag that tracks whether a helper or kfunc may be called from a
non-sleepable context. This makes the flag more broadly applicable
beyond just storage_get helpers. See [0] for more context.

The flag is now set unconditionally for all helpers and kfuncs when:
- RCU critical section is active.
- Preemption is disabled.
- IRQs are disabled.
- In a non-sleepable context within a sleepable program (e.g., timer
  callbacks), which is indicated by !in_sleepable().

Previously, the flag was only set for storage_get helpers in these
contexts. With this change, it can be used by any code that needs to
differentiate between sleepable and non-sleepable contexts at the
per-instruction level.

The existing usage in do_misc_fixups() for storage_get helpers is
preserved by checking is_storage_get_function() before using the flag.

  [0]: https://lore.kernel.org/bpf/CAP01T76cbaNi4p-y8E0sjE2NXSra2S=Uja8G4hSQDu_SbXxREQ@mail.gmail.com

Cc: Mykyta Yatsenko <yatsenko@meta.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Mykyta Yatsenko <mykyta.yatsenko5@gmail.com>
Link: https://lore.kernel.org/r/20251007220349.3852807-3-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-10 10:04:51 -07:00
Kumar Kartikeya Dwivedi
469d638d15 bpf: Fix sleepable context for async callbacks
Fix the BPF verifier to correctly determine the sleepable context of
async callbacks based on the async primitive type rather than the arming
program's context.

The bug is in in_sleepable() which uses OR logic to check if the current
execution context is sleepable. When a sleepable program arms a timer
callback, the callback's state correctly has in_sleepable=false, but
in_sleepable() would still return true due to env->prog->sleepable being
true. This incorrectly allows sleepable helpers like
bpf_copy_from_user() inside timer callbacks when armed from sleepable
programs, even though timer callbacks always execute in non-sleepable
context.

Fix in_sleepable() to rely solely on env->cur_state->in_sleepable, and
initialize state->in_sleepable to env->prog->sleepable in
do_check_common() for the main program entry. This ensures the sleepable
context is properly tracked per verification state rather than being
overridden by the program's sleepability.

The env->cur_state NULL check in in_sleepable() was only needed for
do_misc_fixups() which runs after verification when env->cur_state is
set to NULL. Update do_misc_fixups() to use env->prog->sleepable
directly for the storage_get_function check, and remove the redundant
NULL check from in_sleepable().

Introduce is_async_cb_sleepable() helper to explicitly determine async
callback sleepability based on the primitive type:
  - bpf_timer callbacks are never sleepable
  - bpf_wq and bpf_task_work callbacks are always sleepable

Add verifier_bug() check to catch unhandled async callback types,
ensuring future additions cannot be silently mishandled. Move the
is_task_work_add_kfunc() forward declaration to the top alongside other
callback-related helpers. We update push_async_cb() to adjust to the new
changes.

At the same time, while simplifying in_sleepable(), we notice a problem
in do_misc_fixups. Fix storage_get helpers to use GFP_ATOMIC when called
from non-sleepable contexts within sleepable programs, such as bpf_timer
callbacks.

Currently, the check in do_misc_fixups assumes that env->prog->sleepable,
previously in_sleepable(env) which only resolved to this check before
last commit, holds across the program's execution, but that is not true.
Instead, the func_atomic bit must be set whenever we see the function
being called in an atomic context. Previously, this is being done when
the helper is invoked in atomic contexts in sleepable programs, we can
simply just set the value to true without doing an in_sleepable() check.

We must also do a standalone in_sleepable() check to handle cases where
the async callback itself is armed from a sleepable program, but is
itself non-sleepable (e.g., timer callback) and invokes such a helper,
thus needing the func_atomic bit to be true for the said call.

Adjust do_misc_fixups() to drop any checks regarding sleepable nature of
the program, and just depend on the func_atomic bit to decide which GFP
flag to pass.

Fixes: 81f1d7a583 ("bpf: wq: add bpf_wq_set_callback_impl")
Fixes: b00fa38a9c ("bpf: Enable non-atomic allocations in local storage")
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20251007220349.3852807-2-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-10 10:04:51 -07:00
Siddharth Chintamaneni
56b4d16239 bpf: Cleanup unused func args in rqspinlock implementation
cleanup unused function args in check_deadlock* functions.

Fixes: 31158ad02d ("rqspinlock: Add deadlock detection and recovery")
Signed-off-by: Siddharth Chintamaneni <sidchintamaneni@gmail.com>
Reviewed-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20251001172702.122838-1-sidchintamaneni@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-10-07 15:30:43 -07:00
Fushuai Wang
0db4941d9d bpf: Use rcu_read_lock_dont_migrate in bpf_sk_storage.c
Use rcu_read_lock_dont_migrate() and rcu_read_unlock_migrate() in
bpf_sk_storage.c to obtain better performance when PREEMPT_RCU is
not enabled.

Signed-off-by: Fushuai Wang <wangfushuai@baidu.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20251007074011.12916-1-wangfushuai@baidu.com
2025-10-07 11:53:29 -07:00
2197 changed files with 82255 additions and 39184 deletions

View File

@@ -106,13 +106,6 @@ Description:
will be discarded from the cache. Should not be turned off with
writeback caching enabled.
What: /sys/block/<disk>/bcache/discard
Date: November 2010
Contact: Kent Overstreet <kent.overstreet@gmail.com>
Description:
For a cache, a boolean allowing discard/TRIM to be turned off
or back on if the device supports it.
What: /sys/block/<disk>/bcache/bucket_size
Date: November 2010
Contact: Kent Overstreet <kent.overstreet@gmail.com>

View File

@@ -17,8 +17,7 @@ The latest bcache kernel code can be found from mainline Linux kernel:
It's designed around the performance characteristics of SSDs - it only allocates
in erase block sized buckets, and it uses a hybrid btree/log to track cached
extents (which can be anywhere from a single sector to the bucket size). It's
designed to avoid random writes at all costs; it fills up an erase block
sequentially, then issues a discard before reusing it.
designed to avoid random writes at all costs.
Both writethrough and writeback caching are supported. Writeback defaults to
off, but can be switched on and off arbitrarily at runtime. Bcache goes to
@@ -618,19 +617,11 @@ bucket_size
cache_replacement_policy
One of either lru, fifo or random.
discard
Boolean; if on a discard/TRIM will be issued to each bucket before it is
reused. Defaults to off, since SATA TRIM is an unqueued command (and thus
slow).
freelist_percent
Size of the freelist as a percentage of nbuckets. Can be written to to
increase the number of buckets kept on the freelist, which lets you
artificially reduce the size of the cache at runtime. Mostly for testing
purposes (i.e. testing how different size caches affect your hit rate), but
since buckets are discarded when they move on to the freelist will also make
the SSD's garbage collection easier by effectively giving it more reserved
space.
purposes (i.e. testing how different size caches affect your hit rate).
io_errors
Number of errors that have occurred, decayed by io_error_halflife.

View File

@@ -68,30 +68,43 @@ The options available for the add command can be listed by reading the
In more details, the options that can be used with the "add" command are as
follows.
================ ===========================================================
id Device number (the X in /dev/zloopX).
Default: automatically assigned.
capacity_mb Device total capacity in MiB. This is always rounded up to
the nearest higher multiple of the zone size.
Default: 16384 MiB (16 GiB).
zone_size_mb Device zone size in MiB. Default: 256 MiB.
zone_capacity_mb Device zone capacity (must always be equal to or lower than
the zone size. Default: zone size.
conv_zones Total number of conventioanl zones starting from sector 0.
Default: 8.
base_dir Path to the base directory where to create the directory
containing the zone files of the device.
Default=/var/local/zloop.
The device directory containing the zone files is always
named with the device ID. E.g. the default zone file
directory for /dev/zloop0 is /var/local/zloop/0.
nr_queues Number of I/O queues of the zoned block device. This value is
always capped by the number of online CPUs
Default: 1
queue_depth Maximum I/O queue depth per I/O queue.
Default: 64
buffered_io Do buffered IOs instead of direct IOs (default: false)
================ ===========================================================
=================== =========================================================
id Device number (the X in /dev/zloopX).
Default: automatically assigned.
capacity_mb Device total capacity in MiB. This is always rounded up
to the nearest higher multiple of the zone size.
Default: 16384 MiB (16 GiB).
zone_size_mb Device zone size in MiB. Default: 256 MiB.
zone_capacity_mb Device zone capacity (must always be equal to or lower
than the zone size. Default: zone size.
conv_zones Total number of conventioanl zones starting from
sector 0
Default: 8
base_dir Path to the base directory where to create the directory
containing the zone files of the device.
Default=/var/local/zloop.
The device directory containing the zone files is always
named with the device ID. E.g. the default zone file
directory for /dev/zloop0 is /var/local/zloop/0.
nr_queues Number of I/O queues of the zoned block device. This
value is always capped by the number of online CPUs
Default: 1
queue_depth Maximum I/O queue depth per I/O queue.
Default: 64
buffered_io Do buffered IOs instead of direct IOs (default: false)
zone_append Enable or disable a zloop device native zone append
support.
Default: 1 (enabled).
If native zone append support is disabled, the block layer
will emulate this operation using regular write
operations.
ordered_zone_append Enable zloop mitigation of zone append reordering.
Default: disabled.
This is useful for testing file systems file data mapping
(extents), as when enabled, this can significantly reduce
the number of data extents needed to for a file data
mapping.
=================== =========================================================
3) Deleting a Zoned Device
--------------------------

View File

@@ -238,6 +238,16 @@ All md devices contain:
the number of devices in a raid4/5/6, or to support external
metadata formats which mandate such clipping.
logical_block_size
Configure the array's logical block size in bytes. This attribute
is only supported for 1.x meta. Write the value before starting
array. The final array LBS uses the maximum between this
configuration and LBS of all combined devices. Note that
LBS cannot exceed PAGE_SIZE before RAID supports folio.
WARNING: Arrays created on new kernel cannot be assembled at old
kernel due to padding check, Set module parameter 'check_new_feature'
to false to bypass, but data loss may occur.
reshape_position
This is either ``none`` or a sector number within the devices of
the array where ``reshape`` is up to. If this is set, the three

View File

@@ -212,6 +212,14 @@ mem_pcpu_rsv
Per-cpu reserved forward alloc cache size in page units. Default 1MB per CPU.
bypass_prot_mem
---------------
Skip charging socket buffers to the global per-protocol memory
accounting controlled by net.ipv4.tcp_mem, net.ipv4.udp_mem, etc.
Default: 0 (off)
rmem_default
------------
@@ -347,9 +355,9 @@ skb_defer_max
-------------
Max size (in skbs) of the per-cpu list of skbs being freed
by the cpu which allocated them. Used by TCP stack so far.
by the cpu which allocated them.
Default: 64
Default: 128
optmem_max
----------
@@ -406,6 +414,23 @@ to SOCK_TXREHASH_DEFAULT (i. e. not overridden by setsockopt).
If set to 1 (default), hash rethink is performed on listening socket.
If set to 0, hash rethink is not performed.
txq_reselection_ms
------------------
Controls how often (in ms) a busy connected flow can select another tx queue.
A resection is desirable when/if user thread has migrated and XPS
would select a different queue. Same can occur without XPS
if the flow hash has changed.
But switching txq can introduce reorders, especially if the
old queue is under high pressure. Modern TCP stacks deal
well with reorders if they happen not too often.
To disable this feature, set the value to 0.
Default : 1000
gro_normal_batch
----------------

View File

@@ -100,10 +100,26 @@ described in more detail in the footnotes.
| | | ``uretprobe.s+`` [#uprobe]_ | Yes |
+ + +----------------------------------+-----------+
| | | ``usdt+`` [#usdt]_ | |
+ + +----------------------------------+-----------+
| | | ``usdt.s+`` [#usdt]_ | Yes |
+ +----------------------------------------+----------------------------------+-----------+
| | ``BPF_TRACE_KPROBE_MULTI`` | ``kprobe.multi+`` [#kpmulti]_ | |
+ + +----------------------------------+-----------+
| | | ``kretprobe.multi+`` [#kpmulti]_ | |
+ +----------------------------------------+----------------------------------+-----------+
| | ``BPF_TRACE_KPROBE_SESSION`` | ``kprobe.session+`` [#kpmulti]_ | |
+ +----------------------------------------+----------------------------------+-----------+
| | ``BPF_TRACE_UPROBE_MULTI`` | ``uprobe.multi+`` [#upmul]_ | |
+ + +----------------------------------+-----------+
| | | ``uprobe.multi.s+`` [#upmul]_ | Yes |
+ + +----------------------------------+-----------+
| | | ``uretprobe.multi+`` [#upmul]_ | |
+ + +----------------------------------+-----------+
| | | ``uretprobe.multi.s+`` [#upmul]_ | Yes |
+ +----------------------------------------+----------------------------------+-----------+
| | ``BPF_TRACE_UPROBE_SESSION`` | ``uprobe.session+`` [#upmul]_ | |
+ + +----------------------------------+-----------+
| | | ``uprobe.session.s+`` [#upmul]_ | Yes |
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
| ``BPF_PROG_TYPE_LIRC_MODE2`` | ``BPF_LIRC_MODE2`` | ``lirc_mode2`` | |
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
@@ -219,6 +235,8 @@ described in more detail in the footnotes.
non-negative integer.
.. [#ksyscall] The ``ksyscall`` attach format is ``ksyscall/<syscall>``.
.. [#uprobe] The ``uprobe`` attach format is ``uprobe[.s]/<path>:<function>[+<offset>]``.
.. [#upmul] The ``uprobe.multi`` attach format is ``uprobe.multi[.s]/<path>:<function-pattern>``
where ``function-pattern`` supports ``*`` and ``?`` wildcards.
.. [#usdt] The ``usdt`` attach format is ``usdt/<path>:<provider>:<name>``.
.. [#kpmulti] The ``kprobe.multi`` attach format is ``kprobe.multi/<pattern>`` where ``pattern``
supports ``*`` and ``?`` wildcards. Valid characters for pattern are

View File

@@ -15,8 +15,9 @@ of constant size. The size of the array is defined in ``max_entries`` at
creation time. All array elements are pre-allocated and zero initialized when
created. ``BPF_MAP_TYPE_PERCPU_ARRAY`` uses a different memory region for each
CPU whereas ``BPF_MAP_TYPE_ARRAY`` uses the same memory region. The value
stored can be of any size, however, all array elements are aligned to 8
bytes.
stored can be of any size for ``BPF_MAP_TYPE_ARRAY`` and not more than
``PCPU_MIN_UNIT_SIZE`` (32 kB) for ``BPF_MAP_TYPE_PERCPU_ARRAY``. All
array elements are aligned to 8 bytes.
Since kernel 5.5, memory mapping may be enabled for ``BPF_MAP_TYPE_ARRAY`` by
setting the flag ``BPF_F_MMAPABLE``. The map definition is page-aligned and

View File

@@ -17,6 +17,7 @@ properties:
compatible:
enum:
- airoha,en7581-eth
- airoha,an7583-eth
reg:
items:
@@ -44,6 +45,7 @@ properties:
- description: PDMA irq
resets:
minItems: 7
maxItems: 8
reset-names:
@@ -54,8 +56,9 @@ properties:
- const: xsi-mac
- const: hsi0-mac
- const: hsi1-mac
- const: hsi-mac
- enum: [ hsi-mac, xfp-mac ]
- const: xfp-mac
minItems: 7
memory-region:
items:
@@ -81,6 +84,36 @@ properties:
interface to implement hardware flow offloading programming Packet
Processor Engine (PPE) flow table.
allOf:
- $ref: ethernet-controller.yaml#
- if:
properties:
compatible:
contains:
enum:
- airoha,en7581-eth
then:
properties:
resets:
minItems: 8
reset-names:
minItems: 8
- if:
properties:
compatible:
contains:
enum:
- airoha,an7583-eth
then:
properties:
resets:
maxItems: 7
reset-names:
maxItems: 7
patternProperties:
"^ethernet@[1-4]$":
type: object

View File

@@ -18,6 +18,7 @@ properties:
compatible:
enum:
- airoha,en7581-npu
- airoha,an7583-npu
reg:
maxItems: 1

View File

@@ -0,0 +1,147 @@
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
%YAML 1.2
---
$id: http://devicetree.org/schemas/net/amd,xgbe-seattle-v1a.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: AMD XGBE Seattle v1a
maintainers:
- Shyam Sundar S K <Shyam-sundar.S-k@amd.com>
allOf:
- $ref: /schemas/net/ethernet-controller.yaml#
properties:
compatible:
const: amd,xgbe-seattle-v1a
reg:
items:
- description: MAC registers
- description: PCS registers
- description: SerDes Rx/Tx registers
- description: SerDes integration registers (1/2)
- description: SerDes integration registers (2/2)
interrupts:
description: Device interrupts. The first entry is the general device
interrupt. If amd,per-channel-interrupt is specified, each DMA channel
interrupt must be specified. The last entry is the PCS auto-negotiation
interrupt.
minItems: 2
maxItems: 6
clocks:
items:
- description: DMA clock for the device
- description: PTP clock for the device
clock-names:
items:
- const: dma_clk
- const: ptp_clk
iommus:
maxItems: 1
phy-mode: true
dma-coherent: true
amd,per-channel-interrupt:
description: Indicates that Rx and Tx complete will generate a unique
interrupt for each DMA channel.
type: boolean
amd,speed-set:
description: >
Speed capabilities of the device.
0 = 1GbE and 10GbE
1 = 2.5GbE and 10GbE
$ref: /schemas/types.yaml#/definitions/uint32
enum: [0, 1]
amd,serdes-blwc:
description: Baseline wandering correction enablement for each speed.
$ref: /schemas/types.yaml#/definitions/uint32-array
minItems: 3
maxItems: 3
items:
enum: [0, 1]
amd,serdes-cdr-rate:
description: CDR rate speed selection for each speed.
$ref: /schemas/types.yaml#/definitions/uint32-array
items:
- description: CDR rate for 1GbE
- description: CDR rate for 2.5GbE
- description: CDR rate for 10GbE
amd,serdes-pq-skew:
description: PQ data sampling skew for each speed.
$ref: /schemas/types.yaml#/definitions/uint32-array
items:
- description: PQ skew for 1GbE
- description: PQ skew for 2.5GbE
- description: PQ skew for 10GbE
amd,serdes-tx-amp:
description: TX amplitude boost for each speed.
$ref: /schemas/types.yaml#/definitions/uint32-array
items:
- description: TX amplitude for 1GbE
- description: TX amplitude for 2.5GbE
- description: TX amplitude for 10GbE
amd,serdes-dfe-tap-config:
description: DFE taps available to run for each speed.
$ref: /schemas/types.yaml#/definitions/uint32-array
items:
- description: DFE taps available for 1GbE
- description: DFE taps available for 2.5GbE
- description: DFE taps available for 10GbE
amd,serdes-dfe-tap-enable:
description: DFE taps to enable for each speed.
$ref: /schemas/types.yaml#/definitions/uint32-array
items:
- description: DFE taps to enable for 1GbE
- description: DFE taps to enable for 2.5GbE
- description: DFE taps to enable for 10GbE
required:
- compatible
- reg
- interrupts
- clocks
- clock-names
- phy-mode
unevaluatedProperties: false
examples:
- |
ethernet@e0700000 {
compatible = "amd,xgbe-seattle-v1a";
reg = <0xe0700000 0x80000>,
<0xe0780000 0x80000>,
<0xe1240800 0x00400>,
<0xe1250000 0x00060>,
<0xe1250080 0x00004>;
interrupts = <0 325 4>,
<0 326 1>, <0 327 1>, <0 328 1>, <0 329 1>,
<0 323 4>;
amd,per-channel-interrupt;
clocks = <&xgbe_dma_clk>, <&xgbe_ptp_clk>;
clock-names = "dma_clk", "ptp_clk";
phy-mode = "xgmii";
mac-address = [ 02 a1 a2 a3 a4 a5 ];
amd,speed-set = <0>;
amd,serdes-blwc = <1>, <1>, <0>;
amd,serdes-cdr-rate = <2>, <2>, <7>;
amd,serdes-pq-skew = <10>, <10>, <30>;
amd,serdes-tx-amp = <15>, <15>, <10>;
amd,serdes-dfe-tap-config = <3>, <3>, <1>;
amd,serdes-dfe-tap-enable = <0>, <0>, <127>;
};

View File

@@ -1,76 +0,0 @@
* AMD 10GbE driver (amd-xgbe)
Required properties:
- compatible: Should be "amd,xgbe-seattle-v1a"
- reg: Address and length of the register sets for the device
- MAC registers
- PCS registers
- SerDes Rx/Tx registers
- SerDes integration registers (1/2)
- SerDes integration registers (2/2)
- interrupts: Should contain the amd-xgbe interrupt(s). The first interrupt
listed is required and is the general device interrupt. If the optional
amd,per-channel-interrupt property is specified, then one additional
interrupt for each DMA channel supported by the device should be specified.
The last interrupt listed should be the PCS auto-negotiation interrupt.
- clocks:
- DMA clock for the amd-xgbe device (used for calculating the
correct Rx interrupt watchdog timer value on a DMA channel
for coalescing)
- PTP clock for the amd-xgbe device
- clock-names: Should be the names of the clocks
- "dma_clk" for the DMA clock
- "ptp_clk" for the PTP clock
- phy-mode: See ethernet.txt file in the same directory
Optional properties:
- dma-coherent: Present if dma operations are coherent
- amd,per-channel-interrupt: Indicates that Rx and Tx complete will generate
a unique interrupt for each DMA channel - this requires an additional
interrupt be configured for each DMA channel
- amd,speed-set: Speed capabilities of the device
0 - 1GbE and 10GbE (default)
1 - 2.5GbE and 10GbE
The MAC address will be determined using the optional properties defined in
ethernet.txt.
The following optional properties are represented by an array with each
value corresponding to a particular speed. The first array value represents
the setting for the 1GbE speed, the second value for the 2.5GbE speed and
the third value for the 10GbE speed. All three values are required if the
property is used.
- amd,serdes-blwc: Baseline wandering correction enablement
0 - Off
1 - On
- amd,serdes-cdr-rate: CDR rate speed selection
- amd,serdes-pq-skew: PQ (data sampling) skew
- amd,serdes-tx-amp: TX amplitude boost
- amd,serdes-dfe-tap-config: DFE taps available to run
- amd,serdes-dfe-tap-enable: DFE taps to enable
Example:
xgbe@e0700000 {
compatible = "amd,xgbe-seattle-v1a";
reg = <0 0xe0700000 0 0x80000>,
<0 0xe0780000 0 0x80000>,
<0 0xe1240800 0 0x00400>,
<0 0xe1250000 0 0x00060>,
<0 0xe1250080 0 0x00004>;
interrupt-parent = <&gic>;
interrupts = <0 325 4>,
<0 326 1>, <0 327 1>, <0 328 1>, <0 329 1>,
<0 323 4>;
amd,per-channel-interrupt;
clocks = <&xgbe_dma_clk>, <&xgbe_ptp_clk>;
clock-names = "dma_clk", "ptp_clk";
phy-mode = "xgmii";
mac-address = [ 02 a1 a2 a3 a4 a5 ];
amd,speed-set = <0>;
amd,serdes-blwc = <1>, <1>, <0>;
amd,serdes-cdr-rate = <2>, <2>, <7>;
amd,serdes-pq-skew = <10>, <10>, <30>;
amd,serdes-tx-amp = <15>, <15>, <10>;
amd,serdes-dfe-tap-config = <3>, <3>, <1>;
amd,serdes-dfe-tap-enable = <0>, <0>, <127>;
};

View File

@@ -19,7 +19,12 @@ allOf:
properties:
compatible:
const: aspeed,ast2600-mdio
oneOf:
- const: aspeed,ast2600-mdio
- items:
- enum:
- aspeed,ast2700-mdio
- const: aspeed,ast2600-mdio
reg:
maxItems: 1

View File

@@ -0,0 +1,79 @@
# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
%YAML 1.2
---
$id: http://devicetree.org/schemas/net/bluetooth/marvell,sd8897-bt.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Marvell 8897/8997 (sd8897/sd8997) bluetooth devices (SDIO)
maintainers:
- Ariel D'Alessandro <ariel.dalessandro@collabora.com>
allOf:
- $ref: /schemas/net/bluetooth/bluetooth-controller.yaml#
properties:
compatible:
enum:
- marvell,sd8897-bt
- marvell,sd8997-bt
reg:
maxItems: 1
interrupts:
maxItems: 1
marvell,cal-data:
$ref: /schemas/types.yaml#/definitions/uint8-array
description:
Calibration data downloaded to the device during initialization.
maxItems: 28
marvell,wakeup-pin:
$ref: /schemas/types.yaml#/definitions/uint16
description:
Wakeup pin number of the bluetooth chip. Used by firmware to wakeup host
system.
marvell,wakeup-gap-ms:
$ref: /schemas/types.yaml#/definitions/uint16
description:
Wakeup latency of the host platform. Required by the chip sleep feature.
required:
- compatible
- reg
- interrupts
additionalProperties: false
examples:
- |
#include <dt-bindings/interrupt-controller/irq.h>
mmc {
vmmc-supply = <&wlan_en_reg>;
bus-width = <4>;
cap-power-off-card;
keep-power-in-suspend;
#address-cells = <1>;
#size-cells = <0>;
bluetooth@2 {
compatible = "marvell,sd8897-bt";
reg = <2>;
interrupt-parent = <&pio>;
interrupts = <119 IRQ_TYPE_LEVEL_LOW>;
marvell,cal-data = /bits/ 8 <
0x37 0x01 0x1c 0x00 0xff 0xff 0xff 0xff 0x01 0x7f 0x04 0x02
0x00 0x00 0xba 0xce 0xc0 0xc6 0x2d 0x00 0x00 0x00 0x00 0x00
0x00 0x00 0xf0 0x00>;
marvell,wakeup-pin = /bits/ 16 <0x0d>;
marvell,wakeup-gap-ms = /bits/ 16 <0x64>;
};
};
...

View File

@@ -14,7 +14,7 @@ Required properties:
Also, vendors that use btusb may have device additional properties, e.g:
Documentation/devicetree/bindings/net/marvell-bt-8xxx.txt
Documentation/devicetree/bindings/net/bluetooth/marvell,sd8897-bt.yaml
Optional properties:

View File

@@ -109,6 +109,26 @@ properties:
maximum: 32
minItems: 1
pinctrl-0:
description: Default pinctrl state
pinctrl-1:
description: Can be "sleep" or "wakeup" pinctrl state
pinctrl-2:
description: Can be "sleep" or "wakeup" pinctrl state
pinctrl-names:
description:
When present should contain at least "default" describing the default pin
states. Other states are "sleep" which describes the pinstate when
sleeping and "wakeup" describing the pins if wakeup is enabled.
minItems: 1
items:
- const: default
- enum: [ sleep, wakeup ]
- const: wakeup
power-domains:
description:
Power domain provider node and an args specifier containing
@@ -125,6 +145,11 @@ properties:
minItems: 1
maxItems: 2
wakeup-source:
$ref: /schemas/types.yaml#/definitions/phandle-array
description:
List of phandles to system idle states in which mcan can wakeup the system.
required:
- compatible
- reg

View File

@@ -49,6 +49,11 @@ properties:
Must be half or less of "clocks" frequency.
maximum: 20000000
gpio-controller: true
"#gpio-cells":
const: 2
required:
- compatible
- reg

View File

@@ -32,11 +32,15 @@ properties:
- description: AHB peripheral clock
- description: CAN bus clock
resets:
maxItems: 1
required:
- compatible
- reg
- interrupts
- clocks
- resets
additionalProperties: false
@@ -46,6 +50,7 @@ examples:
compatible = "microchip,mpfs-can";
reg = <0x2010c000 0x1000>;
clocks = <&clkcfg 17>, <&clkcfg 37>;
resets = <&clkcfg 17>;
interrupt-parent = <&plic>;
interrupts = <56>;
};

View File

@@ -38,7 +38,10 @@ properties:
- cdns,sam9x60-macb # Microchip sam9x60 SoC
- microchip,mpfs-macb # Microchip PolarFire SoC
- const: cdns,macb # Generic
- items:
- const: microchip,pic64gx-macb # Microchip PIC64GX SoC
- const: microchip,mpfs-macb # Microchip PolarFire SoC
- const: cdns,macb # Generic
- items:
- enum:
- atmel,sama5d3-macb # 10/100Mbit IP on Atmel sama5d3 SoCs
@@ -47,18 +50,19 @@ properties:
- const: cdns,macb # Generic
- enum:
- atmel,sama5d29-gem # GEM XL IP (10/100) on Atmel sama5d29 SoCs
- atmel,sama5d2-gem # GEM IP (10/100) on Atmel sama5d2 SoCs
- atmel,sama5d29-gem # GEM XL IP (10/100) on Atmel sama5d29 SoCs
- atmel,sama5d3-gem # Gigabit IP on Atmel sama5d3 SoCs
- atmel,sama5d4-gem # GEM IP (10/100) on Atmel sama5d4 SoCs
- cdns,np4-macb # NP4 SoC devices
- microchip,sama7g5-emac # Microchip SAMA7G5 ethernet interface
- microchip,sama7g5-gem # Microchip SAMA7G5 gigabit ethernet interface
- raspberrypi,rp1-gem # Raspberry Pi RP1 gigabit ethernet interface
- sifive,fu540-c000-gem # SiFive FU540-C000 SoC
- cdns,emac # Generic
- cdns,gem # Generic
- cdns,macb # Generic
- cdns,np4-macb # NP4 SoC devices
- microchip,sama7g5-emac # Microchip SAMA7G5 ethernet interface
- microchip,sama7g5-gem # Microchip SAMA7G5 gigabit ethernet interface
- mobileye,eyeq5-gem # Mobileye EyeQ5 SoCs
- raspberrypi,rp1-gem # Raspberry Pi RP1 gigabit ethernet interface
- sifive,fu540-c000-gem # SiFive FU540-C000 SoC
- items:
- enum:
@@ -183,6 +187,15 @@ allOf:
reg:
maxItems: 1
- if:
properties:
compatible:
contains:
const: mobileye,eyeq5-gem
then:
required:
- phys
unevaluatedProperties: false
examples:

View File

@@ -4,10 +4,14 @@
$id: http://devicetree.org/schemas/net/dsa/lantiq,gswip.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Lantiq GSWIP Ethernet switches
title: Lantiq GSWIP and MaxLinear GSW1xx Ethernet switches
allOf:
- $ref: dsa.yaml#/$defs/ethernet-ports
description:
Lantiq GSWIP and MaxLinear GSW1xx switches share the same hardware IP.
Lantiq switches are embedded in SoCs and accessed via memory-mapped I/O,
while MaxLinear switches are standalone ICs connected via MDIO.
$ref: dsa.yaml#
maintainers:
- Hauke Mehrtens <hauke@hauke-m.de>
@@ -18,9 +22,14 @@ properties:
- lantiq,xrx200-gswip
- lantiq,xrx300-gswip
- lantiq,xrx330-gswip
- maxlinear,gsw120
- maxlinear,gsw125
- maxlinear,gsw140
- maxlinear,gsw141
- maxlinear,gsw145
reg:
minItems: 3
minItems: 1
maxItems: 3
reg-names:
@@ -37,9 +46,6 @@ properties:
compatible:
const: lantiq,xrx200-mdio
required:
- compatible
gphy-fw:
type: object
properties:
@@ -91,10 +97,63 @@ properties:
additionalProperties: false
patternProperties:
"^(ethernet-)?ports$":
type: object
patternProperties:
"^(ethernet-)?port@[0-6]$":
$ref: dsa-port.yaml#
unevaluatedProperties: false
properties:
maxlinear,rmii-refclk-out:
type: boolean
description:
Configure the RMII reference clock to be a clock output
rather than an input. Only applicable for RMII mode.
tx-internal-delay-ps:
enum: [0, 500, 1000, 1500, 2000, 2500, 3000, 3500]
description:
RGMII TX Clock Delay defined in pico seconds.
The delay lines adjust the MII clock vs. data timing.
If this property is not present the delay is determined by
the interface mode.
rx-internal-delay-ps:
enum: [0, 500, 1000, 1500, 2000, 2500, 3000, 3500]
description:
RGMII RX Clock Delay defined in pico seconds.
The delay lines adjust the MII clock vs. data timing.
If this property is not present the delay is determined by
the interface mode.
required:
- compatible
- reg
allOf:
- if:
properties:
compatible:
contains:
enum:
- lantiq,xrx200-gswip
- lantiq,xrx300-gswip
- lantiq,xrx330-gswip
then:
properties:
reg:
minItems: 3
maxItems: 3
mdio:
required:
- compatible
else:
properties:
reg:
maxItems: 1
reg-names: false
gphy-fw: false
unevaluatedProperties: false
examples:
@@ -113,8 +172,10 @@ examples:
port@0 {
reg = <0>;
label = "lan3";
phy-mode = "rgmii";
phy-mode = "rgmii-id";
phy-handle = <&phy0>;
tx-internal-delay-ps = <2000>;
rx-internal-delay-ps = <2000>;
};
port@1 {
@@ -200,3 +261,90 @@ examples:
};
};
};
- |
#include <dt-bindings/leds/common.h>
mdio {
#address-cells = <1>;
#size-cells = <0>;
switch@1f {
compatible = "maxlinear,gsw125";
reg = <0x1f>;
ports {
#address-cells = <1>;
#size-cells = <0>;
port@0 {
reg = <0>;
label = "lan0";
phy-handle = <&switchphy0>;
phy-mode = "internal";
};
port@1 {
reg = <1>;
label = "lan1";
phy-handle = <&switchphy1>;
phy-mode = "internal";
};
port@4 {
reg = <4>;
label = "wan";
phy-mode = "1000base-x";
managed = "in-band-status";
};
port@5 {
reg = <5>;
phy-mode = "rgmii-id";
tx-internal-delay-ps = <2000>;
rx-internal-delay-ps = <2000>;
ethernet = <&eth0>;
fixed-link {
speed = <1000>;
full-duplex;
};
};
};
mdio {
#address-cells = <1>;
#size-cells = <0>;
switchphy0: switchphy@0 {
reg = <0>;
leds {
#address-cells = <1>;
#size-cells = <0>;
led@0 {
reg = <0>;
color = <LED_COLOR_ID_GREEN>;
function = LED_FUNCTION_LAN;
};
};
};
switchphy1: switchphy@1 {
reg = <1>;
leds {
#address-cells = <1>;
#size-cells = <0>;
led@0 {
reg = <0>;
color = <LED_COLOR_ID_GREEN>;
function = LED_FUNCTION_LAN;
};
};
};
};
};
};

View File

@@ -0,0 +1,167 @@
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
%YAML 1.2
---
$id: http://devicetree.org/schemas/net/dsa/motorcomm,yt921x.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Motorcomm YT921x Ethernet switch family
maintainers:
- David Yang <mmyangfl@gmail.com>
description: |
The Motorcomm YT921x series is a family of Ethernet switches with up to 8
internal GbE PHYs and up to 2 GMACs, including:
- YT9215S / YT9215RB / YT9215SC: 5 GbE PHYs (Port 0-4) + 2 GMACs (Port 8-9)
- YT9213NB: 2 GbE PHYs (Port 1/3) + 1 GMAC (Port 9)
- YT9214NB: 2 GbE PHYs (Port 1/3) + 2 GMACs (Port 8-9)
- YT9218N: 8 GbE PHYs (Port 0-7)
- YT9218MB: 8 GbE PHYs (Port 0-7) + 2 GMACs (Port 8-9)
Any port can be used as the CPU port.
properties:
compatible:
const: motorcomm,yt9215
reg:
enum: [0x0, 0x1d]
reset-gpios:
maxItems: 1
mdio:
$ref: /schemas/net/mdio.yaml#
unevaluatedProperties: false
description:
Internal MDIO bus for the internal GbE PHYs. PHY 0-7 are used for Port
0-7 respectively.
mdio-external:
$ref: /schemas/net/mdio.yaml#
unevaluatedProperties: false
description:
External MDIO bus to access external components. External PHYs for GMACs
(Port 8-9) are expected to be connected to the external MDIO bus in
vendor's reference design, but that is not a hard limitation from the
chip.
required:
- compatible
- reg
allOf:
- $ref: dsa.yaml#/$defs/ethernet-ports
unevaluatedProperties: false
examples:
- |
#include <dt-bindings/gpio/gpio.h>
mdio {
#address-cells = <1>;
#size-cells = <0>;
switch@1d {
compatible = "motorcomm,yt9215";
/* default 0x1d, alternate 0x0 */
reg = <0x1d>;
reset-gpios = <&tlmm 39 GPIO_ACTIVE_LOW>;
mdio {
#address-cells = <1>;
#size-cells = <0>;
sw_phy0: phy@0 {
reg = <0x0>;
};
sw_phy1: phy@1 {
reg = <0x1>;
};
sw_phy2: phy@2 {
reg = <0x2>;
};
sw_phy3: phy@3 {
reg = <0x3>;
};
sw_phy4: phy@4 {
reg = <0x4>;
};
};
mdio-external {
#address-cells = <1>;
#size-cells = <0>;
phy1: phy@b {
reg = <0xb>;
};
};
ethernet-ports {
#address-cells = <1>;
#size-cells = <0>;
ethernet-port@0 {
reg = <0>;
label = "lan1";
phy-mode = "internal";
phy-handle = <&sw_phy0>;
};
ethernet-port@1 {
reg = <1>;
label = "lan2";
phy-mode = "internal";
phy-handle = <&sw_phy1>;
};
ethernet-port@2 {
reg = <2>;
label = "lan3";
phy-mode = "internal";
phy-handle = <&sw_phy2>;
};
ethernet-port@3 {
reg = <3>;
label = "lan4";
phy-mode = "internal";
phy-handle = <&sw_phy3>;
};
ethernet-port@4 {
reg = <4>;
label = "lan5";
phy-mode = "internal";
phy-handle = <&sw_phy4>;
};
/* CPU port */
ethernet-port@8 {
reg = <8>;
phy-mode = "2500base-x";
ethernet = <&eth0>;
fixed-link {
speed = <2500>;
full-duplex;
};
};
/* if external phy is connected to a MAC */
ethernet-port@9 {
reg = <9>;
label = "wan";
phy-mode = "rgmii-id";
phy-handle = <&phy1>;
};
};
};
};

View File

@@ -41,6 +41,9 @@ properties:
therefore discouraged.
maxItems: 1
clocks:
maxItems: 1
spi-cpha: true
spi-cpol: true

View File

@@ -0,0 +1,129 @@
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
%YAML 1.2
---
$id: http://devicetree.org/schemas/net/eswin,eic7700-eth.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Eswin EIC7700 SOC Eth Controller
maintainers:
- Shuang Liang <liangshuang@eswincomputing.com>
- Zhi Li <lizhi2@eswincomputing.com>
- Shangjuan Wei <weishangjuan@eswincomputing.com>
description:
Platform glue layer implementation for STMMAC Ethernet driver.
select:
properties:
compatible:
contains:
enum:
- eswin,eic7700-qos-eth
required:
- compatible
allOf:
- $ref: snps,dwmac.yaml#
properties:
compatible:
items:
- const: eswin,eic7700-qos-eth
- const: snps,dwmac-5.20
reg:
maxItems: 1
interrupts:
maxItems: 1
interrupt-names:
const: macirq
clocks:
items:
- description: AXI clock
- description: Configuration clock
- description: GMAC main clock
- description: Tx clock
clock-names:
items:
- const: axi
- const: cfg
- const: stmmaceth
- const: tx
resets:
maxItems: 1
reset-names:
items:
- const: stmmaceth
rx-internal-delay-ps:
enum: [0, 200, 600, 1200, 1600, 1800, 2000, 2200, 2400]
tx-internal-delay-ps:
enum: [0, 200, 600, 1200, 1600, 1800, 2000, 2200, 2400]
eswin,hsp-sp-csr:
description:
HSP CSR is to control and get status of different high-speed peripherals
(such as Ethernet, USB, SATA, etc.) via register, which can tune
board-level's parameters of PHY, etc.
$ref: /schemas/types.yaml#/definitions/phandle-array
items:
- items:
- description: Phandle to HSP(High-Speed Peripheral) device
- description: Offset of phy control register for internal
or external clock selection
- description: Offset of AXI clock controller Low-Power request
register
- description: Offset of register controlling TX/RX clock delay
required:
- compatible
- reg
- clocks
- clock-names
- interrupts
- interrupt-names
- phy-mode
- resets
- reset-names
- rx-internal-delay-ps
- tx-internal-delay-ps
- eswin,hsp-sp-csr
unevaluatedProperties: false
examples:
- |
ethernet@50400000 {
compatible = "eswin,eic7700-qos-eth", "snps,dwmac-5.20";
reg = <0x50400000 0x10000>;
clocks = <&d0_clock 186>, <&d0_clock 171>, <&d0_clock 40>,
<&d0_clock 193>;
clock-names = "axi", "cfg", "stmmaceth", "tx";
interrupt-parent = <&plic>;
interrupts = <61>;
interrupt-names = "macirq";
phy-mode = "rgmii-id";
phy-handle = <&phy0>;
resets = <&reset 95>;
reset-names = "stmmaceth";
rx-internal-delay-ps = <200>;
tx-internal-delay-ps = <200>;
eswin,hsp-sp-csr = <&hsp_sp_csr 0x100 0x108 0x118>;
snps,axi-config = <&stmmac_axi_setup>;
snps,aal;
snps,fixed-burst;
snps,tso;
stmmac_axi_setup: stmmac-axi-config {
snps,blen = <0 0 0 0 16 8 4>;
snps,rd_osr_lmt = <2>;
snps,wr_osr_lmt = <2>;
};
};

View File

@@ -35,9 +35,13 @@ properties:
description: PHYs that implement IEEE802.3 clause 45
- pattern: "^ethernet-phy-id[a-f0-9]{4}\\.[a-f0-9]{4}$"
description:
If the PHY reports an incorrect ID (or none at all) then the
compatible list may contain an entry with the correct PHY ID
in the above form.
PHYs contain identification registers. These will be read to
identify the PHY. If the PHY reports an incorrect ID, or the
PHY requires a specific initialization sequence (like a
particular order of clocks, resets, power supplies), in
order to be able to read the ID registers, then the
compatible list must contain an entry with the correct PHY
ID in the above form.
The first group of digits is the 16 bit Phy Identifier 1
register, this is the chip vendor OUI bits 3:18. The
second group of digits is the Phy Identifier 2 register,

View File

@@ -27,6 +27,7 @@ properties:
- const: fsl,enetc
- enum:
- pci1131,e101
- pci1131,e110
reg:
maxItems: 1

View File

@@ -1,83 +0,0 @@
Marvell 8897/8997 (sd8897/sd8997) bluetooth devices (SDIO or USB based)
------
The 8997 devices supports multiple interfaces. When used on SDIO interfaces,
the btmrvl driver is used and when used on USB interface, the btusb driver is
used.
Required properties:
- compatible : should be one of the following:
* "marvell,sd8897-bt" (for SDIO)
* "marvell,sd8997-bt" (for SDIO)
* "usb1286,204e" (for USB)
Optional properties:
- marvell,cal-data: Calibration data downloaded to the device during
initialization. This is an array of 28 values(u8).
This is only applicable to SDIO devices.
- marvell,wakeup-pin: It represents wakeup pin number of the bluetooth chip.
firmware will use the pin to wakeup host system (u16).
- marvell,wakeup-gap-ms: wakeup gap represents wakeup latency of the host
platform. The value will be configured to firmware. This
is needed to work chip's sleep feature as expected (u16).
- interrupt-names: Used only for USB based devices (See below)
- interrupts : specifies the interrupt pin number to the cpu. For SDIO, the
driver will use the first interrupt specified in the interrupt
array. For USB based devices, the driver will use the interrupt
named "wakeup" from the interrupt-names and interrupt arrays.
The driver will request an irq based on this interrupt number.
During system suspend, the irq will be enabled so that the
bluetooth chip can wakeup host platform under certain
conditions. During system resume, the irq will be disabled
to make sure unnecessary interrupt is not received.
Example:
IRQ pin 119 is used as system wakeup source interrupt.
wakeup pin 13 and gap 100ms are configured so that firmware can wakeup host
using this device side pin and wakeup latency.
Example for SDIO device follows (calibration data is also available in
below example).
&mmc3 {
vmmc-supply = <&wlan_en_reg>;
bus-width = <4>;
cap-power-off-card;
keep-power-in-suspend;
#address-cells = <1>;
#size-cells = <0>;
btmrvl: bluetooth@2 {
compatible = "marvell,sd8897-bt";
reg = <2>;
interrupt-parent = <&pio>;
interrupts = <119 IRQ_TYPE_LEVEL_LOW>;
marvell,cal-data = /bits/ 8 <
0x37 0x01 0x1c 0x00 0xff 0xff 0xff 0xff 0x01 0x7f 0x04 0x02
0x00 0x00 0xba 0xce 0xc0 0xc6 0x2d 0x00 0x00 0x00 0x00 0x00
0x00 0x00 0xf0 0x00>;
marvell,wakeup-pin = /bits/ 16 <0x0d>;
marvell,wakeup-gap-ms = /bits/ 16 <0x64>;
};
};
Example for USB device:
&usb_host1_ohci {
#address-cells = <1>;
#size-cells = <0>;
mvl_bt1: bt@1 {
compatible = "usb1286,204e";
reg = <1>;
interrupt-parent = <&gpio0>;
interrupt-names = "wakeup";
interrupts = <119 IRQ_TYPE_LEVEL_LOW>;
marvell,wakeup-pin = /bits/ 16 <0x0d>;
marvell,wakeup-gap-ms = /bits/ 16 <0x64>;
};
};

View File

@@ -112,7 +112,7 @@ properties:
mediatek,wed:
$ref: /schemas/types.yaml#/definitions/phandle-array
minItems: 2
minItems: 1
maxItems: 2
items:
maxItems: 1
@@ -249,6 +249,9 @@ allOf:
minItems: 1
maxItems: 1
mediatek,wed:
minItems: 2
mediatek,wed-pcie: false
else:
properties:
@@ -338,12 +341,13 @@ allOf:
- const: netsys0
- const: netsys1
mediatek,infracfg: false
mediatek,sgmiisys:
minItems: 2
maxItems: 2
mediatek,wed:
maxItems: 1
- if:
properties:
compatible:
@@ -385,6 +389,9 @@ allOf:
minItems: 2
maxItems: 2
mediatek,wed:
minItems: 2
- if:
properties:
compatible:
@@ -429,6 +436,19 @@ allOf:
- const: xgp2
- const: xgp3
mediatek,wed:
minItems: 2
- if:
properties:
compatible:
contains:
const: ralink,rt5350-eth
then:
properties:
mediatek,wed:
minItems: 2
patternProperties:
"^mac@[0-2]$":
type: object

View File

@@ -1,73 +0,0 @@
* Microsemi - vsc8531 Giga bit ethernet phy
Optional properties:
- vsc8531,vddmac : The vddmac in mV. Allowed values is listed
in the first row of Table 1 (below).
This property is only used in combination
with the 'edge-slowdown' property.
Default value is 3300.
- vsc8531,edge-slowdown : % the edge should be slowed down relative to
the fastest possible edge time.
Edge rate sets the drive strength of the MAC
interface output signals. Changing the
drive strength will affect the edge rate of
the output signal. The goal of this setting
is to help reduce electrical emission (EMI)
by being able to reprogram drive strength
and in effect slow down the edge rate if
desired.
To adjust the edge-slowdown, the 'vddmac'
must be specified. Table 1 lists the
supported edge-slowdown values for a given
'vddmac'.
Default value is 0%.
Ref: Table:1 - Edge rate change (below).
- vsc8531,led-[N]-mode : LED mode. Specify how the LED[N] should behave.
N depends on the number of LEDs supported by a
PHY.
Allowed values are defined in
"include/dt-bindings/net/mscc-phy-vsc8531.h".
Default values are VSC8531_LINK_1000_ACTIVITY (1),
VSC8531_LINK_100_ACTIVITY (2),
VSC8531_LINK_ACTIVITY (0) and
VSC8531_DUPLEX_COLLISION (8).
- load-save-gpios : GPIO used for the load/save operation of the PTP
hardware clock (PHC).
Table: 1 - Edge rate change
----------------------------------------------------------------|
| Edge Rate Change (VDDMAC) |
| |
| 3300 mV 2500 mV 1800 mV 1500 mV |
|---------------------------------------------------------------|
| 0% 0% 0% 0% |
| (Fastest) (recommended) (recommended) |
|---------------------------------------------------------------|
| 2% 3% 5% 6% |
|---------------------------------------------------------------|
| 4% 6% 9% 14% |
|---------------------------------------------------------------|
| 7% 10% 16% 21% |
|(recommended) (recommended) |
|---------------------------------------------------------------|
| 10% 14% 23% 29% |
|---------------------------------------------------------------|
| 17% 23% 35% 42% |
|---------------------------------------------------------------|
| 29% 37% 52% 58% |
|---------------------------------------------------------------|
| 53% 63% 76% 77% |
| (slowest) |
|---------------------------------------------------------------|
Example:
vsc8531_0: ethernet-phy@0 {
compatible = "ethernet-phy-id0007.0570";
vsc8531,vddmac = <3300>;
vsc8531,edge-slowdown = <7>;
vsc8531,led-0-mode = <VSC8531_LINK_1000_ACTIVITY>;
vsc8531,led-1-mode = <VSC8531_LINK_100_ACTIVITY>;
load-save-gpios = <&gpio 10 GPIO_ACTIVE_HIGH>;
};

View File

@@ -0,0 +1,131 @@
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
%YAML 1.2
---
$id: http://devicetree.org/schemas/net/mscc-phy-vsc8531.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Microsemi VSC8531 Gigabit Ethernet PHY
maintainers:
- Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
description:
The VSC8531 is a Gigabit Ethernet PHY with configurable MAC interface
drive strength and LED modes.
allOf:
- $ref: ethernet-phy.yaml#
select:
properties:
compatible:
contains:
enum:
- ethernet-phy-id0007.0570 # VSC8531
- ethernet-phy-id0007.0772 # VSC8541
required:
- compatible
properties:
compatible:
items:
- enum:
- ethernet-phy-id0007.0570 # VSC8531
- ethernet-phy-id0007.0772 # VSC8541
- const: ethernet-phy-ieee802.3-c22
vsc8531,vddmac:
$ref: /schemas/types.yaml#/definitions/uint32
description:
The VDDMAC voltage in millivolts. This property is used in combination
with the edge-slowdown property to control the drive strength of the
MAC interface output signals.
enum: [3300, 2500, 1800, 1500]
default: 3300
vsc8531,edge-slowdown:
$ref: /schemas/types.yaml#/definitions/uint32
description: >
Percentage by which the edge rate should be slowed down relative to
the fastest possible edge time. This setting helps reduce electromagnetic
interference (EMI) by adjusting the drive strength of the MAC interface
output signals. Valid values depend on the vddmac voltage setting
according to the edge rate change table in the datasheet.
- When vsc8531,vddmac = 3300 mV: allowed values are 0, 2, 4, 7, 10, 17, 29, and 53.
(Recommended: 7)
- When vsc8531,vddmac = 2500 mV: allowed values are 0, 3, 6, 10, 14, 23, 37, and 63.
(Recommended: 10)
- When vsc8531,vddmac = 1800 mV: allowed values are 0, 5, 9, 16, 23, 35, 52, and 76.
(Recommended: 0)
- When vsc8531,vddmac = 1500 mV: allowed values are 0, 6, 14, 21, 29, 42, 58, and 77.
(Recommended: 0)
enum: [0, 2, 3, 4, 5, 6, 7, 9, 10, 14, 16, 17, 21, 23, 29, 35, 37, 42, 52, 53, 58, 63, 76, 77]
default: 0
vsc8531,led-0-mode:
$ref: /schemas/types.yaml#/definitions/uint32
description: LED[0] behavior mode. See include/dt-bindings/net/mscc-phy-vsc8531.h
for available modes.
minimum: 0
maximum: 15
default: 1
vsc8531,led-1-mode:
$ref: /schemas/types.yaml#/definitions/uint32
description: LED[1] behavior mode. See include/dt-bindings/net/mscc-phy-vsc8531.h
for available modes.
minimum: 0
maximum: 15
default: 2
vsc8531,led-2-mode:
$ref: /schemas/types.yaml#/definitions/uint32
description: LED[2] behavior mode. See include/dt-bindings/net/mscc-phy-vsc8531.h
for available modes.
minimum: 0
maximum: 15
default: 0
vsc8531,led-3-mode:
$ref: /schemas/types.yaml#/definitions/uint32
description: LED[3] behavior mode. See include/dt-bindings/net/mscc-phy-vsc8531.h
for available modes.
minimum: 0
maximum: 15
default: 8
load-save-gpios:
description: GPIO phandle used for the load/save operation of the PTP hardware
clock (PHC).
maxItems: 1
dependencies:
vsc8531,edge-slowdown:
- vsc8531,vddmac
required:
- compatible
- reg
unevaluatedProperties: false
examples:
- |
#include <dt-bindings/gpio/gpio.h>
#include <dt-bindings/net/mscc-phy-vsc8531.h>
mdio {
#address-cells = <1>;
#size-cells = <0>;
ethernet-phy@0 {
compatible = "ethernet-phy-id0007.0772", "ethernet-phy-ieee802.3-c22";
reg = <0>;
vsc8531,vddmac = <3300>;
vsc8531,edge-slowdown = <7>;
vsc8531,led-0-mode = <VSC8531_LINK_1000_ACTIVITY>;
vsc8531,led-1-mode = <VSC8531_LINK_100_ACTIVITY>;
load-save-gpios = <&gpio 10 GPIO_ACTIVE_HIGH>;
};
};

View File

@@ -21,6 +21,7 @@ maintainers:
properties:
compatible:
enum:
- nxp,imx94-netc-blk-ctrl
- nxp,imx95-netc-blk-ctrl
reg:

View File

@@ -16,6 +16,7 @@ properties:
compatible:
enum:
- ti,tps23881
- ti,tps23881b
reg:
maxItems: 1

View File

@@ -73,6 +73,14 @@ properties:
dma-coherent: true
interconnects:
maxItems: 2
interconnect-names:
items:
- const: cpu-mac
- const: mac-mem
phys: true
phy-names:

View File

@@ -24,6 +24,7 @@ select:
- rockchip,rk3366-gmac
- rockchip,rk3368-gmac
- rockchip,rk3399-gmac
- rockchip,rk3506-gmac
- rockchip,rk3528-gmac
- rockchip,rk3568-gmac
- rockchip,rk3576-gmac
@@ -50,6 +51,7 @@ properties:
- rockchip,rv1108-gmac
- items:
- enum:
- rockchip,rk3506-gmac
- rockchip,rk3528-gmac
- rockchip,rk3568-gmac
- rockchip,rk3576-gmac
@@ -148,6 +150,7 @@ allOf:
compatible:
contains:
enum:
- rockchip,rk3506-gmac
- rockchip,rk3528-gmac
then:
properties:

View File

@@ -86,10 +86,14 @@ properties:
- rockchip,rk3328-gmac
- rockchip,rk3366-gmac
- rockchip,rk3368-gmac
- rockchip,rk3399-gmac
- rockchip,rk3506-gmac
- rockchip,rk3528-gmac
- rockchip,rk3568-gmac
- rockchip,rk3576-gmac
- rockchip,rk3588-gmac
- rockchip,rk3399-gmac
- rockchip,rv1108-gmac
- rockchip,rv1126-gmac
- snps,dwmac
- snps,dwmac-3.40a
- snps,dwmac-3.50a

View File

@@ -70,6 +70,25 @@ required:
allOf:
- $ref: snps,dwmac.yaml#
- if:
properties:
compatible:
contains:
const: sophgo,sg2042-dwmac
then:
properties:
phy-mode:
enum:
- rgmii-rxid
- rgmii-id
else:
properties:
phy-mode:
enum:
- rgmii
- rgmii-rxid
- rgmii-txid
- rgmii-id
unevaluatedProperties: false

View File

@@ -151,6 +151,12 @@ properties:
- ETSI
- JP
country:
$ref: /schemas/types.yaml#/definitions/string
pattern: '^[A-Z]{2}$'
description:
ISO 3166-1 alpha-2 country code for power limits
patternProperties:
"^txpower-[256]g$":
type: object
@@ -210,6 +216,66 @@ properties:
minItems: 13
maxItems: 13
paths-cck:
$ref: /schemas/types.yaml#/definitions/uint8-array
minItems: 4
maxItems: 4
description:
4 half-dBm backoff values (1 - 4 antennas, single spacial
stream)
paths-ofdm:
$ref: /schemas/types.yaml#/definitions/uint8-array
minItems: 4
maxItems: 4
description:
4 half-dBm backoff values (1 - 4 antennas, single spacial
stream)
paths-ofdm-bf:
$ref: /schemas/types.yaml#/definitions/uint8-array
minItems: 4
maxItems: 4
description:
4 half-dBm backoff values for beamforming
(1 - 4 antennas, single spacial stream)
paths-ru:
$ref: /schemas/types.yaml#/definitions/uint8-matrix
description:
Sets of half-dBm backoff values for 802.11ax rates for
1T1ss (aka 1 transmitting antenna with 1 spacial stream),
2T1ss, 3T1ss, 4T1ss, 2T2ss, 3T2ss, 4T2ss, 3T3ss, 4T3ss
and 4T4ss.
Each set starts with the number of channel bandwidth or
resource unit settings for which the rate set applies,
followed by 10 power limit values. The order of the
channel resource unit settings is RU26, RU52, RU106,
RU242/SU20, RU484/SU40, RU996/SU80 and RU2x996/SU160.
minItems: 1
maxItems: 7
items:
minItems: 11
maxItems: 11
paths-ru-bf:
$ref: /schemas/types.yaml#/definitions/uint8-matrix
description:
Sets of half-dBm backoff (beamforming) values for 802.11ax
rates for 1T1ss (aka 1 transmitting antenna with 1 spacial
stream), 2T1ss, 3T1ss, 4T1ss, 2T2ss, 3T2ss, 4T2ss, 3T3ss,
4T3ss and 4T4ss.
Each set starts with the number of channel bandwidth or
resource unit settings for which the rate set applies,
followed by 10 power limit values. The order of the
channel resource unit settings is RU26, RU52, RU106,
RU242/SU20, RU484/SU40, RU996/SU80 and RU2x996/SU160.
minItems: 1
maxItems: 7
items:
minItems: 11
maxItems: 11
txs-delta:
$ref: /schemas/types.yaml#/definitions/uint32-array
description:

View File

@@ -20,7 +20,7 @@ patternProperties:
"^(keypad|m25p|max8952|max8997|max8998|mpmc),.*": true
"^(pciclass|pinctrl-single|#pinctrl-single|PowerPC),.*": true
"^(pl022|pxa-mmc|rcar_sound|rotary-encoder|s5m8767|sdhci),.*": true
"^(simple-audio-card|st-plgpio|st-spics|ts),.*": true
"^(simple-audio-card|st-plgpio|st-spics|ts|vsc8531),.*": true
"^pool[0-3],.*": true
# Keep list in alphabetical order.

View File

@@ -198,26 +198,28 @@ be requested with the same attribute with ``DPLL_CMD_DEVICE_SET`` command.
================================== ======================================
Device may also provide ability to adjust a signal phase on a pin.
If pin phase adjustment is supported, minimal and maximal values that pin
handle shall be provide to the user on ``DPLL_CMD_PIN_GET`` respond
with ``DPLL_A_PIN_PHASE_ADJUST_MIN`` and ``DPLL_A_PIN_PHASE_ADJUST_MAX``
If pin phase adjustment is supported, minimal and maximal values and
granularity that pin handle shall be provided to the user on
``DPLL_CMD_PIN_GET`` respond with ``DPLL_A_PIN_PHASE_ADJUST_MIN``,
``DPLL_A_PIN_PHASE_ADJUST_MAX`` and ``DPLL_A_PIN_PHASE_ADJUST_GRAN``
attributes. Configured phase adjust value is provided with
``DPLL_A_PIN_PHASE_ADJUST`` attribute of a pin, and value change can be
requested with the same attribute with ``DPLL_CMD_PIN_SET`` command.
=============================== ======================================
``DPLL_A_PIN_ID`` configured pin id
``DPLL_A_PIN_PHASE_ADJUST_MIN`` attr minimum value of phase adjustment
``DPLL_A_PIN_PHASE_ADJUST_MAX`` attr maximum value of phase adjustment
``DPLL_A_PIN_PHASE_ADJUST`` attr configured value of phase
adjustment on parent dpll device
``DPLL_A_PIN_PARENT_DEVICE`` nested attribute for requesting
configuration on given parent dpll
device
``DPLL_A_PIN_PARENT_ID`` parent dpll device id
``DPLL_A_PIN_PHASE_OFFSET`` attr measured phase difference
between a pin and parent dpll device
=============================== ======================================
================================ ==========================================
``DPLL_A_PIN_ID`` configured pin id
``DPLL_A_PIN_PHASE_ADJUST_GRAN`` attr granularity of phase adjustment value
``DPLL_A_PIN_PHASE_ADJUST_MIN`` attr minimum value of phase adjustment
``DPLL_A_PIN_PHASE_ADJUST_MAX`` attr maximum value of phase adjustment
``DPLL_A_PIN_PHASE_ADJUST`` attr configured value of phase
adjustment on parent dpll device
``DPLL_A_PIN_PARENT_DEVICE`` nested attribute for requesting
configuration on given parent dpll
device
``DPLL_A_PIN_PARENT_ID`` parent dpll device id
``DPLL_A_PIN_PHASE_OFFSET`` attr measured phase difference
between a pin and parent dpll device
================================ ==========================================
All phase related values are provided in pico seconds, which represents
time difference between signals phase. The negative value means that
@@ -384,6 +386,8 @@ according to attribute purpose.
frequencies
``DPLL_A_PIN_ANY_FREQUENCY_MIN`` attr minimum value of frequency
``DPLL_A_PIN_ANY_FREQUENCY_MAX`` attr maximum value of frequency
``DPLL_A_PIN_PHASE_ADJUST_GRAN`` attr granularity of phase
adjustment value
``DPLL_A_PIN_PHASE_ADJUST_MIN`` attr minimum value of phase
adjustment
``DPLL_A_PIN_PHASE_ADJUST_MAX`` attr maximum value of phase

View File

@@ -297,6 +297,8 @@ The ``i_flags`` field is a combination of these values:
- Inode has inline data (EXT4_INLINE_DATA_FL).
* - 0x20000000
- Create children with the same project ID (EXT4_PROJINHERIT_FL).
* - 0x40000000
- Use case-insensitive lookups for directory contents (EXT4_CASEFOLD_FL).
* - 0x80000000
- Reserved for ext4 library (EXT4_RESERVED_FL).
* -

View File

@@ -671,7 +671,9 @@ following:
* - 0x8000
- Data in inode (INCOMPAT_INLINE_DATA).
* - 0x10000
- Encrypted inodes are present on the filesystem. (INCOMPAT_ENCRYPT).
- Encrypted inodes can be present. (INCOMPAT_ENCRYPT).
* - 0x20000
- Directories can be marked case-insensitive. (INCOMPAT_CASEFOLD).
.. _super_rocompat:

View File

@@ -4,6 +4,9 @@
Global File System 2
====================
Overview
========
GFS2 is a cluster file system. It allows a cluster of computers to
simultaneously use a block device that is shared between them (with FC,
iSCSI, NBD, etc). GFS2 reads and writes to the block device like a local
@@ -50,3 +53,12 @@ The following man pages are available from gfs2-utils:
gfs2_convert to convert a gfs filesystem to GFS2 in-place
mkfs.gfs2 to make a filesystem
============ =============================================
Implementation Notes
====================
.. toctree::
:maxdepth: 1
glocks
uevents

View File

@@ -89,9 +89,7 @@ Documentation for filesystem implementations.
ext3
ext4/index
f2fs
gfs2
gfs2-uevents
gfs2-glocks
gfs2/index
hfs
hfsplus
hpfs

View File

@@ -105,10 +105,8 @@ occur; this capability aids both strategies.
TLDR; Show Me the Code!
-----------------------
Code is posted to the kernel.org git trees as follows:
`kernel changes <https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=repair-symlink>`_,
`userspace changes <https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=scrub-media-scan-service>`_, and
`QA test changes <https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfstests-dev.git/log/?h=repair-dirs>`_.
Kernel and userspace code has been fully merged as of October 2025.
Each kernel patchset adding an online repair function will use the same branch
name across the kernel, xfsprogs, and fstests git repos.
@@ -764,12 +762,8 @@ allow the online fsck developers to compare online fsck against offline fsck,
and they enable XFS developers to find deficiencies in the code base.
Proposed patchsets include
`general fuzzer improvements
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfstests-dev.git/log/?h=fuzzer-improvements>`_,
`fuzzing baselines
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfstests-dev.git/log/?h=fuzz-baseline>`_,
and `improvements in fuzz testing comprehensiveness
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfstests-dev.git/log/?h=more-fuzz-testing>`_.
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfstests-dev.git/log/?h=fuzz-baseline>`_.
Stress Testing
--------------
@@ -801,11 +795,6 @@ Success is defined by the ability to run all of these tests without observing
any unexpected filesystem shutdowns due to corrupted metadata, kernel hang
check warnings, or any other sort of mischief.
Proposed patchsets include `general stress testing
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfstests-dev.git/log/?h=race-scrub-and-mount-state-changes>`_
and the `evolution of existing per-function stress testing
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfstests-dev.git/log/?h=refactor-scrub-stress>`_.
4. User Interface
=================
@@ -886,10 +875,6 @@ apply as nice of a priority to IO and CPU scheduling as possible.
This measure was taken to minimize delays in the rest of the filesystem.
No such hardening has been performed for the cron job.
Proposed patchset:
`Enabling the xfs_scrub background service
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=scrub-media-scan-service>`_.
Health Reporting
----------------
@@ -912,13 +897,6 @@ notifications and initiate a repair?
*Answer*: These questions remain unanswered, but should be a part of the
conversation with early adopters and potential downstream users of XFS.
Proposed patchsets include
`wiring up health reports to correction returns
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=corruption-health-reports>`_
and
`preservation of sickness info during memory reclaim
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=indirect-health-reporting>`_.
5. Kernel Algorithms and Data Structures
========================================
@@ -1310,21 +1288,6 @@ Space allocation records are cross-referenced as follows:
are there the same number of reverse mapping records for each block as the
reference count record claims?
Proposed patchsets are the series to find gaps in
`refcount btree
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=scrub-detect-refcount-gaps>`_,
`inode btree
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=scrub-detect-inobt-gaps>`_, and
`rmap btree
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=scrub-detect-rmapbt-gaps>`_ records;
to find
`mergeable records
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=scrub-detect-mergeable-records>`_;
and to
`improve cross referencing with rmap
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=scrub-strengthen-rmap-checking>`_
before starting a repair.
Checking Extended Attributes
````````````````````````````
@@ -1756,10 +1719,6 @@ For scrub, the drain works as follows:
To avoid polling in step 4, the drain provides a waitqueue for scrub threads to
be woken up whenever the intent count drops to zero.
The proposed patchset is the
`scrub intent drain series
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=scrub-drain-intents>`_.
.. _jump_labels:
Static Keys (aka Jump Label Patching)
@@ -2036,10 +1995,6 @@ The ``xfarray_store_anywhere`` function is used to insert a record in any
null record slot in the bag; and the ``xfarray_unset`` function removes a
record from the bag.
The proposed patchset is the
`big in-memory array
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=big-array>`_.
Iterating Array Elements
^^^^^^^^^^^^^^^^^^^^^^^^
@@ -2172,10 +2127,6 @@ However, it should be noted that these repair functions only use blob storage
to cache a small number of entries before adding them to a temporary ondisk
file, which is why compaction is not required.
The proposed patchset is at the start of the
`extended attribute repair
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=repair-xattrs>`_ series.
.. _xfbtree:
In-Memory B+Trees
@@ -2214,11 +2165,6 @@ xfiles enables reuse of the entire btree library.
Btrees built atop an xfile are collectively known as ``xfbtrees``.
The next few sections describe how they actually work.
The proposed patchset is the
`in-memory btree
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=in-memory-btrees>`_
series.
Using xfiles as a Buffer Cache Target
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -2459,14 +2405,6 @@ This enables the log to release the old EFI to keep the log moving forwards.
EFIs have a role to play during the commit and reaping phases; please see the
next section and the section about :ref:`reaping<reaping>` for more details.
Proposed patchsets are the
`bitmap rework
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=repair-bitmap-rework>`_
and the
`preparation for bulk loading btrees
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=repair-prep-for-bulk-loading>`_.
Writing the New Tree
````````````````````
@@ -2623,11 +2561,6 @@ The number of records for the inode btree is the number of xfarray records,
but the record count for the free inode btree has to be computed as inode chunk
records are stored in the xfarray.
The proposed patchset is the
`AG btree repair
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=repair-ag-btrees>`_
series.
Case Study: Rebuilding the Space Reference Counts
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -2716,11 +2649,6 @@ Reverse mappings are added to the bag using ``xfarray_store_anywhere`` and
removed via ``xfarray_unset``.
Bag members are examined through ``xfarray_iter`` loops.
The proposed patchset is the
`AG btree repair
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=repair-ag-btrees>`_
series.
Case Study: Rebuilding File Fork Mapping Indices
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -2757,11 +2685,6 @@ EXTENTS format instead of BMBT, which may require a conversion.
Third, the incore extent map must be reloaded carefully to avoid disturbing
any delayed allocation extents.
The proposed patchset is the
`file mapping repair
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=repair-file-mappings>`_
series.
.. _reaping:
Reaping Old Metadata Blocks
@@ -2843,11 +2766,6 @@ blocks.
As stated earlier, online repair functions use very large transactions to
minimize the chances of this occurring.
The proposed patchset is the
`preparation for bulk loading btrees
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=repair-prep-for-bulk-loading>`_
series.
Case Study: Reaping After a Regular Btree Repair
````````````````````````````````````````````````
@@ -2943,11 +2861,6 @@ When the walk is complete, the bitmap disunion operation ``(ag_owner_bitmap &
btrees.
These blocks can then be reaped using the methods outlined above.
The proposed patchset is the
`AG btree repair
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=repair-ag-btrees>`_
series.
.. _rmap_reap:
Case Study: Reaping After Repairing Reverse Mapping Btrees
@@ -2972,11 +2885,6 @@ methods outlined above.
The rest of the process of rebuildng the reverse mapping btree is discussed
in a separate :ref:`case study<rmap_repair>`.
The proposed patchset is the
`AG btree repair
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=repair-ag-btrees>`_
series.
Case Study: Rebuilding the AGFL
```````````````````````````````
@@ -3024,11 +2932,6 @@ more complicated, because computing the correct value requires traversing the
forks, or if that fails, leaving the fields invalid and waiting for the fork
fsck functions to run.
The proposed patchset is the
`inode
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=repair-inodes>`_
repair series.
Quota Record Repairs
--------------------
@@ -3045,11 +2948,6 @@ checking are obviously bad limits and timer values.
Quota usage counters are checked, repaired, and discussed separately in the
section about :ref:`live quotacheck <quotacheck>`.
The proposed patchset is the
`quota
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=repair-quota>`_
repair series.
.. _fscounters:
Freezing to Fix Summary Counters
@@ -3145,11 +3043,6 @@ long enough to check and correct the summary counters.
| This bug was fixed in Linux 5.17. |
+--------------------------------------------------------------------------+
The proposed patchset is the
`summary counter cleanup
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=repair-fscounters>`_
series.
Full Filesystem Scans
---------------------
@@ -3277,15 +3170,6 @@ Second, if the incore inode is stuck in some intermediate state, the scan
coordinator must release the AGI and push the main filesystem to get the inode
back into a loadable state.
The proposed patches are the
`inode scanner
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=scrub-iscan>`_
series.
The first user of the new functionality is the
`online quotacheck
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=repair-quotacheck>`_
series.
Inode Management
````````````````
@@ -3381,12 +3265,6 @@ To capture these nuances, the online fsck code has a separate ``xchk_irele``
function to set or clear the ``DONTCACHE`` flag to get the required release
behavior.
Proposed patchsets include fixing
`scrub iget usage
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=scrub-iget-fixes>`_ and
`dir iget usage
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=scrub-dir-iget-fixes>`_.
.. _ilocking:
Locking Inodes
@@ -3443,11 +3321,6 @@ If the dotdot entry changes while the directory is unlocked, then a move or
rename operation must have changed the child's parentage, and the scan can
exit early.
The proposed patchset is the
`directory repair
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=repair-dirs>`_
series.
.. _fshooks:
Filesystem Hooks
@@ -3594,11 +3467,6 @@ The inode scan APIs are pretty simple:
- ``xchk_iscan_teardown`` to finish the scan
This functionality is also a part of the
`inode scanner
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=scrub-iscan>`_
series.
.. _quotacheck:
Case Study: Quota Counter Checking
@@ -3686,11 +3554,6 @@ needing to hold any locks for a long duration.
If repairs are desired, the real and shadow dquots are locked and their
resource counts are set to the values in the shadow dquot.
The proposed patchset is the
`online quotacheck
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=repair-quotacheck>`_
series.
.. _nlinks:
Case Study: File Link Count Checking
@@ -3744,11 +3607,6 @@ shadow information.
If no parents are found, the file must be :ref:`reparented <orphanage>` to the
orphanage to prevent the file from being lost forever.
The proposed patchset is the
`file link count repair
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=scrub-nlinks>`_
series.
.. _rmap_repair:
Case Study: Rebuilding Reverse Mapping Records
@@ -3828,11 +3686,6 @@ scan for reverse mapping records.
12. Free the xfbtree now that it not needed.
The proposed patchset is the
`rmap repair
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=repair-rmap-btree>`_
series.
Staging Repairs with Temporary Files on Disk
--------------------------------------------
@@ -3971,11 +3824,6 @@ Once a good copy of a data file has been constructed in a temporary file, it
must be conveyed to the file being repaired, which is the topic of the next
section.
The proposed patches are in the
`repair temporary files
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=repair-tempfiles>`_
series.
Logged File Content Exchanges
-----------------------------
@@ -4025,11 +3873,6 @@ The new ``XFS_SB_FEAT_INCOMPAT_EXCHRANGE`` incompatible feature flag
in the superblock protects these new log item records from being replayed on
old kernels.
The proposed patchset is the
`file contents exchange
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=atomic-file-updates>`_
series.
+--------------------------------------------------------------------------+
| **Sidebar: Using Log-Incompatible Feature Flags** |
+--------------------------------------------------------------------------+
@@ -4323,11 +4166,6 @@ To repair the summary file, write the xfile contents into the temporary file
and use atomic mapping exchange to commit the new contents.
The temporary file is then reaped.
The proposed patchset is the
`realtime summary repair
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=repair-rtsummary>`_
series.
Case Study: Salvaging Extended Attributes
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -4369,11 +4207,6 @@ Salvaging extended attributes is done as follows:
4. Reap the temporary file.
The proposed patchset is the
`extended attribute repair
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=repair-xattrs>`_
series.
Fixing Directories
------------------
@@ -4448,11 +4281,6 @@ Unfortunately, the current dentry cache design doesn't provide a means to walk
every child dentry of a specific directory, which makes this a hard problem.
There is no known solution.
The proposed patchset is the
`directory repair
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=repair-dirs>`_
series.
Parent Pointers
```````````````
@@ -4612,11 +4440,6 @@ a :ref:`directory entry live update hook <liveupdate>` as follows:
7. Reap the temporary directory.
The proposed patchset is the
`parent pointers directory repair
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=pptrs-fsck>`_
series.
Case Study: Repairing Parent Pointers
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -4662,11 +4485,6 @@ directory reconstruction:
8. Reap the temporary file.
The proposed patchset is the
`parent pointers repair
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=pptrs-fsck>`_
series.
Digression: Offline Checking of Parent Pointers
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -4755,11 +4573,6 @@ connectivity checks:
4. Move on to examining link counts, as we do today.
The proposed patchset is the
`offline parent pointers repair
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=pptrs-fsck>`_
series.
Rebuilding directories from parent pointers in offline repair would be very
challenging because xfs_repair currently uses two single-pass scans of the
filesystem during phases 3 and 4 to decide which files are corrupt enough to be
@@ -4903,12 +4716,6 @@ Repairing the directory tree works as follows:
6. If the subdirectory has zero paths, attach it to the lost and found.
The proposed patches are in the
`directory tree repair
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=scrub-directory-tree>`_
series.
.. _orphanage:
The Orphanage
@@ -4973,11 +4780,6 @@ Orphaned files are adopted by the orphanage as follows:
7. If a runtime error happens, call ``xrep_adoption_cancel`` to release all
resources.
The proposed patches are in the
`orphanage adoption
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=repair-orphanage>`_
series.
6. Userspace Algorithms and Data Structures
===========================================
@@ -5091,14 +4893,6 @@ first workqueue's workers until the backlog eases.
This doesn't completely solve the balancing problem, but reduces it enough to
move on to more pressing issues.
The proposed patchsets are the scrub
`performance tweaks
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=scrub-performance-tweaks>`_
and the
`inode scan rebalance
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=scrub-iscan-rebalance>`_
series.
.. _scrubrepair:
Scheduling Repairs
@@ -5179,20 +4973,6 @@ immediately.
Corrupt file data blocks reported by phase 6 cannot be recovered by the
filesystem.
The proposed patchsets are the
`repair warning improvements
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=scrub-better-repair-warnings>`_,
refactoring of the
`repair data dependency
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=scrub-repair-data-deps>`_
and
`object tracking
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=scrub-object-tracking>`_,
and the
`repair scheduling
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=scrub-repair-scheduling>`_
improvement series.
Checking Names for Confusable Unicode Sequences
-----------------------------------------------
@@ -5372,6 +5152,8 @@ The extra flexibility enables several new use cases:
This emulates an atomic device write in software, and can support arbitrary
scattered writes.
(This functionality was merged into mainline as of 2025)
Vectorized Scrub
----------------
@@ -5393,13 +5175,7 @@ It is hoped that ``io_uring`` will pick up enough of this functionality that
online fsck can use that instead of adding a separate vectored scrub system
call to XFS.
The relevant patchsets are the
`kernel vectorized scrub
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=vectorized-scrub>`_
and
`userspace vectorized scrub
<https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=vectorized-scrub>`_
series.
(This functionality was merged into mainline as of 2025)
Quality of Service Targets for Scrub
------------------------------------

View File

@@ -227,7 +227,7 @@ properties:
Optional format indicator that is intended only for choosing
the right formatting mechanism when displaying values of this
type.
enum: [ hex, mac, fddi, ipv4, ipv6, uuid ]
enum: [ hex, mac, fddi, ipv4, ipv6, ipv4-or-v6, uuid ]
# Start genetlink-c
name-prefix:
type: string

View File

@@ -185,7 +185,7 @@ properties:
Optional format indicator that is intended only for choosing
the right formatting mechanism when displaying values of this
type.
enum: [ hex, mac, fddi, ipv4, ipv6, uuid ]
enum: [ hex, mac, fddi, ipv4, ipv6, ipv4-or-v6, uuid ]
# Make sure name-prefix does not appear in subsets (subsets inherit naming)
dependencies:

View File

@@ -157,7 +157,7 @@ properties:
Optional format indicator that is intended only for choosing
the right formatting mechanism when displaying values of this
type.
enum: [ hex, mac, fddi, ipv4, ipv6, uuid ]
enum: [ hex, mac, fddi, ipv4, ipv6, ipv4-or-v6, uuid ]
struct:
description: Name of the nested struct type.
type: string

View File

@@ -457,7 +457,7 @@ attribute-sets:
name: labels
type: binary
-
name: labels mask
name: labels-mask
type: binary
-
name: synproxy

View File

@@ -99,6 +99,8 @@ definitions:
name: legacy
-
name: switchdev
-
name: switchdev-inactive
-
type: enum
name: eswitch-inline-mode
@@ -857,6 +859,14 @@ attribute-sets:
name: health-reporter-burst-period
type: u64
doc: Time (in msec) for recoveries before starting the grace period.
# TODO: fill in the attributes in between
-
name: param-reset-default
type: flag
doc: Request restoring parameter to its default value.
value: 183
-
name: dl-dev-stats
subset-of: devlink
@@ -1791,6 +1801,7 @@ operations:
- param-type
# param-value-data is missing here as the type is variable
- param-value-cmode
- param-reset-default
-
name: region-get

View File

@@ -440,6 +440,12 @@ attribute-sets:
doc: |
Capable pin provides list of pins that can be bound to create a
reference-sync pin pair.
-
name: phase-adjust-gran
type: u32
doc: |
Granularity of phase adjustment, in picoseconds. The value of
phase adjustment must be a multiple of this granularity.
-
name: pin-parent-device
@@ -616,6 +622,7 @@ operations:
- capabilities
- parent-device
- parent-pin
- phase-adjust-gran
- phase-adjust-min
- phase-adjust-max
- phase-adjust

View File

@@ -1269,7 +1269,7 @@ attribute-sets:
-
name: hist
type: nest
multi-attr: True
multi-attr: true
nested-attributes: fec-hist
-
name: fec
@@ -1823,6 +1823,73 @@ attribute-sets:
type: uint
enum: pse-event
doc: List of events reported by the PSE controller
-
name: mse-capabilities
doc: MSE capabilities attribute set
attr-cnt-name: --ethtool-a-mse-capabilities-cnt
attributes:
-
name: max-average-mse
type: uint
-
name: max-peak-mse
type: uint
-
name: refresh-rate-ps
type: uint
-
name: num-symbols
type: uint
-
name: mse-snapshot
doc: MSE snapshot attribute set
attr-cnt-name: --ethtool-a-mse-snapshot-cnt
attributes:
-
name: average-mse
type: uint
-
name: peak-mse
type: uint
-
name: worst-peak-mse
type: uint
-
name: mse
attr-cnt-name: --ethtool-a-mse-cnt
attributes:
-
name: header
type: nest
nested-attributes: header
-
name: capabilities
type: nest
nested-attributes: mse-capabilities
-
name: channel-a
type: nest
nested-attributes: mse-snapshot
-
name: channel-b
type: nest
nested-attributes: mse-snapshot
-
name: channel-c
type: nest
nested-attributes: mse-snapshot
-
name: channel-d
type: nest
nested-attributes: mse-snapshot
-
name: worst-channel
type: nest
nested-attributes: mse-snapshot
-
name: link
type: nest
nested-attributes: mse-snapshot
operations:
enum-model: directional
@@ -2756,6 +2823,25 @@ operations:
attributes:
- header
- context
-
name: mse-get
doc: Get PHY MSE measurement data and capabilities.
attribute-set: mse
do: &mse-get-op
request:
attributes:
- header
reply:
attributes:
- header
- capabilities
- channel-a
- channel-b
- channel-c
- channel-d
- worst-channel
- link
dump: *mse-get-op
mcast-groups:
list:

View File

@@ -88,7 +88,7 @@ definitions:
-
name: napi-threaded
type: enum
entries: [disabled, enabled]
entries: [disabled, enabled, busy-poll]
attribute-sets:
-
@@ -291,7 +291,8 @@ attribute-sets:
name: threaded
doc: Whether the NAPI is configured to operate in threaded polling
mode. If this is set to enabled then the NAPI context operates
in threaded polling mode.
in threaded polling mode. If this is set to busy-poll, then the
threaded polling mode also busy polls.
type: u32
enum: napi-threaded
-
@@ -732,6 +733,29 @@ operations:
- rx-bytes
- tx-packets
- tx-bytes
- rx-alloc-fail
- rx-hw-drops
- rx-hw-drop-overruns
- rx-csum-complete
- rx-csum-unnecessary
- rx-csum-none
- rx-csum-bad
- rx-hw-gro-packets
- rx-hw-gro-bytes
- rx-hw-gro-wire-packets
- rx-hw-gro-wire-bytes
- rx-hw-drop-ratelimits
- tx-hw-drops
- tx-hw-drop-errors
- tx-csum-none
- tx-needs-csum
- tx-hw-gso-packets
- tx-hw-gso-bytes
- tx-hw-gso-wire-packets
- tx-hw-gso-wire-bytes
- tx-hw-drop-ratelimits
- tx-stop
- tx-wake
-
name: bind-rx
doc: Bind dmabuf to netdev

View File

@@ -915,7 +915,7 @@ attribute-sets:
type: string
doc: Name of set to use
-
name: set id
name: set-id
type: u32
byte-order: big-endian
doc: ID of set to use

View File

@@ -76,6 +76,83 @@ attribute-sets:
name: spi
doc: Security Parameters Index (SPI) of the association.
type: u32
-
name: stats
attributes:
-
name: dev-id
doc: PSP device ID.
type: u32
checks:
min: 1
-
name: key-rotations
type: uint
doc: |
Number of key rotations during the lifetime of the device.
Kernel statistic.
-
name: stale-events
type: uint
doc: |
Number of times a socket's Rx got shut down due to using
a key which went stale (fully rotated out).
Kernel statistic.
-
name: rx-packets
type: uint
doc: |
Number of successfully processed and authenticated PSP packets.
Device statistic (from the PSP spec).
-
name: rx-bytes
type: uint
doc: |
Number of successfully authenticated PSP bytes received, counting from
the first byte after the IV through the last byte of payload.
The fixed initial portion of the PSP header (16 bytes)
and the PSP trailer/ICV (16 bytes) are not included in this count.
Device statistic (from the PSP spec).
-
name: rx-auth-fail
type: uint
doc: |
Number of received PSP packets with unsuccessful authentication.
Device statistic (from the PSP spec).
-
name: rx-error
type: uint
doc: |
Number of received PSP packets with length/framing errors.
Device statistic (from the PSP spec).
-
name: rx-bad
type: uint
doc: |
Number of received PSP packets with miscellaneous errors
(invalid master key indicated by SPI, unsupported version, etc.)
Device statistic (from the PSP spec).
-
name: tx-packets
type: uint
doc: |
Number of successfully processed PSP packets for transmission.
Device statistic (from the PSP spec).
-
name: tx-bytes
type: uint
doc: |
Number of successfully processed PSP bytes for transmit, counting from
the first byte after the IV through the last byte of payload.
The fixed initial portion of the PSP header (16 bytes)
and the PSP trailer/ICV (16 bytes) are not included in this count.
Device statistic (from the PSP spec).
-
name: tx-error
type: uint
doc: |
Number of PSP packets for transmission with errors.
Device statistic (from the PSP spec).
operations:
list:
@@ -177,6 +254,24 @@ operations:
pre: psp-assoc-device-get-locked
post: psp-device-unlock
-
name: get-stats
doc: Get device statistics.
attribute-set: stats
do:
request:
attributes:
- dev-id
reply: &stats-all
attributes:
- dev-id
- key-rotations
- stale-events
pre: psp-device-get-locked
post: psp-device-unlock
dump:
reply: *stats-all
mcast-groups:
list:
-

View File

@@ -86,17 +86,18 @@ attribute-sets:
-
name: address
type: binary
display-hint: ipv4
display-hint: ipv4-or-v6
-
name: local
type: binary
display-hint: ipv4
display-hint: ipv4-or-v6
-
name: label
type: string
-
name: broadcast
type: binary
type: u32
byte-order: big-endian
display-hint: ipv4
-
name: anycast

View File

@@ -1707,11 +1707,11 @@ attribute-sets:
-
name: local
type: binary
display-hint: ipv4
display-hint: ipv4-or-v6
-
name: remote
type: binary
display-hint: ipv4
display-hint: ipv4-or-v6
-
name: ttl
type: u8
@@ -1833,11 +1833,11 @@ attribute-sets:
-
name: local
type: binary
display-hint: ipv4
display-hint: ipv4-or-v6
-
name: remote
type: binary
display-hint: ipv4
display-hint: ipv4-or-v6
-
name: fwmark
type: u32
@@ -1868,7 +1868,8 @@ attribute-sets:
type: u32
-
name: remote
type: binary
type: u32
byte-order: big-endian
display-hint: ipv4
-
name: ttl
@@ -1913,6 +1914,35 @@ attribute-sets:
name: port-range
type: binary
struct: ifla-geneve-port-range
-
name: linkinfo-hsr-attrs
name-prefix: ifla-hsr-
attributes:
-
name: slave1
type: u32
-
name: slave2
type: u32
-
name: multicast-spec
type: u8
-
name: supervision-addr
type: binary
display-hint: mac
-
name: seq-nr
type: u16
-
name: version
type: u8
-
name: protocol
type: u8
-
name: interlink
type: u32
-
name: linkinfo-iptun-attrs
name-prefix: ifla-iptun-
@@ -1923,11 +1953,11 @@ attribute-sets:
-
name: local
type: binary
display-hint: ipv4
display-hint: ipv4-or-v6
-
name: remote
type: binary
display-hint: ipv4
display-hint: ipv4-or-v6
-
name: ttl
type: u8
@@ -1957,7 +1987,8 @@ attribute-sets:
display-hint: ipv6
-
name: 6rd-relay-prefix
type: binary
type: u32
byte-order: big-endian
display-hint: ipv4
-
name: 6rd-prefixlen
@@ -2299,6 +2330,9 @@ sub-messages:
-
value: geneve
attribute-set: linkinfo-geneve-attrs
-
value: hsr
attribute-set: linkinfo-hsr-attrs
-
value: ipip
attribute-set: linkinfo-iptun-attrs

View File

@@ -194,7 +194,7 @@ attribute-sets:
-
name: dst
type: binary
display-hint: ipv4
display-hint: ipv4-or-v6
-
name: lladdr
type: binary

View File

@@ -87,11 +87,11 @@ attribute-sets:
-
name: dst
type: binary
display-hint: ipv4
display-hint: ipv4-or-v6
-
name: src
type: binary
display-hint: ipv4
display-hint: ipv4-or-v6
-
name: iif
type: u32
@@ -101,14 +101,14 @@ attribute-sets:
-
name: gateway
type: binary
display-hint: ipv4
display-hint: ipv4-or-v6
-
name: priority
type: u32
-
name: prefsrc
type: binary
display-hint: ipv4
display-hint: ipv4-or-v6
-
name: metrics
type: nest

View File

@@ -96,10 +96,12 @@ attribute-sets:
attributes:
-
name: dst
type: u32
type: binary
display-hint: ipv4-or-v6
-
name: src
type: u32
type: binary
display-hint: ipv4-or-v6
-
name: iifname
type: string

View File

@@ -0,0 +1,298 @@
# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
---
name: wireguard
protocol: genetlink-legacy
doc: |
**Netlink protocol to control WireGuard network devices.**
The below enums and macros are for interfacing with WireGuard, using generic
netlink, with family ``WG_GENL_NAME`` and version ``WG_GENL_VERSION``. It
defines two commands: get and set. Note that while they share many common
attributes, these two commands actually accept a slightly different set of
inputs and outputs. These differences are noted under the individual
attributes.
c-family-name: wg-genl-name
c-version-name: wg-genl-version
max-by-define: true
definitions:
-
name-prefix: wg-
name: key-len
type: const
value: 32
-
name: --kernel-timespec
type: struct
header: linux/time_types.h
members:
-
name: sec
type: u64
doc: Number of seconds, since UNIX epoch.
-
name: nsec
type: u64
doc: Number of nanoseconds, after the second began.
-
name: wgdevice-flags
name-prefix: wgdevice-f-
enum-name: wgdevice-flag
type: flags
entries:
- replace-peers
-
name: wgpeer-flags
name-prefix: wgpeer-f-
enum-name: wgpeer-flag
type: flags
entries:
- remove-me
- replace-allowedips
- update-only
-
name: wgallowedip-flags
name-prefix: wgallowedip-f-
enum-name: wgallowedip-flag
type: flags
entries:
- remove-me
attribute-sets:
-
name: wgdevice
enum-name: wgdevice-attribute
name-prefix: wgdevice-a-
attr-cnt-name: --wgdevice-a-last
attributes:
-
name: unspec
type: unused
value: 0
-
name: ifindex
type: u32
-
name: ifname
type: string
checks:
max-len: 15
-
name: private-key
type: binary
doc: Set to all zeros to remove.
display-hint: hex
checks:
exact-len: wg-key-len
-
name: public-key
type: binary
display-hint: hex
checks:
exact-len: wg-key-len
-
name: flags
type: u32
doc: |
``0`` or ``WGDEVICE_F_REPLACE_PEERS`` if all current peers should be
removed prior to adding the list below.
enum: wgdevice-flags
-
name: listen-port
type: u16
doc: Set as ``0`` to choose randomly.
-
name: fwmark
type: u32
doc: Set as ``0`` to disable.
-
name: peers
type: indexed-array
sub-type: nest
nested-attributes: wgpeer
doc: |
The index/type parameter is unused on ``SET_DEVICE`` operations and is
zero on ``GET_DEVICE`` operations.
-
name: wgpeer
enum-name: wgpeer-attribute
name-prefix: wgpeer-a-
attr-cnt-name: --wgpeer-a-last
attributes:
-
name: unspec
type: unused
value: 0
-
name: public-key
type: binary
display-hint: hex
checks:
exact-len: wg-key-len
-
name: preshared-key
type: binary
doc: Set as all zeros to remove.
display-hint: hex
checks:
exact-len: wg-key-len
-
name: flags
type: u32
doc: |
``0`` and/or ``WGPEER_F_REMOVE_ME`` if the specified peer should not
exist at the end of the operation, rather than added/updated and/or
``WGPEER_F_REPLACE_ALLOWEDIPS`` if all current allowed IPs of this
peer should be removed prior to adding the list below and/or
``WGPEER_F_UPDATE_ONLY`` if the peer should only be set if it already
exists.
enum: wgpeer-flags
-
name: endpoint
type: binary
doc: struct sockaddr_in or struct sockaddr_in6
checks:
min-len: 16
-
name: persistent-keepalive-interval
type: u16
doc: Set as ``0`` to disable.
-
name: last-handshake-time
type: binary
struct: --kernel-timespec
checks:
exact-len: 16
-
name: rx-bytes
type: u64
-
name: tx-bytes
type: u64
-
name: allowedips
type: indexed-array
sub-type: nest
nested-attributes: wgallowedip
doc: |
The index/type parameter is unused on ``SET_DEVICE`` operations and is
zero on ``GET_DEVICE`` operations.
-
name: protocol-version
type: u32
doc: |
Should not be set or used at all by most users of this API, as the
most recent protocol will be used when this is unset. Otherwise,
must be set to ``1``.
-
name: wgallowedip
enum-name: wgallowedip-attribute
name-prefix: wgallowedip-a-
attr-cnt-name: --wgallowedip-a-last
attributes:
-
name: unspec
type: unused
value: 0
-
name: family
type: u16
doc: IP family, either ``AF_INET`` or ``AF_INET6``.
-
name: ipaddr
type: binary
doc: Either ``struct in_addr`` or ``struct in6_addr``.
display-hint: ipv4-or-v6
checks:
min-len: 4
-
name: cidr-mask
type: u8
-
name: flags
type: u32
doc: |
``WGALLOWEDIP_F_REMOVE_ME`` if the specified IP should be removed;
otherwise, this IP will be added if it is not already present.
enum: wgallowedip-flags
operations:
enum-name: wg-cmd
name-prefix: wg-cmd-
list:
-
name: get-device
value: 0
doc: |
Retrieve WireGuard device
~~~~~~~~~~~~~~~~~~~~~~~~~
The command should be called with one but not both of:
- ``WGDEVICE_A_IFINDEX``
- ``WGDEVICE_A_IFNAME``
The kernel will then return several messages (``NLM_F_MULTI``). It is
possible that all of the allowed IPs of a single peer will not fit
within a single netlink message. In that case, the same peer will be
written in the following message, except it will only contain
``WGPEER_A_PUBLIC_KEY`` and ``WGPEER_A_ALLOWEDIPS``. This may occur
several times in a row for the same peer. It is then up to the receiver
to coalesce adjacent peers. Likewise, it is possible that all peers will
not fit within a single message. So, subsequent peers will be sent in
following messages, except those will only contain ``WGDEVICE_A_IFNAME``
and ``WGDEVICE_A_PEERS``. It is then up to the receiver to coalesce
these messages to form the complete list of peers.
Since this is an ``NLA_F_DUMP`` command, the final message will always
be ``NLMSG_DONE``, even if an error occurs. However, this ``NLMSG_DONE``
message contains an integer error code. It is either zero or a negative
error code corresponding to the errno.
attribute-set: wgdevice
flags: [uns-admin-perm]
dump:
pre: wg-get-device-start
post: wg-get-device-done
request:
attributes:
- ifindex
- ifname
reply: &all-attrs
attributes:
- ifindex
- ifname
- private-key
- public-key
- flags
- listen-port
- fwmark
- peers
-
name: set-device
value: 1
doc: |
Set WireGuard device
~~~~~~~~~~~~~~~~~~~~
This command should be called with a wgdevice set, containing one but
not both of ``WGDEVICE_A_IFINDEX`` and ``WGDEVICE_A_IFNAME``.
It is possible that the amount of configuration data exceeds that of the
maximum message length accepted by the kernel. In that case, several
messages should be sent one after another, with each successive one
filling in information not contained in the prior. Note that if
``WGDEVICE_F_REPLACE_PEERS`` is specified in the first message, it
probably should not be specified in fragments that come after, so that
the list of peers is only cleared the first time but appended after.
Likewise for peers, if ``WGPEER_F_REPLACE_ALLOWEDIPS`` is specified in
the first message of a peer, it likely should not be specified in
subsequent fragments.
If an error occurs, ``NLMSG_ERROR`` will reply containing an errno.
attribute-set: wgdevice
flags: [uns-admin-perm]
do:
request: *all-attrs

View File

@@ -94,7 +94,7 @@ kernels may lead to a compilation error because the interface to a kernel
function has been changed in the 2.1.8x kernels.
How to turn on 6pack support:
=============================
-----------------------------
- In the linux kernel configuration program, select the code maturity level
options menu and turn on the prompting for development drivers.

View File

@@ -4,18 +4,20 @@
ARCnet Hardware
===============
:Author: Avery Pennarun <apenwarr@worldvisions.ca>
.. note::
1) This file is a supplement to arcnet.txt. Please read that for general
1) This file is a supplement to arcnet.rst. Please read that for general
driver configuration help.
2) This file is no longer Linux-specific. It should probably be moved out
of the kernel sources. Ideas?
Because so many people (myself included) seem to have obtained ARCnet cards
without manuals, this file contains a quick introduction to ARCnet hardware,
some cabling tips, and a listing of all jumper settings I can find. Please
e-mail apenwarr@worldvisions.ca with any settings for your particular card,
or any other information you have!
some cabling tips, and a listing of all jumper settings I can find. If you
have any settings for your particular card, and/or any other information you
have, do not hesitate to :ref:`email to netdev <arcnet-netdev>`.
Introduction to ARCnet
@@ -72,11 +74,10 @@ level of encapsulation is defined by RFC1201, which I call "packet
splitting," that allows "virtual packets" to grow as large as 64K each,
although they are generally kept down to the Ethernet-style 1500 bytes.
For more information on the advantages and disadvantages (mostly the
advantages) of ARCnet networks, you might try the "ARCnet Trade Association"
WWW page:
For more information on ARCnet networks, visit the "ARCNET Resource Center"
WWW page at:
http://www.arcnet.com
https://www.arcnet.cc
Cabling ARCnet Networks
@@ -3226,9 +3227,6 @@ Settings for IRQ Selection (Lower Jumper Line)
Other Cards
===========
I have no information on other models of ARCnet cards at the moment. Please
send any and all info to:
apenwarr@worldvisions.ca
I have no information on other models of ARCnet cards at the moment.
Thanks.

View File

@@ -4,6 +4,8 @@
ARCnet
======
:Author: Avery Pennarun <apenwarr@worldvisions.ca>
.. note::
See also arcnet-hardware.txt in this directory for jumper-setting
@@ -30,18 +32,7 @@ Come on, be a sport! Send me a success report!
(hey, that was even better than my original poem... this is getting bad!)
.. warning::
If you don't e-mail me about your success/failure soon, I may be forced to
start SINGING. And we don't want that, do we?
(You know, it might be argued that I'm pushing this point a little too much.
If you think so, why not flame me in a quick little e-mail? Please also
include the type of card(s) you're using, software, size of network, and
whether it's working or not.)
My e-mail address is: apenwarr@worldvisions.ca
----
These are the ARCnet drivers for Linux.
@@ -59,23 +50,14 @@ ARCnet 2.10 ALPHA, Tomasz's all-new-and-improved RFC1051 support has been
included and seems to be working fine!
.. _arcnet-netdev:
Where do I discuss these drivers?
---------------------------------
Tomasz has been so kind as to set up a new and improved mailing list.
Subscribe by sending a message with the BODY "subscribe linux-arcnet YOUR
REAL NAME" to listserv@tichy.ch.uj.edu.pl. Then, to submit messages to the
list, mail to linux-arcnet@tichy.ch.uj.edu.pl.
There are archives of the mailing list at:
http://epistolary.org/mailman/listinfo.cgi/arcnet
The people on linux-net@vger.kernel.org (now defunct, replaced by
netdev@vger.kernel.org) have also been known to be very helpful, especially
when we're talking about ALPHA Linux kernels that may or may not work right
in the first place.
ARCnet discussions take place on netdev. Simply send your email to
netdev@vger.kernel.org and make sure to Cc: maintainer listed in
"ARCNET NETWORK LAYER" heading of Documentation/process/maintainers.rst.
Other Drivers and Info
----------------------
@@ -523,17 +505,9 @@ can set up your network then:
It works: what now?
-------------------
Send mail describing your setup, preferably including driver version, kernel
version, ARCnet card model, CPU type, number of systems on your network, and
list of software in use to me at the following address:
apenwarr@worldvisions.ca
I do send (sometimes automated) replies to all messages I receive. My email
can be weird (and also usually gets forwarded all over the place along the
way to me), so if you don't get a reply within a reasonable time, please
resend.
Send mail following :ref:`arcnet-netdev`. Describe your setup, preferably
including driver version, kernel version, ARCnet card model, CPU type, number
of systems on your network, and list of software in use.
It doesn't work: what now?
--------------------------

View File

@@ -28,6 +28,7 @@ these MAP frames and send them to appropriate PDN's.
================
a. MAP packet v1 (data / control)
---------------------------------
MAP header fields are in big endian format.
@@ -54,6 +55,7 @@ Payload length includes the padding length but does not include MAP header
length.
b. Map packet v4 (data / control)
---------------------------------
MAP header fields are in big endian format.
@@ -107,6 +109,7 @@ over which checksum is computed.
Checksum value, indicates the checksum computed.
c. MAP packet v5 (data / control)
---------------------------------
MAP header fields are in big endian format.
@@ -134,6 +137,7 @@ Payload length includes the padding length but does not include MAP header
length.
d. Checksum offload header v5
-----------------------------
Checksum offload header fields are in big endian format.
@@ -158,7 +162,10 @@ indicates that the calculated packet checksum is invalid.
Reserved bits must be zero when sent and ignored when received.
e. MAP packet v1/v5 (command specific)::
e. MAP packet v1/v5 (command specific)
--------------------------------------
Packet format::
Bit 0 1 2-7 8 - 15 16 - 31
Function Command Reserved Pad Multiplexer ID Payload length
@@ -181,6 +188,7 @@ Command types
= ==========================================
f. Aggregation
--------------
Aggregation is multiple MAP packets (can be data or command) delivered to
rmnet in a single linear skb. rmnet will process the individual

View File

@@ -47,6 +47,7 @@ Contents:
mellanox/mlx5/index
meta/fbnic
microsoft/netvsc
mucse/rnpgbe
neterion/s2io
netronome/nfp
pensando/ionic

View File

@@ -0,0 +1,17 @@
.. SPDX-License-Identifier: GPL-2.0
===========================================================
Linux Base Driver for MUCSE(R) Gigabit PCI Express Adapters
===========================================================
Contents
========
- Identifying Your Adapter
Identifying Your Adapter
========================
The driver is compatible with devices based on the following:
* MUCSE(R) Ethernet Controller N210 series
* MUCSE(R) Ethernet Controller N500 series

View File

@@ -39,6 +39,10 @@ The following is a list of E-Switch attributes.
rules.
* ``switchdev`` allows for more advanced offloading capabilities of
the E-Switch to hardware.
* ``switchdev_inactive`` switchdev mode but starts inactive, doesn't allow traffic
until explicitly activated. This mode is useful for orchestrators that
want to prepare the device in switchdev mode but only activate it when
all configurations are done.
* - ``inline-mode``
- enum
- Some HWs need the VF driver to put part of the packet
@@ -74,3 +78,12 @@ Example Usage
# enable encap-mode with legacy mode
$ devlink dev eswitch set pci/0000:08:00.0 mode legacy inline-mode none encap-mode basic
# start switchdev mode in inactive state
$ devlink dev eswitch set pci/0000:08:00.0 mode switchdev_inactive
# setup switchdev configurations, representors, FDB entries, etc..
...
# activate switchdev mode to allow traffic
$ devlink dev eswitch set pci/0000:08:00.0 mode switchdev

View File

@@ -41,6 +41,16 @@ In order for ``driverinit`` parameters to take effect, the driver must
support reloading via the ``devlink-reload`` command. This command will
request a reload of the device driver.
Default parameter values
=========================
Drivers may optionally export default values for parameters of cmode
``runtime`` and ``permanent``. For ``driverinit`` parameters, the last
value set by the driver will be used as the default value. Drivers can
also support resetting params with cmode ``runtime`` and ``permanent``
to their default values. Resetting ``driverinit`` params is supported
by devlink core without additional driver support needed.
.. _devlink_params_generic:
Generic configuration parameters
@@ -151,3 +161,7 @@ own name.
* - ``num_doorbells``
- u32
- Controls the number of doorbells used by the device.
* - ``max_mac_per_vf``
- u32
- Controls the maximum number of MAC address filters that can be assigned
to a Virtual Function (VF).

View File

@@ -7,6 +7,40 @@ i40e devlink support
This document describes the devlink features implemented by the ``i40e``
device driver.
Parameters
==========
.. list-table:: Generic parameters implemented
:widths: 5 5 90
* - Name
- Mode
- Notes
* - ``max_mac_per_vf``
- runtime
- Controls the maximum number of MAC addresses a VF can use
on i40e devices.
By default (``0``), the driver enforces its internally calculated per-VF
MAC filter limit, which is based on the number of allocated VFS.
If set to a non-zero value, this parameter acts as a strict cap:
the driver will use the user-provided value instead of its internal
calculation.
**Important notes:**
- This value **must be set before enabling SR-IOV**.
Attempting to change it while SR-IOV is enabled will return an error.
- MAC filters are a **shared hardware resource** across all VFs.
Setting a high value may cause other VFs to be starved of filters.
- This value is a **Administrative policy**. The hardware may return
errors when its absolute limit is reached, regardless of the value
set here.
The default value is ``0`` (internal calculation is used).
Info versions
=============

View File

@@ -99,5 +99,6 @@ parameters, info versions, and other features it supports.
prestera
qed
sfc
stmmac
ti-cpsw-switch
zl3073x

View File

@@ -218,6 +218,20 @@ parameters.
* ``balanced`` : Merges fewer CQEs, resulting in a moderate compression ratio but maintaining a balance between bandwidth savings and performance
* ``aggressive`` : Merges more CQEs into a single entry, achieving a higher compression rate and maximizing performance, particularly under high traffic loads
* - ``swp_l4_csum_mode``
- string
- permanent
- Configure how the L4 checksum is calculated by the device when using
Software Parser (SWP) hints for header locations.
* ``default`` : Use the device's default checksum calculation
mode. The driver will discover during init whether or
full_csum or l4_only is in use. Setting this value explicitly
from userspace is not allowed, but some firmware versions may
return this value on param read.
* ``full_csum`` : Calculate full checksum including the pseudo-header
* ``l4_only`` : Calculate L4-only checksum, excluding the pseudo-header
The ``mlx5`` driver supports reloading via ``DEVLINK_CMD_RELOAD``
Info versions

View File

@@ -0,0 +1,40 @@
.. SPDX-License-Identifier: GPL-2.0
=======================================
stmmac (synopsys dwmac) devlink support
=======================================
This document describes the devlink features implemented by the ``stmmac``
device driver.
Parameters
==========
The ``stmmac`` driver implements the following driver-specific parameters.
.. list-table:: Driver-specific parameters implemented
:widths: 5 5 5 85
* - Name
- Type
- Mode
- Description
* - ``phc_coarse_adj``
- Boolean
- runtime
- Enable the Coarse timestamping mode, as defined in the DWMAC TRM.
A detailed explanation of this timestamping mode can be found in the
Socfpga Functionnal Description [1].
In Coarse mode, the ptp clock is expected to be fed by a high-precision
clock that is externally adjusted, and the subsecond increment used for
timestamping is set to 1/ptp_clock_rate.
In Fine mode (i.e. Coarse mode == false), the ptp clock frequency is
continuously adjusted, but the subsecond increment is set to
2/ptp_clock_rate.
Coarse mode is suitable for PTP Grand Master operation. If unsure, leave
the parameter to False.
[1] https://www.intel.com/content/www/us/en/docs/programmable/683126/21-2/functional-description-of-the-emac.html

View File

@@ -1104,12 +1104,11 @@ health of the network and for discovery of other nodes.
In Linux, both HSR and PRP are implemented in the hsr driver, which
instantiates a virtual, stackable network interface with two member ports.
The driver only implements the basic roles of DANH (Doubly Attached Node
implementing HSR) and DANP (Doubly Attached Node implementing PRP); the roles
of RedBox and QuadBox are not implemented (therefore, bridging a hsr network
interface with a physical switch port does not produce the expected result).
implementing HSR), DANP (Doubly Attached Node implementing PRP) and RedBox
(allows non-HSR devices to connect to the ring via Interlink ports).
A driver which is able of offloading certain functions of a DANP or DANH should
declare the corresponding netdev features as indicated by the documentation at
A driver which is able of offloading certain functions should declare the
corresponding netdev features as indicated by the documentation at
``Documentation/networking/netdev-features.rst``. Additionally, the following
methods must be implemented:
@@ -1120,6 +1119,14 @@ methods must be implemented:
- ``port_hsr_leave``: function invoked when a given switch port leaves a
DANP/DANH and returns to normal operation as a standalone port.
Note that the ``NETIF_F_HW_HSR_DUP`` feature relies on transmission towards
multiple ports, which is generally available whenever the tagging protocol uses
the ``dsa_xmit_port_mask()`` helper function. If the helper is used, the HSR
offload feature should also be set. The ``dsa_port_simple_hsr_join()`` and
``dsa_port_simple_hsr_leave()`` methods can be used as generic implementations
of ``port_hsr_join`` and ``port_hsr_leave``, if this is the only supported
offload feature.
TODO
====

View File

@@ -242,6 +242,7 @@ Userspace to kernel:
``ETHTOOL_MSG_RSS_SET`` set RSS settings
``ETHTOOL_MSG_RSS_CREATE_ACT`` create an additional RSS context
``ETHTOOL_MSG_RSS_DELETE_ACT`` delete an additional RSS context
``ETHTOOL_MSG_MSE_GET`` get MSE diagnostic data
===================================== =================================
Kernel to userspace:
@@ -299,6 +300,7 @@ Kernel to userspace:
``ETHTOOL_MSG_RSS_CREATE_ACT_REPLY`` create an additional RSS context
``ETHTOOL_MSG_RSS_CREATE_NTF`` additional RSS context created
``ETHTOOL_MSG_RSS_DELETE_NTF`` additional RSS context deleted
``ETHTOOL_MSG_MSE_GET_REPLY`` MSE diagnostic data
======================================== =================================
``GET`` requests are sent by userspace applications to retrieve device
@@ -2458,6 +2460,68 @@ Kernel response contents:
For a description of each attribute, see ``TSCONFIG_GET``.
MSE_GET
=======
Retrieves detailed Mean Square Error (MSE) diagnostic information from the PHY.
Request Contents:
==================================== ====== ============================
``ETHTOOL_A_MSE_HEADER`` nested request header
==================================== ====== ============================
Kernel Response Contents:
==================================== ====== ================================
``ETHTOOL_A_MSE_HEADER`` nested reply header
``ETHTOOL_A_MSE_CAPABILITIES`` nested capability/scale info for MSE
measurements
``ETHTOOL_A_MSE_CHANNEL_A`` nested snapshot for Channel A
``ETHTOOL_A_MSE_CHANNEL_B`` nested snapshot for Channel B
``ETHTOOL_A_MSE_CHANNEL_C`` nested snapshot for Channel C
``ETHTOOL_A_MSE_CHANNEL_D`` nested snapshot for Channel D
``ETHTOOL_A_MSE_WORST_CHANNEL`` nested snapshot for worst channel
``ETHTOOL_A_MSE_LINK`` nested snapshot for link-wide aggregate
==================================== ====== ================================
MSE Capabilities
----------------
This nested attribute reports the capability / scaling properties used to
interpret snapshot values.
============================================== ====== =========================
``ETHTOOL_A_MSE_CAPABILITIES_MAX_AVERAGE_MSE`` uint max avg_mse scale
``ETHTOOL_A_MSE_CAPABILITIES_MAX_PEAK_MSE`` uint max peak_mse scale
``ETHTOOL_A_MSE_CAPABILITIES_REFRESH_RATE_PS`` uint sample rate (picoseconds)
``ETHTOOL_A_MSE_CAPABILITIES_NUM_SYMBOLS`` uint symbols per HW sample
============================================== ====== =========================
The max-average/peak fields are included only if the corresponding metric
is supported by the PHY. Their absence indicates that the metric is not
available.
See ``struct phy_mse_capability`` kernel documentation in
``include/linux/phy.h``.
MSE Snapshot
------------
Each per-channel nest contains an atomic snapshot of MSE values for that
selector (channel A/B/C/D, worst channel, or link).
========================================== ====== ===================
``ETHTOOL_A_MSE_SNAPSHOT_AVERAGE_MSE`` uint average MSE value
``ETHTOOL_A_MSE_SNAPSHOT_PEAK_MSE`` uint current peak MSE
``ETHTOOL_A_MSE_SNAPSHOT_WORST_PEAK_MSE`` uint worst-case peak MSE
========================================== ====== ===================
Within each channel nest, only the metrics supported by the PHY will be present.
See ``struct phy_mse_snapshot`` kernel documentation in
``include/linux/phy.h``.
Request translation
===================

View File

@@ -131,10 +131,7 @@ Contents:
vxlan
x25
x25-iface
xfrm_device
xfrm_proc
xfrm_sync
xfrm_sysctl
xfrm/index
xdp-rx-metadata
xsk-tx-metadata

View File

@@ -673,6 +673,16 @@ tcp_moderate_rcvbuf - BOOLEAN
Default: 1 (enabled)
tcp_rcvbuf_low_rtt - INTEGER
rcvbuf autotuning can over estimate final socket rcvbuf, which
can lead to cache trashing for high throughput flows.
For small RTT flows (below tcp_rcvbuf_low_rtt usecs), we can relax
rcvbuf growth: Few additional ms to reach the final (and smaller)
rcvbuf is a good tradeoff.
Default : 1000 (1 ms)
tcp_mtu_probing - INTEGER
Controls TCP Packetization-Layer Path MTU Discovery. Takes three
values:
@@ -854,9 +864,18 @@ tcp_sack - BOOLEAN
Default: 1 (enabled)
tcp_comp_sack_rtt_percent - INTEGER
Percentage of SRTT used for the compressed SACK feature.
See tcp_comp_sack_nr, tcp_comp_sack_delay_ns, tcp_comp_sack_slack_ns.
Possible values : 1 - 1000
Default : 33 %
tcp_comp_sack_delay_ns - LONG INTEGER
TCP tries to reduce number of SACK sent, using a timer
based on 5% of SRTT, capped by this sysctl, in nano seconds.
TCP tries to reduce number of SACK sent, using a timer based
on tcp_comp_sack_rtt_percent of SRTT, capped by this sysctl
in nano seconds.
The default is 1ms, based on TSO autosizing period.
Default : 1,000,000 ns (1 ms)
@@ -866,8 +885,9 @@ tcp_comp_sack_slack_ns - LONG INTEGER
timer used by SACK compression. This gives extra time
for small RTT flows, and reduces system overhead by allowing
opportunistic reduction of timer interrupts.
Too big values might reduce goodput.
Default : 100,000 ns (100 us)
Default : 10,000 ns (10 us)
tcp_comp_sack_nr - INTEGER
Max number of SACK that can be compressed.
@@ -1796,6 +1816,23 @@ icmp_errors_use_inbound_ifaddr - BOOLEAN
Default: 0 (disabled)
icmp_errors_extension_mask - UNSIGNED INTEGER
Bitmask of ICMP extensions to append to ICMPv4 error messages
("Destination Unreachable", "Time Exceeded" and "Parameter Problem").
The original datagram is trimmed / padded to 128 bytes in order to be
compatible with applications that do not comply with RFC 4884.
Possible extensions are:
==== ==============================================================
0x01 Incoming IP interface information according to RFC 5837.
Extension will include the index, IPv4 address (if present),
name and MTU of the IP interface that received the datagram
which elicited the ICMP error.
==== ==============================================================
Default: 0x00 (no extensions)
igmp_max_memberships - INTEGER
Change the maximum number of multicast groups we can subscribe to.
Default: 20
@@ -3262,6 +3299,23 @@ error_anycast_as_unicast - BOOLEAN
Default: 0 (disabled)
errors_extension_mask - UNSIGNED INTEGER
Bitmask of ICMP extensions to append to ICMPv6 error messages
("Destination Unreachable" and "Time Exceeded"). The original datagram
is trimmed / padded to 128 bytes in order to be compatible with
applications that do not comply with RFC 4884.
Possible extensions are:
==== ==============================================================
0x01 Incoming IP interface information according to RFC 5837.
Extension will include the index, IPv6 address (if present),
name and MTU of the IP interface that received the datagram
which elicited the ICMP error.
==== ==============================================================
Default: 0x00 (no extensions)
xfrm6_gc_thresh - INTEGER
(Obsolete since linux-4.14)
The threshold at which we will start garbage collecting for IPv6

View File

@@ -263,7 +263,9 @@ are not well known).
Busy polling is enabled by either setting ``SO_BUSY_POLL`` on
selected sockets or using the global ``net.core.busy_poll`` and
``net.core.busy_read`` sysctls. An io_uring API for NAPI busy polling
also exists.
also exists. Threaded polling of NAPI also has a mode to busy poll for
packets (:ref:`threaded busy polling<threaded_busy_poll>`) using the NAPI
processing kthread.
epoll-based busy polling
------------------------
@@ -426,6 +428,52 @@ Therefore, setting ``gro_flush_timeout`` and ``napi_defer_hard_irqs`` is
the recommended usage, because otherwise setting ``irq-suspend-timeout``
might not have any discernible effect.
.. _threaded_busy_poll:
Threaded NAPI busy polling
--------------------------
Threaded NAPI busy polling extends threaded NAPI and adds support to do
continuous busy polling of the NAPI. This can be useful for forwarding or
AF_XDP applications.
Threaded NAPI busy polling can be enabled on per NIC queue basis using Netlink.
For example, using the following script:
.. code-block:: bash
$ ynl --family netdev --do napi-set \
--json='{"id": 66, "threaded": "busy-poll"}'
The kernel will create a kthread that busy polls on this NAPI.
The user may elect to set the CPU affinity of this kthread to an unused CPU
core to improve how often the NAPI is polled at the expense of wasted CPU
cycles. Note that this will keep the CPU core busy with 100% usage.
Once threaded busy polling is enabled for a NAPI, PID of the kthread can be
retrieved using Netlink so the affinity of the kthread can be set up.
For example, the following script can be used to fetch the PID:
.. code-block:: bash
$ ynl --family netdev --do napi-get --json='{"id": 66}'
This will output something like following, the pid `258` is the PID of the
kthread that is polling this NAPI.
.. code-block:: bash
$ {'defer-hard-irqs': 0,
'gro-flush-timeout': 0,
'id': 66,
'ifindex': 2,
'irq-suspend-timeout': 0,
'pid': 258,
'threaded': 'busy-poll'}
.. _threaded:
Threaded NAPI

View File

@@ -12,8 +12,8 @@ struct inet_sock icsk_inet read_mostly r
struct request_sock_queue icsk_accept_queue
struct inet_bind_bucket icsk_bind_hash read_mostly tcp_set_state
struct inet_bind2_bucket icsk_bind2_hash read_mostly tcp_set_state,inet_put_port
struct timer_list icsk_retransmit_timer read_write inet_csk_reset_xmit_timer,tcp_connect
struct timer_list icsk_delack_timer read_mostly inet_csk_reset_xmit_timer,tcp_connect
struct timer_list icsk_keepalive_timer
u32 icsk_rto read_write tcp_cwnd_validate,tcp_schedule_loss_probe,tcp_connect_init,tcp_connect,tcp_write_xmit,tcp_push_one
u32 icsk_rto_min
u32 icsk_rto_max read_mostly tcp_reset_xmit_timer

View File

@@ -5,42 +5,43 @@
inet_sock struct fast path usage breakdown
==========================================
======================= ===================== =================== =================== ======================================================================================================
Type Name fastpath_tx_access fastpath_rx_access comment
======================= ===================== =================== =================== ======================================================================================================
struct sock sk read_mostly read_mostly tcp_init_buffer_space,tcp_init_transfer,tcp_finish_connect,tcp_connect,tcp_send_rcvq,tcp_send_syn_data
struct ipv6_pinfo* pinet6
be16 inet_sport read_mostly __tcp_transmit_skb
be32 inet_daddr read_mostly ip_select_ident_segs
be32 inet_rcv_saddr
be16 inet_dport read_mostly __tcp_transmit_skb
u16 inet_num
be32 inet_saddr
s16 uc_ttl read_mostly __ip_queue_xmit/ip_select_ttl
u16 cmsg_flags
struct ip_options_rcu* inet_opt read_mostly __ip_queue_xmit
u16 inet_id read_mostly ip_select_ident_segs
u8 tos read_mostly ip_queue_xmit
u8 min_ttl
u8 mc_ttl
u8 pmtudisc
u8:1 recverr
u8:1 is_icsk
u8:1 freebind
u8:1 hdrincl
u8:1 mc_loop
u8:1 transparent
u8:1 mc_all
u8:1 nodefrag
u8:1 bind_address_no_port
u8:1 recverr_rfc4884
u8:1 defer_connect read_mostly tcp_sendmsg_fastopen
u8 rcv_tos
u8 convert_csum
int uc_index
int mc_index
be32 mc_addr
struct ip_mc_socklist* mc_list
struct inet_cork_full cork read_mostly __tcp_transmit_skb
struct local_port_range
======================= ===================== =================== =================== ======================================================================================================
======================== ===================== =================== =================== ======================================================================================================
Type Name fastpath_tx_access fastpath_rx_access comment
======================== ===================== =================== =================== ======================================================================================================
struct sock sk read_mostly read_mostly tcp_init_buffer_space,tcp_init_transfer,tcp_finish_connect,tcp_connect,tcp_send_rcvq,tcp_send_syn_data
struct ipv6_pinfo* pinet6
struct ipv6_fl_socklist* ipv6_fl_list read_mostly tcp_v6_connect,__ip6_datagram_connect,udpv6_sendmsg,rawv6_sendmsg
be16 inet_sport read_mostly __tcp_transmit_skb
be32 inet_daddr read_mostly ip_select_ident_segs
be32 inet_rcv_saddr
be16 inet_dport read_mostly __tcp_transmit_skb
u16 inet_num
be32 inet_saddr
s16 uc_ttl read_mostly __ip_queue_xmit/ip_select_ttl
u16 cmsg_flags
struct ip_options_rcu* inet_opt read_mostly __ip_queue_xmit
u16 inet_id read_mostly ip_select_ident_segs
u8 tos read_mostly ip_queue_xmit
u8 min_ttl
u8 mc_ttl
u8 pmtudisc
u8:1 recverr
u8:1 is_icsk
u8:1 freebind
u8:1 hdrincl
u8:1 mc_loop
u8:1 transparent
u8:1 mc_all
u8:1 nodefrag
u8:1 bind_address_no_port
u8:1 recverr_rfc4884
u8:1 defer_connect read_mostly tcp_sendmsg_fastopen
u8 rcv_tos
u8 convert_csum
int uc_index
int mc_index
be32 mc_addr
struct ip_mc_socklist* mc_list
struct inet_cork_full cork read_mostly __tcp_transmit_skb
struct local_port_range
======================== ===================== =================== =================== ======================================================================================================

View File

@@ -102,7 +102,8 @@ u8 sysctl_tcp_app_win
u8 sysctl_tcp_frto tcp_enter_loss
u8 sysctl_tcp_nometrics_save TCP_LAST_ACK/tcp_update_metrics
u8 sysctl_tcp_no_ssthresh_metrics_save TCP_LAST_ACK/tcp_(update/init)_metrics
u8 sysctl_tcp_moderate_rcvbuf read_mostly read_mostly tcp_tso_should_defer(tx);tcp_rcv_space_adjust(rx)
u8 sysctl_tcp_moderate_rcvbuf read_mostly tcp_rcvbuf_grow()
u32 sysctl_tcp_rcvbuf_low_rtt read_mostly tcp_rcvbuf_grow()
u8 sysctl_tcp_tso_win_divisor read_mostly tcp_tso_should_defer(tcp_write_xmit)
u8 sysctl_tcp_workaround_signed_windows tcp_select_window
int sysctl_tcp_limit_output_bytes read_mostly tcp_small_queue_check(tcp_write_xmit)

View File

@@ -88,7 +88,7 @@ for example:
nc -u -l -p <port>' / 'nc -u -l <port>
or::
or::
netcat -u -l -p <port>' / 'netcat -u -l <port>

View File

@@ -71,7 +71,8 @@ Userspace interface
The userspace interface is divided in control operations and low-level data
exchange operation.
CONTROL OPERATIONS:
Control operations
------------------
Generic netlink is used to implement the interface to the control operations.
The operations are composed by commands and events, all listed below:
@@ -100,7 +101,8 @@ relevant information such as the supported NFC protocols.
All polling operations requested through one netlink socket are stopped when
it's closed.
LOW-LEVEL DATA EXCHANGE:
Low-level data exchange
-----------------------
The userspace must use PF_NFC sockets to perform any data communication with
targets. All NFC sockets use AF_NFC::

View File

@@ -71,3 +71,43 @@ smcr_max_conns_per_lgr - INTEGER
acceptable value ranges from 16 to 255. Only for SMC-R v2.1 and later.
Default: 255
smcr_max_send_wr - INTEGER
So-called work request buffers are SMCR link (and RDMA queue pair) level
resources necessary for performing RDMA operations. Since up to 255
connections can share a link group and thus also a link and the number
of the work request buffers is decided when the link is allocated,
depending on the workload it can be a bottleneck in a sense that threads
have to wait for work request buffers to become available. Before the
introduction of this control the maximal number of work request buffers
available on the send path used to be hard coded to 16. With this control
it becomes configurable. The acceptable range is between 2 and 2048.
Please be aware that all the buffers need to be allocated as a physically
continuous array in which each element is a single buffer and has the size
of SMC_WR_BUF_SIZE (48) bytes. If the allocation fails, we keep retrying
with half of the buffer count until it is ether successful or (unlikely)
we dip below the old hard coded value which is 16 where we give up much
like before having this control.
Default: 16
smcr_max_recv_wr - INTEGER
So-called work request buffers are SMCR link (and RDMA queue pair) level
resources necessary for performing RDMA operations. Since up to 255
connections can share a link group and thus also a link and the number
of the work request buffers is decided when the link is allocated,
depending on the workload it can be a bottleneck in a sense that threads
have to wait for work request buffers to become available. Before the
introduction of this control the maximal number of work request buffers
available on the receive path used to be hard coded to 16. With this control
it becomes configurable. The acceptable range is between 2 and 2048.
Please be aware that all the buffers need to be allocated as a physically
continuous array in which each element is a single buffer and has the size
of SMC_WR_BUF_SIZE (48) bytes. If the allocation fails, we keep retrying
with half of the buffer count until it is ether successful or (unlikely)
we dip below the old hard coded value which is 16 where we give up much
like before having this control.
Default: 48

View File

@@ -184,9 +184,11 @@ Protocol-related statistics can be requested in get commands by setting
the `ETHTOOL_FLAG_STATS` flag in `ETHTOOL_A_HEADER_FLAGS`. Currently
statistics are supported in the following commands:
- `ETHTOOL_MSG_PAUSE_GET`
- `ETHTOOL_MSG_FEC_GET`
- `ETHTOOL_MSG_LINKSTATE_GET`
- `ETHTOOL_MSG_MM_GET`
- `ETHTOOL_MSG_PAUSE_GET`
- `ETHTOOL_MSG_TSINFO_GET`
debugfs
-------

View File

@@ -280,6 +280,26 @@ If the record decrypted turns out to had been padded or is not a data
record it will be decrypted again into a kernel buffer without zero copy.
Such events are counted in the ``TlsDecryptRetry`` statistic.
TLS_TX_MAX_PAYLOAD_LEN
~~~~~~~~~~~~~~~~~~~~~~
Specifies the maximum size of the plaintext payload for transmitted TLS records.
When this option is set, the kernel enforces the specified limit on all outgoing
TLS records. No plaintext fragment will exceed this size. This option can be used
to implement the TLS Record Size Limit extension [1].
* For TLS 1.2, the value corresponds directly to the record size limit.
* For TLS 1.3, the value should be set to record_size_limit - 1, since
the record size limit includes one additional byte for the ContentType
field.
The valid range for this option is 64 to 16384 bytes for TLS 1.2, and 63 to
16384 bytes for TLS 1.3. The lower minimum for TLS 1.3 accounts for the
extra byte used by the ContentType field.
[1] https://datatracker.ietf.org/doc/html/rfc8449
Statistics
==========

View File

@@ -0,0 +1,13 @@
.. SPDX-License-Identifier: GPL-2.0
==============
XFRM Framework
==============
.. toctree::
:maxdepth: 2
xfrm_device
xfrm_proc
xfrm_sync
xfrm_sysctl

View File

@@ -20,11 +20,15 @@ can radically increase throughput and decrease CPU utilization. The XFRM
Device interface allows NIC drivers to offer to the stack access to the
hardware offload.
Right now, there are two types of hardware offload that kernel supports.
Right now, there are two types of hardware offload that kernel supports:
* IPsec crypto offload:
* NIC performs encrypt/decrypt
* Kernel does everything else
* IPsec packet offload:
* NIC performs encrypt/decrypt
* NIC does encapsulation
* Kernel and NIC have SA and policy in-sync
@@ -34,7 +38,7 @@ Right now, there are two types of hardware offload that kernel supports.
Userland access to the offload is typically through a system such as
libreswan or KAME/raccoon, but the iproute2 'ip xfrm' command set can
be handy when experimenting. An example command might look something
like this for crypto offload:
like this for crypto offload::
ip x s add proto esp dst 14.0.0.70 src 14.0.0.52 spi 0x07 mode transport \
reqid 0x07 replay-window 32 \
@@ -42,7 +46,7 @@ like this for crypto offload:
sel src 14.0.0.52/24 dst 14.0.0.70/24 proto tcp \
offload dev eth4 dir in
and for packet offload
and for packet offload::
ip x s add proto esp dst 14.0.0.70 src 14.0.0.52 spi 0x07 mode transport \
reqid 0x07 replay-window 32 \
@@ -153,26 +157,26 @@ the packet's skb. At this point the data should be decrypted but the
IPsec headers are still in the packet data; they are removed later up
the stack in xfrm_input().
find and hold the SA that was used to the Rx skb::
1. Find and hold the SA that was used to the Rx skb::
get spi, protocol, and destination IP from packet headers
/* get spi, protocol, and destination IP from packet headers */
xs = find xs from (spi, protocol, dest_IP)
xfrm_state_hold(xs);
store the state information into the skb::
2. Store the state information into the skb::
sp = secpath_set(skb);
if (!sp) return;
sp->xvec[sp->len++] = xs;
sp->olen++;
indicate the success and/or error status of the offload::
3. Indicate the success and/or error status of the offload::
xo = xfrm_offload(skb);
xo->flags = CRYPTO_DONE;
xo->status = crypto_status;
hand the packet to napi_gro_receive() as usual
4. Hand the packet to napi_gro_receive() as usual.
In ESN mode, xdo_dev_state_advance_esn() is called from
xfrm_replay_advance_esn() for RX, and xfrm_replay_overflow_offload_esn for TX.

View File

@@ -1,8 +1,8 @@
.. SPDX-License-Identifier: GPL-2.0
====
XFRM
====
=========
XFRM sync
=========
The sync patches work is based on initial patches from
Krisztian <hidden@balabit.hu> and others and additional patches
@@ -36,7 +36,7 @@ is not driven by packet arrival.
- the replay sequence for both inbound and outbound
1) Message Structure
----------------------
--------------------
nlmsghdr:aevent_id:optional-TLVs.
@@ -83,31 +83,31 @@ when going from kernel to user space)
A program needs to subscribe to multicast group XFRMNLGRP_AEVENTS
to get notified of these events.
2) TLVS reflect the different parameters:
-----------------------------------------
2) TLVS reflect the different parameters
----------------------------------------
a) byte value (XFRMA_LTIME_VAL)
This TLV carries the running/current counter for byte lifetime since
last event.
This TLV carries the running/current counter for byte lifetime since
last event.
b)replay value (XFRMA_REPLAY_VAL)
b) replay value (XFRMA_REPLAY_VAL)
This TLV carries the running/current counter for replay sequence since
last event.
This TLV carries the running/current counter for replay sequence since
last event.
c)replay threshold (XFRMA_REPLAY_THRESH)
c) replay threshold (XFRMA_REPLAY_THRESH)
This TLV carries the threshold being used by the kernel to trigger events
when the replay sequence is exceeded.
This TLV carries the threshold being used by the kernel to trigger events
when the replay sequence is exceeded.
d) expiry timer (XFRMA_ETIMER_THRESH)
This is a timer value in milliseconds which is used as the nagle
value to rate limit the events.
This is a timer value in milliseconds which is used as the nagle
value to rate limit the events.
3) Default configurations for the parameters:
---------------------------------------------
3) Default configurations for the parameters
--------------------------------------------
By default these events should be turned off unless there is
at least one listener registered to listen to the multicast
@@ -121,12 +121,14 @@ in case they are not specified.
the two sysctls/proc entries are:
a) /proc/sys/net/core/sysctl_xfrm_aevent_etime
used to provide default values for the XFRMA_ETIMER_THRESH in incremental
units of time of 100ms. The default is 10 (1 second)
Used to provide default values for the XFRMA_ETIMER_THRESH in incremental
units of time of 100ms. The default is 10 (1 second)
b) /proc/sys/net/core/sysctl_xfrm_aevent_rseqth
used to provide default values for XFRMA_REPLAY_THRESH parameter
in incremental packet count. The default is two packets.
Used to provide default values for XFRMA_REPLAY_THRESH parameter
in incremental packet count. The default is two packets.
4) Message types
----------------
@@ -134,50 +136,51 @@ in incremental packet count. The default is two packets.
a) XFRM_MSG_GETAE issued by user-->kernel.
XFRM_MSG_GETAE does not carry any TLVs.
The response is a XFRM_MSG_NEWAE which is formatted based on what
XFRM_MSG_GETAE queried for.
The response is a XFRM_MSG_NEWAE which is formatted based on what
XFRM_MSG_GETAE queried for.
The response will always have XFRMA_LTIME_VAL and XFRMA_REPLAY_VAL TLVs.
* if XFRM_AE_RTHR flag is set, then XFRMA_REPLAY_THRESH is also retrieved
* if XFRM_AE_ETHR flag is set, then XFRMA_ETIMER_THRESH is also retrieved
The response will always have XFRMA_LTIME_VAL and XFRMA_REPLAY_VAL TLVs.
* if XFRM_AE_RTHR flag is set, then XFRMA_REPLAY_THRESH is also retrieved
* if XFRM_AE_ETHR flag is set, then XFRMA_ETIMER_THRESH is also retrieved
b) XFRM_MSG_NEWAE is issued by either user space to configure
or kernel to announce events or respond to a XFRM_MSG_GETAE.
i) user --> kernel to configure a specific SA.
i) user --> kernel to configure a specific SA.
any of the values or threshold parameters can be updated by passing the
appropriate TLV.
any of the values or threshold parameters can be updated by passing the
appropriate TLV.
A response is issued back to the sender in user space to indicate success
or failure.
A response is issued back to the sender in user space to indicate success
or failure.
In the case of success, additionally an event with
XFRM_MSG_NEWAE is also issued to any listeners as described in iii).
In the case of success, additionally an event with
XFRM_MSG_NEWAE is also issued to any listeners as described in iii).
ii) kernel->user direction as a response to XFRM_MSG_GETAE
ii) kernel->user direction as a response to XFRM_MSG_GETAE
The response will always have XFRMA_LTIME_VAL and XFRMA_REPLAY_VAL TLVs.
The response will always have XFRMA_LTIME_VAL and XFRMA_REPLAY_VAL TLVs.
The threshold TLVs will be included if explicitly requested in
the XFRM_MSG_GETAE message.
The threshold TLVs will be included if explicitly requested in
the XFRM_MSG_GETAE message.
iii) kernel->user to report as event if someone sets any values or
thresholds for an SA using XFRM_MSG_NEWAE (as described in #i above).
In such a case XFRM_AE_CU flag is set to inform the user that
the change happened as a result of an update.
The message will always have XFRMA_LTIME_VAL and XFRMA_REPLAY_VAL TLVs.
iii) kernel->user to report as event if someone sets any values or
thresholds for an SA using XFRM_MSG_NEWAE (as described in #i above).
In such a case XFRM_AE_CU flag is set to inform the user that
the change happened as a result of an update.
The message will always have XFRMA_LTIME_VAL and XFRMA_REPLAY_VAL TLVs.
iv) kernel->user to report event when replay threshold or a timeout
is exceeded.
iv) kernel->user to report event when replay threshold or a timeout
is exceeded.
In such a case either XFRM_AE_CR (replay exceeded) or XFRM_AE_CE (timeout
happened) is set to inform the user what happened.
Note the two flags are mutually exclusive.
The message will always have XFRMA_LTIME_VAL and XFRMA_REPLAY_VAL TLVs.
Exceptions to threshold settings
--------------------------------
5) Exceptions to threshold settings
-----------------------------------
If you have an SA that is getting hit by traffic in bursts such that
there is a period where the timer threshold expires with no packets

View File

@@ -4,8 +4,8 @@
XFRM Syscall
============
/proc/sys/net/core/xfrm_* Variables:
====================================
/proc/sys/net/core/xfrm_* Variables
===================================
xfrm_acq_expires - INTEGER
default 30 - hard timeout in seconds for acquire requests

View File

@@ -3301,6 +3301,7 @@ F: drivers/*/*/*rockchip*
F: drivers/*/*rockchip*
F: drivers/clk/rockchip/
F: drivers/i2c/busses/i2c-rk3x.c
F: drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c
F: sound/soc/rockchip/
N: rockchip
@@ -4306,7 +4307,7 @@ F: Documentation/filesystems/befs.rst
F: fs/befs/
BFQ I/O SCHEDULER
M: Yu Kuai <yukuai3@huawei.com>
M: Yu Kuai <yukuai@fnnas.com>
L: linux-block@vger.kernel.org
S: Odd Fixes
F: Documentation/block/bfq-iosched.rst
@@ -4406,6 +4407,8 @@ F: block/
F: drivers/block/
F: include/linux/bio.h
F: include/linux/blk*
F: include/uapi/linux/blk*
F: include/uapi/linux/ioprio.h
F: kernel/trace/blktrace.c
F: lib/sbitmap.c
@@ -4654,6 +4657,7 @@ F: Documentation/userspace-api/ebpf/
F: arch/*/net/*
F: include/linux/bpf*
F: include/linux/btf*
F: include/linux/buildid.h
F: include/linux/filter.h
F: include/trace/events/xdp.h
F: include/uapi/linux/bpf*
@@ -5131,7 +5135,6 @@ F: Documentation/devicetree/bindings/net/brcm,unimac-mdio.yaml
F: drivers/net/ethernet/broadcom/genet/
F: drivers/net/ethernet/broadcom/unimac.h
F: drivers/net/mdio/mdio-bcm-unimac.c
F: include/linux/platform_data/bcmgenet.h
F: include/linux/platform_data/mdio-bcm-unimac.h
BROADCOM IPROC ARM ARCHITECTURE
@@ -10532,7 +10535,7 @@ L: gfs2@lists.linux.dev
S: Supported
B: https://bugzilla.kernel.org/enter_bug.cgi?product=File%20System&component=gfs2
T: git git://git.kernel.org/pub/scm/linux/kernel/git/gfs2/linux-gfs2.git
F: Documentation/filesystems/gfs2*
F: Documentation/filesystems/gfs2/
F: fs/gfs2/
F: include/uapi/linux/gfs2_ondisk.h
@@ -14053,7 +14056,7 @@ F: tools/testing/selftests/landlock/
K: landlock
K: LANDLOCK
LANTIQ / INTEL Ethernet drivers
LANTIQ / MAXLINEAR / INTEL Ethernet DSA drivers
M: Hauke Mehrtens <hauke@hauke-m.de>
L: netdev@vger.kernel.org
S: Maintained
@@ -14061,6 +14064,7 @@ F: Documentation/devicetree/bindings/net/dsa/lantiq,gswip.yaml
F: drivers/net/dsa/lantiq/*
F: drivers/net/ethernet/lantiq_xrx200.c
F: net/dsa/tag_gswip.c
F: net/dsa/tag_mxl-gsw1xx.c
LANTIQ MIPS ARCHITECTURE
M: John Crispin <john@phrozen.org>
@@ -15412,14 +15416,12 @@ S: Supported
F: drivers/net/phy/mxl-86110.c
F: drivers/net/phy/mxl-gpy.c
MCAN MMIO DEVICE DRIVER
M: Chandrasekar Ramakrishnan <rcsekar@samsung.com>
MCAN DEVICE DRIVER
M: Markus Schneider-Pargmann <msp@baylibre.com>
L: linux-can@vger.kernel.org
S: Maintained
F: Documentation/devicetree/bindings/net/can/bosch,m_can.yaml
F: drivers/net/can/m_can/m_can.c
F: drivers/net/can/m_can/m_can.h
F: drivers/net/can/m_can/m_can_platform.c
F: drivers/net/can/m_can/
MCBA MICROCHIP CAN BUS ANALYZER TOOL DRIVER
R: Yasushi SHOJI <yashi@spacecubics.com>
@@ -17455,6 +17457,14 @@ S: Maintained
F: Documentation/devicetree/bindings/net/motorcomm,yt8xxx.yaml
F: drivers/net/phy/motorcomm.c
MOTORCOMM YT921X ETHERNET SWITCH DRIVER
M: David Yang <mmyangfl@gmail.com>
L: netdev@vger.kernel.org
S: Maintained
F: Documentation/devicetree/bindings/net/dsa/motorcomm,yt921x.yaml
F: drivers/net/dsa/yt921x.*
F: net/dsa/tag_yt921x.c
MOXA SMARTIO/INDUSTIO/INTELLIO SERIAL CARD
M: Jiri Slaby <jirislaby@kernel.org>
S: Maintained
@@ -17620,6 +17630,14 @@ T: git git://linuxtv.org/media.git
F: Documentation/devicetree/bindings/media/i2c/aptina,mt9v111.yaml
F: drivers/media/i2c/mt9v111.c
MUCSE ETHERNET DRIVER
M: Yibo Dong <dong100@mucse.com>
L: netdev@vger.kernel.org
S: Maintained
W: https://www.mucse.com/en/
F: Documentation/networking/device_drivers/ethernet/mucse/
F: drivers/net/ethernet/mucse/
MULTIFUNCTION DEVICES (MFD)
M: Lee Jones <lee@kernel.org>
S: Maintained
@@ -18061,6 +18079,7 @@ L: netdev@vger.kernel.org
S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec.git
T: git git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec-next.git
F: Documentation/networking/xfrm/
F: include/net/xfrm.h
F: include/uapi/linux/xfrm.h
F: net/ipv4/ah4.c
@@ -21066,6 +21085,7 @@ F: Documentation/devicetree/bindings/net/qcom,bam-dmux.yaml
F: drivers/net/wwan/qcom_bam_dmux.c
QUALCOMM BLUETOOTH DRIVER
M: Bartosz Golaszewski <brgl@bgdev.pl>
L: linux-arm-msm@vger.kernel.org
S: Maintained
F: drivers/bluetooth/btqca.[ch]
@@ -23890,7 +23910,7 @@ F: include/linux/property.h
SOFTWARE RAID (Multiple Disks) SUPPORT
M: Song Liu <song@kernel.org>
M: Yu Kuai <yukuai3@huawei.com>
M: Yu Kuai <yukuai@fnnas.com>
L: linux-raid@vger.kernel.org
S: Supported
Q: https://patchwork.kernel.org/project/linux-raid/list/
@@ -27683,6 +27703,7 @@ M: Jason A. Donenfeld <Jason@zx2c4.com>
L: wireguard@lists.zx2c4.com
L: netdev@vger.kernel.org
S: Maintained
F: Documentation/netlink/specs/wireguard.yaml
F: drivers/net/wireguard/
F: tools/testing/selftests/wireguard/
@@ -28352,6 +28373,13 @@ L: linux-kernel@vger.kernel.org
S: Maintained
F: arch/x86/kernel/cpu/zhaoxin.c
ZONED BLOCK DEVICE (BLOCK LAYER)
M: Damien Le Moal <dlemoal@kernel.org>
L: linux-block@vger.kernel.org
S: Maintained
F: block/blk-zoned.c
F: include/uapi/linux/blkzoned.h
ZONED LOOP DEVICE
M: Damien Le Moal <dlemoal@kernel.org>
R: Christoph Hellwig <hch@lst.de>

View File

@@ -1452,6 +1452,10 @@ emit_bswap_uxt:
emit(A64_ASR(is64, dst, dst, imm), ctx);
break;
/* JUMP reg */
case BPF_JMP | BPF_JA | BPF_X:
emit(A64_BR(dst), ctx);
break;
/* JUMP off */
case BPF_JMP | BPF_JA:
case BPF_JMP32 | BPF_JA:
@@ -2231,6 +2235,13 @@ skip_init_ctx:
for (i = 0; i <= prog->len; i++)
ctx.offset[i] *= AARCH64_INSN_SIZE;
bpf_prog_fill_jited_linfo(prog, ctx.offset + 1);
/*
* The bpf_prog_update_insn_ptrs function expects offsets to
* point to the first byte of the jitted instruction (unlike
* the bpf_prog_fill_jited_linfo above, which, for historical
* reasons, expects to point to the next instruction)
*/
bpf_prog_update_insn_ptrs(prog, ctx.offset, ctx.ro_image);
out_off:
if (!ro_header && priv_stack_ptr) {
free_percpu(priv_stack_ptr);
@@ -2923,8 +2934,9 @@ static int gen_branch_or_nop(enum aarch64_insn_branch_type type, void *ip,
* The dummy_tramp is used to prevent another CPU from jumping to unknown
* locations during the patching process, making the patching process easier.
*/
int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
void *old_addr, void *new_addr)
int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t,
enum bpf_text_poke_type new_t, void *old_addr,
void *new_addr)
{
int ret;
u32 old_insn;
@@ -2968,14 +2980,13 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
!poking_bpf_entry))
return -EINVAL;
if (poke_type == BPF_MOD_CALL)
branch_type = AARCH64_INSN_BRANCH_LINK;
else
branch_type = AARCH64_INSN_BRANCH_NOLINK;
branch_type = old_t == BPF_MOD_CALL ? AARCH64_INSN_BRANCH_LINK :
AARCH64_INSN_BRANCH_NOLINK;
if (gen_branch_or_nop(branch_type, ip, old_addr, plt, &old_insn) < 0)
return -EFAULT;
branch_type = new_t == BPF_MOD_CALL ? AARCH64_INSN_BRANCH_LINK :
AARCH64_INSN_BRANCH_NOLINK;
if (gen_branch_or_nop(branch_type, ip, new_addr, plt, &new_insn) < 0)
return -EFAULT;

View File

@@ -1284,11 +1284,12 @@ void *bpf_arch_text_copy(void *dst, void *src, size_t len)
return ret ? ERR_PTR(-EINVAL) : dst;
}
int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
void *old_addr, void *new_addr)
int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t,
enum bpf_text_poke_type new_t, void *old_addr,
void *new_addr)
{
int ret;
bool is_call = (poke_type == BPF_MOD_CALL);
bool is_call;
u32 old_insns[LOONGARCH_LONG_JUMP_NINSNS] = {[0 ... 4] = INSN_NOP};
u32 new_insns[LOONGARCH_LONG_JUMP_NINSNS] = {[0 ... 4] = INSN_NOP};
@@ -1298,6 +1299,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
if (!is_bpf_text_address((unsigned long)ip))
return -ENOTSUPP;
is_call = old_t == BPF_MOD_CALL;
ret = emit_jump_or_nops(old_addr, ip, old_insns, is_call);
if (ret)
return ret;
@@ -1305,6 +1307,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
if (memcmp(ip, old_insns, LOONGARCH_LONG_JUMP_NBYTES))
return -EFAULT;
is_call = new_t == BPF_MOD_CALL;
ret = emit_jump_or_nops(new_addr, ip, new_insns, is_call);
if (ret)
return ret;

View File

@@ -16,7 +16,6 @@
#include <linux/init.h>
#include <linux/io.h>
#include <linux/phy.h>
#include <linux/phy_fixed.h>
#include <asm/machdep.h>
#include <asm/coldfire.h>
#include <asm/mcfsim.h>
@@ -103,23 +102,9 @@ void __init config_BSP(char *commandp, int size)
/***************************************************************************/
/*
* Some 5272 based boards have the FEC ethernet directly connected to
* an ethernet switch. In this case we need to use the fixed phy type,
* and we need to declare it early in boot.
*/
static const struct fixed_phy_status nettel_fixed_phy_status __initconst = {
.link = 1,
.speed = 100,
.duplex = 0,
};
/***************************************************************************/
static int __init init_BSP(void)
{
m5272_uarts_init();
fixed_phy_add(&nettel_fixed_phy_status);
clkdev_add_table(m5272_clk_lookup, ARRAY_SIZE(m5272_clk_lookup));
return 0;
}

View File

@@ -256,12 +256,6 @@ static int __init bcm47xx_cpu_fixes(void)
}
arch_initcall(bcm47xx_cpu_fixes);
static const struct fixed_phy_status bcm47xx_fixed_phy_status __initconst = {
.link = 1,
.speed = SPEED_100,
.duplex = DUPLEX_FULL,
};
static int __init bcm47xx_register_bus_complete(void)
{
switch (bcm47xx_bus_type) {
@@ -282,7 +276,6 @@ static int __init bcm47xx_register_bus_complete(void)
bcm47xx_leds_register();
bcm47xx_workarounds();
fixed_phy_add(&bcm47xx_fixed_phy_status);
return 0;
}
device_initcall(bcm47xx_register_bus_complete);

View File

@@ -1107,8 +1107,9 @@ static void do_isync(void *info __maybe_unused)
* execute isync (or some CSI) so that they don't go back into the
* trampoline again.
*/
int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
void *old_addr, void *new_addr)
int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t,
enum bpf_text_poke_type new_t, void *old_addr,
void *new_addr)
{
unsigned long bpf_func, bpf_func_end, size, offset;
ppc_inst_t old_inst, new_inst;
@@ -1119,7 +1120,6 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
return -EOPNOTSUPP;
bpf_func = (unsigned long)ip;
branch_flags = poke_type == BPF_MOD_CALL ? BRANCH_SET_LINK : 0;
/* We currently only support poking bpf programs */
if (!__bpf_address_lookup(bpf_func, &size, &offset, name)) {
@@ -1132,7 +1132,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
* an unconditional branch instruction at im->ip_after_call
*/
if (offset) {
if (poke_type != BPF_MOD_JUMP) {
if (old_t == BPF_MOD_CALL || new_t == BPF_MOD_CALL) {
pr_err("%s (0x%lx): calls are not supported in bpf prog body\n", __func__,
bpf_func);
return -EOPNOTSUPP;
@@ -1166,6 +1166,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
}
old_inst = ppc_inst(PPC_RAW_NOP());
branch_flags = old_t == BPF_MOD_CALL ? BRANCH_SET_LINK : 0;
if (old_addr) {
if (is_offset_in_branch_range(ip - old_addr))
create_branch(&old_inst, ip, (unsigned long)old_addr, branch_flags);
@@ -1174,6 +1175,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
branch_flags);
}
new_inst = ppc_inst(PPC_RAW_NOP());
branch_flags = new_t == BPF_MOD_CALL ? BRANCH_SET_LINK : 0;
if (new_addr) {
if (is_offset_in_branch_range(ip - new_addr))
create_branch(&new_inst, ip, (unsigned long)new_addr, branch_flags);

View File

@@ -852,17 +852,19 @@ static int gen_jump_or_nops(void *target, void *ip, u32 *insns, bool is_call)
return emit_jump_and_link(is_call ? RV_REG_T0 : RV_REG_ZERO, rvoff, false, &ctx);
}
int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
void *old_addr, void *new_addr)
int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t,
enum bpf_text_poke_type new_t, void *old_addr,
void *new_addr)
{
u32 old_insns[RV_FENTRY_NINSNS], new_insns[RV_FENTRY_NINSNS];
bool is_call = poke_type == BPF_MOD_CALL;
bool is_call;
int ret;
if (!is_kernel_text((unsigned long)ip) &&
!is_bpf_text_address((unsigned long)ip))
return -ENOTSUPP;
is_call = old_t == BPF_MOD_CALL;
ret = gen_jump_or_nops(old_addr, ip, old_insns, is_call);
if (ret)
return ret;
@@ -870,6 +872,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
if (memcmp(ip, old_insns, RV_FENTRY_NBYTES))
return -EFAULT;
is_call = new_t == BPF_MOD_CALL;
ret = gen_jump_or_nops(new_addr, ip, new_insns, is_call);
if (ret)
return ret;
@@ -1131,7 +1134,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im,
store_args(nr_arg_slots, args_off, ctx);
/* skip to actual body of traced function */
if (flags & BPF_TRAMP_F_SKIP_FRAME)
if (flags & BPF_TRAMP_F_ORIG_STACK)
orig_call += RV_FENTRY_NINSNS * 4;
if (flags & BPF_TRAMP_F_CALL_ORIG) {

View File

@@ -2412,8 +2412,9 @@ bool bpf_jit_supports_far_kfunc_call(void)
return true;
}
int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
void *old_addr, void *new_addr)
int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t,
enum bpf_text_poke_type new_t, void *old_addr,
void *new_addr)
{
struct bpf_plt expected_plt, current_plt, new_plt, *plt;
struct {
@@ -2430,7 +2431,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
if (insn.opc != (0xc004 | (old_addr ? 0xf0 : 0)))
return -EINVAL;
if (t == BPF_MOD_JUMP &&
if ((new_t == BPF_MOD_JUMP || old_t == BPF_MOD_JUMP) &&
insn.disp == ((char *)new_addr - (char *)ip) >> 1) {
/*
* The branch already points to the destination,

View File

@@ -230,6 +230,7 @@ config X86
select HAVE_DYNAMIC_FTRACE_WITH_ARGS if X86_64
select HAVE_FTRACE_REGS_HAVING_PT_REGS if X86_64
select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
select HAVE_DYNAMIC_FTRACE_WITH_JMP if X86_64
select HAVE_SAMPLE_FTRACE_DIRECT if X86_64
select HAVE_SAMPLE_FTRACE_DIRECT_MULTI if X86_64
select HAVE_EBPF_JIT

View File

@@ -74,7 +74,12 @@ static const char *ftrace_call_replace(unsigned long ip, unsigned long addr)
* No need to translate into a callthunk. The trampoline does
* the depth accounting itself.
*/
return text_gen_insn(CALL_INSN_OPCODE, (void *)ip, (void *)addr);
if (ftrace_is_jmp(addr)) {
addr = ftrace_jmp_get(addr);
return text_gen_insn(JMP32_INSN_OPCODE, (void *)ip, (void *)addr);
} else {
return text_gen_insn(CALL_INSN_OPCODE, (void *)ip, (void *)addr);
}
}
static int ftrace_verify_code(unsigned long ip, const char *old_code)

Some files were not shown because too many files have changed in this diff Show More