net: allow busy connected flows to switch tx queues

This is a followup of commit 726e9e8b94 ("tcp: refine
skb->ooo_okay setting") and of prior commit in this series
("net: control skb->ooo_okay from skb_set_owner_w()")

skb->ooo_okay might never be set for bulk flows that always
have at least one skb in a qdisc queue of NIC queue,
especially if TX completion is delayed because of a stressed cpu.

The so-called "strange attractors" has caused many performance
issues (see for instance 9b462d02d6 ("tcp: TCP Small Queues
and strange attractors")), we need to do better.

We have tried very hard to avoid reorders because TCP was
not dealing with them nicely a decade ago.

Use the new net.core.txq_reselection_ms sysctl to let
flows follow XPS and select a more efficient queue.

After this patch, we no longer have to make sure threads
are pinned to cpus, they now can be migrated without
adding too much spinlock/qdisc/TX completion pressure anymore.

TX completion part was problematic, because it added false sharing
on various socket fields, but also added false sharing and spinlock
contention in mm layers. Calling skb_orphan() from ndo_start_xmit()
is not an option unfortunately.

Note for later:

1) move sk->sk_tx_queue_mapping closer
to sk_tx_queue_mapping_jiffies for better cache locality.

2) Study if 9b462d02d6 ("tcp: TCP Small Queues
and strange attractors") could be revised.

Tested:

Used a host with 32 TX queues, shared by groups of 8 cores.
XPS setup :

echo ff >/sys/class/net/eth1/queue/tx-0/xps_cpus
echo ff00 >/sys/class/net/eth1/queue/tx-1/xps_cpus
echo ff0000 >/sys/class/net/eth1/queue/tx-2/xps_cpus
echo ff000000 >/sys/class/net/eth1/queue/tx-3/xps_cpus
echo ff,00000000 >/sys/class/net/eth1/queue/tx-4/xps_cpus
echo ff00,00000000 >/sys/class/net/eth1/queue/tx-5/xps_cpus
echo ff0000,00000000 >/sys/class/net/eth1/queue/tx-6/xps_cpus
echo ff000000,00000000 >/sys/class/net/eth1/queue/tx-7/xps_cpus
...

Launched a tcp_stream with 15 threads and 1000 flows, initially affined to core 0-15

taskset -c 0-15 tcp_stream -T15 -F1000 -l1000 -c -H target_host

Checked that only queues 0 and 1 are used as instructed by XPS :
tc -s qdisc show dev eth1|grep backlog|grep -v "backlog 0b 0p"
 backlog 123489410b 1890p
 backlog 69809026b 1064p
 backlog 52401054b 805p

Then force each thread to run on cpu 1,9,17,25,33,41,49,57,65,73,81,89,97,105,113,121

C=1;PID=`pidof tcp_stream`;for P in `ls /proc/$PID/task`; do taskset -pc $C $P; C=$(($C + 8));done

Set txq_reselection_ms to 1000
echo 1000 > /proc/sys/net/core/txq_reselection_ms

Check that the flows have migrated nicely:

tc -s qdisc show dev eth1|grep backlog|grep -v "backlog 0b 0p"
 backlog 130508314b 1916p
 backlog 8584380b 126p
 backlog 8584380b 126p
 backlog 8379990b 123p
 backlog 8584380b 126p
 backlog 8487484b 125p
 backlog 8584380b 126p
 backlog 8448120b 124p
 backlog 8584380b 126p
 backlog 8720640b 128p
 backlog 8856900b 130p
 backlog 8584380b 126p
 backlog 8652510b 127p
 backlog 8448120b 124p
 backlog 8516250b 125p
 backlog 7834950b 115p

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20251013152234.842065-5-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Eric Dumazet
2025-10-13 15:22:34 +00:00
committed by Jakub Kicinski
parent 2ddef3462b
commit 4a7708443d
2 changed files with 39 additions and 16 deletions

View File

@@ -313,6 +313,7 @@ struct sk_filter;
* @sk_bind_phc: SO_TIMESTAMPING bind PHC index of PTP virtual clock
* for timestamping
* @sk_tskey: counter to disambiguate concurrent tstamp requests
* @sk_tx_queue_mapping_jiffies: time in jiffies of last @sk_tx_queue_mapping refresh.
* @sk_zckey: counter to order MSG_ZEROCOPY notifications
* @sk_socket: Identd and reporting IO signals
* @sk_user_data: RPC layer private data. Write-protected by @sk_callback_lock.
@@ -485,6 +486,7 @@ struct sock {
unsigned long sk_pacing_rate; /* bytes per second */
atomic_t sk_zckey;
atomic_t sk_tskey;
unsigned long sk_tx_queue_mapping_jiffies;
__cacheline_group_end(sock_write_tx);
__cacheline_group_begin(sock_read_tx);
@@ -1992,7 +1994,15 @@ static inline void sk_tx_queue_set(struct sock *sk, int tx_queue)
/* Paired with READ_ONCE() in sk_tx_queue_get() and
* other WRITE_ONCE() because socket lock might be not held.
*/
WRITE_ONCE(sk->sk_tx_queue_mapping, tx_queue);
if (READ_ONCE(sk->sk_tx_queue_mapping) != tx_queue) {
WRITE_ONCE(sk->sk_tx_queue_mapping, tx_queue);
WRITE_ONCE(sk->sk_tx_queue_mapping_jiffies, jiffies);
return;
}
/* Refresh sk_tx_queue_mapping_jiffies if too old. */
if (time_is_before_jiffies(READ_ONCE(sk->sk_tx_queue_mapping_jiffies) + HZ))
WRITE_ONCE(sk->sk_tx_queue_mapping_jiffies, jiffies);
}
#define NO_QUEUE_MAPPING USHRT_MAX
@@ -2005,19 +2015,7 @@ static inline void sk_tx_queue_clear(struct sock *sk)
WRITE_ONCE(sk->sk_tx_queue_mapping, NO_QUEUE_MAPPING);
}
static inline int sk_tx_queue_get(const struct sock *sk)
{
if (sk) {
/* Paired with WRITE_ONCE() in sk_tx_queue_clear()
* and sk_tx_queue_set().
*/
int val = READ_ONCE(sk->sk_tx_queue_mapping);
if (val != NO_QUEUE_MAPPING)
return val;
}
return -1;
}
int sk_tx_queue_get(const struct sock *sk);
static inline void __sk_rx_queue_set(struct sock *sk,
const struct sk_buff *skb,

View File

@@ -4591,6 +4591,32 @@ u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
}
EXPORT_SYMBOL(dev_pick_tx_zero);
int sk_tx_queue_get(const struct sock *sk)
{
int resel, val;
if (!sk)
return -1;
/* Paired with WRITE_ONCE() in sk_tx_queue_clear()
* and sk_tx_queue_set().
*/
val = READ_ONCE(sk->sk_tx_queue_mapping);
if (val == NO_QUEUE_MAPPING)
return -1;
if (!sk_fullsock(sk))
return val;
resel = READ_ONCE(sock_net(sk)->core.sysctl_txq_reselection);
if (resel && time_is_before_jiffies(
READ_ONCE(sk->sk_tx_queue_mapping_jiffies) + resel))
return -1;
return val;
}
EXPORT_SYMBOL(sk_tx_queue_get);
u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
struct net_device *sb_dev)
{
@@ -4606,8 +4632,7 @@ u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
if (new_index < 0)
new_index = skb_tx_hash(dev, sb_dev, skb);
if (queue_index != new_index && sk &&
sk_fullsock(sk) &&
if (sk && sk_fullsock(sk) &&
rcu_access_pointer(sk->sk_dst_cache))
sk_tx_queue_set(sk, new_index);