Merge branch 'tcp-provide-better-locality-for-retransmit-timer'

Eric Dumazet says:

====================
tcp: provide better locality for retransmit timer

TCP stack uses three timers per flow, currently spread this way:

- sk->sk_timer : keepalive timer
- icsk->icsk_retransmit_timer : retransmit timer
- icsk->icsk_delack_timer : delayed ack timer

This series moves the retransmit timer to sk->sk_timer location,
to increase data locality in TX paths.

keepalive timers are not often used, this change should be neutral for them.

After the series we have following fields:

- sk->tcp_retransmit_timer : retransmit timer, in sock_write_tx group
- icsk->icsk_delack_timer : delayed ack timer
- icsk->icsk_keepalive_timer : keepalive timer

Moving icsk_delack_timer in a beter location would also be welcomed.
====================

Link: https://patch.msgid.link/20251124175013.1473655-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Jakub Kicinski
2025-11-25 19:28:32 -08:00
13 changed files with 74 additions and 69 deletions

View File

@@ -12,8 +12,8 @@ struct inet_sock icsk_inet read_mostly r
struct request_sock_queue icsk_accept_queue
struct inet_bind_bucket icsk_bind_hash read_mostly tcp_set_state
struct inet_bind2_bucket icsk_bind2_hash read_mostly tcp_set_state,inet_put_port
struct timer_list icsk_retransmit_timer read_write inet_csk_reset_xmit_timer,tcp_connect
struct timer_list icsk_delack_timer read_mostly inet_csk_reset_xmit_timer,tcp_connect
struct timer_list icsk_keepalive_timer
u32 icsk_rto read_write tcp_cwnd_validate,tcp_schedule_loss_probe,tcp_connect_init,tcp_connect,tcp_write_xmit,tcp_push_one
u32 icsk_rto_min
u32 icsk_rto_max read_mostly tcp_reset_xmit_timer

View File

@@ -56,7 +56,9 @@ struct inet_connection_sock_af_ops {
* @icsk_accept_queue: FIFO of established children
* @icsk_bind_hash: Bind node
* @icsk_bind2_hash: Bind node in the bhash2 table
* @icsk_retransmit_timer: Resend (no ack)
* @icsk_delack_timer: Delayed ACK timer
* @icsk_keepalive_timer: Keepalive timer
* @mptcp_tout_timer: mptcp timer
* @icsk_rto: Retransmit timeout
* @icsk_pmtu_cookie Last pmtu seen by socket
* @icsk_ca_ops Pluggable congestion control hook
@@ -81,8 +83,11 @@ struct inet_connection_sock {
struct request_sock_queue icsk_accept_queue;
struct inet_bind_bucket *icsk_bind_hash;
struct inet_bind2_bucket *icsk_bind2_hash;
struct timer_list icsk_retransmit_timer;
struct timer_list icsk_delack_timer;
struct timer_list icsk_delack_timer;
union {
struct timer_list icsk_keepalive_timer;
struct timer_list mptcp_tout_timer;
};
__u32 icsk_rto;
__u32 icsk_rto_min;
u32 icsk_rto_max;
@@ -184,10 +189,9 @@ static inline void inet_csk_delack_init(struct sock *sk)
memset(&inet_csk(sk)->icsk_ack, 0, sizeof(inet_csk(sk)->icsk_ack));
}
static inline unsigned long
icsk_timeout(const struct inet_connection_sock *icsk)
static inline unsigned long tcp_timeout_expires(const struct sock *sk)
{
return READ_ONCE(icsk->icsk_retransmit_timer.expires);
return READ_ONCE(sk->tcp_retransmit_timer.expires);
}
static inline unsigned long
@@ -203,7 +207,7 @@ static inline void inet_csk_clear_xmit_timer(struct sock *sk, const int what)
if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0) {
smp_store_release(&icsk->icsk_pending, 0);
#ifdef INET_CSK_CLEAR_TIMERS
sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
sk_stop_timer(sk, &sk->tcp_retransmit_timer);
#endif
} else if (what == ICSK_TIME_DACK) {
smp_store_release(&icsk->icsk_ack.pending, 0);
@@ -235,7 +239,7 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what,
if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0 ||
what == ICSK_TIME_LOSS_PROBE || what == ICSK_TIME_REO_TIMEOUT) {
smp_store_release(&icsk->icsk_pending, what);
sk_reset_timer(sk, &icsk->icsk_retransmit_timer, when);
sk_reset_timer(sk, &sk->tcp_retransmit_timer, when);
} else if (what == ICSK_TIME_DACK) {
smp_store_release(&icsk->icsk_ack.pending,
icsk->icsk_ack.pending | ICSK_ACK_TIMER);

View File

@@ -305,6 +305,8 @@ struct sk_filter;
* @sk_txrehash: enable TX hash rethink
* @sk_filter: socket filtering instructions
* @sk_timer: sock cleanup timer
* @tcp_retransmit_timer: tcp retransmit timer
* @mptcp_retransmit_timer: mptcp retransmit timer
* @sk_stamp: time stamp of last packet received
* @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only
* @sk_tsflags: SO_TIMESTAMPING flags
@@ -481,11 +483,12 @@ struct sock {
struct rb_root tcp_rtx_queue;
};
struct sk_buff_head sk_write_queue;
u32 sk_dst_pending_confirm;
u32 sk_pacing_status; /* see enum sk_pacing */
struct page_frag sk_frag;
struct timer_list sk_timer;
union {
struct timer_list sk_timer;
struct timer_list tcp_retransmit_timer;
struct timer_list mptcp_retransmit_timer;
};
unsigned long sk_pacing_rate; /* bytes per second */
atomic_t sk_zckey;
atomic_t sk_tskey;
@@ -493,6 +496,8 @@ struct sock {
__cacheline_group_end(sock_write_tx);
__cacheline_group_begin(sock_read_tx);
u32 sk_dst_pending_confirm;
u32 sk_pacing_status; /* see enum sk_pacing */
unsigned long sk_max_pacing_rate;
long sk_sndtimeo;
u32 sk_priority;

View File

@@ -4519,14 +4519,14 @@ static int __init sock_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_dst_pending_confirm);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_status);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_pending_confirm);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_status);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority);

View File

@@ -737,9 +737,9 @@ void inet_csk_init_xmit_timers(struct sock *sk,
{
struct inet_connection_sock *icsk = inet_csk(sk);
timer_setup(&icsk->icsk_retransmit_timer, retransmit_handler, 0);
timer_setup(&sk->tcp_retransmit_timer, retransmit_handler, 0);
timer_setup(&icsk->icsk_delack_timer, delack_handler, 0);
timer_setup(&sk->sk_timer, keepalive_handler, 0);
timer_setup(&icsk->icsk_keepalive_timer, keepalive_handler, 0);
icsk->icsk_pending = icsk->icsk_ack.pending = 0;
}
@@ -750,9 +750,9 @@ void inet_csk_clear_xmit_timers(struct sock *sk)
smp_store_release(&icsk->icsk_pending, 0);
smp_store_release(&icsk->icsk_ack.pending, 0);
sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
sk_stop_timer(sk, &sk->tcp_retransmit_timer);
sk_stop_timer(sk, &icsk->icsk_delack_timer);
sk_stop_timer(sk, &sk->sk_timer);
sk_stop_timer(sk, &icsk->icsk_keepalive_timer);
}
void inet_csk_clear_xmit_timers_sync(struct sock *sk)
@@ -765,9 +765,9 @@ void inet_csk_clear_xmit_timers_sync(struct sock *sk)
smp_store_release(&icsk->icsk_pending, 0);
smp_store_release(&icsk->icsk_ack.pending, 0);
sk_stop_timer_sync(sk, &icsk->icsk_retransmit_timer);
sk_stop_timer_sync(sk, &sk->tcp_retransmit_timer);
sk_stop_timer_sync(sk, &icsk->icsk_delack_timer);
sk_stop_timer_sync(sk, &sk->sk_timer);
sk_stop_timer_sync(sk, &icsk->icsk_keepalive_timer);
}
struct dst_entry *inet_csk_route_req(const struct sock *sk,

View File

@@ -287,17 +287,17 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
r->idiag_timer = 1;
r->idiag_retrans = READ_ONCE(icsk->icsk_retransmits);
r->idiag_expires =
jiffies_delta_to_msecs(icsk_timeout(icsk) - jiffies);
jiffies_delta_to_msecs(tcp_timeout_expires(sk) - jiffies);
} else if (icsk_pending == ICSK_TIME_PROBE0) {
r->idiag_timer = 4;
r->idiag_retrans = READ_ONCE(icsk->icsk_probes_out);
r->idiag_expires =
jiffies_delta_to_msecs(icsk_timeout(icsk) - jiffies);
} else if (timer_pending(&sk->sk_timer)) {
jiffies_delta_to_msecs(tcp_timeout_expires(sk) - jiffies);
} else if (timer_pending(&icsk->icsk_keepalive_timer)) {
r->idiag_timer = 2;
r->idiag_retrans = READ_ONCE(icsk->icsk_probes_out);
r->idiag_expires =
jiffies_delta_to_msecs(sk->sk_timer.expires - jiffies);
jiffies_delta_to_msecs(icsk->icsk_keepalive_timer.expires - jiffies);
}
if ((ext & (1 << (INET_DIAG_INFO - 1))) && handler->idiag_info_size) {

View File

@@ -2869,13 +2869,13 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
icsk_pending == ICSK_TIME_REO_TIMEOUT ||
icsk_pending == ICSK_TIME_LOSS_PROBE) {
timer_active = 1;
timer_expires = icsk_timeout(icsk);
timer_expires = tcp_timeout_expires(sk);
} else if (icsk_pending == ICSK_TIME_PROBE0) {
timer_active = 4;
timer_expires = icsk_timeout(icsk);
} else if (timer_pending(&sk->sk_timer)) {
timer_expires = tcp_timeout_expires(sk);
} else if (timer_pending(&icsk->icsk_keepalive_timer)) {
timer_active = 2;
timer_expires = sk->sk_timer.expires;
timer_expires = icsk->icsk_keepalive_timer.expires;
} else {
timer_active = 0;
timer_expires = jiffies;

View File

@@ -510,7 +510,7 @@ static bool tcp_rtx_probe0_timed_out(const struct sock *sk,
* and tp->rcv_tstamp might very well have been written recently.
* rcv_delta can thus be negative.
*/
rcv_delta = icsk_timeout(icsk) - tp->rcv_tstamp;
rcv_delta = tcp_timeout_expires(sk) - tp->rcv_tstamp;
if (rcv_delta <= timeout)
return false;
@@ -697,9 +697,9 @@ void tcp_write_timer_handler(struct sock *sk)
!icsk->icsk_pending)
return;
if (time_after(icsk_timeout(icsk), jiffies)) {
sk_reset_timer(sk, &icsk->icsk_retransmit_timer,
icsk_timeout(icsk));
if (time_after(tcp_timeout_expires(sk), jiffies)) {
sk_reset_timer(sk, &sk->tcp_retransmit_timer,
tcp_timeout_expires(sk));
return;
}
tcp_mstamp_refresh(tcp_sk(sk));
@@ -725,12 +725,10 @@ void tcp_write_timer_handler(struct sock *sk)
static void tcp_write_timer(struct timer_list *t)
{
struct inet_connection_sock *icsk =
timer_container_of(icsk, t, icsk_retransmit_timer);
struct sock *sk = &icsk->icsk_inet.sk;
struct sock *sk = timer_container_of(sk, t, tcp_retransmit_timer);
/* Avoid locking the socket when there is no pending event. */
if (!smp_load_acquire(&icsk->icsk_pending))
if (!smp_load_acquire(&inet_csk(sk)->icsk_pending))
goto out;
bh_lock_sock(sk);
@@ -755,12 +753,12 @@ void tcp_syn_ack_timeout(const struct request_sock *req)
void tcp_reset_keepalive_timer(struct sock *sk, unsigned long len)
{
sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
sk_reset_timer(sk, &inet_csk(sk)->icsk_keepalive_timer, jiffies + len);
}
static void tcp_delete_keepalive_timer(struct sock *sk)
{
sk_stop_timer(sk, &sk->sk_timer);
sk_stop_timer(sk, &inet_csk(sk)->icsk_keepalive_timer);
}
void tcp_set_keepalive(struct sock *sk, int val)
@@ -777,8 +775,9 @@ EXPORT_IPV6_MOD_GPL(tcp_set_keepalive);
static void tcp_keepalive_timer(struct timer_list *t)
{
struct sock *sk = timer_container_of(sk, t, sk_timer);
struct inet_connection_sock *icsk = inet_csk(sk);
struct inet_connection_sock *icsk =
timer_container_of(icsk, t, icsk_keepalive_timer);
struct sock *sk = &icsk->icsk_inet.sk;
struct tcp_sock *tp = tcp_sk(sk);
u32 elapsed;

View File

@@ -2163,13 +2163,13 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
icsk_pending == ICSK_TIME_REO_TIMEOUT ||
icsk_pending == ICSK_TIME_LOSS_PROBE) {
timer_active = 1;
timer_expires = icsk_timeout(icsk);
timer_expires = tcp_timeout_expires(sp);
} else if (icsk_pending == ICSK_TIME_PROBE0) {
timer_active = 4;
timer_expires = icsk_timeout(icsk);
} else if (timer_pending(&sp->sk_timer)) {
timer_expires = tcp_timeout_expires(sp);
} else if (timer_pending(&icsk->icsk_keepalive_timer)) {
timer_active = 2;
timer_expires = sp->sk_timer.expires;
timer_expires = icsk->icsk_keepalive_timer.expires;
} else {
timer_active = 0;
timer_expires = jiffies;

View File

@@ -411,9 +411,7 @@ static bool __mptcp_move_skb(struct sock *sk, struct sk_buff *skb)
static void mptcp_stop_rtx_timer(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
sk_stop_timer(sk, &sk->mptcp_retransmit_timer);
mptcp_sk(sk)->timer_ival = 0;
}
@@ -519,7 +517,7 @@ static long mptcp_timeout_from_subflow(const struct mptcp_subflow_context *subfl
const struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
return inet_csk(ssk)->icsk_pending && !subflow->stale_count ?
icsk_timeout(inet_csk(ssk)) - jiffies : 0;
tcp_timeout_expires(ssk) - jiffies : 0;
}
static void mptcp_set_timeout(struct sock *sk)
@@ -954,12 +952,11 @@ static void __mptcp_flush_join_list(struct sock *sk, struct list_head *join_list
static bool mptcp_rtx_timer_pending(struct sock *sk)
{
return timer_pending(&inet_csk(sk)->icsk_retransmit_timer);
return timer_pending(&sk->mptcp_retransmit_timer);
}
static void mptcp_reset_rtx_timer(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
unsigned long tout;
/* prevent rescheduling on close */
@@ -967,7 +964,7 @@ static void mptcp_reset_rtx_timer(struct sock *sk)
return;
tout = mptcp_sk(sk)->timer_ival;
sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + tout);
sk_reset_timer(sk, &sk->mptcp_retransmit_timer, jiffies + tout);
}
bool mptcp_schedule_work(struct sock *sk)
@@ -2354,9 +2351,7 @@ out_err:
static void mptcp_retransmit_timer(struct timer_list *t)
{
struct inet_connection_sock *icsk = timer_container_of(icsk, t,
icsk_retransmit_timer);
struct sock *sk = &icsk->icsk_inet.sk;
struct sock *sk = timer_container_of(sk, t, mptcp_retransmit_timer);
struct mptcp_sock *msk = mptcp_sk(sk);
bh_lock_sock(sk);
@@ -2374,7 +2369,9 @@ static void mptcp_retransmit_timer(struct timer_list *t)
static void mptcp_tout_timer(struct timer_list *t)
{
struct sock *sk = timer_container_of(sk, t, sk_timer);
struct inet_connection_sock *icsk =
timer_container_of(icsk, t, mptcp_tout_timer);
struct sock *sk = &icsk->icsk_inet.sk;
mptcp_schedule_work(sk);
sock_put(sk);
@@ -2828,7 +2825,7 @@ void mptcp_reset_tout_timer(struct mptcp_sock *msk, unsigned long fail_tout)
*/
timeout = inet_csk(sk)->icsk_mtup.probe_timestamp ? close_timeout : fail_tout;
sk_reset_timer(sk, &sk->sk_timer, timeout);
sk_reset_timer(sk, &inet_csk(sk)->mptcp_tout_timer, timeout);
}
static void mptcp_mp_fail_no_response(struct mptcp_sock *msk)
@@ -2973,8 +2970,8 @@ static void __mptcp_init_sock(struct sock *sk)
spin_lock_init(&msk->fallback_lock);
/* re-use the csk retrans timer for MPTCP-level retrans */
timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0);
timer_setup(&sk->sk_timer, mptcp_tout_timer, 0);
timer_setup(&sk->mptcp_retransmit_timer, mptcp_retransmit_timer, 0);
timer_setup(&msk->sk.mptcp_tout_timer, mptcp_tout_timer, 0);
}
static void mptcp_ca_reset(struct sock *sk)
@@ -3176,7 +3173,7 @@ static void __mptcp_destroy_sock(struct sock *sk)
might_sleep();
mptcp_stop_rtx_timer(sk);
sk_stop_timer(sk, &sk->sk_timer);
sk_stop_timer(sk, &inet_csk(sk)->mptcp_tout_timer);
msk->pm.status = 0;
mptcp_release_sched(msk);

View File

@@ -892,7 +892,7 @@ static inline void mptcp_stop_tout_timer(struct sock *sk)
if (!inet_csk(sk)->icsk_mtup.probe_timestamp)
return;
sk_stop_timer(sk, &sk->sk_timer);
sk_stop_timer(sk, &inet_csk(sk)->mptcp_tout_timer);
inet_csk(sk)->icsk_mtup.probe_timestamp = 0;
}

View File

@@ -99,13 +99,13 @@ static int dump_tcp_sock(struct seq_file *seq, struct tcp_sock *tp,
icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
timer_active = 1;
timer_expires = icsk->icsk_retransmit_timer.expires;
timer_expires = sp->tcp_retransmit_timer.expires;
} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
timer_active = 4;
timer_expires = icsk->icsk_retransmit_timer.expires;
} else if (timer_pending(&sp->sk_timer)) {
timer_expires = sp->tcp_retransmit_timer.expires;
} else if (timer_pending(&icsk->icsk_keepalive_timer)) {
timer_active = 2;
timer_expires = sp->sk_timer.expires;
timer_expires = icsk->icsk_keepalive_timer.expires;
} else {
timer_active = 0;
timer_expires = bpf_jiffies64();

View File

@@ -99,13 +99,13 @@ static int dump_tcp6_sock(struct seq_file *seq, struct tcp6_sock *tp,
icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
timer_active = 1;
timer_expires = icsk->icsk_retransmit_timer.expires;
timer_expires = sp->tcp_retransmit_timer.expires;
} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
timer_active = 4;
timer_expires = icsk->icsk_retransmit_timer.expires;
} else if (timer_pending(&sp->sk_timer)) {
timer_expires = sp->tcp_retransmit_timer.expires;
} else if (timer_pending(&icsk->icsk_keepalive_timer)) {
timer_active = 2;
timer_expires = sp->sk_timer.expires;
timer_expires = icsk->icsk_keepalive_timer.expires;
} else {
timer_active = 0;
timer_expires = bpf_jiffies64();