mirror of
https://github.com/torvalds/linux.git
synced 2025-12-07 20:06:24 +00:00
net: Allow opt-out from global protocol memory accounting.
Some protocols (e.g., TCP, UDP) implement memory accounting for socket buffers and charge memory to per-protocol global counters pointed to by sk->sk_proto->memory_allocated. Sometimes, system processes do not want that limitation. For a similar purpose, there is SO_RESERVE_MEM for sockets under memcg. Also, by opting out of the per-protocol accounting, sockets under memcg can avoid paying costs for two orthogonal memory accounting mechanisms. A microbenchmark result is in the subsequent bpf patch. Let's allow opt-out from the per-protocol memory accounting if sk->sk_bypass_prot_mem is true. sk->sk_bypass_prot_mem and sk->sk_prot are placed in the same cache line, and sk_has_account() always fetches sk->sk_prot before accessing sk->sk_bypass_prot_mem, so there is no extra cache miss for this patch. The following patches will set sk->sk_bypass_prot_mem to true, and then, the per-protocol memory accounting will be skipped. Note that this does NOT disable memcg, but rather the per-protocol one. Another option not to use the hole in struct sock_common is create sk_prot variants like tcp_prot_bypass, but this would complicate SOCKMAP logic, tcp_bpf_prots etc. Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com> Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org> Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev> Reviewed-by: Eric Dumazet <edumazet@google.com> Acked-by: Roman Gushchin <roman.gushchin@linux.dev> Link: https://patch.msgid.link/20251014235604.3057003-3-kuniyu@google.com
This commit is contained in:
committed by
Martin KaFai Lau
parent
4a997d49d9
commit
7c268eaeec
@@ -35,6 +35,9 @@ static inline bool sk_under_memory_pressure(const struct sock *sk)
|
||||
mem_cgroup_sk_under_memory_pressure(sk))
|
||||
return true;
|
||||
|
||||
if (sk->sk_bypass_prot_mem)
|
||||
return false;
|
||||
|
||||
return !!READ_ONCE(*sk->sk_prot->memory_pressure);
|
||||
}
|
||||
|
||||
|
||||
@@ -118,6 +118,7 @@ typedef __u64 __bitwise __addrpair;
|
||||
* @skc_reuseport: %SO_REUSEPORT setting
|
||||
* @skc_ipv6only: socket is IPV6 only
|
||||
* @skc_net_refcnt: socket is using net ref counting
|
||||
* @skc_bypass_prot_mem: bypass the per-protocol memory accounting for skb
|
||||
* @skc_bound_dev_if: bound device index if != 0
|
||||
* @skc_bind_node: bind hash linkage for various protocol lookup tables
|
||||
* @skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol
|
||||
@@ -174,6 +175,7 @@ struct sock_common {
|
||||
unsigned char skc_reuseport:1;
|
||||
unsigned char skc_ipv6only:1;
|
||||
unsigned char skc_net_refcnt:1;
|
||||
unsigned char skc_bypass_prot_mem:1;
|
||||
int skc_bound_dev_if;
|
||||
union {
|
||||
struct hlist_node skc_bind_node;
|
||||
@@ -381,6 +383,7 @@ struct sock {
|
||||
#define sk_reuseport __sk_common.skc_reuseport
|
||||
#define sk_ipv6only __sk_common.skc_ipv6only
|
||||
#define sk_net_refcnt __sk_common.skc_net_refcnt
|
||||
#define sk_bypass_prot_mem __sk_common.skc_bypass_prot_mem
|
||||
#define sk_bound_dev_if __sk_common.skc_bound_dev_if
|
||||
#define sk_bind_node __sk_common.skc_bind_node
|
||||
#define sk_prot __sk_common.skc_prot
|
||||
|
||||
@@ -303,6 +303,9 @@ static inline bool tcp_under_memory_pressure(const struct sock *sk)
|
||||
mem_cgroup_sk_under_memory_pressure(sk))
|
||||
return true;
|
||||
|
||||
if (sk->sk_bypass_prot_mem)
|
||||
return false;
|
||||
|
||||
return READ_ONCE(tcp_memory_pressure);
|
||||
}
|
||||
/*
|
||||
|
||||
@@ -1046,9 +1046,13 @@ static int sock_reserve_memory(struct sock *sk, int bytes)
|
||||
if (!charged)
|
||||
return -ENOMEM;
|
||||
|
||||
if (sk->sk_bypass_prot_mem)
|
||||
goto success;
|
||||
|
||||
/* pre-charge to forward_alloc */
|
||||
sk_memory_allocated_add(sk, pages);
|
||||
allocated = sk_memory_allocated(sk);
|
||||
|
||||
/* If the system goes into memory pressure with this
|
||||
* precharge, give up and return error.
|
||||
*/
|
||||
@@ -1057,6 +1061,8 @@ static int sock_reserve_memory(struct sock *sk, int bytes)
|
||||
mem_cgroup_sk_uncharge(sk, pages);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
success:
|
||||
sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
|
||||
|
||||
WRITE_ONCE(sk->sk_reserved_mem,
|
||||
@@ -3145,8 +3151,11 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
|
||||
if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
|
||||
return true;
|
||||
|
||||
sk_enter_memory_pressure(sk);
|
||||
if (!sk->sk_bypass_prot_mem)
|
||||
sk_enter_memory_pressure(sk);
|
||||
|
||||
sk_stream_moderate_sndbuf(sk);
|
||||
|
||||
return false;
|
||||
}
|
||||
EXPORT_SYMBOL(sk_page_frag_refill);
|
||||
@@ -3263,10 +3272,12 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
|
||||
{
|
||||
bool memcg_enabled = false, charged = false;
|
||||
struct proto *prot = sk->sk_prot;
|
||||
long allocated;
|
||||
long allocated = 0;
|
||||
|
||||
sk_memory_allocated_add(sk, amt);
|
||||
allocated = sk_memory_allocated(sk);
|
||||
if (!sk->sk_bypass_prot_mem) {
|
||||
sk_memory_allocated_add(sk, amt);
|
||||
allocated = sk_memory_allocated(sk);
|
||||
}
|
||||
|
||||
if (mem_cgroup_sk_enabled(sk)) {
|
||||
memcg_enabled = true;
|
||||
@@ -3275,6 +3286,9 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
|
||||
goto suppress_allocation;
|
||||
}
|
||||
|
||||
if (!allocated)
|
||||
return 1;
|
||||
|
||||
/* Under limit. */
|
||||
if (allocated <= sk_prot_mem_limits(sk, 0)) {
|
||||
sk_leave_memory_pressure(sk);
|
||||
@@ -3353,7 +3367,8 @@ suppress_allocation:
|
||||
|
||||
trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
|
||||
|
||||
sk_memory_allocated_sub(sk, amt);
|
||||
if (allocated)
|
||||
sk_memory_allocated_sub(sk, amt);
|
||||
|
||||
if (charged)
|
||||
mem_cgroup_sk_uncharge(sk, amt);
|
||||
@@ -3392,11 +3407,14 @@ EXPORT_SYMBOL(__sk_mem_schedule);
|
||||
*/
|
||||
void __sk_mem_reduce_allocated(struct sock *sk, int amount)
|
||||
{
|
||||
sk_memory_allocated_sub(sk, amount);
|
||||
|
||||
if (mem_cgroup_sk_enabled(sk))
|
||||
mem_cgroup_sk_uncharge(sk, amount);
|
||||
|
||||
if (sk->sk_bypass_prot_mem)
|
||||
return;
|
||||
|
||||
sk_memory_allocated_sub(sk, amount);
|
||||
|
||||
if (sk_under_global_memory_pressure(sk) &&
|
||||
(sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
|
||||
sk_leave_memory_pressure(sk);
|
||||
|
||||
@@ -928,7 +928,8 @@ struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp,
|
||||
}
|
||||
__kfree_skb(skb);
|
||||
} else {
|
||||
sk->sk_prot->enter_memory_pressure(sk);
|
||||
if (!sk->sk_bypass_prot_mem)
|
||||
tcp_enter_memory_pressure(sk);
|
||||
sk_stream_moderate_sndbuf(sk);
|
||||
}
|
||||
return NULL;
|
||||
|
||||
@@ -3743,12 +3743,17 @@ void sk_forced_mem_schedule(struct sock *sk, int size)
|
||||
delta = size - sk->sk_forward_alloc;
|
||||
if (delta <= 0)
|
||||
return;
|
||||
|
||||
amt = sk_mem_pages(delta);
|
||||
sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
|
||||
sk_memory_allocated_add(sk, amt);
|
||||
|
||||
if (mem_cgroup_sk_enabled(sk))
|
||||
mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge() | __GFP_NOFAIL);
|
||||
|
||||
if (sk->sk_bypass_prot_mem)
|
||||
return;
|
||||
|
||||
sk_memory_allocated_add(sk, amt);
|
||||
}
|
||||
|
||||
/* Send a FIN. The caller locks the socket for us.
|
||||
|
||||
@@ -1065,11 +1065,12 @@ static void mptcp_enter_memory_pressure(struct sock *sk)
|
||||
mptcp_for_each_subflow(msk, subflow) {
|
||||
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
|
||||
|
||||
if (first)
|
||||
if (first && !ssk->sk_bypass_prot_mem) {
|
||||
tcp_enter_memory_pressure(ssk);
|
||||
sk_stream_moderate_sndbuf(ssk);
|
||||
first = false;
|
||||
}
|
||||
|
||||
first = false;
|
||||
sk_stream_moderate_sndbuf(ssk);
|
||||
}
|
||||
__mptcp_sync_sndbuf(sk);
|
||||
}
|
||||
|
||||
@@ -373,7 +373,8 @@ static int tls_do_allocation(struct sock *sk,
|
||||
if (!offload_ctx->open_record) {
|
||||
if (unlikely(!skb_page_frag_refill(prepend_size, pfrag,
|
||||
sk->sk_allocation))) {
|
||||
READ_ONCE(sk->sk_prot)->enter_memory_pressure(sk);
|
||||
if (!sk->sk_bypass_prot_mem)
|
||||
READ_ONCE(sk->sk_prot)->enter_memory_pressure(sk);
|
||||
sk_stream_moderate_sndbuf(sk);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user