mirror of
https://github.com/torvalds/linux.git
synced 2025-12-07 20:06:24 +00:00
Merge tag 'ipsec-next-2025-11-18' of git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec-next
Steffen Klassert says: ==================== pull request (net-next): ipsec-next 2025-11-18 1) Relax a lock contention bottleneck to improve IPsec crypto offload performance. From Jianbo Liu. 2) Deprecate pfkey, the interface will be removed in 2027. 3) Update xfrm documentation and move it to ipsec maintainance. From Bagas Sanjaya. * tag 'ipsec-next-2025-11-18' of git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec-next: MAINTAINERS: Add entry for XFRM documentation net: Move XFRM documentation into its own subdirectory Documentation: xfrm_sync: Number the fifth section Documentation: xfrm_sysctl: Trim trailing colon in section heading Documentation: xfrm_sync: Trim excess section heading characters Documentation: xfrm_sync: Properly reindent list text Documentation: xfrm_device: Separate hardware offload sublists Documentation: xfrm_device: Use numbered list for offloading steps Documentation: xfrm_device: Wrap iproute2 snippets in literal code block pfkey: Deprecate pfkey xfrm: Skip redundant replay recheck for the hardware offload path xfrm: Refactor xfrm_input lock to reduce contention with RSS ==================== Link: https://patch.msgid.link/20251118092610.2223552-1-steffen.klassert@secunet.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
@@ -131,10 +131,7 @@ Contents:
|
||||
vxlan
|
||||
x25
|
||||
x25-iface
|
||||
xfrm_device
|
||||
xfrm_proc
|
||||
xfrm_sync
|
||||
xfrm_sysctl
|
||||
xfrm/index
|
||||
xdp-rx-metadata
|
||||
xsk-tx-metadata
|
||||
|
||||
|
||||
13
Documentation/networking/xfrm/index.rst
Normal file
13
Documentation/networking/xfrm/index.rst
Normal file
@@ -0,0 +1,13 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
==============
|
||||
XFRM Framework
|
||||
==============
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
xfrm_device
|
||||
xfrm_proc
|
||||
xfrm_sync
|
||||
xfrm_sysctl
|
||||
@@ -20,11 +20,15 @@ can radically increase throughput and decrease CPU utilization. The XFRM
|
||||
Device interface allows NIC drivers to offer to the stack access to the
|
||||
hardware offload.
|
||||
|
||||
Right now, there are two types of hardware offload that kernel supports.
|
||||
Right now, there are two types of hardware offload that kernel supports:
|
||||
|
||||
* IPsec crypto offload:
|
||||
|
||||
* NIC performs encrypt/decrypt
|
||||
* Kernel does everything else
|
||||
|
||||
* IPsec packet offload:
|
||||
|
||||
* NIC performs encrypt/decrypt
|
||||
* NIC does encapsulation
|
||||
* Kernel and NIC have SA and policy in-sync
|
||||
@@ -34,7 +38,7 @@ Right now, there are two types of hardware offload that kernel supports.
|
||||
Userland access to the offload is typically through a system such as
|
||||
libreswan or KAME/raccoon, but the iproute2 'ip xfrm' command set can
|
||||
be handy when experimenting. An example command might look something
|
||||
like this for crypto offload:
|
||||
like this for crypto offload::
|
||||
|
||||
ip x s add proto esp dst 14.0.0.70 src 14.0.0.52 spi 0x07 mode transport \
|
||||
reqid 0x07 replay-window 32 \
|
||||
@@ -42,7 +46,7 @@ like this for crypto offload:
|
||||
sel src 14.0.0.52/24 dst 14.0.0.70/24 proto tcp \
|
||||
offload dev eth4 dir in
|
||||
|
||||
and for packet offload
|
||||
and for packet offload::
|
||||
|
||||
ip x s add proto esp dst 14.0.0.70 src 14.0.0.52 spi 0x07 mode transport \
|
||||
reqid 0x07 replay-window 32 \
|
||||
@@ -153,26 +157,26 @@ the packet's skb. At this point the data should be decrypted but the
|
||||
IPsec headers are still in the packet data; they are removed later up
|
||||
the stack in xfrm_input().
|
||||
|
||||
find and hold the SA that was used to the Rx skb::
|
||||
1. Find and hold the SA that was used to the Rx skb::
|
||||
|
||||
get spi, protocol, and destination IP from packet headers
|
||||
/* get spi, protocol, and destination IP from packet headers */
|
||||
xs = find xs from (spi, protocol, dest_IP)
|
||||
xfrm_state_hold(xs);
|
||||
|
||||
store the state information into the skb::
|
||||
2. Store the state information into the skb::
|
||||
|
||||
sp = secpath_set(skb);
|
||||
if (!sp) return;
|
||||
sp->xvec[sp->len++] = xs;
|
||||
sp->olen++;
|
||||
|
||||
indicate the success and/or error status of the offload::
|
||||
3. Indicate the success and/or error status of the offload::
|
||||
|
||||
xo = xfrm_offload(skb);
|
||||
xo->flags = CRYPTO_DONE;
|
||||
xo->status = crypto_status;
|
||||
|
||||
hand the packet to napi_gro_receive() as usual
|
||||
4. Hand the packet to napi_gro_receive() as usual.
|
||||
|
||||
In ESN mode, xdo_dev_state_advance_esn() is called from
|
||||
xfrm_replay_advance_esn() for RX, and xfrm_replay_overflow_offload_esn for TX.
|
||||
@@ -1,8 +1,8 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
====
|
||||
XFRM
|
||||
====
|
||||
=========
|
||||
XFRM sync
|
||||
=========
|
||||
|
||||
The sync patches work is based on initial patches from
|
||||
Krisztian <hidden@balabit.hu> and others and additional patches
|
||||
@@ -36,7 +36,7 @@ is not driven by packet arrival.
|
||||
- the replay sequence for both inbound and outbound
|
||||
|
||||
1) Message Structure
|
||||
----------------------
|
||||
--------------------
|
||||
|
||||
nlmsghdr:aevent_id:optional-TLVs.
|
||||
|
||||
@@ -83,31 +83,31 @@ when going from kernel to user space)
|
||||
A program needs to subscribe to multicast group XFRMNLGRP_AEVENTS
|
||||
to get notified of these events.
|
||||
|
||||
2) TLVS reflect the different parameters:
|
||||
-----------------------------------------
|
||||
2) TLVS reflect the different parameters
|
||||
----------------------------------------
|
||||
|
||||
a) byte value (XFRMA_LTIME_VAL)
|
||||
|
||||
This TLV carries the running/current counter for byte lifetime since
|
||||
last event.
|
||||
This TLV carries the running/current counter for byte lifetime since
|
||||
last event.
|
||||
|
||||
b)replay value (XFRMA_REPLAY_VAL)
|
||||
b) replay value (XFRMA_REPLAY_VAL)
|
||||
|
||||
This TLV carries the running/current counter for replay sequence since
|
||||
last event.
|
||||
This TLV carries the running/current counter for replay sequence since
|
||||
last event.
|
||||
|
||||
c)replay threshold (XFRMA_REPLAY_THRESH)
|
||||
c) replay threshold (XFRMA_REPLAY_THRESH)
|
||||
|
||||
This TLV carries the threshold being used by the kernel to trigger events
|
||||
when the replay sequence is exceeded.
|
||||
This TLV carries the threshold being used by the kernel to trigger events
|
||||
when the replay sequence is exceeded.
|
||||
|
||||
d) expiry timer (XFRMA_ETIMER_THRESH)
|
||||
|
||||
This is a timer value in milliseconds which is used as the nagle
|
||||
value to rate limit the events.
|
||||
This is a timer value in milliseconds which is used as the nagle
|
||||
value to rate limit the events.
|
||||
|
||||
3) Default configurations for the parameters:
|
||||
---------------------------------------------
|
||||
3) Default configurations for the parameters
|
||||
--------------------------------------------
|
||||
|
||||
By default these events should be turned off unless there is
|
||||
at least one listener registered to listen to the multicast
|
||||
@@ -121,12 +121,14 @@ in case they are not specified.
|
||||
the two sysctls/proc entries are:
|
||||
|
||||
a) /proc/sys/net/core/sysctl_xfrm_aevent_etime
|
||||
used to provide default values for the XFRMA_ETIMER_THRESH in incremental
|
||||
units of time of 100ms. The default is 10 (1 second)
|
||||
|
||||
Used to provide default values for the XFRMA_ETIMER_THRESH in incremental
|
||||
units of time of 100ms. The default is 10 (1 second)
|
||||
|
||||
b) /proc/sys/net/core/sysctl_xfrm_aevent_rseqth
|
||||
used to provide default values for XFRMA_REPLAY_THRESH parameter
|
||||
in incremental packet count. The default is two packets.
|
||||
|
||||
Used to provide default values for XFRMA_REPLAY_THRESH parameter
|
||||
in incremental packet count. The default is two packets.
|
||||
|
||||
4) Message types
|
||||
----------------
|
||||
@@ -134,50 +136,51 @@ in incremental packet count. The default is two packets.
|
||||
a) XFRM_MSG_GETAE issued by user-->kernel.
|
||||
XFRM_MSG_GETAE does not carry any TLVs.
|
||||
|
||||
The response is a XFRM_MSG_NEWAE which is formatted based on what
|
||||
XFRM_MSG_GETAE queried for.
|
||||
The response is a XFRM_MSG_NEWAE which is formatted based on what
|
||||
XFRM_MSG_GETAE queried for.
|
||||
|
||||
The response will always have XFRMA_LTIME_VAL and XFRMA_REPLAY_VAL TLVs.
|
||||
* if XFRM_AE_RTHR flag is set, then XFRMA_REPLAY_THRESH is also retrieved
|
||||
* if XFRM_AE_ETHR flag is set, then XFRMA_ETIMER_THRESH is also retrieved
|
||||
The response will always have XFRMA_LTIME_VAL and XFRMA_REPLAY_VAL TLVs.
|
||||
|
||||
* if XFRM_AE_RTHR flag is set, then XFRMA_REPLAY_THRESH is also retrieved
|
||||
* if XFRM_AE_ETHR flag is set, then XFRMA_ETIMER_THRESH is also retrieved
|
||||
|
||||
b) XFRM_MSG_NEWAE is issued by either user space to configure
|
||||
or kernel to announce events or respond to a XFRM_MSG_GETAE.
|
||||
|
||||
i) user --> kernel to configure a specific SA.
|
||||
i) user --> kernel to configure a specific SA.
|
||||
|
||||
any of the values or threshold parameters can be updated by passing the
|
||||
appropriate TLV.
|
||||
any of the values or threshold parameters can be updated by passing the
|
||||
appropriate TLV.
|
||||
|
||||
A response is issued back to the sender in user space to indicate success
|
||||
or failure.
|
||||
A response is issued back to the sender in user space to indicate success
|
||||
or failure.
|
||||
|
||||
In the case of success, additionally an event with
|
||||
XFRM_MSG_NEWAE is also issued to any listeners as described in iii).
|
||||
In the case of success, additionally an event with
|
||||
XFRM_MSG_NEWAE is also issued to any listeners as described in iii).
|
||||
|
||||
ii) kernel->user direction as a response to XFRM_MSG_GETAE
|
||||
ii) kernel->user direction as a response to XFRM_MSG_GETAE
|
||||
|
||||
The response will always have XFRMA_LTIME_VAL and XFRMA_REPLAY_VAL TLVs.
|
||||
The response will always have XFRMA_LTIME_VAL and XFRMA_REPLAY_VAL TLVs.
|
||||
|
||||
The threshold TLVs will be included if explicitly requested in
|
||||
the XFRM_MSG_GETAE message.
|
||||
The threshold TLVs will be included if explicitly requested in
|
||||
the XFRM_MSG_GETAE message.
|
||||
|
||||
iii) kernel->user to report as event if someone sets any values or
|
||||
thresholds for an SA using XFRM_MSG_NEWAE (as described in #i above).
|
||||
In such a case XFRM_AE_CU flag is set to inform the user that
|
||||
the change happened as a result of an update.
|
||||
The message will always have XFRMA_LTIME_VAL and XFRMA_REPLAY_VAL TLVs.
|
||||
iii) kernel->user to report as event if someone sets any values or
|
||||
thresholds for an SA using XFRM_MSG_NEWAE (as described in #i above).
|
||||
In such a case XFRM_AE_CU flag is set to inform the user that
|
||||
the change happened as a result of an update.
|
||||
The message will always have XFRMA_LTIME_VAL and XFRMA_REPLAY_VAL TLVs.
|
||||
|
||||
iv) kernel->user to report event when replay threshold or a timeout
|
||||
is exceeded.
|
||||
iv) kernel->user to report event when replay threshold or a timeout
|
||||
is exceeded.
|
||||
|
||||
In such a case either XFRM_AE_CR (replay exceeded) or XFRM_AE_CE (timeout
|
||||
happened) is set to inform the user what happened.
|
||||
Note the two flags are mutually exclusive.
|
||||
The message will always have XFRMA_LTIME_VAL and XFRMA_REPLAY_VAL TLVs.
|
||||
|
||||
Exceptions to threshold settings
|
||||
--------------------------------
|
||||
5) Exceptions to threshold settings
|
||||
-----------------------------------
|
||||
|
||||
If you have an SA that is getting hit by traffic in bursts such that
|
||||
there is a period where the timer threshold expires with no packets
|
||||
@@ -4,8 +4,8 @@
|
||||
XFRM Syscall
|
||||
============
|
||||
|
||||
/proc/sys/net/core/xfrm_* Variables:
|
||||
====================================
|
||||
/proc/sys/net/core/xfrm_* Variables
|
||||
===================================
|
||||
|
||||
xfrm_acq_expires - INTEGER
|
||||
default 30 - hard timeout in seconds for acquire requests
|
||||
@@ -18068,6 +18068,7 @@ L: netdev@vger.kernel.org
|
||||
S: Maintained
|
||||
T: git git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec.git
|
||||
T: git git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec-next.git
|
||||
F: Documentation/networking/xfrm/
|
||||
F: include/net/xfrm.h
|
||||
F: include/uapi/linux/xfrm.h
|
||||
F: net/ipv4/ah4.c
|
||||
|
||||
@@ -3903,6 +3903,8 @@ static int __init ipsec_pfkey_init(void)
|
||||
{
|
||||
int err = proto_register(&key_proto, 0);
|
||||
|
||||
pr_warn_once("PFKEY is deprecated and scheduled to be removed in 2027, "
|
||||
"please contact the netdev mailing list\n");
|
||||
if (err != 0)
|
||||
goto out;
|
||||
|
||||
|
||||
@@ -110,14 +110,17 @@ config XFRM_IPCOMP
|
||||
select CRYPTO_DEFLATE
|
||||
|
||||
config NET_KEY
|
||||
tristate "PF_KEY sockets"
|
||||
tristate "PF_KEY sockets (deprecated)"
|
||||
select XFRM_ALGO
|
||||
help
|
||||
PF_KEYv2 socket family, compatible to KAME ones.
|
||||
They are required if you are going to use IPsec tools ported
|
||||
from KAME.
|
||||
|
||||
Say Y unless you know what you are doing.
|
||||
The PF_KEYv2 socket interface is deprecated and
|
||||
scheduled for removal. All maintained IKE daemons
|
||||
no longer need PF_KEY sockets. Please use the netlink
|
||||
interface (XFRM_USER) to configure IPsec.
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
config NET_KEY_MIGRATE
|
||||
bool "PF_KEY MIGRATE"
|
||||
|
||||
@@ -505,6 +505,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
|
||||
async = 1;
|
||||
dev_put(skb->dev);
|
||||
seq = XFRM_SKB_CB(skb)->seq.input.low;
|
||||
spin_lock(&x->lock);
|
||||
goto resume;
|
||||
}
|
||||
/* GRO call */
|
||||
@@ -541,9 +542,11 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
|
||||
XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR);
|
||||
goto drop;
|
||||
}
|
||||
|
||||
nexthdr = x->type_offload->input_tail(x, skb);
|
||||
}
|
||||
|
||||
goto lock;
|
||||
goto process;
|
||||
}
|
||||
|
||||
family = XFRM_SPI_SKB_CB(skb)->family;
|
||||
@@ -611,7 +614,12 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
|
||||
goto drop;
|
||||
}
|
||||
|
||||
lock:
|
||||
process:
|
||||
seq_hi = htonl(xfrm_replay_seqhi(x, seq));
|
||||
|
||||
XFRM_SKB_CB(skb)->seq.input.low = seq;
|
||||
XFRM_SKB_CB(skb)->seq.input.hi = seq_hi;
|
||||
|
||||
spin_lock(&x->lock);
|
||||
|
||||
if (unlikely(x->km.state != XFRM_STATE_VALID)) {
|
||||
@@ -638,21 +646,13 @@ lock:
|
||||
goto drop_unlock;
|
||||
}
|
||||
|
||||
spin_unlock(&x->lock);
|
||||
|
||||
if (xfrm_tunnel_check(skb, x, family)) {
|
||||
XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMODEERROR);
|
||||
goto drop;
|
||||
goto drop_unlock;
|
||||
}
|
||||
|
||||
seq_hi = htonl(xfrm_replay_seqhi(x, seq));
|
||||
|
||||
XFRM_SKB_CB(skb)->seq.input.low = seq;
|
||||
XFRM_SKB_CB(skb)->seq.input.hi = seq_hi;
|
||||
|
||||
if (crypto_done) {
|
||||
nexthdr = x->type_offload->input_tail(x, skb);
|
||||
} else {
|
||||
if (!crypto_done) {
|
||||
spin_unlock(&x->lock);
|
||||
dev_hold(skb->dev);
|
||||
|
||||
nexthdr = x->type->input(x, skb);
|
||||
@@ -660,9 +660,9 @@ lock:
|
||||
return 0;
|
||||
|
||||
dev_put(skb->dev);
|
||||
spin_lock(&x->lock);
|
||||
}
|
||||
resume:
|
||||
spin_lock(&x->lock);
|
||||
if (nexthdr < 0) {
|
||||
if (nexthdr == -EBADMSG) {
|
||||
xfrm_audit_state_icvfail(x, skb,
|
||||
@@ -676,7 +676,7 @@ resume:
|
||||
/* only the first xfrm gets the encap type */
|
||||
encap_type = 0;
|
||||
|
||||
if (xfrm_replay_recheck(x, skb, seq)) {
|
||||
if (!crypto_done && xfrm_replay_recheck(x, skb, seq)) {
|
||||
XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATESEQERROR);
|
||||
goto drop_unlock;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user