mirror of
https://github.com/torvalds/linux.git
synced 2025-12-07 20:06:24 +00:00
Merge tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Martin KaFai Lau says: ==================== pull-request: bpf-next 2025-07-17 We've added 13 non-merge commits during the last 20 day(s) which contain a total of 4 files changed, 712 insertions(+), 84 deletions(-). The main changes are: 1) Avoid skipping or repeating a sk when using a TCP bpf_iter, from Jordan Rife. 2) Clarify the driver requirement on using the XDP metadata, from Song Yoong Siang * tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next: doc: xdp: Clarify driver implementation for XDP Rx metadata selftests/bpf: Add tests for bucket resume logic in established sockets selftests/bpf: Create iter_tcp_destroy test program selftests/bpf: Create established sockets in socket iterator tests selftests/bpf: Make ehash buckets configurable in socket iterator tests selftests/bpf: Allow for iteration over multiple states selftests/bpf: Allow for iteration over multiple ports selftests/bpf: Add tests for bucket resume logic in listening sockets bpf: tcp: Avoid socket skips and repeats during iteration bpf: tcp: Use bpf_tcp_iter_batch_item for bpf_tcp_iter_state batch items bpf: tcp: Get rid of st_bucket_done bpf: tcp: Make sure iter->batch always contains a full bucket snapshot bpf: tcp: Make mem flags configurable through bpf_iter_tcp_realloc_batch ==================== Link: https://patch.msgid.link/20250717191731.4142326-1-martin.lau@linux.dev Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
@@ -58,6 +58,7 @@
|
||||
#include <linux/times.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/sock_diag.h>
|
||||
|
||||
#include <net/aligned_data.h>
|
||||
#include <net/net_namespace.h>
|
||||
@@ -3014,13 +3015,17 @@ out:
|
||||
}
|
||||
|
||||
#ifdef CONFIG_BPF_SYSCALL
|
||||
union bpf_tcp_iter_batch_item {
|
||||
struct sock *sk;
|
||||
__u64 cookie;
|
||||
};
|
||||
|
||||
struct bpf_tcp_iter_state {
|
||||
struct tcp_iter_state state;
|
||||
unsigned int cur_sk;
|
||||
unsigned int end_sk;
|
||||
unsigned int max_sk;
|
||||
struct sock **batch;
|
||||
bool st_bucket_done;
|
||||
union bpf_tcp_iter_batch_item *batch;
|
||||
};
|
||||
|
||||
struct bpf_iter__tcp {
|
||||
@@ -3043,21 +3048,32 @@ static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
|
||||
|
||||
static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
|
||||
{
|
||||
while (iter->cur_sk < iter->end_sk)
|
||||
sock_gen_put(iter->batch[iter->cur_sk++]);
|
||||
union bpf_tcp_iter_batch_item *item;
|
||||
unsigned int cur_sk = iter->cur_sk;
|
||||
__u64 cookie;
|
||||
|
||||
/* Remember the cookies of the sockets we haven't seen yet, so we can
|
||||
* pick up where we left off next time around.
|
||||
*/
|
||||
while (cur_sk < iter->end_sk) {
|
||||
item = &iter->batch[cur_sk++];
|
||||
cookie = sock_gen_cookie(item->sk);
|
||||
sock_gen_put(item->sk);
|
||||
item->cookie = cookie;
|
||||
}
|
||||
}
|
||||
|
||||
static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
|
||||
unsigned int new_batch_sz)
|
||||
unsigned int new_batch_sz, gfp_t flags)
|
||||
{
|
||||
struct sock **new_batch;
|
||||
union bpf_tcp_iter_batch_item *new_batch;
|
||||
|
||||
new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
|
||||
GFP_USER | __GFP_NOWARN);
|
||||
flags | __GFP_NOWARN);
|
||||
if (!new_batch)
|
||||
return -ENOMEM;
|
||||
|
||||
bpf_iter_tcp_put_batch(iter);
|
||||
memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk);
|
||||
kvfree(iter->batch);
|
||||
iter->batch = new_batch;
|
||||
iter->max_sk = new_batch_sz;
|
||||
@@ -3065,112 +3081,234 @@ static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
|
||||
struct sock *start_sk)
|
||||
static struct sock *bpf_iter_tcp_resume_bucket(struct sock *first_sk,
|
||||
union bpf_tcp_iter_batch_item *cookies,
|
||||
int n_cookies)
|
||||
{
|
||||
struct hlist_nulls_node *node;
|
||||
struct sock *sk;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < n_cookies; i++) {
|
||||
sk = first_sk;
|
||||
sk_nulls_for_each_from(sk, node)
|
||||
if (cookies[i].cookie == atomic64_read(&sk->sk_cookie))
|
||||
return sk;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct sock *bpf_iter_tcp_resume_listening(struct seq_file *seq)
|
||||
{
|
||||
struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
|
||||
struct bpf_tcp_iter_state *iter = seq->private;
|
||||
struct tcp_iter_state *st = &iter->state;
|
||||
unsigned int find_cookie = iter->cur_sk;
|
||||
unsigned int end_cookie = iter->end_sk;
|
||||
int resume_bucket = st->bucket;
|
||||
struct sock *sk;
|
||||
|
||||
if (end_cookie && find_cookie == end_cookie)
|
||||
++st->bucket;
|
||||
|
||||
sk = listening_get_first(seq);
|
||||
iter->cur_sk = 0;
|
||||
iter->end_sk = 0;
|
||||
|
||||
if (sk && st->bucket == resume_bucket && end_cookie) {
|
||||
sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
|
||||
end_cookie - find_cookie);
|
||||
if (!sk) {
|
||||
spin_unlock(&hinfo->lhash2[st->bucket].lock);
|
||||
++st->bucket;
|
||||
sk = listening_get_first(seq);
|
||||
}
|
||||
}
|
||||
|
||||
return sk;
|
||||
}
|
||||
|
||||
static struct sock *bpf_iter_tcp_resume_established(struct seq_file *seq)
|
||||
{
|
||||
struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
|
||||
struct bpf_tcp_iter_state *iter = seq->private;
|
||||
struct tcp_iter_state *st = &iter->state;
|
||||
unsigned int find_cookie = iter->cur_sk;
|
||||
unsigned int end_cookie = iter->end_sk;
|
||||
int resume_bucket = st->bucket;
|
||||
struct sock *sk;
|
||||
|
||||
if (end_cookie && find_cookie == end_cookie)
|
||||
++st->bucket;
|
||||
|
||||
sk = established_get_first(seq);
|
||||
iter->cur_sk = 0;
|
||||
iter->end_sk = 0;
|
||||
|
||||
if (sk && st->bucket == resume_bucket && end_cookie) {
|
||||
sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
|
||||
end_cookie - find_cookie);
|
||||
if (!sk) {
|
||||
spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
|
||||
++st->bucket;
|
||||
sk = established_get_first(seq);
|
||||
}
|
||||
}
|
||||
|
||||
return sk;
|
||||
}
|
||||
|
||||
static struct sock *bpf_iter_tcp_resume(struct seq_file *seq)
|
||||
{
|
||||
struct bpf_tcp_iter_state *iter = seq->private;
|
||||
struct tcp_iter_state *st = &iter->state;
|
||||
struct sock *sk = NULL;
|
||||
|
||||
switch (st->state) {
|
||||
case TCP_SEQ_STATE_LISTENING:
|
||||
sk = bpf_iter_tcp_resume_listening(seq);
|
||||
if (sk)
|
||||
break;
|
||||
st->bucket = 0;
|
||||
st->state = TCP_SEQ_STATE_ESTABLISHED;
|
||||
fallthrough;
|
||||
case TCP_SEQ_STATE_ESTABLISHED:
|
||||
sk = bpf_iter_tcp_resume_established(seq);
|
||||
break;
|
||||
}
|
||||
|
||||
return sk;
|
||||
}
|
||||
|
||||
static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
|
||||
struct sock **start_sk)
|
||||
{
|
||||
struct bpf_tcp_iter_state *iter = seq->private;
|
||||
struct hlist_nulls_node *node;
|
||||
unsigned int expected = 1;
|
||||
struct sock *sk;
|
||||
|
||||
sock_hold(start_sk);
|
||||
iter->batch[iter->end_sk++] = start_sk;
|
||||
sock_hold(*start_sk);
|
||||
iter->batch[iter->end_sk++].sk = *start_sk;
|
||||
|
||||
sk = sk_nulls_next(start_sk);
|
||||
sk = sk_nulls_next(*start_sk);
|
||||
*start_sk = NULL;
|
||||
sk_nulls_for_each_from(sk, node) {
|
||||
if (seq_sk_match(seq, sk)) {
|
||||
if (iter->end_sk < iter->max_sk) {
|
||||
sock_hold(sk);
|
||||
iter->batch[iter->end_sk++] = sk;
|
||||
iter->batch[iter->end_sk++].sk = sk;
|
||||
} else if (!*start_sk) {
|
||||
/* Remember where we left off. */
|
||||
*start_sk = sk;
|
||||
}
|
||||
expected++;
|
||||
}
|
||||
}
|
||||
spin_unlock(&hinfo->lhash2[st->bucket].lock);
|
||||
|
||||
return expected;
|
||||
}
|
||||
|
||||
static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
|
||||
struct sock *start_sk)
|
||||
struct sock **start_sk)
|
||||
{
|
||||
struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
|
||||
struct bpf_tcp_iter_state *iter = seq->private;
|
||||
struct tcp_iter_state *st = &iter->state;
|
||||
struct hlist_nulls_node *node;
|
||||
unsigned int expected = 1;
|
||||
struct sock *sk;
|
||||
|
||||
sock_hold(start_sk);
|
||||
iter->batch[iter->end_sk++] = start_sk;
|
||||
sock_hold(*start_sk);
|
||||
iter->batch[iter->end_sk++].sk = *start_sk;
|
||||
|
||||
sk = sk_nulls_next(start_sk);
|
||||
sk = sk_nulls_next(*start_sk);
|
||||
*start_sk = NULL;
|
||||
sk_nulls_for_each_from(sk, node) {
|
||||
if (seq_sk_match(seq, sk)) {
|
||||
if (iter->end_sk < iter->max_sk) {
|
||||
sock_hold(sk);
|
||||
iter->batch[iter->end_sk++] = sk;
|
||||
iter->batch[iter->end_sk++].sk = sk;
|
||||
} else if (!*start_sk) {
|
||||
/* Remember where we left off. */
|
||||
*start_sk = sk;
|
||||
}
|
||||
expected++;
|
||||
}
|
||||
}
|
||||
spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
|
||||
|
||||
return expected;
|
||||
}
|
||||
|
||||
static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
|
||||
static unsigned int bpf_iter_fill_batch(struct seq_file *seq,
|
||||
struct sock **start_sk)
|
||||
{
|
||||
struct bpf_tcp_iter_state *iter = seq->private;
|
||||
struct tcp_iter_state *st = &iter->state;
|
||||
|
||||
if (st->state == TCP_SEQ_STATE_LISTENING)
|
||||
return bpf_iter_tcp_listening_batch(seq, start_sk);
|
||||
else
|
||||
return bpf_iter_tcp_established_batch(seq, start_sk);
|
||||
}
|
||||
|
||||
static void bpf_iter_tcp_unlock_bucket(struct seq_file *seq)
|
||||
{
|
||||
struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
|
||||
struct bpf_tcp_iter_state *iter = seq->private;
|
||||
struct tcp_iter_state *st = &iter->state;
|
||||
|
||||
if (st->state == TCP_SEQ_STATE_LISTENING)
|
||||
spin_unlock(&hinfo->lhash2[st->bucket].lock);
|
||||
else
|
||||
spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
|
||||
}
|
||||
|
||||
static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
|
||||
{
|
||||
struct bpf_tcp_iter_state *iter = seq->private;
|
||||
unsigned int expected;
|
||||
bool resized = false;
|
||||
struct sock *sk;
|
||||
int err;
|
||||
|
||||
/* The st->bucket is done. Directly advance to the next
|
||||
* bucket instead of having the tcp_seek_last_pos() to skip
|
||||
* one by one in the current bucket and eventually find out
|
||||
* it has to advance to the next bucket.
|
||||
*/
|
||||
if (iter->st_bucket_done) {
|
||||
st->offset = 0;
|
||||
st->bucket++;
|
||||
if (st->state == TCP_SEQ_STATE_LISTENING &&
|
||||
st->bucket > hinfo->lhash2_mask) {
|
||||
st->state = TCP_SEQ_STATE_ESTABLISHED;
|
||||
st->bucket = 0;
|
||||
}
|
||||
}
|
||||
|
||||
again:
|
||||
/* Get a new batch */
|
||||
iter->cur_sk = 0;
|
||||
iter->end_sk = 0;
|
||||
iter->st_bucket_done = false;
|
||||
|
||||
sk = tcp_seek_last_pos(seq);
|
||||
sk = bpf_iter_tcp_resume(seq);
|
||||
if (!sk)
|
||||
return NULL; /* Done */
|
||||
|
||||
if (st->state == TCP_SEQ_STATE_LISTENING)
|
||||
expected = bpf_iter_tcp_listening_batch(seq, sk);
|
||||
else
|
||||
expected = bpf_iter_tcp_established_batch(seq, sk);
|
||||
expected = bpf_iter_fill_batch(seq, &sk);
|
||||
if (likely(iter->end_sk == expected))
|
||||
goto done;
|
||||
|
||||
if (iter->end_sk == expected) {
|
||||
iter->st_bucket_done = true;
|
||||
return sk;
|
||||
/* Batch size was too small. */
|
||||
bpf_iter_tcp_unlock_bucket(seq);
|
||||
bpf_iter_tcp_put_batch(iter);
|
||||
err = bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2,
|
||||
GFP_USER);
|
||||
if (err)
|
||||
return ERR_PTR(err);
|
||||
|
||||
sk = bpf_iter_tcp_resume(seq);
|
||||
if (!sk)
|
||||
return NULL; /* Done */
|
||||
|
||||
expected = bpf_iter_fill_batch(seq, &sk);
|
||||
if (likely(iter->end_sk == expected))
|
||||
goto done;
|
||||
|
||||
/* Batch size was still too small. Hold onto the lock while we try
|
||||
* again with a larger batch to make sure the current bucket's size
|
||||
* does not change in the meantime.
|
||||
*/
|
||||
err = bpf_iter_tcp_realloc_batch(iter, expected, GFP_NOWAIT);
|
||||
if (err) {
|
||||
bpf_iter_tcp_unlock_bucket(seq);
|
||||
return ERR_PTR(err);
|
||||
}
|
||||
|
||||
if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
|
||||
resized = true;
|
||||
goto again;
|
||||
}
|
||||
|
||||
return sk;
|
||||
expected = bpf_iter_fill_batch(seq, &sk);
|
||||
WARN_ON_ONCE(iter->end_sk != expected);
|
||||
done:
|
||||
bpf_iter_tcp_unlock_bucket(seq);
|
||||
return iter->batch[0].sk;
|
||||
}
|
||||
|
||||
static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
|
||||
@@ -3200,16 +3338,11 @@ static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
|
||||
* meta.seq_num is used instead.
|
||||
*/
|
||||
st->num++;
|
||||
/* Move st->offset to the next sk in the bucket such that
|
||||
* the future start() will resume at st->offset in
|
||||
* st->bucket. See tcp_seek_last_pos().
|
||||
*/
|
||||
st->offset++;
|
||||
sock_gen_put(iter->batch[iter->cur_sk++]);
|
||||
sock_gen_put(iter->batch[iter->cur_sk++].sk);
|
||||
}
|
||||
|
||||
if (iter->cur_sk < iter->end_sk)
|
||||
sk = iter->batch[iter->cur_sk];
|
||||
sk = iter->batch[iter->cur_sk].sk;
|
||||
else
|
||||
sk = bpf_iter_tcp_batch(seq);
|
||||
|
||||
@@ -3275,10 +3408,8 @@ static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
|
||||
(void)tcp_prog_seq_show(prog, &meta, v, 0);
|
||||
}
|
||||
|
||||
if (iter->cur_sk < iter->end_sk) {
|
||||
if (iter->cur_sk < iter->end_sk)
|
||||
bpf_iter_tcp_put_batch(iter);
|
||||
iter->st_bucket_done = false;
|
||||
}
|
||||
}
|
||||
|
||||
static const struct seq_operations bpf_iter_tcp_seq_ops = {
|
||||
@@ -3596,7 +3727,7 @@ static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
|
||||
err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER);
|
||||
if (err) {
|
||||
bpf_iter_fini_seq_net(priv_data);
|
||||
return err;
|
||||
|
||||
Reference in New Issue
Block a user