Merge tag 'bpf-next-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

Pull bpf updates from Alexei Starovoitov:

 - Convert selftests/bpf/test_tc_edt and test_tc_tunnel from .sh to
   test_progs runner (Alexis Lothoré)

 - Convert selftests/bpf/test_xsk to test_progs runner (Bastien
   Curutchet)

 - Replace bpf memory allocator with kmalloc_nolock() in
   bpf_local_storage (Amery Hung), and in bpf streams and range tree
   (Puranjay Mohan)

 - Introduce support for indirect jumps in BPF verifier and x86 JIT
   (Anton Protopopov) and arm64 JIT (Puranjay Mohan)

 - Remove runqslower bpf tool (Hoyeon Lee)

 - Fix corner cases in the verifier to close several syzbot reports
   (Eduard Zingerman, KaFai Wan)

 - Several improvements in deadlock detection in rqspinlock (Kumar
   Kartikeya Dwivedi)

 - Implement "jmp" mode for BPF trampoline and corresponding
   DYNAMIC_FTRACE_WITH_JMP. It improves "fexit" program type performance
   from 80 M/s to 136 M/s. With Steven's Ack. (Menglong Dong)

 - Add ability to test non-linear skbs in BPF_PROG_TEST_RUN (Paul
   Chaignon)

 - Do not let BPF_PROG_TEST_RUN emit invalid GSO types to stack (Daniel
   Borkmann)

 - Generalize buildid reader into bpf_dynptr (Mykyta Yatsenko)

 - Optimize bpf_map_update_elem() for map-in-map types (Ritesh
   Oedayrajsingh Varma)

 - Introduce overwrite mode for BPF ring buffer (Xu Kuohai)

* tag 'bpf-next-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next: (169 commits)
  bpf: optimize bpf_map_update_elem() for map-in-map types
  bpf: make kprobe_multi_link_prog_run always_inline
  selftests/bpf: do not hardcode target rate in test_tc_edt BPF program
  selftests/bpf: remove test_tc_edt.sh
  selftests/bpf: integrate test_tc_edt into test_progs
  selftests/bpf: rename test_tc_edt.bpf.c section to expose program type
  selftests/bpf: Add success stats to rqspinlock stress test
  rqspinlock: Precede non-head waiter queueing with AA check
  rqspinlock: Disable spinning for trylock fallback
  rqspinlock: Use trylock fallback when per-CPU rqnode is busy
  rqspinlock: Perform AA checks immediately
  rqspinlock: Enclose lock/unlock within lock entry acquisitions
  bpf: Remove runqslower tool
  selftests/bpf: Remove usage of lsm/file_alloc_security in selftest
  bpf: Disable file_alloc_security hook
  bpf: check for insn arrays in check_ptr_alignment
  bpf: force BPF_F_RDONLY_PROG on insn array creation
  bpf: Fix exclusive map memory leak
  selftests/bpf: Make CS length configurable for rqspinlock stress test
  selftests/bpf: Add lock wait time stats to rqspinlock stress test
  ...
This commit is contained in:
Linus Torvalds
2025-12-03 16:54:54 -08:00
157 changed files with 10859 additions and 5005 deletions

View File

@@ -100,10 +100,26 @@ described in more detail in the footnotes.
| | | ``uretprobe.s+`` [#uprobe]_ | Yes |
+ + +----------------------------------+-----------+
| | | ``usdt+`` [#usdt]_ | |
+ + +----------------------------------+-----------+
| | | ``usdt.s+`` [#usdt]_ | Yes |
+ +----------------------------------------+----------------------------------+-----------+
| | ``BPF_TRACE_KPROBE_MULTI`` | ``kprobe.multi+`` [#kpmulti]_ | |
+ + +----------------------------------+-----------+
| | | ``kretprobe.multi+`` [#kpmulti]_ | |
+ +----------------------------------------+----------------------------------+-----------+
| | ``BPF_TRACE_KPROBE_SESSION`` | ``kprobe.session+`` [#kpmulti]_ | |
+ +----------------------------------------+----------------------------------+-----------+
| | ``BPF_TRACE_UPROBE_MULTI`` | ``uprobe.multi+`` [#upmul]_ | |
+ + +----------------------------------+-----------+
| | | ``uprobe.multi.s+`` [#upmul]_ | Yes |
+ + +----------------------------------+-----------+
| | | ``uretprobe.multi+`` [#upmul]_ | |
+ + +----------------------------------+-----------+
| | | ``uretprobe.multi.s+`` [#upmul]_ | Yes |
+ +----------------------------------------+----------------------------------+-----------+
| | ``BPF_TRACE_UPROBE_SESSION`` | ``uprobe.session+`` [#upmul]_ | |
+ + +----------------------------------+-----------+
| | | ``uprobe.session.s+`` [#upmul]_ | Yes |
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
| ``BPF_PROG_TYPE_LIRC_MODE2`` | ``BPF_LIRC_MODE2`` | ``lirc_mode2`` | |
+-------------------------------------------+----------------------------------------+----------------------------------+-----------+
@@ -219,6 +235,8 @@ described in more detail in the footnotes.
non-negative integer.
.. [#ksyscall] The ``ksyscall`` attach format is ``ksyscall/<syscall>``.
.. [#uprobe] The ``uprobe`` attach format is ``uprobe[.s]/<path>:<function>[+<offset>]``.
.. [#upmul] The ``uprobe.multi`` attach format is ``uprobe.multi[.s]/<path>:<function-pattern>``
where ``function-pattern`` supports ``*`` and ``?`` wildcards.
.. [#usdt] The ``usdt`` attach format is ``usdt/<path>:<provider>:<name>``.
.. [#kpmulti] The ``kprobe.multi`` attach format is ``kprobe.multi/<pattern>`` where ``pattern``
supports ``*`` and ``?`` wildcards. Valid characters for pattern are

View File

@@ -15,8 +15,9 @@ of constant size. The size of the array is defined in ``max_entries`` at
creation time. All array elements are pre-allocated and zero initialized when
created. ``BPF_MAP_TYPE_PERCPU_ARRAY`` uses a different memory region for each
CPU whereas ``BPF_MAP_TYPE_ARRAY`` uses the same memory region. The value
stored can be of any size, however, all array elements are aligned to 8
bytes.
stored can be of any size for ``BPF_MAP_TYPE_ARRAY`` and not more than
``PCPU_MIN_UNIT_SIZE`` (32 kB) for ``BPF_MAP_TYPE_PERCPU_ARRAY``. All
array elements are aligned to 8 bytes.
Since kernel 5.5, memory mapping may be enabled for ``BPF_MAP_TYPE_ARRAY`` by
setting the flag ``BPF_F_MMAPABLE``. The map definition is page-aligned and

View File

@@ -4654,6 +4654,7 @@ F: Documentation/userspace-api/ebpf/
F: arch/*/net/*
F: include/linux/bpf*
F: include/linux/btf*
F: include/linux/buildid.h
F: include/linux/filter.h
F: include/trace/events/xdp.h
F: include/uapi/linux/bpf*

View File

@@ -1452,6 +1452,10 @@ emit_bswap_uxt:
emit(A64_ASR(is64, dst, dst, imm), ctx);
break;
/* JUMP reg */
case BPF_JMP | BPF_JA | BPF_X:
emit(A64_BR(dst), ctx);
break;
/* JUMP off */
case BPF_JMP | BPF_JA:
case BPF_JMP32 | BPF_JA:
@@ -2231,6 +2235,13 @@ skip_init_ctx:
for (i = 0; i <= prog->len; i++)
ctx.offset[i] *= AARCH64_INSN_SIZE;
bpf_prog_fill_jited_linfo(prog, ctx.offset + 1);
/*
* The bpf_prog_update_insn_ptrs function expects offsets to
* point to the first byte of the jitted instruction (unlike
* the bpf_prog_fill_jited_linfo above, which, for historical
* reasons, expects to point to the next instruction)
*/
bpf_prog_update_insn_ptrs(prog, ctx.offset, ctx.ro_image);
out_off:
if (!ro_header && priv_stack_ptr) {
free_percpu(priv_stack_ptr);
@@ -2923,8 +2934,9 @@ static int gen_branch_or_nop(enum aarch64_insn_branch_type type, void *ip,
* The dummy_tramp is used to prevent another CPU from jumping to unknown
* locations during the patching process, making the patching process easier.
*/
int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
void *old_addr, void *new_addr)
int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t,
enum bpf_text_poke_type new_t, void *old_addr,
void *new_addr)
{
int ret;
u32 old_insn;
@@ -2968,14 +2980,13 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
!poking_bpf_entry))
return -EINVAL;
if (poke_type == BPF_MOD_CALL)
branch_type = AARCH64_INSN_BRANCH_LINK;
else
branch_type = AARCH64_INSN_BRANCH_NOLINK;
branch_type = old_t == BPF_MOD_CALL ? AARCH64_INSN_BRANCH_LINK :
AARCH64_INSN_BRANCH_NOLINK;
if (gen_branch_or_nop(branch_type, ip, old_addr, plt, &old_insn) < 0)
return -EFAULT;
branch_type = new_t == BPF_MOD_CALL ? AARCH64_INSN_BRANCH_LINK :
AARCH64_INSN_BRANCH_NOLINK;
if (gen_branch_or_nop(branch_type, ip, new_addr, plt, &new_insn) < 0)
return -EFAULT;

View File

@@ -1284,11 +1284,12 @@ void *bpf_arch_text_copy(void *dst, void *src, size_t len)
return ret ? ERR_PTR(-EINVAL) : dst;
}
int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
void *old_addr, void *new_addr)
int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t,
enum bpf_text_poke_type new_t, void *old_addr,
void *new_addr)
{
int ret;
bool is_call = (poke_type == BPF_MOD_CALL);
bool is_call;
u32 old_insns[LOONGARCH_LONG_JUMP_NINSNS] = {[0 ... 4] = INSN_NOP};
u32 new_insns[LOONGARCH_LONG_JUMP_NINSNS] = {[0 ... 4] = INSN_NOP};
@@ -1298,6 +1299,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
if (!is_bpf_text_address((unsigned long)ip))
return -ENOTSUPP;
is_call = old_t == BPF_MOD_CALL;
ret = emit_jump_or_nops(old_addr, ip, old_insns, is_call);
if (ret)
return ret;
@@ -1305,6 +1307,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
if (memcmp(ip, old_insns, LOONGARCH_LONG_JUMP_NBYTES))
return -EFAULT;
is_call = new_t == BPF_MOD_CALL;
ret = emit_jump_or_nops(new_addr, ip, new_insns, is_call);
if (ret)
return ret;

View File

@@ -1107,8 +1107,9 @@ static void do_isync(void *info __maybe_unused)
* execute isync (or some CSI) so that they don't go back into the
* trampoline again.
*/
int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
void *old_addr, void *new_addr)
int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t,
enum bpf_text_poke_type new_t, void *old_addr,
void *new_addr)
{
unsigned long bpf_func, bpf_func_end, size, offset;
ppc_inst_t old_inst, new_inst;
@@ -1119,7 +1120,6 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
return -EOPNOTSUPP;
bpf_func = (unsigned long)ip;
branch_flags = poke_type == BPF_MOD_CALL ? BRANCH_SET_LINK : 0;
/* We currently only support poking bpf programs */
if (!__bpf_address_lookup(bpf_func, &size, &offset, name)) {
@@ -1132,7 +1132,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
* an unconditional branch instruction at im->ip_after_call
*/
if (offset) {
if (poke_type != BPF_MOD_JUMP) {
if (old_t == BPF_MOD_CALL || new_t == BPF_MOD_CALL) {
pr_err("%s (0x%lx): calls are not supported in bpf prog body\n", __func__,
bpf_func);
return -EOPNOTSUPP;
@@ -1166,6 +1166,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
}
old_inst = ppc_inst(PPC_RAW_NOP());
branch_flags = old_t == BPF_MOD_CALL ? BRANCH_SET_LINK : 0;
if (old_addr) {
if (is_offset_in_branch_range(ip - old_addr))
create_branch(&old_inst, ip, (unsigned long)old_addr, branch_flags);
@@ -1174,6 +1175,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
branch_flags);
}
new_inst = ppc_inst(PPC_RAW_NOP());
branch_flags = new_t == BPF_MOD_CALL ? BRANCH_SET_LINK : 0;
if (new_addr) {
if (is_offset_in_branch_range(ip - new_addr))
create_branch(&new_inst, ip, (unsigned long)new_addr, branch_flags);

View File

@@ -852,17 +852,19 @@ static int gen_jump_or_nops(void *target, void *ip, u32 *insns, bool is_call)
return emit_jump_and_link(is_call ? RV_REG_T0 : RV_REG_ZERO, rvoff, false, &ctx);
}
int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
void *old_addr, void *new_addr)
int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t,
enum bpf_text_poke_type new_t, void *old_addr,
void *new_addr)
{
u32 old_insns[RV_FENTRY_NINSNS], new_insns[RV_FENTRY_NINSNS];
bool is_call = poke_type == BPF_MOD_CALL;
bool is_call;
int ret;
if (!is_kernel_text((unsigned long)ip) &&
!is_bpf_text_address((unsigned long)ip))
return -ENOTSUPP;
is_call = old_t == BPF_MOD_CALL;
ret = gen_jump_or_nops(old_addr, ip, old_insns, is_call);
if (ret)
return ret;
@@ -870,6 +872,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
if (memcmp(ip, old_insns, RV_FENTRY_NBYTES))
return -EFAULT;
is_call = new_t == BPF_MOD_CALL;
ret = gen_jump_or_nops(new_addr, ip, new_insns, is_call);
if (ret)
return ret;
@@ -1131,7 +1134,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im,
store_args(nr_arg_slots, args_off, ctx);
/* skip to actual body of traced function */
if (flags & BPF_TRAMP_F_SKIP_FRAME)
if (flags & BPF_TRAMP_F_ORIG_STACK)
orig_call += RV_FENTRY_NINSNS * 4;
if (flags & BPF_TRAMP_F_CALL_ORIG) {

View File

@@ -2412,8 +2412,9 @@ bool bpf_jit_supports_far_kfunc_call(void)
return true;
}
int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
void *old_addr, void *new_addr)
int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t,
enum bpf_text_poke_type new_t, void *old_addr,
void *new_addr)
{
struct bpf_plt expected_plt, current_plt, new_plt, *plt;
struct {
@@ -2430,7 +2431,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
if (insn.opc != (0xc004 | (old_addr ? 0xf0 : 0)))
return -EINVAL;
if (t == BPF_MOD_JUMP &&
if ((new_t == BPF_MOD_JUMP || old_t == BPF_MOD_JUMP) &&
insn.disp == ((char *)new_addr - (char *)ip) >> 1) {
/*
* The branch already points to the destination,

View File

@@ -230,6 +230,7 @@ config X86
select HAVE_DYNAMIC_FTRACE_WITH_ARGS if X86_64
select HAVE_FTRACE_REGS_HAVING_PT_REGS if X86_64
select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
select HAVE_DYNAMIC_FTRACE_WITH_JMP if X86_64
select HAVE_SAMPLE_FTRACE_DIRECT if X86_64
select HAVE_SAMPLE_FTRACE_DIRECT_MULTI if X86_64
select HAVE_EBPF_JIT

View File

@@ -74,7 +74,12 @@ static const char *ftrace_call_replace(unsigned long ip, unsigned long addr)
* No need to translate into a callthunk. The trampoline does
* the depth accounting itself.
*/
return text_gen_insn(CALL_INSN_OPCODE, (void *)ip, (void *)addr);
if (ftrace_is_jmp(addr)) {
addr = ftrace_jmp_get(addr);
return text_gen_insn(JMP32_INSN_OPCODE, (void *)ip, (void *)addr);
} else {
return text_gen_insn(CALL_INSN_OPCODE, (void *)ip, (void *)addr);
}
}
static int ftrace_verify_code(unsigned long ip, const char *old_code)

View File

@@ -285,8 +285,18 @@ SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
RET
1:
testb $1, %al
jz 2f
andq $0xfffffffffffffffe, %rax
movq %rax, MCOUNT_REG_SIZE+8(%rsp)
restore_mcount_regs
/* Restore flags */
popfq
RET
/* Swap the flags with orig_rax */
1: movq MCOUNT_REG_SIZE(%rsp), %rdi
2: movq MCOUNT_REG_SIZE(%rsp), %rdi
movq %rdi, MCOUNT_REG_SIZE-8(%rsp)
movq %rax, MCOUNT_REG_SIZE(%rsp)

View File

@@ -597,7 +597,8 @@ static int emit_jump(u8 **pprog, void *func, void *ip)
return emit_patch(pprog, func, ip, 0xE9);
}
static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t,
enum bpf_text_poke_type new_t,
void *old_addr, void *new_addr)
{
const u8 *nop_insn = x86_nops[5];
@@ -607,9 +608,9 @@ static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
int ret;
memcpy(old_insn, nop_insn, X86_PATCH_SIZE);
if (old_addr) {
if (old_t != BPF_MOD_NOP && old_addr) {
prog = old_insn;
ret = t == BPF_MOD_CALL ?
ret = old_t == BPF_MOD_CALL ?
emit_call(&prog, old_addr, ip) :
emit_jump(&prog, old_addr, ip);
if (ret)
@@ -617,9 +618,9 @@ static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
}
memcpy(new_insn, nop_insn, X86_PATCH_SIZE);
if (new_addr) {
if (new_t != BPF_MOD_NOP && new_addr) {
prog = new_insn;
ret = t == BPF_MOD_CALL ?
ret = new_t == BPF_MOD_CALL ?
emit_call(&prog, new_addr, ip) :
emit_jump(&prog, new_addr, ip);
if (ret)
@@ -640,8 +641,9 @@ out:
return ret;
}
int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
void *old_addr, void *new_addr)
int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t,
enum bpf_text_poke_type new_t, void *old_addr,
void *new_addr)
{
if (!is_kernel_text((long)ip) &&
!is_bpf_text_address((long)ip))
@@ -655,29 +657,43 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
if (is_endbr(ip))
ip += ENDBR_INSN_SIZE;
return __bpf_arch_text_poke(ip, t, old_addr, new_addr);
return __bpf_arch_text_poke(ip, old_t, new_t, old_addr, new_addr);
}
#define EMIT_LFENCE() EMIT3(0x0F, 0xAE, 0xE8)
static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip)
static void __emit_indirect_jump(u8 **pprog, int reg, bool ereg)
{
u8 *prog = *pprog;
if (ereg)
EMIT1(0x41);
EMIT2(0xFF, 0xE0 + reg);
*pprog = prog;
}
static void emit_indirect_jump(u8 **pprog, int bpf_reg, u8 *ip)
{
u8 *prog = *pprog;
int reg = reg2hex[bpf_reg];
bool ereg = is_ereg(bpf_reg);
if (cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) {
OPTIMIZER_HIDE_VAR(reg);
emit_jump(&prog, its_static_thunk(reg), ip);
emit_jump(&prog, its_static_thunk(reg + 8*ereg), ip);
} else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
EMIT_LFENCE();
EMIT2(0xFF, 0xE0 + reg);
__emit_indirect_jump(&prog, reg, ereg);
} else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE)) {
OPTIMIZER_HIDE_VAR(reg);
if (cpu_feature_enabled(X86_FEATURE_CALL_DEPTH))
emit_jump(&prog, &__x86_indirect_jump_thunk_array[reg], ip);
emit_jump(&prog, &__x86_indirect_jump_thunk_array[reg + 8*ereg], ip);
else
emit_jump(&prog, &__x86_indirect_thunk_array[reg], ip);
emit_jump(&prog, &__x86_indirect_thunk_array[reg + 8*ereg], ip);
} else {
EMIT2(0xFF, 0xE0 + reg); /* jmp *%\reg */
__emit_indirect_jump(&prog, reg, ereg);
if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) || IS_ENABLED(CONFIG_MITIGATION_SLS))
EMIT1(0xCC); /* int3 */
}
@@ -797,7 +813,7 @@ static void emit_bpf_tail_call_indirect(struct bpf_prog *bpf_prog,
* rdi == ctx (1st arg)
* rcx == prog->bpf_func + X86_TAIL_CALL_OFFSET
*/
emit_indirect_jump(&prog, 1 /* rcx */, ip + (prog - start));
emit_indirect_jump(&prog, BPF_REG_4 /* R4 -> rcx */, ip + (prog - start));
/* out: */
ctx->tail_call_indirect_label = prog - start;
@@ -883,12 +899,13 @@ static void bpf_tail_call_direct_fixup(struct bpf_prog *prog)
target = array->ptrs[poke->tail_call.key];
if (target) {
ret = __bpf_arch_text_poke(poke->tailcall_target,
BPF_MOD_JUMP, NULL,
BPF_MOD_NOP, BPF_MOD_JUMP,
NULL,
(u8 *)target->bpf_func +
poke->adj_off);
BUG_ON(ret < 0);
ret = __bpf_arch_text_poke(poke->tailcall_bypass,
BPF_MOD_JUMP,
BPF_MOD_JUMP, BPF_MOD_NOP,
(u8 *)poke->tailcall_target +
X86_PATCH_SIZE, NULL);
BUG_ON(ret < 0);
@@ -2614,6 +2631,9 @@ emit_cond_jmp: /* Convert BPF opcode to x86 */
break;
case BPF_JMP | BPF_JA | BPF_X:
emit_indirect_jump(&prog, insn->dst_reg, image + addrs[i - 1]);
break;
case BPF_JMP | BPF_JA:
case BPF_JMP32 | BPF_JA:
if (BPF_CLASS(insn->code) == BPF_JMP) {
@@ -2830,9 +2850,10 @@ static int get_nr_used_regs(const struct btf_func_model *m)
}
static void save_args(const struct btf_func_model *m, u8 **prog,
int stack_size, bool for_call_origin)
int stack_size, bool for_call_origin, u32 flags)
{
int arg_regs, first_off = 0, nr_regs = 0, nr_stack_slots = 0;
bool use_jmp = bpf_trampoline_use_jmp(flags);
int i, j;
/* Store function arguments to stack.
@@ -2873,7 +2894,7 @@ static void save_args(const struct btf_func_model *m, u8 **prog,
*/
for (j = 0; j < arg_regs; j++) {
emit_ldx(prog, BPF_DW, BPF_REG_0, BPF_REG_FP,
nr_stack_slots * 8 + 0x18);
nr_stack_slots * 8 + 16 + (!use_jmp) * 8);
emit_stx(prog, BPF_DW, BPF_REG_FP, BPF_REG_0,
-stack_size);
@@ -3267,12 +3288,17 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
* should be 16-byte aligned. Following code depend on
* that stack_size is already 8-byte aligned.
*/
stack_size += (stack_size % 16) ? 0 : 8;
if (bpf_trampoline_use_jmp(flags)) {
/* no rip in the "jmp" case */
stack_size += (stack_size % 16) ? 8 : 0;
} else {
stack_size += (stack_size % 16) ? 0 : 8;
}
}
arg_stack_off = stack_size;
if (flags & BPF_TRAMP_F_SKIP_FRAME) {
if (flags & BPF_TRAMP_F_CALL_ORIG) {
/* skip patched call instruction and point orig_call to actual
* body of the kernel function.
*/
@@ -3327,7 +3353,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -ip_off);
}
save_args(m, &prog, regs_off, false);
save_args(m, &prog, regs_off, false, flags);
if (flags & BPF_TRAMP_F_CALL_ORIG) {
/* arg1: mov rdi, im */
@@ -3360,7 +3386,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
if (flags & BPF_TRAMP_F_CALL_ORIG) {
restore_regs(m, &prog, regs_off);
save_args(m, &prog, arg_stack_off, true);
save_args(m, &prog, arg_stack_off, true, flags);
if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) {
/* Before calling the original function, load the
@@ -3543,7 +3569,7 @@ static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs, u8 *image,
if (err)
return err;
emit_indirect_jump(&prog, 2 /* rdx */, image + (prog - buf));
emit_indirect_jump(&prog, BPF_REG_3 /* R3 -> rdx */, image + (prog - buf));
*pprog = prog;
return 0;
@@ -3827,6 +3853,15 @@ out_image:
jit_data->header = header;
jit_data->rw_header = rw_header;
}
/*
* The bpf_prog_update_insn_ptrs function expects addrs to
* point to the first byte of the jitted instruction (unlike
* the bpf_prog_fill_jited_linfo below, which, for historical
* reasons, expects to point to the next instruction)
*/
bpf_prog_update_insn_ptrs(prog, addrs, image);
/*
* ctx.prog_offset is used when CFI preambles put code *before*
* the function. See emit_cfi(). For FineIBT specifically this code
@@ -3953,6 +3988,7 @@ void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
struct bpf_prog *new, struct bpf_prog *old)
{
u8 *old_addr, *new_addr, *old_bypass_addr;
enum bpf_text_poke_type t;
int ret;
old_bypass_addr = old ? NULL : poke->bypass_addr;
@@ -3965,21 +4001,22 @@ void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
* the kallsyms check.
*/
if (new) {
t = old_addr ? BPF_MOD_JUMP : BPF_MOD_NOP;
ret = __bpf_arch_text_poke(poke->tailcall_target,
BPF_MOD_JUMP,
t, BPF_MOD_JUMP,
old_addr, new_addr);
BUG_ON(ret < 0);
if (!old) {
ret = __bpf_arch_text_poke(poke->tailcall_bypass,
BPF_MOD_JUMP,
BPF_MOD_JUMP, BPF_MOD_NOP,
poke->bypass_addr,
NULL);
BUG_ON(ret < 0);
}
} else {
t = old_bypass_addr ? BPF_MOD_JUMP : BPF_MOD_NOP;
ret = __bpf_arch_text_poke(poke->tailcall_bypass,
BPF_MOD_JUMP,
old_bypass_addr,
t, BPF_MOD_JUMP, old_bypass_addr,
poke->bypass_addr);
BUG_ON(ret < 0);
/* let other CPUs finish the execution of program
@@ -3988,9 +4025,9 @@ void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
*/
if (!ret)
synchronize_rcu();
t = old_addr ? BPF_MOD_JUMP : BPF_MOD_NOP;
ret = __bpf_arch_text_poke(poke->tailcall_target,
BPF_MOD_JUMP,
old_addr, NULL);
t, BPF_MOD_NOP, old_addr, NULL);
BUG_ON(ret < 0);
}
}

View File

@@ -129,8 +129,8 @@ dec:
* <error> for lock B
* release_held_lock_entry
*
* try_cmpxchg_acquire for lock A
* grab_held_lock_entry
* try_cmpxchg_acquire for lock A
*
* Lack of any ordering means reordering may occur such that dec, inc
* are done before entry is overwritten. This permits a remote lock
@@ -139,13 +139,8 @@ dec:
* CPU holds a lock it is attempting to acquire, leading to false ABBA
* diagnosis).
*
* In case of unlock, we will always do a release on the lock word after
* releasing the entry, ensuring that other CPUs cannot hold the lock
* (and make conclusions about deadlocks) until the entry has been
* cleared on the local CPU, preventing any anomalies. Reordering is
* still possible there, but a remote CPU cannot observe a lock in our
* table which it is already holding, since visibility entails our
* release store for the said lock has not retired.
* The case of unlock is treated differently due to NMI reentrancy, see
* comments in res_spin_unlock.
*
* In theory we don't have a problem if the dec and WRITE_ONCE above get
* reordered with each other, we either notice an empty NULL entry on
@@ -175,10 +170,22 @@ static __always_inline int res_spin_lock(rqspinlock_t *lock)
{
int val = 0;
if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) {
grab_held_lock_entry(lock);
/*
* Grab the deadlock detection entry before doing the cmpxchg, so that
* reentrancy due to NMIs between the succeeding cmpxchg and creation of
* held lock entry can correctly detect an acquisition attempt in the
* interrupted context.
*
* cmpxchg lock A
* <NMI>
* res_spin_lock(A) --> missed AA, leads to timeout
* </NMI>
* grab_held_lock_entry(A)
*/
grab_held_lock_entry(lock);
if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)))
return 0;
}
return resilient_queued_spin_lock_slowpath(lock, val);
}
@@ -192,28 +199,25 @@ static __always_inline void res_spin_unlock(rqspinlock_t *lock)
{
struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
if (unlikely(rqh->cnt > RES_NR_HELD))
goto unlock;
WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL);
unlock:
/*
* Release barrier, ensures correct ordering. See release_held_lock_entry
* for details. Perform release store instead of queued_spin_unlock,
* since we use this function for test-and-set fallback as well. When we
* have CONFIG_QUEUED_SPINLOCKS=n, we clear the full 4-byte lockword.
* Release barrier, ensures correct ordering. Perform release store
* instead of queued_spin_unlock, since we use this function for the TAS
* fallback as well. When we have CONFIG_QUEUED_SPINLOCKS=n, we clear
* the full 4-byte lockword.
*
* Like release_held_lock_entry, we can do the release before the dec.
* We simply care about not seeing the 'lock' in our table from a remote
* CPU once the lock has been released, which doesn't rely on the dec.
* Perform the smp_store_release before clearing the lock entry so that
* NMIs landing in the unlock path can correctly detect AA issues. The
* opposite order shown below may lead to missed AA checks:
*
* Unlike smp_wmb(), release is not a two way fence, hence it is
* possible for a inc to move up and reorder with our clearing of the
* entry. This isn't a problem however, as for a misdiagnosis of ABBA,
* the remote CPU needs to hold this lock, which won't be released until
* the store below is done, which would ensure the entry is overwritten
* to NULL, etc.
* WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL)
* <NMI>
* res_spin_lock(A) --> missed AA, leads to timeout
* </NMI>
* smp_store_release(A->locked, 0)
*/
smp_store_release(&lock->locked, 0);
if (likely(rqh->cnt <= RES_NR_HELD))
WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL);
this_cpu_dec(rqspinlock_held_locks.cnt);
}

View File

@@ -663,6 +663,16 @@ int map_check_no_btf(const struct bpf_map *map,
bool bpf_map_meta_equal(const struct bpf_map *meta0,
const struct bpf_map *meta1);
static inline bool bpf_map_has_internal_structs(struct bpf_map *map)
{
return btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK);
}
void bpf_map_free_internal_structs(struct bpf_map *map, void *obj);
int bpf_dynptr_from_file_sleepable(struct file *file, u32 flags,
struct bpf_dynptr *ptr__uninit);
extern const struct bpf_map_ops bpf_map_offload_ops;
/* bpf_type_flag contains a set of flags that are applicable to the values of
@@ -785,12 +795,15 @@ enum bpf_type_flag {
/* DYNPTR points to skb_metadata_end()-skb_metadata_len() */
DYNPTR_TYPE_SKB_META = BIT(19 + BPF_BASE_TYPE_BITS),
/* DYNPTR points to file */
DYNPTR_TYPE_FILE = BIT(20 + BPF_BASE_TYPE_BITS),
__BPF_TYPE_FLAG_MAX,
__BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1,
};
#define DYNPTR_TYPE_FLAG_MASK (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF | DYNPTR_TYPE_SKB \
| DYNPTR_TYPE_XDP | DYNPTR_TYPE_SKB_META)
| DYNPTR_TYPE_XDP | DYNPTR_TYPE_SKB_META | DYNPTR_TYPE_FILE)
/* Max number of base types. */
#define BPF_BASE_TYPE_LIMIT (1UL << BPF_BASE_TYPE_BITS)
@@ -988,6 +1001,7 @@ enum bpf_reg_type {
PTR_TO_ARENA,
PTR_TO_BUF, /* reg points to a read/write buffer */
PTR_TO_FUNC, /* reg points to a bpf program function */
PTR_TO_INSN, /* reg points to a bpf program instruction */
CONST_PTR_TO_DYNPTR, /* reg points to a const struct bpf_dynptr */
__BPF_REG_TYPE_MAX,
@@ -1250,6 +1264,18 @@ typedef void (*bpf_trampoline_exit_t)(struct bpf_prog *prog, u64 start,
bpf_trampoline_enter_t bpf_trampoline_enter(const struct bpf_prog *prog);
bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog);
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP
static inline bool bpf_trampoline_use_jmp(u64 flags)
{
return flags & BPF_TRAMP_F_CALL_ORIG && !(flags & BPF_TRAMP_F_SKIP_FRAME);
}
#else
static inline bool bpf_trampoline_use_jmp(u64 flags)
{
return false;
}
#endif
struct bpf_ksym {
unsigned long start;
unsigned long end;
@@ -1378,21 +1404,23 @@ enum bpf_dynptr_type {
BPF_DYNPTR_TYPE_XDP,
/* Points to skb_metadata_end()-skb_metadata_len() */
BPF_DYNPTR_TYPE_SKB_META,
/* Underlying data is a file */
BPF_DYNPTR_TYPE_FILE,
};
int bpf_dynptr_check_size(u32 size);
u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr);
const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len);
void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len);
int bpf_dynptr_check_size(u64 size);
u64 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr);
const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u64 len);
void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u64 len);
bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr);
int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset,
void *src, u32 len, u64 flags);
void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u32 offset,
void *buffer__opt, u32 buffer__szk);
int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u64 offset,
void *src, u64 len, u64 flags);
void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset,
void *buffer__opt, u64 buffer__szk);
static inline int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u32 offset, u32 len)
static inline int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u64 offset, u64 len)
{
u32 size = __bpf_dynptr_size(ptr);
u64 size = __bpf_dynptr_size(ptr);
if (len > size || offset > size - len)
return -E2BIG;
@@ -1616,6 +1644,7 @@ struct bpf_prog_aux {
u32 ctx_arg_info_size;
u32 max_rdonly_access;
u32 max_rdwr_access;
u32 subprog_start;
struct btf *attach_btf;
struct bpf_ctx_arg_aux *ctx_arg_info;
void __percpu *priv_stack_ptr;
@@ -1905,12 +1934,14 @@ struct btf_member;
* reason, if this callback is not defined, the check is skipped as
* the struct_ops map will have final verification performed in
* @reg.
* @type: BTF type.
* @value_type: Value type.
* @cfi_stubs: Pointer to a structure of stub functions for CFI. These stubs
* provide the correct Control Flow Integrity hashes for the
* trampolines generated by BPF struct_ops.
* @owner: The module that owns this struct_ops. Used for module reference
* counting to ensure the module providing the struct_ops cannot be
* unloaded while in use.
* @name: The name of the struct bpf_struct_ops object.
* @func_models: Func models
* @type_id: BTF type id.
* @value_id: BTF value id.
*/
struct bpf_struct_ops {
const struct bpf_verifier_ops *verifier_ops;
@@ -2099,6 +2130,12 @@ struct bpf_array {
};
};
/*
* The bpf_array_get_next_key() function may be used for all array-like
* maps, i.e., maps with u32 keys with range [0 ,..., max_entries)
*/
int bpf_array_get_next_key(struct bpf_map *map, void *key, void *next_key);
#define BPF_COMPLEXITY_LIMIT_INSNS 1000000 /* yes. 1M insns */
#define MAX_TAIL_CALL_CNT 33
@@ -2374,6 +2411,9 @@ bpf_prog_run_array_uprobe(const struct bpf_prog_array *array,
bool bpf_jit_bypass_spec_v1(void);
bool bpf_jit_bypass_spec_v4(void);
#define bpf_rcu_lock_held() \
(rcu_read_lock_held() || rcu_read_lock_trace_held() || rcu_read_lock_bh_held())
#ifdef CONFIG_BPF_SYSCALL
DECLARE_PER_CPU(int, bpf_prog_active);
extern struct mutex bpf_stats_enabled_mutex;
@@ -3670,12 +3710,14 @@ static inline u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
#endif /* CONFIG_INET */
enum bpf_text_poke_type {
BPF_MOD_NOP,
BPF_MOD_CALL,
BPF_MOD_JUMP,
};
int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
void *addr1, void *addr2);
int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t,
enum bpf_text_poke_type new_t, void *old_addr,
void *new_addr);
void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
struct bpf_prog *new, struct bpf_prog *old);
@@ -3772,4 +3814,30 @@ int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char *
const char **linep, int *nump);
struct bpf_prog *bpf_prog_find_from_stack(void);
int bpf_insn_array_init(struct bpf_map *map, const struct bpf_prog *prog);
int bpf_insn_array_ready(struct bpf_map *map);
void bpf_insn_array_release(struct bpf_map *map);
void bpf_insn_array_adjust(struct bpf_map *map, u32 off, u32 len);
void bpf_insn_array_adjust_after_remove(struct bpf_map *map, u32 off, u32 len);
#ifdef CONFIG_BPF_SYSCALL
void bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image);
#else
static inline void
bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image)
{
}
#endif
static inline int bpf_map_check_op_flags(struct bpf_map *map, u64 flags, u64 allowed_flags)
{
if (flags & ~allowed_flags)
return -EINVAL;
if ((flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK))
return -EINVAL;
return 0;
}
#endif /* _LINUX_BPF_H */

View File

@@ -18,9 +18,6 @@
#define BPF_LOCAL_STORAGE_CACHE_SIZE 16
#define bpf_rcu_lock_held() \
(rcu_read_lock_held() || rcu_read_lock_trace_held() || \
rcu_read_lock_bh_held())
struct bpf_local_storage_map_bucket {
struct hlist_head list;
raw_spinlock_t lock;
@@ -56,9 +53,7 @@ struct bpf_local_storage_map {
u32 bucket_log;
u16 elem_size;
u16 cache_idx;
struct bpf_mem_alloc selem_ma;
struct bpf_mem_alloc storage_ma;
bool bpf_ma;
bool use_kmalloc_nolock;
};
struct bpf_local_storage_data {
@@ -100,6 +95,7 @@ struct bpf_local_storage {
*/
struct rcu_head rcu;
raw_spinlock_t lock; /* Protect adding/removing from the "list" */
bool use_kmalloc_nolock;
};
/* U16_MAX is much more than enough for sk local storage
@@ -133,7 +129,7 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr);
struct bpf_map *
bpf_local_storage_map_alloc(union bpf_attr *attr,
struct bpf_local_storage_cache *cache,
bool bpf_ma);
bool use_kmalloc_nolock);
void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage,
struct bpf_local_storage_map *smap,
@@ -187,10 +183,9 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap,
struct bpf_local_storage_elem *
bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value,
bool charge_mem, bool swap_uptrs, gfp_t gfp_flags);
bool swap_uptrs, gfp_t gfp_flags);
void bpf_selem_free(struct bpf_local_storage_elem *selem,
struct bpf_local_storage_map *smap,
bool reuse_now);
int

View File

@@ -133,6 +133,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_BLOOM_FILTER, bloom_filter_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_USER_RINGBUF, user_ringbuf_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_ARENA, arena_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_INSN_ARRAY, insn_array_map_ops)
BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint)
BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing)

View File

@@ -416,7 +416,7 @@ struct bpf_verifier_state {
u32 active_irq_id;
u32 active_lock_id;
void *active_lock_ptr;
bool active_rcu_lock;
u32 active_rcu_locks;
bool speculative;
bool in_sleepable;
@@ -509,6 +509,15 @@ struct bpf_map_ptr_state {
#define BPF_ALU_SANITIZE (BPF_ALU_SANITIZE_SRC | \
BPF_ALU_SANITIZE_DST)
/*
* An array of BPF instructions.
* Primary usage: return value of bpf_insn_successors.
*/
struct bpf_iarray {
int cnt;
u32 items[];
};
struct bpf_insn_aux_data {
union {
enum bpf_reg_type ptr_type; /* pointer type for load/store insns */
@@ -540,6 +549,7 @@ struct bpf_insn_aux_data {
/* remember the offset of node field within type to rewrite */
u64 insert_off;
};
struct bpf_iarray *jt; /* jump table for gotox or bpf_tailcall call instruction */
struct btf_struct_meta *kptr_struct_meta;
u64 map_key_state; /* constant (32 bit) key tracking for maps */
int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
@@ -548,7 +558,7 @@ struct bpf_insn_aux_data {
bool nospec_result; /* result is unsafe under speculation, nospec must follow */
bool zext_dst; /* this insn zero extends dst reg */
bool needs_zext; /* alu op needs to clear upper bits */
bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */
bool non_sleepable; /* helper/kfunc may be called from non-sleepable context */
bool is_iter_next; /* bpf_iter_<type>_next() kfunc call */
bool call_with_percpu_alloc_ptr; /* {this,per}_cpu_ptr() with prog percpu alloc */
u8 alu_state; /* used in combination with alu_limit */
@@ -642,6 +652,7 @@ struct bpf_subprog_info {
u32 start; /* insn idx of function entry point */
u32 linfo_idx; /* The idx to the main_prog->aux->linfo */
u32 postorder_start; /* The idx to the env->cfg.insn_postorder */
u32 exit_idx; /* Index of one of the BPF_EXIT instructions in this subprogram */
u16 stack_depth; /* max. stack depth used by this function */
u16 stack_extra;
/* offsets in range [stack_depth .. fastcall_stack_off)
@@ -659,9 +670,9 @@ struct bpf_subprog_info {
bool keep_fastcall_stack: 1;
bool changes_pkt_data: 1;
bool might_sleep: 1;
u8 arg_cnt:3;
enum priv_stack_mode priv_stack_mode;
u8 arg_cnt;
struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS];
};
@@ -745,8 +756,10 @@ struct bpf_verifier_env {
struct list_head free_list; /* list of struct bpf_verifier_state_list */
struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
struct btf_mod_pair used_btfs[MAX_USED_BTFS]; /* array of BTF's used by BPF program */
struct bpf_map *insn_array_maps[MAX_USED_MAPS]; /* array of INSN_ARRAY map's to be relocated */
u32 used_map_cnt; /* number of used maps */
u32 used_btf_cnt; /* number of used BTF objects */
u32 insn_array_map_cnt; /* number of used maps of type BPF_MAP_TYPE_INSN_ARRAY */
u32 id_gen; /* used to generate unique reg IDs */
u32 hidden_subprog_cnt; /* number of hidden subprogs */
int exception_callback_subprog;
@@ -828,6 +841,8 @@ struct bpf_verifier_env {
/* array of pointers to bpf_scc_info indexed by SCC id */
struct bpf_scc_info **scc_info;
u32 scc_cnt;
struct bpf_iarray *succ;
struct bpf_iarray *gotox_tmp_buf;
};
static inline struct bpf_func_info_aux *subprog_aux(struct bpf_verifier_env *env, int subprog)
@@ -1038,6 +1053,13 @@ static inline bool bpf_stack_narrow_access_ok(int off, int fill_size, int spill_
return !(off % BPF_REG_SIZE);
}
static inline bool insn_is_gotox(struct bpf_insn *insn)
{
return BPF_CLASS(insn->code) == BPF_JMP &&
BPF_OP(insn->code) == BPF_JA &&
BPF_SRC(insn->code) == BPF_X;
}
const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type);
const char *dynptr_type_str(enum bpf_dynptr_type type);
const char *iter_type_str(const struct btf *btf, u32 btf_id);
@@ -1050,7 +1072,7 @@ void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_st
struct bpf_subprog_info *bpf_find_containing_subprog(struct bpf_verifier_env *env, int off);
int bpf_jmp_offset(struct bpf_insn *insn);
int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]);
struct bpf_iarray *bpf_insn_successors(struct bpf_verifier_env *env, u32 idx);
void bpf_fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask);
bool bpf_calls_callback(struct bpf_verifier_env *env, int insn_idx);

View File

@@ -18,4 +18,29 @@ void init_vmlinux_build_id(void);
static inline void init_vmlinux_build_id(void) { }
#endif
struct freader {
void *buf;
u32 buf_sz;
int err;
union {
struct {
struct file *file;
struct folio *folio;
void *addr;
loff_t folio_off;
bool may_fault;
};
struct {
const char *data;
u64 data_sz;
};
};
};
void freader_init_from_file(struct freader *r, void *buf, u32 buf_sz,
struct file *file, bool may_fault);
void freader_init_from_mem(struct freader *r, const char *data, u64 data_sz);
const void *freader_fetch(struct freader *r, loff_t file_off, size_t sz);
void freader_cleanup(struct freader *r);
#endif

View File

@@ -712,11 +712,13 @@ static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
duration = sched_clock() - start;
stats = this_cpu_ptr(prog->stats);
flags = u64_stats_update_begin_irqsave(&stats->syncp);
u64_stats_inc(&stats->cnt);
u64_stats_add(&stats->nsecs, duration);
u64_stats_update_end_irqrestore(&stats->syncp, flags);
if (likely(prog->stats)) {
stats = this_cpu_ptr(prog->stats);
flags = u64_stats_update_begin_irqsave(&stats->syncp);
u64_stats_inc(&stats->cnt);
u64_stats_add(&stats->nsecs, duration);
u64_stats_update_end_irqrestore(&stats->syncp, flags);
}
} else {
ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
}

View File

@@ -359,6 +359,7 @@ enum {
FTRACE_OPS_FL_DIRECT = BIT(17),
FTRACE_OPS_FL_SUBOP = BIT(18),
FTRACE_OPS_FL_GRAPH = BIT(19),
FTRACE_OPS_FL_JMP = BIT(20),
};
#ifndef CONFIG_DYNAMIC_FTRACE_WITH_ARGS
@@ -577,6 +578,38 @@ static inline void arch_ftrace_set_direct_caller(struct ftrace_regs *fregs,
unsigned long addr) { }
#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP
static inline bool ftrace_is_jmp(unsigned long addr)
{
return addr & 1;
}
static inline unsigned long ftrace_jmp_set(unsigned long addr)
{
return addr | 1UL;
}
static inline unsigned long ftrace_jmp_get(unsigned long addr)
{
return addr & ~1UL;
}
#else
static inline bool ftrace_is_jmp(unsigned long addr)
{
return false;
}
static inline unsigned long ftrace_jmp_set(unsigned long addr)
{
return addr;
}
static inline unsigned long ftrace_jmp_get(unsigned long addr)
{
return addr;
}
#endif /* CONFIG_DYNAMIC_FTRACE_WITH_JMP */
#ifdef CONFIG_STACK_TRACER
int stack_trace_sysctl(const struct ctl_table *table, int write, void *buffer,

View File

@@ -1026,6 +1026,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_USER_RINGBUF,
BPF_MAP_TYPE_CGRP_STORAGE,
BPF_MAP_TYPE_ARENA,
BPF_MAP_TYPE_INSN_ARRAY,
__MAX_BPF_MAP_TYPE
};
@@ -1430,6 +1431,9 @@ enum {
/* Do not translate kernel bpf_arena pointers to user pointers */
BPF_F_NO_USER_CONV = (1U << 18),
/* Enable BPF ringbuf overwrite mode */
BPF_F_RB_OVERWRITE = (1U << 19),
};
/* Flags for BPF_PROG_QUERY. */
@@ -5618,7 +5622,7 @@ union bpf_attr {
* Return
* *sk* if casting is valid, or **NULL** otherwise.
*
* long bpf_dynptr_from_mem(void *data, u32 size, u64 flags, struct bpf_dynptr *ptr)
* long bpf_dynptr_from_mem(void *data, u64 size, u64 flags, struct bpf_dynptr *ptr)
* Description
* Get a dynptr to local memory *data*.
*
@@ -5661,7 +5665,7 @@ union bpf_attr {
* Return
* Nothing. Always succeeds.
*
* long bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr *src, u32 offset, u64 flags)
* long bpf_dynptr_read(void *dst, u64 len, const struct bpf_dynptr *src, u64 offset, u64 flags)
* Description
* Read *len* bytes from *src* into *dst*, starting from *offset*
* into *src*.
@@ -5671,7 +5675,7 @@ union bpf_attr {
* of *src*'s data, -EINVAL if *src* is an invalid dynptr or if
* *flags* is not 0.
*
* long bpf_dynptr_write(const struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags)
* long bpf_dynptr_write(const struct bpf_dynptr *dst, u64 offset, void *src, u64 len, u64 flags)
* Description
* Write *len* bytes from *src* into *dst*, starting from *offset*
* into *dst*.
@@ -5692,7 +5696,7 @@ union bpf_attr {
* is a read-only dynptr or if *flags* is not correct. For skb-type dynptrs,
* other errors correspond to errors returned by **bpf_skb_store_bytes**\ ().
*
* void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len)
* void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u64 offset, u64 len)
* Description
* Get a pointer to the underlying dynptr data.
*
@@ -6231,6 +6235,7 @@ enum {
BPF_RB_RING_SIZE = 1,
BPF_RB_CONS_POS = 2,
BPF_RB_PROD_POS = 3,
BPF_RB_OVERWRITE_POS = 4,
};
/* BPF ring buffer constants */
@@ -7645,4 +7650,24 @@ enum bpf_kfunc_flags {
BPF_F_PAD_ZEROS = (1ULL << 0),
};
/*
* Values of a BPF_MAP_TYPE_INSN_ARRAY entry must be of this type.
*
* Before the map is used the orig_off field should point to an
* instruction inside the program being loaded. The other fields
* must be set to 0.
*
* After the program is loaded, the xlated_off will be adjusted
* by the verifier to point to the index of the original instruction
* in the xlated program. If the instruction is deleted, it will
* be set to (u32)-1. The jitted_off will be set to the corresponding
* offset in the jitted image of the program.
*/
struct bpf_insn_array_value {
__u32 orig_off;
__u32 xlated_off;
__u32 jitted_off;
__u32 :32;
};
#endif /* _UAPI__LINUX_BPF_H__ */

View File

@@ -9,7 +9,7 @@ CFLAGS_core.o += -Wno-override-init $(cflags-nogcse-yy)
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o liveness.o
obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o bpf_insn_array.o
obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o
obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o

View File

@@ -335,18 +335,17 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
}
/* Called from syscall */
static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
int bpf_array_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 index = key ? *(u32 *)key : U32_MAX;
u32 *next = (u32 *)next_key;
if (index >= array->map.max_entries) {
if (index >= map->max_entries) {
*next = 0;
return 0;
}
if (index == array->map.max_entries - 1)
if (index == map->max_entries - 1)
return -ENOENT;
*next = index + 1;
@@ -448,19 +447,12 @@ static void array_map_free_internal_structs(struct bpf_map *map)
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
/* We don't reset or free fields other than timer and workqueue
* on uref dropping to zero.
*/
if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) {
for (i = 0; i < array->map.max_entries; i++) {
if (btf_record_has_field(map->record, BPF_TIMER))
bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i));
if (btf_record_has_field(map->record, BPF_WORKQUEUE))
bpf_obj_free_workqueue(map->record, array_map_elem_ptr(array, i));
if (btf_record_has_field(map->record, BPF_TASK_WORK))
bpf_obj_free_task_work(map->record, array_map_elem_ptr(array, i));
}
}
/* We only free internal structs on uref dropping to zero */
if (!bpf_map_has_internal_structs(map))
return;
for (i = 0; i < array->map.max_entries; i++)
bpf_map_free_internal_structs(map, array_map_elem_ptr(array, i));
}
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
@@ -796,7 +788,7 @@ const struct bpf_map_ops array_map_ops = {
.map_alloc_check = array_map_alloc_check,
.map_alloc = array_map_alloc,
.map_free = array_map_free,
.map_get_next_key = array_map_get_next_key,
.map_get_next_key = bpf_array_get_next_key,
.map_release_uref = array_map_free_internal_structs,
.map_lookup_elem = array_map_lookup_elem,
.map_update_elem = array_map_update_elem,
@@ -822,7 +814,7 @@ const struct bpf_map_ops percpu_array_map_ops = {
.map_alloc_check = array_map_alloc_check,
.map_alloc = array_map_alloc,
.map_free = array_map_free,
.map_get_next_key = array_map_get_next_key,
.map_get_next_key = bpf_array_get_next_key,
.map_lookup_elem = percpu_array_map_lookup_elem,
.map_gen_lookup = percpu_array_map_gen_lookup,
.map_update_elem = array_map_update_elem,
@@ -1211,7 +1203,7 @@ const struct bpf_map_ops prog_array_map_ops = {
.map_poke_track = prog_array_map_poke_track,
.map_poke_untrack = prog_array_map_poke_untrack,
.map_poke_run = prog_array_map_poke_run,
.map_get_next_key = array_map_get_next_key,
.map_get_next_key = bpf_array_get_next_key,
.map_lookup_elem = fd_array_map_lookup_elem,
.map_delete_elem = fd_array_map_delete_elem,
.map_fd_get_ptr = prog_fd_array_get_ptr,
@@ -1315,7 +1307,7 @@ const struct bpf_map_ops perf_event_array_map_ops = {
.map_alloc_check = fd_array_map_alloc_check,
.map_alloc = array_map_alloc,
.map_free = perf_event_fd_array_map_free,
.map_get_next_key = array_map_get_next_key,
.map_get_next_key = bpf_array_get_next_key,
.map_lookup_elem = fd_array_map_lookup_elem,
.map_delete_elem = fd_array_map_delete_elem,
.map_fd_get_ptr = perf_event_fd_array_get_ptr,
@@ -1351,7 +1343,7 @@ const struct bpf_map_ops cgroup_array_map_ops = {
.map_alloc_check = fd_array_map_alloc_check,
.map_alloc = array_map_alloc,
.map_free = cgroup_fd_array_free,
.map_get_next_key = array_map_get_next_key,
.map_get_next_key = bpf_array_get_next_key,
.map_lookup_elem = fd_array_map_lookup_elem,
.map_delete_elem = fd_array_map_delete_elem,
.map_fd_get_ptr = cgroup_fd_array_get_ptr,
@@ -1436,7 +1428,7 @@ const struct bpf_map_ops array_of_maps_map_ops = {
.map_alloc_check = fd_array_map_alloc_check,
.map_alloc = array_of_map_alloc,
.map_free = array_of_map_free,
.map_get_next_key = array_map_get_next_key,
.map_get_next_key = bpf_array_get_next_key,
.map_lookup_elem = array_of_map_lookup_elem,
.map_delete_elem = fd_array_map_delete_elem,
.map_fd_get_ptr = bpf_map_fd_get_ptr,

304
kernel/bpf/bpf_insn_array.c Normal file
View File

@@ -0,0 +1,304 @@
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2025 Isovalent */
#include <linux/bpf.h>
struct bpf_insn_array {
struct bpf_map map;
atomic_t used;
long *ips;
DECLARE_FLEX_ARRAY(struct bpf_insn_array_value, values);
};
#define cast_insn_array(MAP_PTR) \
container_of((MAP_PTR), struct bpf_insn_array, map)
#define INSN_DELETED ((u32)-1)
static inline u64 insn_array_alloc_size(u32 max_entries)
{
const u64 base_size = sizeof(struct bpf_insn_array);
const u64 entry_size = sizeof(struct bpf_insn_array_value);
return base_size + max_entries * (entry_size + sizeof(long));
}
static int insn_array_alloc_check(union bpf_attr *attr)
{
u32 value_size = sizeof(struct bpf_insn_array_value);
if (attr->max_entries == 0 || attr->key_size != 4 ||
attr->value_size != value_size || attr->map_flags != 0)
return -EINVAL;
return 0;
}
static void insn_array_free(struct bpf_map *map)
{
struct bpf_insn_array *insn_array = cast_insn_array(map);
bpf_map_area_free(insn_array);
}
static struct bpf_map *insn_array_alloc(union bpf_attr *attr)
{
u64 size = insn_array_alloc_size(attr->max_entries);
struct bpf_insn_array *insn_array;
insn_array = bpf_map_area_alloc(size, NUMA_NO_NODE);
if (!insn_array)
return ERR_PTR(-ENOMEM);
/* ips are allocated right after the insn_array->values[] array */
insn_array->ips = (void *)&insn_array->values[attr->max_entries];
bpf_map_init_from_attr(&insn_array->map, attr);
/* BPF programs aren't allowed to write to the map */
insn_array->map.map_flags |= BPF_F_RDONLY_PROG;
return &insn_array->map;
}
static void *insn_array_lookup_elem(struct bpf_map *map, void *key)
{
struct bpf_insn_array *insn_array = cast_insn_array(map);
u32 index = *(u32 *)key;
if (unlikely(index >= insn_array->map.max_entries))
return NULL;
return &insn_array->values[index];
}
static long insn_array_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags)
{
struct bpf_insn_array *insn_array = cast_insn_array(map);
u32 index = *(u32 *)key;
struct bpf_insn_array_value val = {};
if (unlikely(index >= insn_array->map.max_entries))
return -E2BIG;
if (unlikely(map_flags & BPF_NOEXIST))
return -EEXIST;
copy_map_value(map, &val, value);
if (val.jitted_off || val.xlated_off)
return -EINVAL;
insn_array->values[index].orig_off = val.orig_off;
return 0;
}
static long insn_array_delete_elem(struct bpf_map *map, void *key)
{
return -EINVAL;
}
static int insn_array_check_btf(const struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type)
{
if (!btf_type_is_i32(key_type))
return -EINVAL;
if (!btf_type_is_i64(value_type))
return -EINVAL;
return 0;
}
static u64 insn_array_mem_usage(const struct bpf_map *map)
{
return insn_array_alloc_size(map->max_entries);
}
static int insn_array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32 off)
{
struct bpf_insn_array *insn_array = cast_insn_array(map);
if ((off % sizeof(long)) != 0 ||
(off / sizeof(long)) >= map->max_entries)
return -EINVAL;
/* from BPF's point of view, this map is a jump table */
*imm = (unsigned long)insn_array->ips + off;
return 0;
}
BTF_ID_LIST_SINGLE(insn_array_btf_ids, struct, bpf_insn_array)
const struct bpf_map_ops insn_array_map_ops = {
.map_alloc_check = insn_array_alloc_check,
.map_alloc = insn_array_alloc,
.map_free = insn_array_free,
.map_get_next_key = bpf_array_get_next_key,
.map_lookup_elem = insn_array_lookup_elem,
.map_update_elem = insn_array_update_elem,
.map_delete_elem = insn_array_delete_elem,
.map_check_btf = insn_array_check_btf,
.map_mem_usage = insn_array_mem_usage,
.map_direct_value_addr = insn_array_map_direct_value_addr,
.map_btf_id = &insn_array_btf_ids[0],
};
static inline bool is_frozen(struct bpf_map *map)
{
guard(mutex)(&map->freeze_mutex);
return map->frozen;
}
static bool is_insn_array(const struct bpf_map *map)
{
return map->map_type == BPF_MAP_TYPE_INSN_ARRAY;
}
static inline bool valid_offsets(const struct bpf_insn_array *insn_array,
const struct bpf_prog *prog)
{
u32 off;
int i;
for (i = 0; i < insn_array->map.max_entries; i++) {
off = insn_array->values[i].orig_off;
if (off >= prog->len)
return false;
if (off > 0) {
if (prog->insnsi[off-1].code == (BPF_LD | BPF_DW | BPF_IMM))
return false;
}
}
return true;
}
int bpf_insn_array_init(struct bpf_map *map, const struct bpf_prog *prog)
{
struct bpf_insn_array *insn_array = cast_insn_array(map);
struct bpf_insn_array_value *values = insn_array->values;
int i;
if (!is_frozen(map))
return -EINVAL;
if (!valid_offsets(insn_array, prog))
return -EINVAL;
/*
* There can be only one program using the map
*/
if (atomic_xchg(&insn_array->used, 1))
return -EBUSY;
/*
* Reset all the map indexes to the original values. This is needed,
* e.g., when a replay of verification with different log level should
* be performed.
*/
for (i = 0; i < map->max_entries; i++)
values[i].xlated_off = values[i].orig_off;
return 0;
}
int bpf_insn_array_ready(struct bpf_map *map)
{
struct bpf_insn_array *insn_array = cast_insn_array(map);
int i;
for (i = 0; i < map->max_entries; i++) {
if (insn_array->values[i].xlated_off == INSN_DELETED)
continue;
if (!insn_array->ips[i])
return -EFAULT;
}
return 0;
}
void bpf_insn_array_release(struct bpf_map *map)
{
struct bpf_insn_array *insn_array = cast_insn_array(map);
atomic_set(&insn_array->used, 0);
}
void bpf_insn_array_adjust(struct bpf_map *map, u32 off, u32 len)
{
struct bpf_insn_array *insn_array = cast_insn_array(map);
int i;
if (len <= 1)
return;
for (i = 0; i < map->max_entries; i++) {
if (insn_array->values[i].xlated_off <= off)
continue;
if (insn_array->values[i].xlated_off == INSN_DELETED)
continue;
insn_array->values[i].xlated_off += len - 1;
}
}
void bpf_insn_array_adjust_after_remove(struct bpf_map *map, u32 off, u32 len)
{
struct bpf_insn_array *insn_array = cast_insn_array(map);
int i;
for (i = 0; i < map->max_entries; i++) {
if (insn_array->values[i].xlated_off < off)
continue;
if (insn_array->values[i].xlated_off == INSN_DELETED)
continue;
if (insn_array->values[i].xlated_off < off + len)
insn_array->values[i].xlated_off = INSN_DELETED;
else
insn_array->values[i].xlated_off -= len;
}
}
/*
* This function is called by JITs. The image is the real program
* image, the offsets array set up the xlated -> jitted mapping.
* The offsets[xlated] offset should point to the beginning of
* the jitted instruction.
*/
void bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image)
{
struct bpf_insn_array *insn_array;
struct bpf_map *map;
u32 xlated_off;
int i, j;
if (!offsets || !image)
return;
for (i = 0; i < prog->aux->used_map_cnt; i++) {
map = prog->aux->used_maps[i];
if (!is_insn_array(map))
continue;
insn_array = cast_insn_array(map);
for (j = 0; j < map->max_entries; j++) {
xlated_off = insn_array->values[j].xlated_off;
if (xlated_off == INSN_DELETED)
continue;
if (xlated_off < prog->aux->subprog_start)
continue;
xlated_off -= prog->aux->subprog_start;
if (xlated_off >= prog->len)
continue;
insn_array->values[j].jitted_off = offsets[xlated_off];
insn_array->ips[j] = (long)(image + offsets[xlated_off]);
}
}
}

View File

@@ -73,30 +73,24 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
struct bpf_local_storage_elem *
bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
void *value, bool charge_mem, bool swap_uptrs, gfp_t gfp_flags)
void *value, bool swap_uptrs, gfp_t gfp_flags)
{
struct bpf_local_storage_elem *selem;
if (charge_mem && mem_charge(smap, owner, smap->elem_size))
if (mem_charge(smap, owner, smap->elem_size))
return NULL;
if (smap->bpf_ma) {
selem = bpf_mem_cache_alloc_flags(&smap->selem_ma, gfp_flags);
if (selem)
/* Keep the original bpf_map_kzalloc behavior
* before started using the bpf_mem_cache_alloc.
*
* No need to use zero_map_value. The bpf_selem_free()
* only does bpf_mem_cache_free when there is
* no other bpf prog is using the selem.
*/
memset(SDATA(selem)->data, 0, smap->map.value_size);
if (smap->use_kmalloc_nolock) {
selem = bpf_map_kmalloc_nolock(&smap->map, smap->elem_size,
__GFP_ZERO, NUMA_NO_NODE);
} else {
selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
gfp_flags | __GFP_NOWARN);
}
if (selem) {
RCU_INIT_POINTER(SDATA(selem)->smap, smap);
if (value) {
/* No need to call check_and_init_map_value as memory is zero init */
copy_map_value(&smap->map, SDATA(selem)->data, value);
@@ -106,13 +100,12 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
return selem;
}
if (charge_mem)
mem_uncharge(smap, owner, smap->elem_size);
mem_uncharge(smap, owner, smap->elem_size);
return NULL;
}
/* rcu tasks trace callback for bpf_ma == false */
/* rcu tasks trace callback for use_kmalloc_nolock == false */
static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage *local_storage;
@@ -127,23 +120,7 @@ static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
kfree_rcu(local_storage, rcu);
}
static void bpf_local_storage_free_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage *local_storage;
local_storage = container_of(rcu, struct bpf_local_storage, rcu);
bpf_mem_cache_raw_free(local_storage);
}
static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
{
if (rcu_trace_implies_rcu_gp())
bpf_local_storage_free_rcu(rcu);
else
call_rcu(rcu, bpf_local_storage_free_rcu);
}
/* Handle bpf_ma == false */
/* Handle use_kmalloc_nolock == false */
static void __bpf_local_storage_free(struct bpf_local_storage *local_storage,
bool vanilla_rcu)
{
@@ -154,35 +131,43 @@ static void __bpf_local_storage_free(struct bpf_local_storage *local_storage,
__bpf_local_storage_free_trace_rcu);
}
static void bpf_local_storage_free_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage *local_storage;
local_storage = container_of(rcu, struct bpf_local_storage, rcu);
kfree_nolock(local_storage);
}
static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
{
if (rcu_trace_implies_rcu_gp())
bpf_local_storage_free_rcu(rcu);
else
call_rcu(rcu, bpf_local_storage_free_rcu);
}
static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
struct bpf_local_storage_map *smap,
bool bpf_ma, bool reuse_now)
bool reuse_now)
{
if (!local_storage)
return;
if (!bpf_ma) {
if (!local_storage->use_kmalloc_nolock) {
__bpf_local_storage_free(local_storage, reuse_now);
return;
}
if (!reuse_now) {
call_rcu_tasks_trace(&local_storage->rcu,
bpf_local_storage_free_trace_rcu);
if (reuse_now) {
call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu);
return;
}
if (smap)
bpf_mem_cache_free(&smap->storage_ma, local_storage);
else
/* smap could be NULL if the selem that triggered
* this 'local_storage' creation had been long gone.
* In this case, directly do call_rcu().
*/
call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu);
call_rcu_tasks_trace(&local_storage->rcu,
bpf_local_storage_free_trace_rcu);
}
/* rcu tasks trace callback for bpf_ma == false */
/* rcu tasks trace callback for use_kmalloc_nolock == false */
static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage_elem *selem;
@@ -194,7 +179,7 @@ static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu)
kfree_rcu(selem, rcu);
}
/* Handle bpf_ma == false */
/* Handle use_kmalloc_nolock == false */
static void __bpf_selem_free(struct bpf_local_storage_elem *selem,
bool vanilla_rcu)
{
@@ -216,7 +201,7 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu)
migrate_disable();
bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
migrate_enable();
bpf_mem_cache_raw_free(selem);
kfree_nolock(selem);
}
static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
@@ -228,14 +213,17 @@ static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
}
void bpf_selem_free(struct bpf_local_storage_elem *selem,
struct bpf_local_storage_map *smap,
bool reuse_now)
{
if (!smap->bpf_ma) {
/* Only task storage has uptrs and task storage
* has moved to bpf_mem_alloc. Meaning smap->bpf_ma == true
* for task storage, so this bpf_obj_free_fields() won't unpin
* any uptr.
struct bpf_local_storage_map *smap;
smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
if (!smap->use_kmalloc_nolock) {
/*
* No uptr will be unpin even when reuse_now == false since uptr
* is only supported in task local storage, where
* smap->use_kmalloc_nolock == true.
*/
bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
__bpf_selem_free(selem, reuse_now);
@@ -243,18 +231,11 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem,
}
if (reuse_now) {
/* reuse_now == true only happens when the storage owner
* (e.g. task_struct) is being destructed or the map itself
* is being destructed (ie map_free). In both cases,
* no bpf prog can have a hold on the selem. It is
* safe to unpin the uptrs and free the selem now.
/*
* While it is okay to call bpf_obj_free_fields() that unpins uptr when
* reuse_now == true, keep it in bpf_selem_free_rcu() for simplicity.
*/
bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
/* Instead of using the vanilla call_rcu(),
* bpf_mem_cache_free will be able to reuse selem
* immediately.
*/
bpf_mem_cache_free(&smap->selem_ma, selem);
call_rcu(&selem->rcu, bpf_selem_free_rcu);
return;
}
@@ -264,7 +245,6 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem,
static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now)
{
struct bpf_local_storage_elem *selem;
struct bpf_local_storage_map *smap;
struct hlist_node *n;
/* The "_safe" iteration is needed.
@@ -272,10 +252,8 @@ static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now)
* but bpf_selem_free will use the selem->rcu_head
* which is union-ized with the selem->free_node.
*/
hlist_for_each_entry_safe(selem, n, list, free_node) {
smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
bpf_selem_free(selem, smap, reuse_now);
}
hlist_for_each_entry_safe(selem, n, list, free_node)
bpf_selem_free(selem, reuse_now);
}
/* local_storage->lock must be held and selem->local_storage == local_storage.
@@ -284,7 +262,7 @@ static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now)
*/
static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
struct bpf_local_storage_elem *selem,
bool uncharge_mem, struct hlist_head *free_selem_list)
struct hlist_head *free_selem_list)
{
struct bpf_local_storage_map *smap;
bool free_local_storage;
@@ -297,8 +275,7 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
* The owner may be freed once the last selem is unlinked
* from local_storage.
*/
if (uncharge_mem)
mem_uncharge(smap, owner, smap->elem_size);
mem_uncharge(smap, owner, smap->elem_size);
free_local_storage = hlist_is_singular_node(&selem->snode,
&local_storage->list);
@@ -336,47 +313,11 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
return free_local_storage;
}
static bool check_storage_bpf_ma(struct bpf_local_storage *local_storage,
struct bpf_local_storage_map *storage_smap,
struct bpf_local_storage_elem *selem)
{
struct bpf_local_storage_map *selem_smap;
/* local_storage->smap may be NULL. If it is, get the bpf_ma
* from any selem in the local_storage->list. The bpf_ma of all
* local_storage and selem should have the same value
* for the same map type.
*
* If the local_storage->list is already empty, the caller will not
* care about the bpf_ma value also because the caller is not
* responsible to free the local_storage.
*/
if (storage_smap)
return storage_smap->bpf_ma;
if (!selem) {
struct hlist_node *n;
n = rcu_dereference_check(hlist_first_rcu(&local_storage->list),
bpf_rcu_lock_held());
if (!n)
return false;
selem = hlist_entry(n, struct bpf_local_storage_elem, snode);
}
selem_smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
return selem_smap->bpf_ma;
}
static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
bool reuse_now)
{
struct bpf_local_storage_map *storage_smap;
struct bpf_local_storage *local_storage;
bool bpf_ma, free_local_storage = false;
bool free_local_storage = false;
HLIST_HEAD(selem_free_list);
unsigned long flags;
@@ -386,20 +327,17 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
local_storage = rcu_dereference_check(selem->local_storage,
bpf_rcu_lock_held());
storage_smap = rcu_dereference_check(local_storage->smap,
bpf_rcu_lock_held());
bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, selem);
raw_spin_lock_irqsave(&local_storage->lock, flags);
if (likely(selem_linked_to_storage(selem)))
free_local_storage = bpf_selem_unlink_storage_nolock(
local_storage, selem, true, &selem_free_list);
local_storage, selem, &selem_free_list);
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
bpf_selem_free_list(&selem_free_list, reuse_now);
if (free_local_storage)
bpf_local_storage_free(local_storage, storage_smap, bpf_ma, reuse_now);
bpf_local_storage_free(local_storage, reuse_now);
}
void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
@@ -434,7 +372,6 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap,
unsigned long flags;
raw_spin_lock_irqsave(&b->lock, flags);
RCU_INIT_POINTER(SDATA(selem)->smap, smap);
hlist_add_head_rcu(&selem->map_node, &b->list);
raw_spin_unlock_irqrestore(&b->lock, flags);
}
@@ -493,8 +430,9 @@ int bpf_local_storage_alloc(void *owner,
if (err)
return err;
if (smap->bpf_ma)
storage = bpf_mem_cache_alloc_flags(&smap->storage_ma, gfp_flags);
if (smap->use_kmalloc_nolock)
storage = bpf_map_kmalloc_nolock(&smap->map, sizeof(*storage),
__GFP_ZERO, NUMA_NO_NODE);
else
storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
gfp_flags | __GFP_NOWARN);
@@ -507,6 +445,7 @@ int bpf_local_storage_alloc(void *owner,
INIT_HLIST_HEAD(&storage->list);
raw_spin_lock_init(&storage->lock);
storage->owner = owner;
storage->use_kmalloc_nolock = smap->use_kmalloc_nolock;
bpf_selem_link_storage_nolock(storage, first_selem);
bpf_selem_link_map(smap, first_selem);
@@ -528,22 +467,12 @@ int bpf_local_storage_alloc(void *owner,
bpf_selem_unlink_map(first_selem);
err = -EAGAIN;
goto uncharge;
/* Note that even first_selem was linked to smap's
* bucket->list, first_selem can be freed immediately
* (instead of kfree_rcu) because
* bpf_local_storage_map_free() does a
* synchronize_rcu_mult (waiting for both sleepable and
* normal programs) before walking the bucket->list.
* Hence, no one is accessing selem from the
* bucket->list under rcu_read_lock().
*/
}
return 0;
uncharge:
bpf_local_storage_free(storage, smap, smap->bpf_ma, true);
bpf_local_storage_free(storage, true);
mem_uncharge(smap, owner, sizeof(*storage));
return err;
}
@@ -582,13 +511,13 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
if (err)
return ERR_PTR(err);
selem = bpf_selem_alloc(smap, owner, value, true, swap_uptrs, gfp_flags);
selem = bpf_selem_alloc(smap, owner, value, swap_uptrs, gfp_flags);
if (!selem)
return ERR_PTR(-ENOMEM);
err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags);
if (err) {
bpf_selem_free(selem, smap, true);
bpf_selem_free(selem, true);
mem_uncharge(smap, owner, smap->elem_size);
return ERR_PTR(err);
}
@@ -616,7 +545,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
/* A lookup has just been done before and concluded a new selem is
* needed. The chance of an unnecessary alloc is unlikely.
*/
alloc_selem = selem = bpf_selem_alloc(smap, owner, value, true, swap_uptrs, gfp_flags);
alloc_selem = selem = bpf_selem_alloc(smap, owner, value, swap_uptrs, gfp_flags);
if (!alloc_selem)
return ERR_PTR(-ENOMEM);
@@ -656,7 +585,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
if (old_sdata) {
bpf_selem_unlink_map(SELEM(old_sdata));
bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata),
true, &old_selem_free_list);
&old_selem_free_list);
}
unlock:
@@ -664,7 +593,7 @@ unlock:
bpf_selem_free_list(&old_selem_free_list, false);
if (alloc_selem) {
mem_uncharge(smap, owner, smap->elem_size);
bpf_selem_free(alloc_selem, smap, true);
bpf_selem_free(alloc_selem, true);
}
return err ? ERR_PTR(err) : SDATA(selem);
}
@@ -730,16 +659,12 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map,
void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
{
struct bpf_local_storage_map *storage_smap;
struct bpf_local_storage_elem *selem;
bool bpf_ma, free_storage = false;
bool free_storage = false;
HLIST_HEAD(free_selem_list);
struct hlist_node *n;
unsigned long flags;
storage_smap = rcu_dereference_check(local_storage->smap, bpf_rcu_lock_held());
bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, NULL);
/* Neither the bpf_prog nor the bpf_map's syscall
* could be modifying the local_storage->list now.
* Thus, no elem can be added to or deleted from the
@@ -762,14 +687,14 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
* of the loop will set the free_cgroup_storage to true.
*/
free_storage = bpf_selem_unlink_storage_nolock(
local_storage, selem, true, &free_selem_list);
local_storage, selem, &free_selem_list);
}
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
bpf_selem_free_list(&free_selem_list, true);
if (free_storage)
bpf_local_storage_free(local_storage, storage_smap, bpf_ma, true);
bpf_local_storage_free(local_storage, true);
}
u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
@@ -782,20 +707,10 @@ u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
return usage;
}
/* When bpf_ma == true, the bpf_mem_alloc is used to allocate and free memory.
* A deadlock free allocator is useful for storage that the bpf prog can easily
* get a hold of the owner PTR_TO_BTF_ID in any context. eg. bpf_get_current_task_btf.
* The task and cgroup storage fall into this case. The bpf_mem_alloc reuses
* memory immediately. To be reuse-immediate safe, the owner destruction
* code path needs to go through a rcu grace period before calling
* bpf_local_storage_destroy().
*
* When bpf_ma == false, the kmalloc and kfree are used.
*/
struct bpf_map *
bpf_local_storage_map_alloc(union bpf_attr *attr,
struct bpf_local_storage_cache *cache,
bool bpf_ma)
bool use_kmalloc_nolock)
{
struct bpf_local_storage_map *smap;
unsigned int i;
@@ -829,20 +744,9 @@ bpf_local_storage_map_alloc(union bpf_attr *attr,
/* In PREEMPT_RT, kmalloc(GFP_ATOMIC) is still not safe in non
* preemptible context. Thus, enforce all storages to use
* bpf_mem_alloc when CONFIG_PREEMPT_RT is enabled.
* kmalloc_nolock() when CONFIG_PREEMPT_RT is enabled.
*/
smap->bpf_ma = IS_ENABLED(CONFIG_PREEMPT_RT) ? true : bpf_ma;
if (smap->bpf_ma) {
err = bpf_mem_alloc_init(&smap->selem_ma, smap->elem_size, false);
if (err)
goto free_smap;
err = bpf_mem_alloc_init(&smap->storage_ma, sizeof(struct bpf_local_storage), false);
if (err) {
bpf_mem_alloc_destroy(&smap->selem_ma);
goto free_smap;
}
}
smap->use_kmalloc_nolock = IS_ENABLED(CONFIG_PREEMPT_RT) ? true : use_kmalloc_nolock;
smap->cache_idx = bpf_local_storage_cache_idx_get(cache);
return &smap->map;
@@ -912,12 +816,9 @@ void bpf_local_storage_map_free(struct bpf_map *map,
*/
synchronize_rcu();
if (smap->bpf_ma) {
if (smap->use_kmalloc_nolock) {
rcu_barrier_tasks_trace();
if (!rcu_trace_implies_rcu_gp())
rcu_barrier();
bpf_mem_alloc_destroy(&smap->selem_ma);
bpf_mem_alloc_destroy(&smap->storage_ma);
rcu_barrier();
}
kvfree(smap->buckets);
bpf_map_area_free(smap);

View File

@@ -51,6 +51,7 @@ BTF_ID(func, bpf_lsm_key_getsecurity)
BTF_ID(func, bpf_lsm_audit_rule_match)
#endif
BTF_ID(func, bpf_lsm_ismaclabel)
BTF_ID(func, bpf_lsm_file_alloc_security)
BTF_SET_END(bpf_lsm_disabled_hooks)
/* List of LSM hooks that should operate on 'current' cgroup regardless

View File

@@ -1450,6 +1450,23 @@ void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other)
bpf_prog_clone_free(fp_other);
}
static void adjust_insn_arrays(struct bpf_prog *prog, u32 off, u32 len)
{
#ifdef CONFIG_BPF_SYSCALL
struct bpf_map *map;
int i;
if (len <= 1)
return;
for (i = 0; i < prog->aux->used_map_cnt; i++) {
map = prog->aux->used_maps[i];
if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY)
bpf_insn_array_adjust(map, off, len);
}
#endif
}
struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
{
struct bpf_insn insn_buff[16], aux[2];
@@ -1505,6 +1522,9 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
clone = tmp;
insn_delta = rewritten - 1;
/* Instructions arrays must be updated using absolute xlated offsets */
adjust_insn_arrays(clone, prog->aux->subprog_start + i, rewritten);
/* Walk new program and skip insns we just inserted. */
insn = clone->insnsi + i + insn_delta;
insn_cnt += insn_delta;
@@ -1688,6 +1708,7 @@ bool bpf_opcode_in_insntable(u8 code)
[BPF_LD | BPF_IND | BPF_B] = true,
[BPF_LD | BPF_IND | BPF_H] = true,
[BPF_LD | BPF_IND | BPF_W] = true,
[BPF_JMP | BPF_JA | BPF_X] = true,
[BPF_JMP | BPF_JCOND] = true,
};
#undef BPF_INSN_3_TBL
@@ -3129,8 +3150,9 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
return -EFAULT;
}
int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
void *addr1, void *addr2)
int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t,
enum bpf_text_poke_type new_t, void *old_addr,
void *new_addr)
{
return -ENOTSUPP;
}

View File

@@ -358,6 +358,9 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
} else if (insn->code == (BPF_JMP | BPF_JA)) {
verbose(cbs->private_data, "(%02x) goto pc%+d\n",
insn->code, insn->off);
} else if (insn->code == (BPF_JMP | BPF_JA | BPF_X)) {
verbose(cbs->private_data, "(%02x) gotox r%d\n",
insn->code, insn->dst_reg);
} else if (insn->code == (BPF_JMP | BPF_JCOND) &&
insn->src_reg == BPF_MAY_GOTO) {
verbose(cbs->private_data, "(%02x) may_goto pc%+d\n",

View File

@@ -215,19 +215,6 @@ static bool htab_has_extra_elems(struct bpf_htab *htab)
return !htab_is_percpu(htab) && !htab_is_lru(htab) && !is_fd_htab(htab);
}
static void htab_free_internal_structs(struct bpf_htab *htab, struct htab_elem *elem)
{
if (btf_record_has_field(htab->map.record, BPF_TIMER))
bpf_obj_free_timer(htab->map.record,
htab_elem_value(elem, htab->map.key_size));
if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE))
bpf_obj_free_workqueue(htab->map.record,
htab_elem_value(elem, htab->map.key_size));
if (btf_record_has_field(htab->map.record, BPF_TASK_WORK))
bpf_obj_free_task_work(htab->map.record,
htab_elem_value(elem, htab->map.key_size));
}
static void htab_free_prealloced_internal_structs(struct bpf_htab *htab)
{
u32 num_entries = htab->map.max_entries;
@@ -240,7 +227,8 @@ static void htab_free_prealloced_internal_structs(struct bpf_htab *htab)
struct htab_elem *elem;
elem = get_htab_elem(htab, i);
htab_free_internal_structs(htab, elem);
bpf_map_free_internal_structs(&htab->map,
htab_elem_value(elem, htab->map.key_size));
cond_resched();
}
}
@@ -669,8 +657,7 @@ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
struct htab_elem *l;
u32 hash, key_size;
WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
!rcu_read_lock_bh_held());
WARN_ON_ONCE(!bpf_rcu_lock_held());
key_size = map->key_size;
@@ -947,15 +934,21 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
void *value, bool onallcpus)
{
void *ptr;
if (!onallcpus) {
/* copy true value_size bytes */
copy_map_value(&htab->map, this_cpu_ptr(pptr), value);
ptr = this_cpu_ptr(pptr);
copy_map_value(&htab->map, ptr, value);
bpf_obj_free_fields(htab->map.record, ptr);
} else {
u32 size = round_up(htab->map.value_size, 8);
int off = 0, cpu;
for_each_possible_cpu(cpu) {
copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value + off);
ptr = per_cpu_ptr(pptr, cpu);
copy_map_value_long(&htab->map, ptr, value + off);
bpf_obj_free_fields(htab->map.record, ptr);
off += size;
}
}
@@ -1098,8 +1091,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
/* unknown flags */
return -EINVAL;
WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
!rcu_read_lock_bh_held());
WARN_ON_ONCE(!bpf_rcu_lock_held());
key_size = map->key_size;
@@ -1206,8 +1198,7 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value
/* unknown flags */
return -EINVAL;
WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
!rcu_read_lock_bh_held());
WARN_ON_ONCE(!bpf_rcu_lock_held());
key_size = map->key_size;
@@ -1275,8 +1266,7 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key,
/* unknown flags */
return -EINVAL;
WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
!rcu_read_lock_bh_held());
WARN_ON_ONCE(!bpf_rcu_lock_held());
key_size = map->key_size;
@@ -1338,8 +1328,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
/* unknown flags */
return -EINVAL;
WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
!rcu_read_lock_bh_held());
WARN_ON_ONCE(!bpf_rcu_lock_held());
key_size = map->key_size;
@@ -1416,8 +1405,7 @@ static long htab_map_delete_elem(struct bpf_map *map, void *key)
u32 hash, key_size;
int ret;
WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
!rcu_read_lock_bh_held());
WARN_ON_ONCE(!bpf_rcu_lock_held());
key_size = map->key_size;
@@ -1452,8 +1440,7 @@ static long htab_lru_map_delete_elem(struct bpf_map *map, void *key)
u32 hash, key_size;
int ret;
WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
!rcu_read_lock_bh_held());
WARN_ON_ONCE(!bpf_rcu_lock_held());
key_size = map->key_size;
@@ -1509,8 +1496,9 @@ static void htab_free_malloced_internal_structs(struct bpf_htab *htab)
struct htab_elem *l;
hlist_nulls_for_each_entry(l, n, head, hash_node) {
/* We only free timer on uref dropping to zero */
htab_free_internal_structs(htab, l);
/* We only free internal structs on uref dropping to zero */
bpf_map_free_internal_structs(&htab->map,
htab_elem_value(l, htab->map.key_size));
}
cond_resched_rcu();
}
@@ -1521,13 +1509,14 @@ static void htab_map_free_internal_structs(struct bpf_map *map)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
/* We only free timer and workqueue on uref dropping to zero */
if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) {
if (!htab_is_prealloc(htab))
htab_free_malloced_internal_structs(htab);
else
htab_free_prealloced_internal_structs(htab);
}
/* We only free internal structs on uref dropping to zero */
if (!bpf_map_has_internal_structs(map))
return;
if (htab_is_prealloc(htab))
htab_free_prealloced_internal_structs(htab);
else
htab_free_malloced_internal_structs(htab);
}
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */

View File

@@ -28,6 +28,7 @@
#include <linux/verification.h>
#include <linux/task_work.h>
#include <linux/irq_work.h>
#include <linux/buildid.h>
#include "../../lib/kstrtox.h"
@@ -42,8 +43,7 @@
*/
BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key)
{
WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
!rcu_read_lock_bh_held());
WARN_ON_ONCE(!bpf_rcu_lock_held());
return (unsigned long) map->ops->map_lookup_elem(map, key);
}
@@ -59,8 +59,7 @@ const struct bpf_func_proto bpf_map_lookup_elem_proto = {
BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key,
void *, value, u64, flags)
{
WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
!rcu_read_lock_bh_held());
WARN_ON_ONCE(!bpf_rcu_lock_held());
return map->ops->map_update_elem(map, key, value, flags);
}
@@ -77,8 +76,7 @@ const struct bpf_func_proto bpf_map_update_elem_proto = {
BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key)
{
WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
!rcu_read_lock_bh_held());
WARN_ON_ONCE(!bpf_rcu_lock_held());
return map->ops->map_delete_elem(map, key);
}
@@ -134,8 +132,7 @@ const struct bpf_func_proto bpf_map_peek_elem_proto = {
BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu)
{
WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
!rcu_read_lock_bh_held());
WARN_ON_ONCE(!bpf_rcu_lock_held());
return (unsigned long) map->ops->map_lookup_percpu_elem(map, key, cpu);
}
@@ -777,9 +774,11 @@ int bpf_try_get_buffers(struct bpf_bprintf_buffers **bufs)
{
int nest_level;
preempt_disable();
nest_level = this_cpu_inc_return(bpf_bprintf_nest_level);
if (WARN_ON_ONCE(nest_level > MAX_BPRINTF_NEST_LEVEL)) {
this_cpu_dec(bpf_bprintf_nest_level);
preempt_enable();
return -EBUSY;
}
*bufs = this_cpu_ptr(&bpf_bprintf_bufs[nest_level - 1]);
@@ -792,6 +791,7 @@ void bpf_put_buffers(void)
if (WARN_ON_ONCE(this_cpu_read(bpf_bprintf_nest_level) == 0))
return;
this_cpu_dec(bpf_bprintf_nest_level);
preempt_enable();
}
void bpf_bprintf_cleanup(struct bpf_bprintf_data *data)
@@ -1660,6 +1660,13 @@ static const struct bpf_func_proto bpf_kptr_xchg_proto = {
.arg2_btf_id = BPF_PTR_POISON,
};
struct bpf_dynptr_file_impl {
struct freader freader;
/* 64 bit offset and size overriding 32 bit ones in bpf_dynptr_kern */
u64 offset;
u64 size;
};
/* Since the upper 8 bits of dynptr->size is reserved, the
* maximum supported size is 2^24 - 1.
*/
@@ -1688,23 +1695,65 @@ static enum bpf_dynptr_type bpf_dynptr_get_type(const struct bpf_dynptr_kern *pt
return (ptr->size & ~(DYNPTR_RDONLY_BIT)) >> DYNPTR_TYPE_SHIFT;
}
u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr)
u64 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr)
{
if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) {
struct bpf_dynptr_file_impl *df = ptr->data;
return df->size;
}
return ptr->size & DYNPTR_SIZE_MASK;
}
static void bpf_dynptr_set_size(struct bpf_dynptr_kern *ptr, u32 new_size)
static void bpf_dynptr_advance_offset(struct bpf_dynptr_kern *ptr, u64 off)
{
if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) {
struct bpf_dynptr_file_impl *df = ptr->data;
df->offset += off;
return;
}
ptr->offset += off;
}
static void bpf_dynptr_set_size(struct bpf_dynptr_kern *ptr, u64 new_size)
{
u32 metadata = ptr->size & ~DYNPTR_SIZE_MASK;
ptr->size = new_size | metadata;
if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) {
struct bpf_dynptr_file_impl *df = ptr->data;
df->size = new_size;
return;
}
ptr->size = (u32)new_size | metadata;
}
int bpf_dynptr_check_size(u32 size)
int bpf_dynptr_check_size(u64 size)
{
return size > DYNPTR_MAX_SIZE ? -E2BIG : 0;
}
static int bpf_file_fetch_bytes(struct bpf_dynptr_file_impl *df, u64 offset, void *buf, u64 len)
{
const void *ptr;
if (!buf)
return -EINVAL;
df->freader.buf = buf;
df->freader.buf_sz = len;
ptr = freader_fetch(&df->freader, offset + df->offset, len);
if (!ptr)
return df->freader.err;
if (ptr != buf) /* Force copying into the buffer */
memcpy(buf, ptr, len);
return 0;
}
void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
enum bpf_dynptr_type type, u32 offset, u32 size)
{
@@ -1719,7 +1768,7 @@ void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
memset(ptr, 0, sizeof(*ptr));
}
BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u32, size, u64, flags, struct bpf_dynptr_kern *, ptr)
BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u64, size, u64, flags, struct bpf_dynptr_kern *, ptr)
{
int err;
@@ -1754,8 +1803,8 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
.arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT | MEM_WRITE,
};
static int __bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr_kern *src,
u32 offset, u64 flags)
static int __bpf_dynptr_read(void *dst, u64 len, const struct bpf_dynptr_kern *src,
u64 offset, u64 flags)
{
enum bpf_dynptr_type type;
int err;
@@ -1785,14 +1834,16 @@ static int __bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr_kern *s
case BPF_DYNPTR_TYPE_SKB_META:
memmove(dst, bpf_skb_meta_pointer(src->data, src->offset + offset), len);
return 0;
case BPF_DYNPTR_TYPE_FILE:
return bpf_file_fetch_bytes(src->data, offset, dst, len);
default:
WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type);
return -EFAULT;
}
}
BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src,
u32, offset, u64, flags)
BPF_CALL_5(bpf_dynptr_read, void *, dst, u64, len, const struct bpf_dynptr_kern *, src,
u64, offset, u64, flags)
{
return __bpf_dynptr_read(dst, len, src, offset, flags);
}
@@ -1808,8 +1859,8 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = {
.arg5_type = ARG_ANYTHING,
};
int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src,
u32 len, u64 flags)
int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u64 offset, void *src,
u64 len, u64 flags)
{
enum bpf_dynptr_type type;
int err;
@@ -1852,8 +1903,8 @@ int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src,
}
}
BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src,
u32, len, u64, flags)
BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u64, offset, void *, src,
u64, len, u64, flags)
{
return __bpf_dynptr_write(dst, offset, src, len, flags);
}
@@ -1869,7 +1920,7 @@ static const struct bpf_func_proto bpf_dynptr_write_proto = {
.arg5_type = ARG_ANYTHING,
};
BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u32, len)
BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u64, offset, u64, len)
{
enum bpf_dynptr_type type;
int err;
@@ -2684,12 +2735,12 @@ __bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid)
* provided buffer, with its contents containing the data, if unable to obtain
* direct pointer)
*/
__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset,
void *buffer__opt, u32 buffer__szk)
__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset,
void *buffer__opt, u64 buffer__szk)
{
const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
enum bpf_dynptr_type type;
u32 len = buffer__szk;
u64 len = buffer__szk;
int err;
if (!ptr->data)
@@ -2723,6 +2774,9 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset,
}
case BPF_DYNPTR_TYPE_SKB_META:
return bpf_skb_meta_pointer(ptr->data, ptr->offset + offset);
case BPF_DYNPTR_TYPE_FILE:
err = bpf_file_fetch_bytes(ptr->data, offset, buffer__opt, buffer__szk);
return err ? NULL : buffer__opt;
default:
WARN_ONCE(true, "unknown dynptr type %d\n", type);
return NULL;
@@ -2771,8 +2825,8 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset,
* provided buffer, with its contents containing the data, if unable to obtain
* direct pointer)
*/
__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u32 offset,
void *buffer__opt, u32 buffer__szk)
__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset,
void *buffer__opt, u64 buffer__szk)
{
const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
@@ -2804,10 +2858,10 @@ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u32 offset,
return bpf_dynptr_slice(p, offset, buffer__opt, buffer__szk);
}
__bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u32 start, u32 end)
__bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u64 start, u64 end)
{
struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
u32 size;
u64 size;
if (!ptr->data || start > end)
return -EINVAL;
@@ -2817,7 +2871,7 @@ __bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u32 start, u32 end
if (start > size || end > size)
return -ERANGE;
ptr->offset += start;
bpf_dynptr_advance_offset(ptr, start);
bpf_dynptr_set_size(ptr, end - start);
return 0;
@@ -2840,7 +2894,7 @@ __bpf_kfunc bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *p)
return __bpf_dynptr_is_rdonly(ptr);
}
__bpf_kfunc __u32 bpf_dynptr_size(const struct bpf_dynptr *p)
__bpf_kfunc u64 bpf_dynptr_size(const struct bpf_dynptr *p)
{
struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
@@ -2877,14 +2931,14 @@ __bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p,
* Copies data from source dynptr to destination dynptr.
* Returns 0 on success; negative error, otherwise.
*/
__bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off,
struct bpf_dynptr *src_ptr, u32 src_off, u32 size)
__bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u64 dst_off,
struct bpf_dynptr *src_ptr, u64 src_off, u64 size)
{
struct bpf_dynptr_kern *dst = (struct bpf_dynptr_kern *)dst_ptr;
struct bpf_dynptr_kern *src = (struct bpf_dynptr_kern *)src_ptr;
void *src_slice, *dst_slice;
char buf[256];
u32 off;
u64 off;
src_slice = bpf_dynptr_slice(src_ptr, src_off, NULL, size);
dst_slice = bpf_dynptr_slice_rdwr(dst_ptr, dst_off, NULL, size);
@@ -2906,7 +2960,7 @@ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off,
off = 0;
while (off < size) {
u32 chunk_sz = min_t(u32, sizeof(buf), size - off);
u64 chunk_sz = min_t(u64, sizeof(buf), size - off);
int err;
err = __bpf_dynptr_read(buf, chunk_sz, src, src_off + off, 0);
@@ -2932,10 +2986,10 @@ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off,
* at @offset with the constant byte @val.
* Returns 0 on success; negative error, otherwise.
*/
__bpf_kfunc int bpf_dynptr_memset(struct bpf_dynptr *p, u32 offset, u32 size, u8 val)
{
__bpf_kfunc int bpf_dynptr_memset(struct bpf_dynptr *p, u64 offset, u64 size, u8 val)
{
struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
u32 chunk_sz, write_off;
u64 chunk_sz, write_off;
char buf[256];
void* slice;
int err;
@@ -2954,11 +3008,11 @@ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off,
return err;
/* Non-linear data under the dynptr, write from a local buffer */
chunk_sz = min_t(u32, sizeof(buf), size);
chunk_sz = min_t(u64, sizeof(buf), size);
memset(buf, val, chunk_sz);
for (write_off = 0; write_off < size; write_off += chunk_sz) {
chunk_sz = min_t(u32, sizeof(buf), size - write_off);
chunk_sz = min_t(u64, sizeof(buf), size - write_off);
err = __bpf_dynptr_write(ptr, offset + write_off, buf, chunk_sz, 0);
if (err)
return err;
@@ -3678,34 +3732,21 @@ err_out:
return -EFAULT;
}
/**
* bpf_strnstr - Find the first substring in a length-limited string
* @s1__ign: The string to be searched
* @s2__ign: The string to search for
* @len: the maximum number of characters to search
*
* Return:
* * >=0 - Index of the first character of the first occurrence of @s2__ign
* within the first @len characters of @s1__ign
* * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign
* * %-EFAULT - Cannot read one of the strings
* * %-E2BIG - One of the strings is too large
* * %-ERANGE - One of the strings is outside of kernel address space
*/
__bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, size_t len)
static int __bpf_strnstr(const char *s1, const char *s2, size_t len,
bool ignore_case)
{
char c1, c2;
int i, j;
if (!copy_from_kernel_nofault_allowed(s1__ign, 1) ||
!copy_from_kernel_nofault_allowed(s2__ign, 1)) {
if (!copy_from_kernel_nofault_allowed(s1, 1) ||
!copy_from_kernel_nofault_allowed(s2, 1)) {
return -ERANGE;
}
guard(pagefault)();
for (i = 0; i < XATTR_SIZE_MAX; i++) {
for (j = 0; i + j <= len && j < XATTR_SIZE_MAX; j++) {
__get_kernel_nofault(&c2, s2__ign + j, char, err_out);
__get_kernel_nofault(&c2, s2 + j, char, err_out);
if (c2 == '\0')
return i;
/*
@@ -3715,7 +3756,13 @@ __bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, size_t len
*/
if (i + j == len)
break;
__get_kernel_nofault(&c1, s1__ign + j, char, err_out);
__get_kernel_nofault(&c1, s1 + j, char, err_out);
if (ignore_case) {
c1 = tolower(c1);
c2 = tolower(c2);
}
if (c1 == '\0')
return -ENOENT;
if (c1 != c2)
@@ -3725,7 +3772,7 @@ __bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, size_t len
return -E2BIG;
if (i + j == len)
return -ENOENT;
s1__ign++;
s1++;
}
return -E2BIG;
err_out:
@@ -3747,8 +3794,69 @@ err_out:
*/
__bpf_kfunc int bpf_strstr(const char *s1__ign, const char *s2__ign)
{
return bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX);
return __bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX, false);
}
/**
* bpf_strcasestr - Find the first substring in a string, ignoring the case of
* the characters
* @s1__ign: The string to be searched
* @s2__ign: The string to search for
*
* Return:
* * >=0 - Index of the first character of the first occurrence of @s2__ign
* within @s1__ign
* * %-ENOENT - @s2__ign is not a substring of @s1__ign
* * %-EFAULT - Cannot read one of the strings
* * %-E2BIG - One of the strings is too large
* * %-ERANGE - One of the strings is outside of kernel address space
*/
__bpf_kfunc int bpf_strcasestr(const char *s1__ign, const char *s2__ign)
{
return __bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX, true);
}
/**
* bpf_strnstr - Find the first substring in a length-limited string
* @s1__ign: The string to be searched
* @s2__ign: The string to search for
* @len: the maximum number of characters to search
*
* Return:
* * >=0 - Index of the first character of the first occurrence of @s2__ign
* within the first @len characters of @s1__ign
* * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign
* * %-EFAULT - Cannot read one of the strings
* * %-E2BIG - One of the strings is too large
* * %-ERANGE - One of the strings is outside of kernel address space
*/
__bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign,
size_t len)
{
return __bpf_strnstr(s1__ign, s2__ign, len, false);
}
/**
* bpf_strncasestr - Find the first substring in a length-limited string,
* ignoring the case of the characters
* @s1__ign: The string to be searched
* @s2__ign: The string to search for
* @len: the maximum number of characters to search
*
* Return:
* * >=0 - Index of the first character of the first occurrence of @s2__ign
* within the first @len characters of @s1__ign
* * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign
* * %-EFAULT - Cannot read one of the strings
* * %-E2BIG - One of the strings is too large
* * %-ERANGE - One of the strings is outside of kernel address space
*/
__bpf_kfunc int bpf_strncasestr(const char *s1__ign, const char *s2__ign,
size_t len)
{
return __bpf_strnstr(s1__ign, s2__ign, len, true);
}
#ifdef CONFIG_KEYS
/**
* bpf_lookup_user_key - lookup a key by its serial
@@ -4206,6 +4314,54 @@ __bpf_kfunc int bpf_task_work_schedule_resume_impl(struct task_struct *task,
return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_RESUME);
}
static int make_file_dynptr(struct file *file, u32 flags, bool may_sleep,
struct bpf_dynptr_kern *ptr)
{
struct bpf_dynptr_file_impl *state;
/* flags is currently unsupported */
if (flags) {
bpf_dynptr_set_null(ptr);
return -EINVAL;
}
state = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_dynptr_file_impl));
if (!state) {
bpf_dynptr_set_null(ptr);
return -ENOMEM;
}
state->offset = 0;
state->size = U64_MAX; /* Don't restrict size, as file may change anyways */
freader_init_from_file(&state->freader, NULL, 0, file, may_sleep);
bpf_dynptr_init(ptr, state, BPF_DYNPTR_TYPE_FILE, 0, 0);
bpf_dynptr_set_rdonly(ptr);
return 0;
}
__bpf_kfunc int bpf_dynptr_from_file(struct file *file, u32 flags, struct bpf_dynptr *ptr__uninit)
{
return make_file_dynptr(file, flags, false, (struct bpf_dynptr_kern *)ptr__uninit);
}
int bpf_dynptr_from_file_sleepable(struct file *file, u32 flags, struct bpf_dynptr *ptr__uninit)
{
return make_file_dynptr(file, flags, true, (struct bpf_dynptr_kern *)ptr__uninit);
}
__bpf_kfunc int bpf_dynptr_file_discard(struct bpf_dynptr *dynptr)
{
struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)dynptr;
struct bpf_dynptr_file_impl *df = ptr->data;
if (!df)
return 0;
freader_cleanup(&df->freader);
bpf_mem_free(&bpf_global_ma, df);
bpf_dynptr_set_null(ptr);
return 0;
}
__bpf_kfunc_end_defs();
static void bpf_task_work_cancel_scheduled(struct irq_work *irq_work)
@@ -4376,13 +4532,17 @@ BTF_ID_FLAGS(func, bpf_strnlen);
BTF_ID_FLAGS(func, bpf_strspn);
BTF_ID_FLAGS(func, bpf_strcspn);
BTF_ID_FLAGS(func, bpf_strstr);
BTF_ID_FLAGS(func, bpf_strcasestr);
BTF_ID_FLAGS(func, bpf_strnstr);
BTF_ID_FLAGS(func, bpf_strncasestr);
#if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS)
BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU)
#endif
BTF_ID_FLAGS(func, bpf_stream_vprintk_impl, KF_TRUSTED_ARGS)
BTF_ID_FLAGS(func, bpf_task_work_schedule_signal_impl, KF_TRUSTED_ARGS)
BTF_ID_FLAGS(func, bpf_task_work_schedule_resume_impl, KF_TRUSTED_ARGS)
BTF_ID_FLAGS(func, bpf_dynptr_from_file, KF_TRUSTED_ARGS)
BTF_ID_FLAGS(func, bpf_dynptr_file_discard)
BTF_KFUNCS_END(common_btf_ids)
static const struct btf_kfunc_id_set common_kfunc_set = {
@@ -4423,7 +4583,7 @@ late_initcall(kfunc_init);
/* Get a pointer to dynptr data up to len bytes for read only access. If
* the dynptr doesn't have continuous data up to len bytes, return NULL.
*/
const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len)
const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u64 len)
{
const struct bpf_dynptr *p = (struct bpf_dynptr *)ptr;
@@ -4434,9 +4594,19 @@ const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len)
* the dynptr doesn't have continuous data up to len bytes, or the dynptr
* is read only, return NULL.
*/
void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len)
void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u64 len)
{
if (__bpf_dynptr_is_rdonly(ptr))
return NULL;
return (void *)__bpf_dynptr_data(ptr, len);
}
void bpf_map_free_internal_structs(struct bpf_map *map, void *val)
{
if (btf_record_has_field(map->record, BPF_TIMER))
bpf_obj_free_timer(map->record, val);
if (btf_record_has_field(map->record, BPF_WORKQUEUE))
bpf_obj_free_workqueue(map->record, val);
if (btf_record_has_field(map->record, BPF_TASK_WORK))
bpf_obj_free_task_work(map->record, val);
}

View File

@@ -34,7 +34,7 @@
* - read and write marks propagation.
* - The propagation phase is a textbook live variable data flow analysis:
*
* state[cc, i].live_after = U [state[cc, s].live_before for s in insn_successors(i)]
* state[cc, i].live_after = U [state[cc, s].live_before for s in bpf_insn_successors(i)]
* state[cc, i].live_before =
* (state[cc, i].live_after / state[cc, i].must_write) U state[i].may_read
*
@@ -54,7 +54,7 @@
* The equation for "must_write_acc" propagation looks as follows:
*
* state[cc, i].must_write_acc =
* [state[cc, s].must_write_acc for s in insn_successors(i)]
* [state[cc, s].must_write_acc for s in bpf_insn_successors(i)]
* U state[cc, i].must_write
*
* (An intersection of all "must_write_acc" for instruction successors
@@ -447,7 +447,12 @@ int bpf_jmp_offset(struct bpf_insn *insn)
__diag_push();
__diag_ignore_all("-Woverride-init", "Allow field initialization overrides for opcode_info_tbl");
inline int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2])
/*
* Returns an array of instructions succ, with succ->items[0], ...,
* succ->items[n-1] with successor instructions, where n=succ->cnt
*/
inline struct bpf_iarray *
bpf_insn_successors(struct bpf_verifier_env *env, u32 idx)
{
static const struct opcode_info {
bool can_jump;
@@ -474,19 +479,29 @@ inline int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2])
_J(BPF_JSET, {.can_jump = true, .can_fallthrough = true}),
#undef _J
};
struct bpf_prog *prog = env->prog;
struct bpf_insn *insn = &prog->insnsi[idx];
const struct opcode_info *opcode_info;
int i = 0, insn_sz;
struct bpf_iarray *succ, *jt;
int insn_sz;
jt = env->insn_aux_data[idx].jt;
if (unlikely(jt))
return jt;
/* pre-allocated array of size up to 2; reset cnt, as it may have been used already */
succ = env->succ;
succ->cnt = 0;
opcode_info = &opcode_info_tbl[BPF_CLASS(insn->code) | BPF_OP(insn->code)];
insn_sz = bpf_is_ldimm64(insn) ? 2 : 1;
if (opcode_info->can_fallthrough)
succ[i++] = idx + insn_sz;
succ->items[succ->cnt++] = idx + insn_sz;
if (opcode_info->can_jump)
succ[i++] = idx + bpf_jmp_offset(insn) + 1;
succ->items[succ->cnt++] = idx + bpf_jmp_offset(insn) + 1;
return i;
return succ;
}
__diag_pop();
@@ -524,6 +539,8 @@ static int propagate_to_outer_instance(struct bpf_verifier_env *env,
this_subprog_start = callchain_subprog_start(callchain);
outer_instance = get_outer_instance(env, instance);
if (IS_ERR(outer_instance))
return PTR_ERR(outer_instance);
callsite = callchain->callsites[callchain->curframe - 1];
reset_stack_write_marks(env, outer_instance, callsite);
@@ -546,11 +563,12 @@ static inline bool update_insn(struct bpf_verifier_env *env,
struct bpf_insn_aux_data *aux = env->insn_aux_data;
u64 new_before, new_after, must_write_acc;
struct per_frame_masks *insn, *succ_insn;
u32 succ_num, s, succ[2];
struct bpf_iarray *succ;
u32 s;
bool changed;
succ_num = bpf_insn_successors(env->prog, insn_idx, succ);
if (unlikely(succ_num == 0))
succ = bpf_insn_successors(env, insn_idx);
if (succ->cnt == 0)
return false;
changed = false;
@@ -562,8 +580,8 @@ static inline bool update_insn(struct bpf_verifier_env *env,
* of successors plus all "must_write" slots of instruction itself.
*/
must_write_acc = U64_MAX;
for (s = 0; s < succ_num; ++s) {
succ_insn = get_frame_masks(instance, frame, succ[s]);
for (s = 0; s < succ->cnt; ++s) {
succ_insn = get_frame_masks(instance, frame, succ->items[s]);
new_after |= succ_insn->live_before;
must_write_acc &= succ_insn->must_write_acc;
}

View File

@@ -461,6 +461,7 @@ const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type)
[PTR_TO_ARENA] = "arena",
[PTR_TO_BUF] = "buf",
[PTR_TO_FUNC] = "func",
[PTR_TO_INSN] = "insn",
[PTR_TO_MAP_KEY] = "map_key",
[CONST_PTR_TO_DYNPTR] = "dynptr_ptr",
};
@@ -500,6 +501,8 @@ const char *dynptr_type_str(enum bpf_dynptr_type type)
return "xdp";
case BPF_DYNPTR_TYPE_SKB_META:
return "skb_meta";
case BPF_DYNPTR_TYPE_FILE:
return "file";
case BPF_DYNPTR_TYPE_INVALID:
return "<invalid>";
default:

View File

@@ -2,7 +2,6 @@
/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
#include <linux/interval_tree_generic.h>
#include <linux/slab.h>
#include <linux/bpf_mem_alloc.h>
#include <linux/bpf.h>
#include "range_tree.h"
@@ -21,7 +20,7 @@
* in commit 6772fcc8890a ("xfs: convert xbitmap to interval tree").
*
* The implementation relies on external lock to protect rbtree-s.
* The alloc/free of range_node-s is done via bpf_mem_alloc.
* The alloc/free of range_node-s is done via kmalloc_nolock().
*
* bpf arena is using range_tree to represent unallocated slots.
* At init time:
@@ -150,9 +149,7 @@ int range_tree_clear(struct range_tree *rt, u32 start, u32 len)
range_it_insert(rn, rt);
/* Add a range */
migrate_disable();
new_rn = bpf_mem_alloc(&bpf_global_ma, sizeof(struct range_node));
migrate_enable();
new_rn = kmalloc_nolock(sizeof(struct range_node), 0, NUMA_NO_NODE);
if (!new_rn)
return -ENOMEM;
new_rn->rn_start = last + 1;
@@ -172,9 +169,7 @@ int range_tree_clear(struct range_tree *rt, u32 start, u32 len)
} else {
/* in the middle of the clearing range */
range_it_remove(rn, rt);
migrate_disable();
bpf_mem_free(&bpf_global_ma, rn);
migrate_enable();
kfree_nolock(rn);
}
}
return 0;
@@ -227,9 +222,7 @@ int range_tree_set(struct range_tree *rt, u32 start, u32 len)
range_it_remove(right, rt);
left->rn_last = right->rn_last;
range_it_insert(left, rt);
migrate_disable();
bpf_mem_free(&bpf_global_ma, right);
migrate_enable();
kfree_nolock(right);
} else if (left) {
/* Combine with the left range */
range_it_remove(left, rt);
@@ -241,9 +234,7 @@ int range_tree_set(struct range_tree *rt, u32 start, u32 len)
right->rn_start = start;
range_it_insert(right, rt);
} else {
migrate_disable();
left = bpf_mem_alloc(&bpf_global_ma, sizeof(struct range_node));
migrate_enable();
left = kmalloc_nolock(sizeof(struct range_node), 0, NUMA_NO_NODE);
if (!left)
return -ENOMEM;
left->rn_start = start;
@@ -259,7 +250,7 @@ void range_tree_destroy(struct range_tree *rt)
while ((rn = range_it_iter_first(rt, 0, -1U))) {
range_it_remove(rn, rt);
bpf_mem_free(&bpf_global_ma, rn);
kfree_nolock(rn);
}
}

View File

@@ -13,7 +13,7 @@
#include <linux/btf_ids.h>
#include <asm/rqspinlock.h>
#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE)
#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE | BPF_F_RB_OVERWRITE)
/* non-mmap()'able part of bpf_ringbuf (everything up to consumer page) */
#define RINGBUF_PGOFF \
@@ -30,6 +30,7 @@ struct bpf_ringbuf {
u64 mask;
struct page **pages;
int nr_pages;
bool overwrite_mode;
rqspinlock_t spinlock ____cacheline_aligned_in_smp;
/* For user-space producer ring buffers, an atomic_t busy bit is used
* to synchronize access to the ring buffers in the kernel, rather than
@@ -73,6 +74,7 @@ struct bpf_ringbuf {
unsigned long consumer_pos __aligned(PAGE_SIZE);
unsigned long producer_pos __aligned(PAGE_SIZE);
unsigned long pending_pos;
unsigned long overwrite_pos; /* position after the last overwritten record */
char data[] __aligned(PAGE_SIZE);
};
@@ -166,7 +168,7 @@ static void bpf_ringbuf_notify(struct irq_work *work)
* considering that the maximum value of data_sz is (4GB - 1), there
* will be no overflow, so just note the size limit in the comments.
*/
static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node, bool overwrite_mode)
{
struct bpf_ringbuf *rb;
@@ -183,17 +185,25 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
rb->consumer_pos = 0;
rb->producer_pos = 0;
rb->pending_pos = 0;
rb->overwrite_mode = overwrite_mode;
return rb;
}
static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr)
{
bool overwrite_mode = false;
struct bpf_ringbuf_map *rb_map;
if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL);
if (attr->map_flags & BPF_F_RB_OVERWRITE) {
if (attr->map_type != BPF_MAP_TYPE_RINGBUF)
return ERR_PTR(-EINVAL);
overwrite_mode = true;
}
if (attr->key_size || attr->value_size ||
!is_power_of_2(attr->max_entries) ||
!PAGE_ALIGNED(attr->max_entries))
@@ -205,7 +215,7 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr)
bpf_map_init_from_attr(&rb_map->map, attr);
rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node);
rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node, overwrite_mode);
if (!rb_map->rb) {
bpf_map_area_free(rb_map);
return ERR_PTR(-ENOMEM);
@@ -295,13 +305,26 @@ static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma
return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF);
}
/*
* Return an estimate of the available data in the ring buffer.
* Note: the returned value can exceed the actual ring buffer size because the
* function is not synchronized with the producer. The producer acquires the
* ring buffer's spinlock, but this function does not.
*/
static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb)
{
unsigned long cons_pos, prod_pos;
unsigned long cons_pos, prod_pos, over_pos;
cons_pos = smp_load_acquire(&rb->consumer_pos);
prod_pos = smp_load_acquire(&rb->producer_pos);
return prod_pos - cons_pos;
if (unlikely(rb->overwrite_mode)) {
over_pos = smp_load_acquire(&rb->overwrite_pos);
prod_pos = smp_load_acquire(&rb->producer_pos);
return prod_pos - max(cons_pos, over_pos);
} else {
prod_pos = smp_load_acquire(&rb->producer_pos);
return prod_pos - cons_pos;
}
}
static u32 ringbuf_total_data_sz(const struct bpf_ringbuf *rb)
@@ -404,11 +427,43 @@ bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr)
return (void*)((addr & PAGE_MASK) - off);
}
static bool bpf_ringbuf_has_space(const struct bpf_ringbuf *rb,
unsigned long new_prod_pos,
unsigned long cons_pos,
unsigned long pend_pos)
{
/*
* No space if oldest not yet committed record until the newest
* record span more than (ringbuf_size - 1).
*/
if (new_prod_pos - pend_pos > rb->mask)
return false;
/* Ok, we have space in overwrite mode */
if (unlikely(rb->overwrite_mode))
return true;
/*
* No space if producer position advances more than (ringbuf_size - 1)
* ahead of consumer position when not in overwrite mode.
*/
if (new_prod_pos - cons_pos > rb->mask)
return false;
return true;
}
static u32 bpf_ringbuf_round_up_hdr_len(u32 hdr_len)
{
hdr_len &= ~BPF_RINGBUF_DISCARD_BIT;
return round_up(hdr_len + BPF_RINGBUF_HDR_SZ, 8);
}
static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
{
unsigned long cons_pos, prod_pos, new_prod_pos, pend_pos, flags;
unsigned long cons_pos, prod_pos, new_prod_pos, pend_pos, over_pos, flags;
struct bpf_ringbuf_hdr *hdr;
u32 len, pg_off, tmp_size, hdr_len;
u32 len, pg_off, hdr_len;
if (unlikely(size > RINGBUF_MAX_RECORD_SZ))
return NULL;
@@ -431,24 +486,43 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
hdr_len = READ_ONCE(hdr->len);
if (hdr_len & BPF_RINGBUF_BUSY_BIT)
break;
tmp_size = hdr_len & ~BPF_RINGBUF_DISCARD_BIT;
tmp_size = round_up(tmp_size + BPF_RINGBUF_HDR_SZ, 8);
pend_pos += tmp_size;
pend_pos += bpf_ringbuf_round_up_hdr_len(hdr_len);
}
rb->pending_pos = pend_pos;
/* check for out of ringbuf space:
* - by ensuring producer position doesn't advance more than
* (ringbuf_size - 1) ahead
* - by ensuring oldest not yet committed record until newest
* record does not span more than (ringbuf_size - 1)
*/
if (new_prod_pos - cons_pos > rb->mask ||
new_prod_pos - pend_pos > rb->mask) {
if (!bpf_ringbuf_has_space(rb, new_prod_pos, cons_pos, pend_pos)) {
raw_res_spin_unlock_irqrestore(&rb->spinlock, flags);
return NULL;
}
/*
* In overwrite mode, advance overwrite_pos when the ring buffer is full.
* The key points are to stay on record boundaries and consume enough records
* to fit the new one.
*/
if (unlikely(rb->overwrite_mode)) {
over_pos = rb->overwrite_pos;
while (new_prod_pos - over_pos > rb->mask) {
hdr = (void *)rb->data + (over_pos & rb->mask);
hdr_len = READ_ONCE(hdr->len);
/*
* The bpf_ringbuf_has_space() check above ensures we wont
* step over a record currently being worked on by another
* producer.
*/
over_pos += bpf_ringbuf_round_up_hdr_len(hdr_len);
}
/*
* smp_store_release(&rb->producer_pos, new_prod_pos) at
* the end of the function ensures that when consumer sees
* the updated rb->producer_pos, it always sees the updated
* rb->overwrite_pos, so when consumer reads overwrite_pos
* after smp_load_acquire(r->producer_pos), the overwrite_pos
* will always be valid.
*/
WRITE_ONCE(rb->overwrite_pos, over_pos);
}
hdr = (void *)rb->data + (prod_pos & rb->mask);
pg_off = bpf_ringbuf_rec_pg_off(rb, hdr);
hdr->len = size | BPF_RINGBUF_BUSY_BIT;
@@ -578,6 +652,8 @@ BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags)
return smp_load_acquire(&rb->consumer_pos);
case BPF_RB_PROD_POS:
return smp_load_acquire(&rb->producer_pos);
case BPF_RB_OVERWRITE_POS:
return smp_load_acquire(&rb->overwrite_pos);
default:
return 0;
}

View File

@@ -89,15 +89,14 @@ struct rqspinlock_timeout {
DEFINE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks);
EXPORT_SYMBOL_GPL(rqspinlock_held_locks);
static bool is_lock_released(rqspinlock_t *lock, u32 mask, struct rqspinlock_timeout *ts)
static bool is_lock_released(rqspinlock_t *lock, u32 mask)
{
if (!(atomic_read_acquire(&lock->val) & (mask)))
return true;
return false;
}
static noinline int check_deadlock_AA(rqspinlock_t *lock, u32 mask,
struct rqspinlock_timeout *ts)
static noinline int check_deadlock_AA(rqspinlock_t *lock)
{
struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
int cnt = min(RES_NR_HELD, rqh->cnt);
@@ -118,8 +117,7 @@ static noinline int check_deadlock_AA(rqspinlock_t *lock, u32 mask,
* more locks, which reduce to ABBA). This is not exhaustive, and we rely on
* timeouts as the final line of defense.
*/
static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask,
struct rqspinlock_timeout *ts)
static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask)
{
struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
int rqh_cnt = min(RES_NR_HELD, rqh->cnt);
@@ -142,7 +140,7 @@ static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask,
* Let's ensure to break out of this loop if the lock is available for
* us to potentially acquire.
*/
if (is_lock_released(lock, mask, ts))
if (is_lock_released(lock, mask))
return 0;
/*
@@ -198,33 +196,21 @@ static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask,
return 0;
}
static noinline int check_deadlock(rqspinlock_t *lock, u32 mask,
struct rqspinlock_timeout *ts)
{
int ret;
ret = check_deadlock_AA(lock, mask, ts);
if (ret)
return ret;
ret = check_deadlock_ABBA(lock, mask, ts);
if (ret)
return ret;
return 0;
}
static noinline int check_timeout(rqspinlock_t *lock, u32 mask,
struct rqspinlock_timeout *ts)
{
u64 time = ktime_get_mono_fast_ns();
u64 prev = ts->cur;
u64 time;
if (!ts->timeout_end) {
ts->cur = time;
ts->timeout_end = time + ts->duration;
if (check_deadlock_AA(lock))
return -EDEADLK;
ts->cur = ktime_get_mono_fast_ns();
ts->timeout_end = ts->cur + ts->duration;
return 0;
}
time = ktime_get_mono_fast_ns();
if (time > ts->timeout_end)
return -ETIMEDOUT;
@@ -234,7 +220,7 @@ static noinline int check_timeout(rqspinlock_t *lock, u32 mask,
*/
if (prev + NSEC_PER_MSEC < time) {
ts->cur = time;
return check_deadlock(lock, mask, ts);
return check_deadlock_ABBA(lock, mask);
}
return 0;
@@ -278,6 +264,10 @@ int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock)
int val, ret = 0;
RES_INIT_TIMEOUT(ts);
/*
* The fast path is not invoked for the TAS fallback, so we must grab
* the deadlock detection entry here.
*/
grab_held_lock_entry(lock);
/*
@@ -400,10 +390,7 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val)
goto queue;
}
/*
* Grab an entry in the held locks array, to enable deadlock detection.
*/
grab_held_lock_entry(lock);
/* Deadlock detection entry already held after failing fast path. */
/*
* We're pending, wait for the owner to go away.
@@ -450,12 +437,21 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val)
* queuing.
*/
queue:
lockevent_inc(lock_slowpath);
/*
* Grab deadlock detection entry for the queue path.
* Do not queue if we're a waiter and someone is attempting this lock on
* the same CPU. In case of NMIs, this prevents long timeouts where we
* interrupt the pending waiter, and the owner, that will eventually
* signal the head of our queue, both of which are logically but not
* physically part of the queue, hence outside the scope of the idx > 0
* check above for the trylock fallback.
*/
grab_held_lock_entry(lock);
if (check_deadlock_AA(lock)) {
ret = -EDEADLK;
goto err_release_entry;
}
lockevent_inc(lock_slowpath);
/* Deadlock detection entry already held after failing fast path. */
node = this_cpu_ptr(&rqnodes[0].mcs);
idx = node->count++;
tail = encode_tail(smp_processor_id(), idx);
@@ -467,19 +463,17 @@ queue:
* not be nested NMIs taking spinlocks. That may not be true in
* some architectures even though the chance of needing more than
* 4 nodes will still be extremely unlikely. When that happens,
* we fall back to spinning on the lock directly without using
* any MCS node. This is not the most elegant solution, but is
* simple enough.
* we fall back to attempting a trylock operation without using
* any MCS node. Unlike qspinlock which cannot fail, we have the
* option of failing the slow path, and under contention, such a
* trylock spinning will likely be treated unfairly due to lack of
* queueing, hence do not spin.
*/
if (unlikely(idx >= _Q_MAX_NODES || in_nmi())) {
if (unlikely(idx >= _Q_MAX_NODES || (in_nmi() && idx > 0))) {
lockevent_inc(lock_no_node);
RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT);
while (!queued_spin_trylock(lock)) {
if (RES_CHECK_TIMEOUT(ts, ret, ~0u)) {
lockevent_inc(rqspinlock_lock_timeout);
goto err_release_node;
}
cpu_relax();
if (!queued_spin_trylock(lock)) {
ret = -EDEADLK;
goto err_release_node;
}
goto release;
}
@@ -540,7 +534,7 @@ queue:
val = arch_mcs_spin_lock_contended(&node->locked);
if (val == RES_TIMEOUT_VAL) {
ret = -EDEADLK;
ret = -ETIMEDOUT;
goto waitq_timeout;
}
@@ -575,6 +569,14 @@ queue:
val = res_atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK) ||
RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_PENDING_MASK));
/* Disable queue destruction when we detect deadlocks. */
if (ret == -EDEADLK) {
if (!next)
next = smp_cond_load_relaxed(&node->next, (VAL));
arch_mcs_spin_unlock_contended(&next->locked);
goto err_release_node;
}
waitq_timeout:
if (ret) {
/*

View File

@@ -42,6 +42,28 @@ static inline int stack_map_data_size(struct bpf_map *map)
sizeof(struct bpf_stack_build_id) : sizeof(u64);
}
/**
* stack_map_calculate_max_depth - Calculate maximum allowed stack trace depth
* @size: Size of the buffer/map value in bytes
* @elem_size: Size of each stack trace element
* @flags: BPF stack trace flags (BPF_F_USER_STACK, BPF_F_USER_BUILD_ID, ...)
*
* Return: Maximum number of stack trace entries that can be safely stored
*/
static u32 stack_map_calculate_max_depth(u32 size, u32 elem_size, u64 flags)
{
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
u32 max_depth;
u32 curr_sysctl_max_stack = READ_ONCE(sysctl_perf_event_max_stack);
max_depth = size / elem_size;
max_depth += skip;
if (max_depth > curr_sysctl_max_stack)
return curr_sysctl_max_stack;
return max_depth;
}
static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
{
u64 elem_size = sizeof(struct stack_map_bucket) +
@@ -229,8 +251,8 @@ static long __bpf_get_stackid(struct bpf_map *map,
{
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
u32 hash, id, trace_nr, trace_len, i, max_depth;
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
u32 hash, id, trace_nr, trace_len, i;
bool user = flags & BPF_F_USER_STACK;
u64 *ips;
bool hash_matches;
@@ -239,7 +261,8 @@ static long __bpf_get_stackid(struct bpf_map *map,
/* skipping more than usable stack trace */
return -EFAULT;
trace_nr = trace->nr - skip;
max_depth = stack_map_calculate_max_depth(map->value_size, stack_map_data_size(map), flags);
trace_nr = min_t(u32, trace->nr - skip, max_depth - skip);
trace_len = trace_nr * sizeof(u64);
ips = trace->ip + skip;
hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0);
@@ -300,20 +323,17 @@ static long __bpf_get_stackid(struct bpf_map *map,
BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
u64, flags)
{
u32 max_depth = map->value_size / stack_map_data_size(map);
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
u32 elem_size = stack_map_data_size(map);
bool user = flags & BPF_F_USER_STACK;
struct perf_callchain_entry *trace;
bool kernel = !user;
u32 max_depth;
if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
return -EINVAL;
max_depth += skip;
if (max_depth > sysctl_perf_event_max_stack)
max_depth = sysctl_perf_event_max_stack;
max_depth = stack_map_calculate_max_depth(map->value_size, elem_size, flags);
trace = get_perf_callchain(regs, kernel, user, max_depth,
false, false, 0);
@@ -371,15 +391,11 @@ BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx,
return -EFAULT;
nr_kernel = count_kernel_ip(trace);
__u64 nr = trace->nr; /* save original */
if (kernel) {
__u64 nr = trace->nr;
trace->nr = nr_kernel;
ret = __bpf_get_stackid(map, trace, flags);
/* restore nr */
trace->nr = nr;
} else { /* user */
u64 skip = flags & BPF_F_SKIP_FIELD_MASK;
@@ -390,6 +406,10 @@ BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx,
flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip;
ret = __bpf_get_stackid(map, trace, flags);
}
/* restore nr */
trace->nr = nr;
return ret;
}
@@ -406,7 +426,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
struct perf_callchain_entry *trace_in,
void *buf, u32 size, u64 flags, bool may_fault)
{
u32 trace_nr, copy_len, elem_size, num_elem, max_depth;
u32 trace_nr, copy_len, elem_size, max_depth;
bool user_build_id = flags & BPF_F_USER_BUILD_ID;
bool crosstask = task && task != current;
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
@@ -438,21 +458,20 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
goto clear;
}
num_elem = size / elem_size;
max_depth = num_elem + skip;
if (sysctl_perf_event_max_stack < max_depth)
max_depth = sysctl_perf_event_max_stack;
max_depth = stack_map_calculate_max_depth(size, elem_size, flags);
if (may_fault)
rcu_read_lock(); /* need RCU for perf's callchain below */
if (trace_in)
if (trace_in) {
trace = trace_in;
else if (kernel && task)
trace->nr = min_t(u32, trace->nr, max_depth);
} else if (kernel && task) {
trace = get_callchain_entry_for_task(task, max_depth);
else
} else {
trace = get_perf_callchain(regs, kernel, user, max_depth,
crosstask, false, 0);
}
if (unlikely(!trace) || trace->nr < skip) {
if (may_fault)
@@ -461,7 +480,6 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
}
trace_nr = trace->nr - skip;
trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
copy_len = trace_nr * elem_size;
ips = trace->ip + skip;

View File

@@ -4,111 +4,10 @@
#include <linux/bpf.h>
#include <linux/filter.h>
#include <linux/bpf_mem_alloc.h>
#include <linux/percpu.h>
#include <linux/refcount.h>
#include <linux/gfp.h>
#include <linux/memory.h>
#include <linux/local_lock.h>
#include <linux/mutex.h>
/*
* Simple per-CPU NMI-safe bump allocation mechanism, backed by the NMI-safe
* try_alloc_pages()/free_pages_nolock() primitives. We allocate a page and
* stash it in a local per-CPU variable, and bump allocate from the page
* whenever items need to be printed to a stream. Each page holds a global
* atomic refcount in its first 4 bytes, and then records of variable length
* that describe the printed messages. Once the global refcount has dropped to
* zero, it is a signal to free the page back to the kernel's page allocator,
* given all the individual records in it have been consumed.
*
* It is possible the same page is used to serve allocations across different
* programs, which may be consumed at different times individually, hence
* maintaining a reference count per-page is critical for correct lifetime
* tracking.
*
* The bpf_stream_page code will be replaced to use kmalloc_nolock() once it
* lands.
*/
struct bpf_stream_page {
refcount_t ref;
u32 consumed;
char buf[];
};
/* Available room to add data to a refcounted page. */
#define BPF_STREAM_PAGE_SZ (PAGE_SIZE - offsetofend(struct bpf_stream_page, consumed))
static DEFINE_PER_CPU(local_trylock_t, stream_local_lock) = INIT_LOCAL_TRYLOCK(stream_local_lock);
static DEFINE_PER_CPU(struct bpf_stream_page *, stream_pcpu_page);
static bool bpf_stream_page_local_lock(unsigned long *flags)
{
return local_trylock_irqsave(&stream_local_lock, *flags);
}
static void bpf_stream_page_local_unlock(unsigned long *flags)
{
local_unlock_irqrestore(&stream_local_lock, *flags);
}
static void bpf_stream_page_free(struct bpf_stream_page *stream_page)
{
struct page *p;
if (!stream_page)
return;
p = virt_to_page(stream_page);
free_pages_nolock(p, 0);
}
static void bpf_stream_page_get(struct bpf_stream_page *stream_page)
{
refcount_inc(&stream_page->ref);
}
static void bpf_stream_page_put(struct bpf_stream_page *stream_page)
{
if (refcount_dec_and_test(&stream_page->ref))
bpf_stream_page_free(stream_page);
}
static void bpf_stream_page_init(struct bpf_stream_page *stream_page)
{
refcount_set(&stream_page->ref, 1);
stream_page->consumed = 0;
}
static struct bpf_stream_page *bpf_stream_page_replace(void)
{
struct bpf_stream_page *stream_page, *old_stream_page;
struct page *page;
page = alloc_pages_nolock(/* Don't account */ 0, NUMA_NO_NODE, 0);
if (!page)
return NULL;
stream_page = page_address(page);
bpf_stream_page_init(stream_page);
old_stream_page = this_cpu_read(stream_pcpu_page);
if (old_stream_page)
bpf_stream_page_put(old_stream_page);
this_cpu_write(stream_pcpu_page, stream_page);
return stream_page;
}
static int bpf_stream_page_check_room(struct bpf_stream_page *stream_page, int len)
{
int min = offsetof(struct bpf_stream_elem, str[0]);
int consumed = stream_page->consumed;
int total = BPF_STREAM_PAGE_SZ;
int rem = max(0, total - consumed - min);
/* Let's give room of at least 8 bytes. */
WARN_ON_ONCE(rem % 8 != 0);
rem = rem < 8 ? 0 : rem;
return min(len, rem);
}
static void bpf_stream_elem_init(struct bpf_stream_elem *elem, int len)
{
init_llist_node(&elem->node);
@@ -116,54 +15,12 @@ static void bpf_stream_elem_init(struct bpf_stream_elem *elem, int len)
elem->consumed_len = 0;
}
static struct bpf_stream_page *bpf_stream_page_from_elem(struct bpf_stream_elem *elem)
{
unsigned long addr = (unsigned long)elem;
return (struct bpf_stream_page *)PAGE_ALIGN_DOWN(addr);
}
static struct bpf_stream_elem *bpf_stream_page_push_elem(struct bpf_stream_page *stream_page, int len)
{
u32 consumed = stream_page->consumed;
stream_page->consumed += round_up(offsetof(struct bpf_stream_elem, str[len]), 8);
return (struct bpf_stream_elem *)&stream_page->buf[consumed];
}
static struct bpf_stream_elem *bpf_stream_page_reserve_elem(int len)
{
struct bpf_stream_elem *elem = NULL;
struct bpf_stream_page *page;
int room = 0;
page = this_cpu_read(stream_pcpu_page);
if (!page)
page = bpf_stream_page_replace();
if (!page)
return NULL;
room = bpf_stream_page_check_room(page, len);
if (room != len)
page = bpf_stream_page_replace();
if (!page)
return NULL;
bpf_stream_page_get(page);
room = bpf_stream_page_check_room(page, len);
WARN_ON_ONCE(room != len);
elem = bpf_stream_page_push_elem(page, room);
bpf_stream_elem_init(elem, room);
return elem;
}
static struct bpf_stream_elem *bpf_stream_elem_alloc(int len)
{
const int max_len = ARRAY_SIZE((struct bpf_bprintf_buffers){}.buf);
struct bpf_stream_elem *elem;
unsigned long flags;
size_t alloc_size;
BUILD_BUG_ON(max_len > BPF_STREAM_PAGE_SZ);
/*
* Length denotes the amount of data to be written as part of stream element,
* thus includes '\0' byte. We're capped by how much bpf_bprintf_buffers can
@@ -172,10 +29,13 @@ static struct bpf_stream_elem *bpf_stream_elem_alloc(int len)
if (len < 0 || len > max_len)
return NULL;
if (!bpf_stream_page_local_lock(&flags))
alloc_size = offsetof(struct bpf_stream_elem, str[len]);
elem = kmalloc_nolock(alloc_size, __GFP_ZERO, -1);
if (!elem)
return NULL;
elem = bpf_stream_page_reserve_elem(len);
bpf_stream_page_local_unlock(&flags);
bpf_stream_elem_init(elem, len);
return elem;
}
@@ -231,10 +91,7 @@ static struct bpf_stream *bpf_stream_get(enum bpf_stream_id stream_id, struct bp
static void bpf_stream_free_elem(struct bpf_stream_elem *elem)
{
struct bpf_stream_page *p;
p = bpf_stream_page_from_elem(elem);
bpf_stream_page_put(p);
kfree_nolock(elem);
}
static void bpf_stream_free_list(struct llist_node *list)

View File

@@ -158,7 +158,7 @@ static void maybe_wait_bpf_programs(struct bpf_map *map)
*/
if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS ||
map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
synchronize_rcu();
synchronize_rcu_expedited();
}
static void unpin_uptr_kaddr(void *kaddr)
@@ -1493,6 +1493,7 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr)
case BPF_MAP_TYPE_STRUCT_OPS:
case BPF_MAP_TYPE_CPUMAP:
case BPF_MAP_TYPE_ARENA:
case BPF_MAP_TYPE_INSN_ARRAY:
if (!bpf_token_capable(token, CAP_BPF))
goto put_token;
break;
@@ -1585,7 +1586,8 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr)
goto free_map;
}
} else if (attr->excl_prog_hash_size) {
return -EINVAL;
err = -EINVAL;
goto free_map;
}
err = security_bpf_map_create(map, attr, token, uattr.is_kernel);
@@ -1724,9 +1726,6 @@ static int map_lookup_elem(union bpf_attr *attr)
if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
return -EINVAL;
if (attr->flags & ~BPF_F_LOCK)
return -EINVAL;
CLASS(fd, f)(attr->map_fd);
map = __bpf_map_get(f);
if (IS_ERR(map))
@@ -1734,9 +1733,9 @@ static int map_lookup_elem(union bpf_attr *attr)
if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ))
return -EPERM;
if ((attr->flags & BPF_F_LOCK) &&
!btf_record_has_field(map->record, BPF_SPIN_LOCK))
return -EINVAL;
err = bpf_map_check_op_flags(map, attr->flags, BPF_F_LOCK);
if (err)
return err;
key = __bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key))
@@ -1799,11 +1798,9 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
goto err_put;
}
if ((attr->flags & BPF_F_LOCK) &&
!btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
err = -EINVAL;
err = bpf_map_check_op_flags(map, attr->flags, ~0);
if (err)
goto err_put;
}
key = ___bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key)) {
@@ -2007,13 +2004,9 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
void *key, *value;
int err = 0;
if (attr->batch.elem_flags & ~BPF_F_LOCK)
return -EINVAL;
if ((attr->batch.elem_flags & BPF_F_LOCK) &&
!btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
return -EINVAL;
}
err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK);
if (err)
return err;
value_size = bpf_map_value_size(map);
@@ -2070,12 +2063,9 @@ int generic_map_lookup_batch(struct bpf_map *map,
u32 value_size, cp, max_count;
int err;
if (attr->batch.elem_flags & ~BPF_F_LOCK)
return -EINVAL;
if ((attr->batch.elem_flags & BPF_F_LOCK) &&
!btf_record_has_field(map->record, BPF_SPIN_LOCK))
return -EINVAL;
err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK);
if (err)
return err;
value_size = bpf_map_value_size(map);
@@ -2462,6 +2452,9 @@ void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog)
struct bpf_prog_stats *stats;
unsigned int flags;
if (unlikely(!prog->stats))
return;
stats = this_cpu_ptr(prog->stats);
flags = u64_stats_update_begin_irqsave(&stats->syncp);
u64_stats_inc(&stats->misses);
@@ -2853,6 +2846,23 @@ static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr
return err;
}
static int bpf_prog_mark_insn_arrays_ready(struct bpf_prog *prog)
{
int err;
int i;
for (i = 0; i < prog->aux->used_map_cnt; i++) {
if (prog->aux->used_maps[i]->map_type != BPF_MAP_TYPE_INSN_ARRAY)
continue;
err = bpf_insn_array_ready(prog->aux->used_maps[i]);
if (err)
return err;
}
return 0;
}
/* last field in 'union bpf_attr' used by this command */
#define BPF_PROG_LOAD_LAST_FIELD keyring_id
@@ -3082,6 +3092,10 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
if (err < 0)
goto free_used_maps;
err = bpf_prog_mark_insn_arrays_ready(prog);
if (err < 0)
goto free_used_maps;
err = bpf_prog_alloc_id(prog);
if (err)
goto free_used_maps;
@@ -5034,19 +5048,19 @@ static int bpf_prog_get_info_by_fd(struct file *file,
struct bpf_insn *insns_sanitized;
bool fault;
if (prog->blinded && !bpf_dump_raw_ok(file->f_cred)) {
if (!prog->blinded || bpf_dump_raw_ok(file->f_cred)) {
insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred);
if (!insns_sanitized)
return -ENOMEM;
uinsns = u64_to_user_ptr(info.xlated_prog_insns);
ulen = min_t(u32, info.xlated_prog_len, ulen);
fault = copy_to_user(uinsns, insns_sanitized, ulen);
kfree(insns_sanitized);
if (fault)
return -EFAULT;
} else {
info.xlated_prog_insns = 0;
goto done;
}
insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred);
if (!insns_sanitized)
return -ENOMEM;
uinsns = u64_to_user_ptr(info.xlated_prog_insns);
ulen = min_t(u32, info.xlated_prog_len, ulen);
fault = copy_to_user(uinsns, insns_sanitized, ulen);
kfree(insns_sanitized);
if (fault)
return -EFAULT;
}
if (bpf_prog_is_offloaded(prog->aux)) {

View File

@@ -175,23 +175,42 @@ out:
return tr;
}
static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
static int bpf_trampoline_update_fentry(struct bpf_trampoline *tr, u32 orig_flags,
void *old_addr, void *new_addr)
{
enum bpf_text_poke_type new_t = BPF_MOD_CALL, old_t = BPF_MOD_CALL;
void *ip = tr->func.addr;
if (!new_addr)
new_t = BPF_MOD_NOP;
else if (bpf_trampoline_use_jmp(tr->flags))
new_t = BPF_MOD_JUMP;
if (!old_addr)
old_t = BPF_MOD_NOP;
else if (bpf_trampoline_use_jmp(orig_flags))
old_t = BPF_MOD_JUMP;
return bpf_arch_text_poke(ip, old_t, new_t, old_addr, new_addr);
}
static int unregister_fentry(struct bpf_trampoline *tr, u32 orig_flags,
void *old_addr)
{
int ret;
if (tr->func.ftrace_managed)
ret = unregister_ftrace_direct(tr->fops, (long)old_addr, false);
else
ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL);
ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr, NULL);
return ret;
}
static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr,
static int modify_fentry(struct bpf_trampoline *tr, u32 orig_flags,
void *old_addr, void *new_addr,
bool lock_direct_mutex)
{
void *ip = tr->func.addr;
int ret;
if (tr->func.ftrace_managed) {
@@ -200,7 +219,8 @@ static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_ad
else
ret = modify_ftrace_direct_nolock(tr->fops, (long)new_addr);
} else {
ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr);
ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr,
new_addr);
}
return ret;
}
@@ -220,10 +240,12 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
}
if (tr->func.ftrace_managed) {
ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1);
ret = ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1);
if (ret)
return ret;
ret = register_ftrace_direct(tr->fops, (long)new_addr);
} else {
ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr);
ret = bpf_trampoline_update_fentry(tr, 0, NULL, new_addr);
}
return ret;
@@ -334,8 +356,9 @@ static void bpf_tramp_image_put(struct bpf_tramp_image *im)
* call_rcu_tasks() is not necessary.
*/
if (im->ip_after_call) {
int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_JUMP,
NULL, im->ip_epilogue);
int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_NOP,
BPF_MOD_JUMP, NULL,
im->ip_epilogue);
WARN_ON(err);
if (IS_ENABLED(CONFIG_TASKS_RCU))
call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu_tasks);
@@ -408,7 +431,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut
return PTR_ERR(tlinks);
if (total == 0) {
err = unregister_fentry(tr, tr->cur_image->image);
err = unregister_fentry(tr, orig_flags, tr->cur_image->image);
bpf_tramp_image_put(tr->cur_image);
tr->cur_image = NULL;
goto out;
@@ -432,9 +455,20 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
again:
if ((tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY) &&
(tr->flags & BPF_TRAMP_F_CALL_ORIG))
tr->flags |= BPF_TRAMP_F_ORIG_STACK;
if (tr->flags & BPF_TRAMP_F_CALL_ORIG) {
if (tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY) {
/* The BPF_TRAMP_F_SKIP_FRAME can be cleared in the
* first try, reset it in the second try.
*/
tr->flags |= BPF_TRAMP_F_ORIG_STACK | BPF_TRAMP_F_SKIP_FRAME;
} else if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_JMP)) {
/* Use "jmp" instead of "call" for the trampoline
* in the origin call case, and we don't need to
* skip the frame.
*/
tr->flags &= ~BPF_TRAMP_F_SKIP_FRAME;
}
}
#endif
size = arch_bpf_trampoline_size(&tr->func.model, tr->flags,
@@ -465,10 +499,18 @@ again:
if (err)
goto out_free;
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP
if (bpf_trampoline_use_jmp(tr->flags))
tr->fops->flags |= FTRACE_OPS_FL_JMP;
else
tr->fops->flags &= ~FTRACE_OPS_FL_JMP;
#endif
WARN_ON(tr->cur_image && total == 0);
if (tr->cur_image)
/* progs already running at this address */
err = modify_fentry(tr, tr->cur_image->image, im->image, lock_direct_mutex);
err = modify_fentry(tr, orig_flags, tr->cur_image->image,
im->image, lock_direct_mutex);
else
/* first time registering */
err = register_fentry(tr, im->image);
@@ -491,8 +533,15 @@ again:
tr->cur_image = im;
out:
/* If any error happens, restore previous flags */
if (err)
if (err) {
tr->flags = orig_flags;
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP
if (bpf_trampoline_use_jmp(tr->flags))
tr->fops->flags |= FTRACE_OPS_FL_JMP;
else
tr->fops->flags &= ~FTRACE_OPS_FL_JMP;
#endif
}
kfree(tlinks);
return err;
@@ -568,7 +617,8 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link,
if (err)
return err;
tr->extension_prog = link->link.prog;
return bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL,
return bpf_arch_text_poke(tr->func.addr, BPF_MOD_NOP,
BPF_MOD_JUMP, NULL,
link->link.prog->bpf_func);
}
if (cnt >= BPF_MAX_TRAMP_LINKS)
@@ -616,6 +666,7 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
if (kind == BPF_TRAMP_REPLACE) {
WARN_ON_ONCE(!tr->extension_prog);
err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP,
BPF_MOD_NOP,
tr->extension_prog->bpf_func, NULL);
tr->extension_prog = NULL;
guard(mutex)(&tgt_prog->aux->ext_mutex);

File diff suppressed because it is too large Load Diff

View File

@@ -80,6 +80,12 @@ config HAVE_DYNAMIC_FTRACE_NO_PATCHABLE
If the architecture generates __patchable_function_entries sections
but does not want them included in the ftrace locations.
config HAVE_DYNAMIC_FTRACE_WITH_JMP
bool
help
If the architecture supports to replace the __fentry__ with a
"jmp" instruction.
config HAVE_SYSCALL_TRACEPOINTS
bool
help
@@ -330,6 +336,12 @@ config DYNAMIC_FTRACE_WITH_ARGS
depends on DYNAMIC_FTRACE
depends on HAVE_DYNAMIC_FTRACE_WITH_ARGS
config DYNAMIC_FTRACE_WITH_JMP
def_bool y
depends on DYNAMIC_FTRACE
depends on DYNAMIC_FTRACE_WITH_DIRECT_CALLS
depends on HAVE_DYNAMIC_FTRACE_WITH_JMP
config FPROBE
bool "Kernel Function Probe (fprobe)"
depends on HAVE_FUNCTION_GRAPH_FREGS && HAVE_FTRACE_GRAPH_FUNC

View File

@@ -2529,7 +2529,7 @@ static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
return run_ctx->entry_ip;
}
static int
static __always_inline int
kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
unsigned long entry_ip, struct ftrace_regs *fregs,
bool is_return, void *data)
@@ -3372,13 +3372,13 @@ typedef int (*copy_fn_t)(void *dst, const void *src, u32 size, struct task_struc
* direct calls into all the specific callback implementations
* (copy_user_data_sleepable, copy_user_data_nofault, and so on)
*/
static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u32 doff, u32 size,
static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u64 doff, u64 size,
const void *unsafe_src,
copy_fn_t str_copy_fn,
struct task_struct *tsk)
{
struct bpf_dynptr_kern *dst;
u32 chunk_sz, off;
u64 chunk_sz, off;
void *dst_slice;
int cnt, err;
char buf[256];
@@ -3392,7 +3392,7 @@ static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u32 do
return -E2BIG;
for (off = 0; off < size; off += chunk_sz - 1) {
chunk_sz = min_t(u32, sizeof(buf), size - off);
chunk_sz = min_t(u64, sizeof(buf), size - off);
/* Expect str_copy_fn to return count of copied bytes, including
* zero terminator. Next iteration increment off by chunk_sz - 1 to
* overwrite NUL.
@@ -3409,14 +3409,14 @@ static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u32 do
return off;
}
static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u32 doff,
u32 size, const void *unsafe_src,
static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u64 doff,
u64 size, const void *unsafe_src,
copy_fn_t copy_fn, struct task_struct *tsk)
{
struct bpf_dynptr_kern *dst;
void *dst_slice;
char buf[256];
u32 off, chunk_sz;
u64 off, chunk_sz;
int err;
dst_slice = bpf_dynptr_slice_rdwr(dptr, doff, NULL, size);
@@ -3428,7 +3428,7 @@ static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u32
return -E2BIG;
for (off = 0; off < size; off += chunk_sz) {
chunk_sz = min_t(u32, sizeof(buf), size - off);
chunk_sz = min_t(u64, sizeof(buf), size - off);
err = copy_fn(buf, unsafe_src + off, chunk_sz, tsk);
if (err)
return err;
@@ -3514,58 +3514,58 @@ __bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid
return bpf_send_signal_common(sig, type, task, value);
}
__bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u32 off,
u32 size, const void __user *unsafe_ptr__ign)
__bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u64 off,
u64 size, const void __user *unsafe_ptr__ign)
{
return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign,
copy_user_data_nofault, NULL);
}
__bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u32 off,
u32 size, const void *unsafe_ptr__ign)
__bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u64 off,
u64 size, const void *unsafe_ptr__ign)
{
return __bpf_dynptr_copy(dptr, off, size, unsafe_ptr__ign,
copy_kernel_data_nofault, NULL);
}
__bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u32 off,
u32 size, const void __user *unsafe_ptr__ign)
__bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u64 off,
u64 size, const void __user *unsafe_ptr__ign)
{
return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign,
copy_user_str_nofault, NULL);
}
__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u32 off,
u32 size, const void *unsafe_ptr__ign)
__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u64 off,
u64 size, const void *unsafe_ptr__ign)
{
return __bpf_dynptr_copy_str(dptr, off, size, unsafe_ptr__ign,
copy_kernel_str_nofault, NULL);
}
__bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u32 off,
u32 size, const void __user *unsafe_ptr__ign)
__bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u64 off,
u64 size, const void __user *unsafe_ptr__ign)
{
return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign,
copy_user_data_sleepable, NULL);
}
__bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u32 off,
u32 size, const void __user *unsafe_ptr__ign)
__bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u64 off,
u64 size, const void __user *unsafe_ptr__ign)
{
return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign,
copy_user_str_sleepable, NULL);
}
__bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u32 off,
u32 size, const void __user *unsafe_ptr__ign,
__bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u64 off,
u64 size, const void __user *unsafe_ptr__ign,
struct task_struct *tsk)
{
return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign,
copy_user_data_sleepable, tsk);
}
__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u32 off,
u32 size, const void __user *unsafe_ptr__ign,
__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u64 off,
u64 size, const void __user *unsafe_ptr__ign,
struct task_struct *tsk)
{
return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign,

View File

@@ -5951,7 +5951,8 @@ static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long
for (i = 0; i < size; i++) {
hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
del = __ftrace_lookup_ip(direct_functions, entry->ip);
if (del && del->direct == addr) {
if (del && ftrace_jmp_get(del->direct) ==
ftrace_jmp_get(addr)) {
remove_hash_entry(direct_functions, del);
kfree(del);
}
@@ -6016,8 +6017,15 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
if (ftrace_hash_empty(hash))
return -EINVAL;
/* This is a "raw" address, and this should never happen. */
if (WARN_ON_ONCE(ftrace_is_jmp(addr)))
return -EINVAL;
mutex_lock(&direct_mutex);
if (ops->flags & FTRACE_OPS_FL_JMP)
addr = ftrace_jmp_set(addr);
/* Make sure requested entries are not already registered.. */
size = 1 << hash->size_bits;
for (i = 0; i < size; i++) {
@@ -6138,6 +6146,13 @@ __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
lockdep_assert_held_once(&direct_mutex);
/* This is a "raw" address, and this should never happen. */
if (WARN_ON_ONCE(ftrace_is_jmp(addr)))
return -EINVAL;
if (ops->flags & FTRACE_OPS_FL_JMP)
addr = ftrace_jmp_set(addr);
/* Enable the tmp_ops to have the same functions as the direct ops */
ftrace_ops_init(&tmp_ops);
tmp_ops.func_hash = ops->func_hash;

View File

@@ -11,27 +11,8 @@
#define MAX_PHDR_CNT 256
struct freader {
void *buf;
u32 buf_sz;
int err;
union {
struct {
struct file *file;
struct folio *folio;
void *addr;
loff_t folio_off;
bool may_fault;
};
struct {
const char *data;
u64 data_sz;
};
};
};
static void freader_init_from_file(struct freader *r, void *buf, u32 buf_sz,
struct file *file, bool may_fault)
void freader_init_from_file(struct freader *r, void *buf, u32 buf_sz,
struct file *file, bool may_fault)
{
memset(r, 0, sizeof(*r));
r->buf = buf;
@@ -40,7 +21,7 @@ static void freader_init_from_file(struct freader *r, void *buf, u32 buf_sz,
r->may_fault = may_fault;
}
static void freader_init_from_mem(struct freader *r, const char *data, u64 data_sz)
void freader_init_from_mem(struct freader *r, const char *data, u64 data_sz)
{
memset(r, 0, sizeof(*r));
r->data = data;
@@ -92,7 +73,7 @@ static int freader_get_folio(struct freader *r, loff_t file_off)
return 0;
}
static const void *freader_fetch(struct freader *r, loff_t file_off, size_t sz)
const void *freader_fetch(struct freader *r, loff_t file_off, size_t sz)
{
size_t folio_sz;
@@ -127,18 +108,21 @@ static const void *freader_fetch(struct freader *r, loff_t file_off, size_t sz)
*/
folio_sz = folio_size(r->folio);
if (file_off + sz > r->folio_off + folio_sz) {
int part_sz = r->folio_off + folio_sz - file_off;
u64 part_sz = r->folio_off + folio_sz - file_off, off;
/* copy the part that resides in the current folio */
memcpy(r->buf, r->addr + (file_off - r->folio_off), part_sz);
memcpy(r->buf, r->addr + file_off - r->folio_off, part_sz);
off = part_sz;
/* fetch next folio */
r->err = freader_get_folio(r, r->folio_off + folio_sz);
if (r->err)
return NULL;
/* copy the rest of requested data */
memcpy(r->buf + part_sz, r->addr, sz - part_sz);
while (off < sz) {
/* fetch next folio */
r->err = freader_get_folio(r, r->folio_off + folio_sz);
if (r->err)
return NULL;
folio_sz = folio_size(r->folio);
part_sz = min_t(u64, sz - off, folio_sz);
memcpy(r->buf + off, r->addr, part_sz);
off += part_sz;
}
return r->buf;
}
@@ -147,7 +131,7 @@ static const void *freader_fetch(struct freader *r, loff_t file_off, size_t sz)
return r->addr + (file_off - r->folio_off);
}
static void freader_cleanup(struct freader *r)
void freader_cleanup(struct freader *r)
{
if (!r->buf)
return; /* non-file-backed mode */

View File

@@ -436,7 +436,7 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
static int bpf_test_finish(const union bpf_attr *kattr,
union bpf_attr __user *uattr, const void *data,
struct skb_shared_info *sinfo, u32 size,
struct skb_shared_info *sinfo, u32 size, u32 frag_size,
u32 retval, u32 duration)
{
void __user *data_out = u64_to_user_ptr(kattr->test.data_out);
@@ -453,7 +453,7 @@ static int bpf_test_finish(const union bpf_attr *kattr,
}
if (data_out) {
int len = sinfo ? copy_size - sinfo->xdp_frags_size : copy_size;
int len = sinfo ? copy_size - frag_size : copy_size;
if (len < 0) {
err = -ENOSPC;
@@ -899,6 +899,12 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb)
/* cb is allowed */
if (!range_is_zero(__skb, offsetofend(struct __sk_buff, cb),
offsetof(struct __sk_buff, data_end)))
return -EINVAL;
/* data_end is allowed, but not copied to skb */
if (!range_is_zero(__skb, offsetofend(struct __sk_buff, data_end),
offsetof(struct __sk_buff, tstamp)))
return -EINVAL;
@@ -939,6 +945,11 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb)
if (__skb->gso_segs > GSO_MAX_SEGS)
return -EINVAL;
/* Currently GSO type is zero/unset. If this gets extended with
* a small list of accepted GSO types in future, the filter for
* an unset GSO type in bpf_clone_redirect() can be lifted.
*/
skb_shinfo(skb)->gso_segs = __skb->gso_segs;
skb_shinfo(skb)->gso_size = __skb->gso_size;
skb_shinfo(skb)->hwtstamps.hwtstamp = __skb->hwtstamp;
@@ -973,46 +984,39 @@ static struct proto bpf_dummy_proto = {
int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
union bpf_attr __user *uattr)
{
bool is_l2 = false, is_direct_pkt_access = false;
bool is_l2 = false, is_direct_pkt_access = false, is_lwt = false;
u32 tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
struct net *net = current->nsproxy->net_ns;
struct net_device *dev = net->loopback_dev;
u32 size = kattr->test.data_size_in;
u32 headroom = NET_SKB_PAD + NET_IP_ALIGN;
u32 linear_sz = kattr->test.data_size_in;
u32 repeat = kattr->test.repeat;
struct __sk_buff *ctx = NULL;
struct sk_buff *skb = NULL;
struct sock *sk = NULL;
u32 retval, duration;
int hh_len = ETH_HLEN;
struct sk_buff *skb;
struct sock *sk;
void *data;
void *data = NULL;
int ret;
if ((kattr->test.flags & ~BPF_F_TEST_SKB_CHECKSUM_COMPLETE) ||
kattr->test.cpu || kattr->test.batch_size)
return -EINVAL;
if (size < ETH_HLEN)
if (kattr->test.data_size_in < ETH_HLEN)
return -EINVAL;
data = bpf_test_init(kattr, kattr->test.data_size_in,
size, NET_SKB_PAD + NET_IP_ALIGN,
SKB_DATA_ALIGN(sizeof(struct skb_shared_info)));
if (IS_ERR(data))
return PTR_ERR(data);
ctx = bpf_ctx_init(kattr, sizeof(struct __sk_buff));
if (IS_ERR(ctx)) {
kfree(data);
return PTR_ERR(ctx);
}
switch (prog->type) {
case BPF_PROG_TYPE_SCHED_CLS:
case BPF_PROG_TYPE_SCHED_ACT:
is_direct_pkt_access = true;
is_l2 = true;
fallthrough;
break;
case BPF_PROG_TYPE_LWT_IN:
case BPF_PROG_TYPE_LWT_OUT:
case BPF_PROG_TYPE_LWT_XMIT:
is_lwt = true;
fallthrough;
case BPF_PROG_TYPE_CGROUP_SKB:
is_direct_pkt_access = true;
break;
@@ -1020,25 +1024,88 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
break;
}
ctx = bpf_ctx_init(kattr, sizeof(struct __sk_buff));
if (IS_ERR(ctx))
return PTR_ERR(ctx);
if (ctx) {
if (ctx->data_end > kattr->test.data_size_in || ctx->data || ctx->data_meta) {
ret = -EINVAL;
goto out;
}
if (ctx->data_end) {
/* Non-linear LWT test_run is unsupported for now. */
if (is_lwt) {
ret = -EINVAL;
goto out;
}
linear_sz = max(ETH_HLEN, ctx->data_end);
}
}
linear_sz = min_t(u32, linear_sz, PAGE_SIZE - headroom - tailroom);
data = bpf_test_init(kattr, linear_sz, linear_sz, headroom, tailroom);
if (IS_ERR(data)) {
ret = PTR_ERR(data);
data = NULL;
goto out;
}
sk = sk_alloc(net, AF_UNSPEC, GFP_USER, &bpf_dummy_proto, 1);
if (!sk) {
kfree(data);
kfree(ctx);
return -ENOMEM;
ret = -ENOMEM;
goto out;
}
sock_init_data(NULL, sk);
skb = slab_build_skb(data);
if (!skb) {
kfree(data);
kfree(ctx);
sk_free(sk);
return -ENOMEM;
ret = -ENOMEM;
goto out;
}
skb->sk = sk;
data = NULL; /* data released via kfree_skb */
skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
__skb_put(skb, size);
__skb_put(skb, linear_sz);
if (unlikely(kattr->test.data_size_in > linear_sz)) {
void __user *data_in = u64_to_user_ptr(kattr->test.data_in);
struct skb_shared_info *sinfo = skb_shinfo(skb);
u32 copied = linear_sz;
while (copied < kattr->test.data_size_in) {
struct page *page;
u32 data_len;
if (sinfo->nr_frags == MAX_SKB_FRAGS) {
ret = -ENOMEM;
goto out;
}
page = alloc_page(GFP_KERNEL);
if (!page) {
ret = -ENOMEM;
goto out;
}
data_len = min_t(u32, kattr->test.data_size_in - copied,
PAGE_SIZE);
skb_fill_page_desc(skb, sinfo->nr_frags, page, 0, data_len);
if (copy_from_user(page_address(page), data_in + copied,
data_len)) {
ret = -EFAULT;
goto out;
}
skb->data_len += data_len;
skb->truesize += PAGE_SIZE;
skb->len += data_len;
copied += data_len;
}
}
if (ctx && ctx->ifindex > 1) {
dev = dev_get_by_index(net, ctx->ifindex);
@@ -1118,12 +1185,11 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
convert_skb_to___skb(skb, ctx);
size = skb->len;
/* bpf program can never convert linear skb to non-linear */
if (WARN_ON_ONCE(skb_is_nonlinear(skb)))
size = skb_headlen(skb);
ret = bpf_test_finish(kattr, uattr, skb->data, NULL, size, retval,
duration);
if (skb_is_nonlinear(skb))
/* bpf program can never convert linear skb to non-linear */
WARN_ON_ONCE(linear_sz == kattr->test.data_size_in);
ret = bpf_test_finish(kattr, uattr, skb->data, skb_shinfo(skb), skb->len,
skb->data_len, retval, duration);
if (!ret)
ret = bpf_ctx_finish(kattr, uattr, ctx,
sizeof(struct __sk_buff));
@@ -1131,7 +1197,9 @@ out:
if (dev && dev != net->loopback_dev)
dev_put(dev);
kfree_skb(skb);
sk_free(sk);
kfree(data);
if (sk)
sk_free(sk);
kfree(ctx);
return ret;
}
@@ -1329,7 +1397,7 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
goto out;
size = xdp.data_end - xdp.data_meta + sinfo->xdp_frags_size;
ret = bpf_test_finish(kattr, uattr, xdp.data_meta, sinfo, size,
ret = bpf_test_finish(kattr, uattr, xdp.data_meta, sinfo, size, sinfo->xdp_frags_size,
retval, duration);
if (!ret)
ret = bpf_ctx_finish(kattr, uattr, ctx,
@@ -1420,7 +1488,7 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
goto out;
ret = bpf_test_finish(kattr, uattr, &flow_keys, NULL,
sizeof(flow_keys), retval, duration);
sizeof(flow_keys), 0, retval, duration);
if (!ret)
ret = bpf_ctx_finish(kattr, uattr, user_ctx,
sizeof(struct bpf_flow_keys));
@@ -1521,7 +1589,7 @@ int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kat
user_ctx->cookie = sock_gen_cookie(ctx.selected_sk);
}
ret = bpf_test_finish(kattr, uattr, NULL, NULL, 0, retval, duration);
ret = bpf_test_finish(kattr, uattr, NULL, NULL, 0, 0, retval, duration);
if (!ret)
ret = bpf_ctx_finish(kattr, uattr, user_ctx, sizeof(*user_ctx));
@@ -1721,7 +1789,7 @@ int bpf_prog_test_run_nf(struct bpf_prog *prog,
if (ret)
goto out;
ret = bpf_test_finish(kattr, uattr, NULL, NULL, 0, retval, duration);
ret = bpf_test_finish(kattr, uattr, NULL, NULL, 0, 0, retval, duration);
out:
kfree(user_ctx);

View File

@@ -50,16 +50,14 @@ void bpf_sk_storage_free(struct sock *sk)
{
struct bpf_local_storage *sk_storage;
migrate_disable();
rcu_read_lock();
rcu_read_lock_dont_migrate();
sk_storage = rcu_dereference(sk->sk_bpf_storage);
if (!sk_storage)
goto out;
bpf_local_storage_destroy(sk_storage);
out:
rcu_read_unlock();
migrate_enable();
rcu_read_unlock_migrate();
}
static void bpf_sk_storage_map_free(struct bpf_map *map)
@@ -138,7 +136,7 @@ bpf_sk_storage_clone_elem(struct sock *newsk,
{
struct bpf_local_storage_elem *copy_selem;
copy_selem = bpf_selem_alloc(smap, newsk, NULL, true, false, GFP_ATOMIC);
copy_selem = bpf_selem_alloc(smap, newsk, NULL, false, GFP_ATOMIC);
if (!copy_selem)
return NULL;
@@ -161,8 +159,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
migrate_disable();
rcu_read_lock();
rcu_read_lock_dont_migrate();
sk_storage = rcu_dereference(sk->sk_bpf_storage);
if (!sk_storage || hlist_empty(&sk_storage->list))
@@ -199,7 +196,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
} else {
ret = bpf_local_storage_alloc(newsk, smap, copy_selem, GFP_ATOMIC);
if (ret) {
bpf_selem_free(copy_selem, smap, true);
bpf_selem_free(copy_selem, true);
atomic_sub(smap->elem_size,
&newsk->sk_omem_alloc);
bpf_map_put(map);
@@ -213,8 +210,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
}
out:
rcu_read_unlock();
migrate_enable();
rcu_read_unlock_migrate();
/* In case of an error, don't free anything explicitly here, the
* caller is responsible to call bpf_sk_storage_free.

View File

@@ -2458,6 +2458,13 @@ BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL)))
return -EINVAL;
/* BPF test infra's convert___skb_to_skb() can create type-less
* GSO packets. gso_features_check() will detect this as a bad
* offload. However, lets not leak them out in the first place.
*/
if (unlikely(skb_is_gso(skb) && !skb_shinfo(skb)->gso_type))
return -EBADMSG;
dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex);
if (unlikely(!dev))
return -EINVAL;
@@ -6422,9 +6429,12 @@ BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb,
*/
if (skb_is_gso(skb)) {
ret = BPF_MTU_CHK_RET_SUCCESS;
if (flags & BPF_MTU_CHK_SEGS &&
!skb_gso_validate_network_len(skb, mtu))
ret = BPF_MTU_CHK_RET_SEGS_TOOBIG;
if (flags & BPF_MTU_CHK_SEGS) {
if (!skb_transport_header_was_set(skb))
return -EINVAL;
if (!skb_gso_validate_network_len(skb, mtu))
ret = BPF_MTU_CHK_RET_SEGS_TOOBIG;
}
}
out:
*mtu_len = mtu;

View File

@@ -112,7 +112,7 @@ function start_hbm () {
processArgs () {
for i in $args ; do
case $i in
# Support for upcomming ingress rate limiting
# Support for upcoming ingress rate limiting
#in) # support for upcoming ingress rate limiting
# dir="-i"
# dir_name="in"

View File

@@ -5,7 +5,7 @@
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*
* Example program for Host Bandwidth Managment
* Example program for Host Bandwidth Management
*
* This program loads a cgroup skb BPF program to enforce cgroup output
* (egress) or input (ingress) bandwidth limits.
@@ -24,7 +24,7 @@
* beyond the rate limit specified while there is available
* bandwidth. Current implementation assumes there is only
* NIC (eth0), but can be extended to support multiple NICs.
* Currrently only supported for egress.
* Currently only supported for egress.
* -h Print this info
* prog BPF program file name. Name defaults to hbm_out_kern.o
*/

View File

@@ -5,7 +5,7 @@
* License as published by the Free Software Foundation.
*
* BPF program to set congestion control to dctcp when both hosts are
* in the same datacenter (as deteremined by IPv6 prefix).
* in the same datacenter (as determined by IPv6 prefix).
*
* Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program.
*/

View File

@@ -20,7 +20,7 @@ SEC("kprobe.multi/__netif_receive_skb_core*")
int bpf_prog1(struct pt_regs *ctx)
{
/* attaches to kprobe __netif_receive_skb_core,
* looks for packets on loobpack device and prints them
* looks for packets on loopback device and prints them
* (wildcard is used for avoiding symbol mismatch due to optimization)
*/
char devname[IFNAMSIZ];

View File

@@ -32,7 +32,7 @@ FEATURE_TESTS = libbfd disassembler-four-args disassembler-init-styled
FEATURE_DISPLAY = libbfd
check_feat := 1
NON_CHECK_FEAT_TARGETS := clean bpftool_clean runqslower_clean resolve_btfids_clean
NON_CHECK_FEAT_TARGETS := clean bpftool_clean resolve_btfids_clean
ifdef MAKECMDGOALS
ifeq ($(filter-out $(NON_CHECK_FEAT_TARGETS),$(MAKECMDGOALS)),)
check_feat := 0
@@ -70,7 +70,7 @@ $(OUTPUT)%.lex.o: $(OUTPUT)%.lex.c
PROGS = $(OUTPUT)bpf_jit_disasm $(OUTPUT)bpf_dbg $(OUTPUT)bpf_asm
all: $(PROGS) bpftool runqslower
all: $(PROGS) bpftool
$(OUTPUT)bpf_jit_disasm: CFLAGS += -DPACKAGE='bpf_jit_disasm'
$(OUTPUT)bpf_jit_disasm: $(OUTPUT)bpf_jit_disasm.o
@@ -86,7 +86,7 @@ $(OUTPUT)bpf_exp.lex.c: $(OUTPUT)bpf_exp.yacc.c
$(OUTPUT)bpf_exp.yacc.o: $(OUTPUT)bpf_exp.yacc.c
$(OUTPUT)bpf_exp.lex.o: $(OUTPUT)bpf_exp.lex.c
clean: bpftool_clean runqslower_clean resolve_btfids_clean
clean: bpftool_clean resolve_btfids_clean
$(call QUIET_CLEAN, bpf-progs)
$(Q)$(RM) -r -- $(OUTPUT)*.o $(OUTPUT)bpf_jit_disasm $(OUTPUT)bpf_dbg \
$(OUTPUT)bpf_asm $(OUTPUT)bpf_exp.yacc.* $(OUTPUT)bpf_exp.lex.*
@@ -112,12 +112,6 @@ bpftool_install:
bpftool_clean:
$(call descend,bpftool,clean)
runqslower:
$(call descend,runqslower)
runqslower_clean:
$(call descend,runqslower,clean)
resolve_btfids:
$(call descend,resolve_btfids)
@@ -125,5 +119,4 @@ resolve_btfids_clean:
$(call descend,resolve_btfids,clean)
.PHONY: all install clean bpftool bpftool_install bpftool_clean \
runqslower runqslower_clean \
resolve_btfids resolve_btfids_clean

View File

@@ -55,7 +55,8 @@ MAP COMMANDS
| | **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash**
| | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage**
| | **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** | **inode_storage**
| | **task_storage** | **bloom_filter** | **user_ringbuf** | **cgrp_storage** | **arena** }
| | **task_storage** | **bloom_filter** | **user_ringbuf** | **cgrp_storage** | **arena**
| | **insn_array** }
DESCRIPTION
===========

View File

@@ -590,7 +590,7 @@ static int btf_dumper_do_type(const struct btf_dumper *d, __u32 type_id,
case BTF_KIND_DATASEC:
return btf_dumper_datasec(d, type_id, data);
default:
jsonw_printf(d->jw, "(unsupported-kind");
jsonw_printf(d->jw, "(unsupported-kind)");
return -EINVAL;
}
}

View File

@@ -1477,7 +1477,8 @@ static int do_help(int argc, char **argv)
" devmap | devmap_hash | sockmap | cpumap | xskmap | sockhash |\n"
" cgroup_storage | reuseport_sockarray | percpu_cgroup_storage |\n"
" queue | stack | sk_storage | struct_ops | ringbuf | inode_storage |\n"
" task_storage | bloom_filter | user_ringbuf | cgrp_storage | arena }\n"
" task_storage | bloom_filter | user_ringbuf | cgrp_storage | arena |\n"
" insn_array }\n"
" " HELP_SPEC_OPTIONS " |\n"
" {-f|--bpffs} | {-n|--nomount} }\n"
"",

View File

@@ -28,6 +28,12 @@
#define OPEN_SSL_ERR_BUF_LEN 256
/* Use deprecated in 3.0 ERR_get_error_line_data for openssl < 3 */
#if !defined(OPENSSL_VERSION_MAJOR) || (OPENSSL_VERSION_MAJOR < 3)
#define ERR_get_error_all(file, line, func, data, flags) \
ERR_get_error_line_data(file, line, data, flags)
#endif
static void display_openssl_errors(int l)
{
char buf[OPEN_SSL_ERR_BUF_LEN];

View File

@@ -1,2 +0,0 @@
# SPDX-License-Identifier: GPL-2.0-only
/.output

View File

@@ -1,91 +0,0 @@
# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
include ../../scripts/Makefile.include
OUTPUT ?= $(abspath .output)/
BPFTOOL_OUTPUT := $(OUTPUT)bpftool/
DEFAULT_BPFTOOL := $(BPFTOOL_OUTPUT)bootstrap/bpftool
BPFTOOL ?= $(DEFAULT_BPFTOOL)
BPF_TARGET_ENDIAN ?= --target=bpf
LIBBPF_SRC := $(abspath ../../lib/bpf)
BPFOBJ_OUTPUT := $(OUTPUT)libbpf/
BPFOBJ := $(BPFOBJ_OUTPUT)libbpf.a
BPF_DESTDIR := $(BPFOBJ_OUTPUT)
BPF_INCLUDE := $(BPF_DESTDIR)/include
INCLUDES := -I$(OUTPUT) -I$(BPF_INCLUDE) -I$(abspath ../../include/uapi)
CFLAGS := -g -Wall $(CLANG_CROSS_FLAGS)
CFLAGS += $(EXTRA_CFLAGS)
LDFLAGS += $(EXTRA_LDFLAGS)
LDLIBS += -lelf -lz
# Try to detect best kernel BTF source
KERNEL_REL := $(shell uname -r)
VMLINUX_BTF_PATHS := $(if $(O),$(O)/vmlinux) \
$(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux) \
../../../vmlinux /sys/kernel/btf/vmlinux \
/boot/vmlinux-$(KERNEL_REL)
VMLINUX_BTF_PATH := $(or $(VMLINUX_BTF),$(firstword \
$(wildcard $(VMLINUX_BTF_PATHS))))
ifneq ($(V),1)
MAKEFLAGS += --no-print-directory
submake_extras := feature_display=0
endif
.DELETE_ON_ERROR:
.PHONY: all clean runqslower libbpf_hdrs
all: runqslower
runqslower: $(OUTPUT)/runqslower
clean:
$(call QUIET_CLEAN, runqslower)
$(Q)$(RM) -r $(BPFOBJ_OUTPUT) $(BPFTOOL_OUTPUT)
$(Q)$(RM) $(OUTPUT)*.o $(OUTPUT)*.d
$(Q)$(RM) $(OUTPUT)*.skel.h $(OUTPUT)vmlinux.h
$(Q)$(RM) $(OUTPUT)runqslower
$(Q)$(RM) -r .output
libbpf_hdrs: $(BPFOBJ)
$(OUTPUT)/runqslower: $(OUTPUT)/runqslower.o $(BPFOBJ)
$(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $^ $(LDLIBS) -o $@
$(OUTPUT)/runqslower.o: runqslower.h $(OUTPUT)/runqslower.skel.h \
$(OUTPUT)/runqslower.bpf.o | libbpf_hdrs
$(OUTPUT)/runqslower.bpf.o: $(OUTPUT)/vmlinux.h runqslower.h | libbpf_hdrs
$(OUTPUT)/%.skel.h: $(OUTPUT)/%.bpf.o | $(BPFTOOL)
$(QUIET_GEN)$(BPFTOOL) gen skeleton $< > $@
$(OUTPUT)/%.bpf.o: %.bpf.c $(BPFOBJ) | $(OUTPUT)
$(QUIET_GEN)$(CLANG) -g -O2 $(BPF_TARGET_ENDIAN) $(INCLUDES) \
-c $(filter %.c,$^) -o $@ && \
$(LLVM_STRIP) -g $@
$(OUTPUT)/%.o: %.c | $(OUTPUT)
$(QUIET_CC)$(CC) $(CFLAGS) $(INCLUDES) -c $(filter %.c,$^) -o $@
$(OUTPUT) $(BPFOBJ_OUTPUT) $(BPFTOOL_OUTPUT):
$(QUIET_MKDIR)mkdir -p $@
$(OUTPUT)/vmlinux.h: $(VMLINUX_BTF_PATH) | $(OUTPUT) $(BPFTOOL)
ifeq ($(VMLINUX_H),)
$(Q)if [ ! -e "$(VMLINUX_BTF_PATH)" ] ; then \
echo "Couldn't find kernel BTF; set VMLINUX_BTF to" \
"specify its location." >&2; \
exit 1;\
fi
$(QUIET_GEN)$(BPFTOOL) btf dump file $(VMLINUX_BTF_PATH) format c > $@
else
$(Q)cp "$(VMLINUX_H)" $@
endif
$(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(BPFOBJ_OUTPUT)
$(Q)$(MAKE) $(submake_extras) -C $(LIBBPF_SRC) OUTPUT=$(BPFOBJ_OUTPUT) \
DESTDIR=$(BPFOBJ_OUTPUT) prefix= $(abspath $@) install_headers
$(DEFAULT_BPFTOOL): | $(BPFTOOL_OUTPUT)
$(Q)$(MAKE) $(submake_extras) -C ../bpftool OUTPUT=$(BPFTOOL_OUTPUT) bootstrap

View File

@@ -1,106 +0,0 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (c) 2019 Facebook
#include "vmlinux.h"
#include <bpf/bpf_helpers.h>
#include "runqslower.h"
#define TASK_RUNNING 0
#define BPF_F_CURRENT_CPU 0xffffffffULL
const volatile __u64 min_us = 0;
const volatile pid_t targ_pid = 0;
struct {
__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
__uint(map_flags, BPF_F_NO_PREALLOC);
__type(key, int);
__type(value, u64);
} start SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
__uint(key_size, sizeof(u32));
__uint(value_size, sizeof(u32));
} events SEC(".maps");
/* record enqueue timestamp */
__always_inline
static int trace_enqueue(struct task_struct *t)
{
u32 pid = t->pid;
u64 *ptr;
if (!pid || (targ_pid && targ_pid != pid))
return 0;
ptr = bpf_task_storage_get(&start, t, 0,
BPF_LOCAL_STORAGE_GET_F_CREATE);
if (!ptr)
return 0;
*ptr = bpf_ktime_get_ns();
return 0;
}
SEC("tp_btf/sched_wakeup")
int handle__sched_wakeup(u64 *ctx)
{
/* TP_PROTO(struct task_struct *p) */
struct task_struct *p = (void *)ctx[0];
return trace_enqueue(p);
}
SEC("tp_btf/sched_wakeup_new")
int handle__sched_wakeup_new(u64 *ctx)
{
/* TP_PROTO(struct task_struct *p) */
struct task_struct *p = (void *)ctx[0];
return trace_enqueue(p);
}
SEC("tp_btf/sched_switch")
int handle__sched_switch(u64 *ctx)
{
/* TP_PROTO(bool preempt, struct task_struct *prev,
* struct task_struct *next)
*/
struct task_struct *prev = (struct task_struct *)ctx[1];
struct task_struct *next = (struct task_struct *)ctx[2];
struct runq_event event = {};
u64 *tsp, delta_us;
u32 pid;
/* ivcsw: treat like an enqueue event and store timestamp */
if (prev->__state == TASK_RUNNING)
trace_enqueue(prev);
pid = next->pid;
/* For pid mismatch, save a bpf_task_storage_get */
if (!pid || (targ_pid && targ_pid != pid))
return 0;
/* fetch timestamp and calculate delta */
tsp = bpf_task_storage_get(&start, next, 0, 0);
if (!tsp)
return 0; /* missed enqueue */
delta_us = (bpf_ktime_get_ns() - *tsp) / 1000;
if (min_us && delta_us <= min_us)
return 0;
event.pid = pid;
event.delta_us = delta_us;
bpf_get_current_comm(&event.task, sizeof(event.task));
/* output */
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU,
&event, sizeof(event));
bpf_task_storage_delete(&start, next);
return 0;
}
char LICENSE[] SEC("license") = "GPL";

View File

@@ -1,171 +0,0 @@
// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
// Copyright (c) 2019 Facebook
#include <argp.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <bpf/libbpf.h>
#include <bpf/bpf.h>
#include "runqslower.h"
#include "runqslower.skel.h"
struct env {
pid_t pid;
__u64 min_us;
bool verbose;
} env = {
.min_us = 10000,
};
const char *argp_program_version = "runqslower 0.1";
const char *argp_program_bug_address = "<bpf@vger.kernel.org>";
const char argp_program_doc[] =
"runqslower Trace long process scheduling delays.\n"
" For Linux, uses eBPF, BPF CO-RE, libbpf, BTF.\n"
"\n"
"This script traces high scheduling delays between tasks being\n"
"ready to run and them running on CPU after that.\n"
"\n"
"USAGE: runqslower [-p PID] [min_us]\n"
"\n"
"EXAMPLES:\n"
" runqslower # trace run queue latency higher than 10000 us (default)\n"
" runqslower 1000 # trace run queue latency higher than 1000 us\n"
" runqslower -p 123 # trace pid 123 only\n";
static const struct argp_option opts[] = {
{ "pid", 'p', "PID", 0, "Process PID to trace"},
{ "verbose", 'v', NULL, 0, "Verbose debug output" },
{},
};
static error_t parse_arg(int key, char *arg, struct argp_state *state)
{
static int pos_args;
int pid;
long long min_us;
switch (key) {
case 'v':
env.verbose = true;
break;
case 'p':
errno = 0;
pid = strtol(arg, NULL, 10);
if (errno || pid <= 0) {
fprintf(stderr, "Invalid PID: %s\n", arg);
argp_usage(state);
}
env.pid = pid;
break;
case ARGP_KEY_ARG:
if (pos_args++) {
fprintf(stderr,
"Unrecognized positional argument: %s\n", arg);
argp_usage(state);
}
errno = 0;
min_us = strtoll(arg, NULL, 10);
if (errno || min_us <= 0) {
fprintf(stderr, "Invalid delay (in us): %s\n", arg);
argp_usage(state);
}
env.min_us = min_us;
break;
default:
return ARGP_ERR_UNKNOWN;
}
return 0;
}
int libbpf_print_fn(enum libbpf_print_level level,
const char *format, va_list args)
{
if (level == LIBBPF_DEBUG && !env.verbose)
return 0;
return vfprintf(stderr, format, args);
}
void handle_event(void *ctx, int cpu, void *data, __u32 data_sz)
{
const struct runq_event *e = data;
struct tm *tm;
char ts[32];
time_t t;
time(&t);
tm = localtime(&t);
strftime(ts, sizeof(ts), "%H:%M:%S", tm);
printf("%-8s %-16s %-6d %14llu\n", ts, e->task, e->pid, e->delta_us);
}
void handle_lost_events(void *ctx, int cpu, __u64 lost_cnt)
{
printf("Lost %llu events on CPU #%d!\n", lost_cnt, cpu);
}
int main(int argc, char **argv)
{
static const struct argp argp = {
.options = opts,
.parser = parse_arg,
.doc = argp_program_doc,
};
struct perf_buffer *pb = NULL;
struct runqslower_bpf *obj;
int err;
err = argp_parse(&argp, argc, argv, 0, NULL, NULL);
if (err)
return err;
libbpf_set_print(libbpf_print_fn);
/* Use libbpf 1.0 API mode */
libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
obj = runqslower_bpf__open();
if (!obj) {
fprintf(stderr, "failed to open and/or load BPF object\n");
return 1;
}
/* initialize global data (filtering options) */
obj->rodata->targ_pid = env.pid;
obj->rodata->min_us = env.min_us;
err = runqslower_bpf__load(obj);
if (err) {
fprintf(stderr, "failed to load BPF object: %d\n", err);
goto cleanup;
}
err = runqslower_bpf__attach(obj);
if (err) {
fprintf(stderr, "failed to attach BPF programs\n");
goto cleanup;
}
printf("Tracing run queue latency higher than %llu us\n", env.min_us);
printf("%-8s %-16s %-6s %14s\n", "TIME", "COMM", "PID", "LAT(us)");
pb = perf_buffer__new(bpf_map__fd(obj->maps.events), 64,
handle_event, handle_lost_events, NULL, NULL);
err = libbpf_get_error(pb);
if (err) {
pb = NULL;
fprintf(stderr, "failed to open perf buffer: %d\n", err);
goto cleanup;
}
while ((err = perf_buffer__poll(pb, 100)) >= 0)
;
printf("Error polling perf buffer: %d\n", err);
cleanup:
perf_buffer__free(pb);
runqslower_bpf__destroy(obj);
return err != 0;
}

View File

@@ -1,13 +0,0 @@
/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
#ifndef __RUNQSLOWER_H
#define __RUNQSLOWER_H
#define TASK_COMM_LEN 16
struct runq_event {
char task[TASK_COMM_LEN];
__u64 delta_us;
pid_t pid;
};
#endif /* __RUNQSLOWER_H */

View File

@@ -1026,6 +1026,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_USER_RINGBUF,
BPF_MAP_TYPE_CGRP_STORAGE,
BPF_MAP_TYPE_ARENA,
BPF_MAP_TYPE_INSN_ARRAY,
__MAX_BPF_MAP_TYPE
};
@@ -1430,6 +1431,9 @@ enum {
/* Do not translate kernel bpf_arena pointers to user pointers */
BPF_F_NO_USER_CONV = (1U << 18),
/* Enable BPF ringbuf overwrite mode */
BPF_F_RB_OVERWRITE = (1U << 19),
};
/* Flags for BPF_PROG_QUERY. */
@@ -5618,7 +5622,7 @@ union bpf_attr {
* Return
* *sk* if casting is valid, or **NULL** otherwise.
*
* long bpf_dynptr_from_mem(void *data, u32 size, u64 flags, struct bpf_dynptr *ptr)
* long bpf_dynptr_from_mem(void *data, u64 size, u64 flags, struct bpf_dynptr *ptr)
* Description
* Get a dynptr to local memory *data*.
*
@@ -5661,7 +5665,7 @@ union bpf_attr {
* Return
* Nothing. Always succeeds.
*
* long bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr *src, u32 offset, u64 flags)
* long bpf_dynptr_read(void *dst, u64 len, const struct bpf_dynptr *src, u64 offset, u64 flags)
* Description
* Read *len* bytes from *src* into *dst*, starting from *offset*
* into *src*.
@@ -5671,7 +5675,7 @@ union bpf_attr {
* of *src*'s data, -EINVAL if *src* is an invalid dynptr or if
* *flags* is not 0.
*
* long bpf_dynptr_write(const struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags)
* long bpf_dynptr_write(const struct bpf_dynptr *dst, u64 offset, void *src, u64 len, u64 flags)
* Description
* Write *len* bytes from *src* into *dst*, starting from *offset*
* into *dst*.
@@ -5692,7 +5696,7 @@ union bpf_attr {
* is a read-only dynptr or if *flags* is not correct. For skb-type dynptrs,
* other errors correspond to errors returned by **bpf_skb_store_bytes**\ ().
*
* void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len)
* void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u64 offset, u64 len)
* Description
* Get a pointer to the underlying dynptr data.
*
@@ -6231,6 +6235,7 @@ enum {
BPF_RB_RING_SIZE = 1,
BPF_RB_CONS_POS = 2,
BPF_RB_PROD_POS = 3,
BPF_RB_OVERWRITE_POS = 4,
};
/* BPF ring buffer constants */
@@ -7645,4 +7650,24 @@ enum bpf_kfunc_flags {
BPF_F_PAD_ZEROS = (1ULL << 0),
};
/*
* Values of a BPF_MAP_TYPE_INSN_ARRAY entry must be of this type.
*
* Before the map is used the orig_off field should point to an
* instruction inside the program being loaded. The other fields
* must be set to 0.
*
* After the program is loaded, the xlated_off will be adjusted
* by the verifier to point to the index of the original instruction
* in the xlated program. If the instruction is deleted, it will
* be set to (u32)-1. The jitted_off will be set to the corresponding
* offset in the jitted image of the program.
*/
struct bpf_insn_array_value {
__u32 orig_off;
__u32 xlated_off;
__u32 jitted_off;
__u32 :32;
};
#endif /* _UAPI__LINUX_BPF_H__ */

View File

@@ -154,7 +154,7 @@ int bump_rlimit_memlock(void)
memlock_bumped = true;
/* zero memlock_rlim_max disables auto-bumping RLIMIT_MEMLOCK */
/* zero memlock_rlim disables auto-bumping RLIMIT_MEMLOCK */
if (memlock_rlim == 0)
return 0;

View File

@@ -1061,7 +1061,7 @@ static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf, b
if (base_btf) {
btf->base_btf = base_btf;
btf->start_id = btf__type_cnt(base_btf);
btf->start_str_off = base_btf->hdr->str_len;
btf->start_str_off = base_btf->hdr->str_len + base_btf->start_str_off;
}
if (is_mmap) {
@@ -3901,6 +3901,20 @@ err_out:
return err;
}
/*
* Calculate type signature hash of TYPEDEF, ignoring referenced type IDs,
* as referenced type IDs equivalence is established separately during type
* graph equivalence check algorithm.
*/
static long btf_hash_typedef(struct btf_type *t)
{
long h;
h = hash_combine(0, t->name_off);
h = hash_combine(h, t->info);
return h;
}
static long btf_hash_common(struct btf_type *t)
{
long h;
@@ -3918,6 +3932,13 @@ static bool btf_equal_common(struct btf_type *t1, struct btf_type *t2)
t1->size == t2->size;
}
/* Check structural compatibility of two TYPEDEF. */
static bool btf_equal_typedef(struct btf_type *t1, struct btf_type *t2)
{
return t1->name_off == t2->name_off &&
t1->info == t2->info;
}
/* Calculate type signature hash of INT or TAG. */
static long btf_hash_int_decl_tag(struct btf_type *t)
{
@@ -4844,13 +4865,30 @@ static void btf_dedup_merge_hypot_map(struct btf_dedup *d)
}
}
static inline long btf_hash_by_kind(struct btf_type *t, __u16 kind)
{
if (kind == BTF_KIND_TYPEDEF)
return btf_hash_typedef(t);
else
return btf_hash_struct(t);
}
static inline bool btf_equal_by_kind(struct btf_type *t1, struct btf_type *t2, __u16 kind)
{
if (kind == BTF_KIND_TYPEDEF)
return btf_equal_typedef(t1, t2);
else
return btf_shallow_equal_struct(t1, t2);
}
/*
* Deduplicate struct/union types.
* Deduplicate struct/union and typedef types.
*
* For each struct/union type its type signature hash is calculated, taking
* into account type's name, size, number, order and names of fields, but
* ignoring type ID's referenced from fields, because they might not be deduped
* completely until after reference types deduplication phase. This type hash
* completely until after reference types deduplication phase. For each typedef
* type, the hash is computed based on the types name and size. This type hash
* is used to iterate over all potential canonical types, sharing same hash.
* For each canonical candidate we check whether type graphs that they form
* (through referenced types in fields and so on) are equivalent using algorithm
@@ -4882,18 +4920,20 @@ static int btf_dedup_struct_type(struct btf_dedup *d, __u32 type_id)
t = btf_type_by_id(d->btf, type_id);
kind = btf_kind(t);
if (kind != BTF_KIND_STRUCT && kind != BTF_KIND_UNION)
if (kind != BTF_KIND_STRUCT &&
kind != BTF_KIND_UNION &&
kind != BTF_KIND_TYPEDEF)
return 0;
h = btf_hash_struct(t);
h = btf_hash_by_kind(t, kind);
for_each_dedup_cand(d, hash_entry, h) {
__u32 cand_id = hash_entry->value;
int eq;
/*
* Even though btf_dedup_is_equiv() checks for
* btf_shallow_equal_struct() internally when checking two
* structs (unions) for equivalence, we need to guard here
* btf_equal_by_kind() internally when checking two
* structs (unions) or typedefs for equivalence, we need to guard here
* from picking matching FWD type as a dedup candidate.
* This can happen due to hash collision. In such case just
* relying on btf_dedup_is_equiv() would lead to potentially
@@ -4901,7 +4941,7 @@ static int btf_dedup_struct_type(struct btf_dedup *d, __u32 type_id)
* FWD and compatible STRUCT/UNION are considered equivalent.
*/
cand_type = btf_type_by_id(d->btf, cand_id);
if (!btf_shallow_equal_struct(t, cand_type))
if (!btf_equal_by_kind(t, cand_type, kind))
continue;
btf_dedup_clear_hypot_map(d);
@@ -4939,18 +4979,18 @@ static int btf_dedup_struct_types(struct btf_dedup *d)
/*
* Deduplicate reference type.
*
* Once all primitive and struct/union types got deduplicated, we can easily
* Once all primitive, struct/union and typedef types got deduplicated, we can easily
* deduplicate all other (reference) BTF types. This is done in two steps:
*
* 1. Resolve all referenced type IDs into their canonical type IDs. This
* resolution can be done either immediately for primitive or struct/union types
* (because they were deduped in previous two phases) or recursively for
* resolution can be done either immediately for primitive, struct/union, and typedef
* types (because they were deduped in previous two phases) or recursively for
* reference types. Recursion will always terminate at either primitive or
* struct/union type, at which point we can "unwind" chain of reference types
* one by one. There is no danger of encountering cycles because in C type
* system the only way to form type cycle is through struct/union, so any chain
* of reference types, even those taking part in a type cycle, will inevitably
* reach struct/union at some point.
* struct/union and typedef types, at which point we can "unwind" chain of reference
* types one by one. There is no danger of encountering cycles in C, as the only way to
* form a type cycle is through struct or union types. Go can form such cycles through
* typedef. Thus, any chain of reference types, even those taking part in a type cycle,
* will inevitably reach a struct/union or typedef type at some point.
*
* 2. Once all referenced type IDs are resolved into canonical ones, BTF type
* becomes "stable", in the sense that no further deduplication will cause
@@ -4982,7 +5022,6 @@ static int btf_dedup_ref_type(struct btf_dedup *d, __u32 type_id)
case BTF_KIND_VOLATILE:
case BTF_KIND_RESTRICT:
case BTF_KIND_PTR:
case BTF_KIND_TYPEDEF:
case BTF_KIND_FUNC:
case BTF_KIND_TYPE_TAG:
ref_type_id = btf_dedup_ref_type(d, t->type);
@@ -5818,7 +5857,7 @@ void btf_set_base_btf(struct btf *btf, const struct btf *base_btf)
{
btf->base_btf = (struct btf *)base_btf;
btf->start_id = btf__type_cnt(base_btf);
btf->start_str_off = base_btf->hdr->str_len;
btf->start_str_off = base_btf->hdr->str_len + base_btf->start_str_off;
}
int btf__relocate(struct btf *btf, const struct btf *base_btf)

View File

@@ -94,6 +94,7 @@ LIBBPF_API struct btf *btf__new_empty(void);
* @brief **btf__new_empty_split()** creates an unpopulated BTF object from an
* ELF BTF section except with a base BTF on top of which split BTF should be
* based
* @param base_btf base BTF object
* @return new BTF object instance which has to be eventually freed with
* **btf__free()**
*
@@ -115,6 +116,10 @@ LIBBPF_API struct btf *btf__new_empty_split(struct btf *base_btf);
* When that split BTF is loaded against a (possibly changed) base, this
* distilled base BTF will help update references to that (possibly changed)
* base BTF.
* @param src_btf source split BTF object
* @param new_base_btf pointer to where the new base BTF object pointer will be stored
* @param new_split_btf pointer to where the new split BTF object pointer will be stored
* @return 0 on success; negative error code, otherwise
*
* Both the new split and its associated new base BTF must be freed by
* the caller.
@@ -264,6 +269,9 @@ LIBBPF_API int btf__dedup(struct btf *btf, const struct btf_dedup_opts *opts);
* to base BTF kinds, and verify those references are compatible with
* *base_btf*; if they are, *btf* is adjusted such that is re-parented to
* *base_btf* and type ids and strings are adjusted to accommodate this.
* @param btf split BTF object to relocate
* @param base_btf base BTF object
* @return 0 on success; negative error code, otherwise
*
* If successful, 0 is returned and **btf** now has **base_btf** as its
* base.

View File

@@ -190,6 +190,7 @@ static const char * const map_type_name[] = {
[BPF_MAP_TYPE_USER_RINGBUF] = "user_ringbuf",
[BPF_MAP_TYPE_CGRP_STORAGE] = "cgrp_storage",
[BPF_MAP_TYPE_ARENA] = "arena",
[BPF_MAP_TYPE_INSN_ARRAY] = "insn_array",
};
static const char * const prog_type_name[] = {
@@ -369,6 +370,7 @@ enum reloc_type {
RELO_EXTERN_CALL,
RELO_SUBPROG_ADDR,
RELO_CORE,
RELO_INSN_ARRAY,
};
struct reloc_desc {
@@ -379,7 +381,16 @@ struct reloc_desc {
struct {
int map_idx;
int sym_off;
int ext_idx;
/*
* The following two fields can be unionized, as the
* ext_idx field is used for extern symbols, and the
* sym_size is used for jump tables, which are never
* extern
*/
union {
int ext_idx;
int sym_size;
};
};
};
};
@@ -421,6 +432,11 @@ struct bpf_sec_def {
libbpf_prog_attach_fn_t prog_attach_fn;
};
struct bpf_light_subprog {
__u32 sec_insn_off;
__u32 sub_insn_off;
};
/*
* bpf_prog should be a better name but it has been used in
* linux/filter.h.
@@ -494,6 +510,9 @@ struct bpf_program {
__u32 line_info_cnt;
__u32 prog_flags;
__u8 hash[SHA256_DIGEST_LENGTH];
struct bpf_light_subprog *subprogs;
__u32 subprog_cnt;
};
struct bpf_struct_ops {
@@ -667,6 +686,7 @@ struct elf_state {
int symbols_shndx;
bool has_st_ops;
int arena_data_shndx;
int jumptables_data_shndx;
};
struct usdt_manager;
@@ -738,6 +758,16 @@ struct bpf_object {
void *arena_data;
size_t arena_data_sz;
void *jumptables_data;
size_t jumptables_data_sz;
struct {
struct bpf_program *prog;
int sym_off;
int fd;
} *jumptable_maps;
size_t jumptable_map_cnt;
struct kern_feature_cache *feat_cache;
char *token_path;
int token_fd;
@@ -764,6 +794,7 @@ void bpf_program__unload(struct bpf_program *prog)
zfree(&prog->func_info);
zfree(&prog->line_info);
zfree(&prog->subprogs);
}
static void bpf_program__exit(struct bpf_program *prog)
@@ -2996,7 +3027,7 @@ static int bpf_object__init_user_btf_maps(struct bpf_object *obj, bool strict,
scn = elf_sec_by_idx(obj, obj->efile.btf_maps_shndx);
data = elf_sec_data(obj, scn);
if (!scn || !data) {
if (!data) {
pr_warn("elf: failed to get %s map definitions for %s\n",
MAPS_ELF_SEC, obj->path);
return -EINVAL;
@@ -3942,6 +3973,13 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
} else if (strcmp(name, ARENA_SEC) == 0) {
obj->efile.arena_data = data;
obj->efile.arena_data_shndx = idx;
} else if (strcmp(name, JUMPTABLES_SEC) == 0) {
obj->jumptables_data = malloc(data->d_size);
if (!obj->jumptables_data)
return -ENOMEM;
memcpy(obj->jumptables_data, data->d_buf, data->d_size);
obj->jumptables_data_sz = data->d_size;
obj->efile.jumptables_data_shndx = idx;
} else {
pr_info("elf: skipping unrecognized data section(%d) %s\n",
idx, name);
@@ -4634,6 +4672,16 @@ static int bpf_program__record_reloc(struct bpf_program *prog,
return 0;
}
/* jump table data relocation */
if (shdr_idx == obj->efile.jumptables_data_shndx) {
reloc_desc->type = RELO_INSN_ARRAY;
reloc_desc->insn_idx = insn_idx;
reloc_desc->map_idx = -1;
reloc_desc->sym_off = sym->st_value;
reloc_desc->sym_size = sym->st_size;
return 0;
}
/* generic map reference relocation */
if (type == LIBBPF_MAP_UNSPEC) {
if (!bpf_object__shndx_is_maps(obj, shdr_idx)) {
@@ -6144,6 +6192,157 @@ static void poison_kfunc_call(struct bpf_program *prog, int relo_idx,
insn->imm = POISON_CALL_KFUNC_BASE + ext_idx;
}
static int find_jt_map(struct bpf_object *obj, struct bpf_program *prog, int sym_off)
{
size_t i;
for (i = 0; i < obj->jumptable_map_cnt; i++) {
/*
* This might happen that same offset is used for two different
* programs (as jump tables can be the same). However, for
* different programs different maps should be created.
*/
if (obj->jumptable_maps[i].sym_off == sym_off &&
obj->jumptable_maps[i].prog == prog)
return obj->jumptable_maps[i].fd;
}
return -ENOENT;
}
static int add_jt_map(struct bpf_object *obj, struct bpf_program *prog, int sym_off, int map_fd)
{
size_t cnt = obj->jumptable_map_cnt;
size_t size = sizeof(obj->jumptable_maps[0]);
void *tmp;
tmp = libbpf_reallocarray(obj->jumptable_maps, cnt + 1, size);
if (!tmp)
return -ENOMEM;
obj->jumptable_maps = tmp;
obj->jumptable_maps[cnt].prog = prog;
obj->jumptable_maps[cnt].sym_off = sym_off;
obj->jumptable_maps[cnt].fd = map_fd;
obj->jumptable_map_cnt++;
return 0;
}
static int find_subprog_idx(struct bpf_program *prog, int insn_idx)
{
int i;
for (i = prog->subprog_cnt - 1; i >= 0; i--) {
if (insn_idx >= prog->subprogs[i].sub_insn_off)
return i;
}
return -1;
}
static int create_jt_map(struct bpf_object *obj, struct bpf_program *prog, struct reloc_desc *relo)
{
const __u32 jt_entry_size = 8;
int sym_off = relo->sym_off;
int jt_size = relo->sym_size;
__u32 max_entries = jt_size / jt_entry_size;
__u32 value_size = sizeof(struct bpf_insn_array_value);
struct bpf_insn_array_value val = {};
int subprog_idx;
int map_fd, err;
__u64 insn_off;
__u64 *jt;
__u32 i;
map_fd = find_jt_map(obj, prog, sym_off);
if (map_fd >= 0)
return map_fd;
if (sym_off % jt_entry_size) {
pr_warn("map '.jumptables': jumptable start %d should be multiple of %u\n",
sym_off, jt_entry_size);
return -EINVAL;
}
if (jt_size % jt_entry_size) {
pr_warn("map '.jumptables': jumptable size %d should be multiple of %u\n",
jt_size, jt_entry_size);
return -EINVAL;
}
map_fd = bpf_map_create(BPF_MAP_TYPE_INSN_ARRAY, ".jumptables",
4, value_size, max_entries, NULL);
if (map_fd < 0)
return map_fd;
if (!obj->jumptables_data) {
pr_warn("map '.jumptables': ELF file is missing jump table data\n");
err = -EINVAL;
goto err_close;
}
if (sym_off + jt_size > obj->jumptables_data_sz) {
pr_warn("map '.jumptables': jumptables_data size is %zd, trying to access %d\n",
obj->jumptables_data_sz, sym_off + jt_size);
err = -EINVAL;
goto err_close;
}
subprog_idx = -1; /* main program */
if (relo->insn_idx < 0 || relo->insn_idx >= prog->insns_cnt) {
pr_warn("map '.jumptables': invalid instruction index %d\n", relo->insn_idx);
err = -EINVAL;
goto err_close;
}
if (prog->subprogs)
subprog_idx = find_subprog_idx(prog, relo->insn_idx);
jt = (__u64 *)(obj->jumptables_data + sym_off);
for (i = 0; i < max_entries; i++) {
/*
* The offset should be made to be relative to the beginning of
* the main function, not the subfunction.
*/
insn_off = jt[i]/sizeof(struct bpf_insn);
if (subprog_idx >= 0) {
insn_off -= prog->subprogs[subprog_idx].sec_insn_off;
insn_off += prog->subprogs[subprog_idx].sub_insn_off;
} else {
insn_off -= prog->sec_insn_off;
}
/*
* LLVM-generated jump tables contain u64 records, however
* should contain values that fit in u32.
*/
if (insn_off > UINT32_MAX) {
pr_warn("map '.jumptables': invalid jump table value 0x%llx at offset %d\n",
(long long)jt[i], sym_off + i * jt_entry_size);
err = -EINVAL;
goto err_close;
}
val.orig_off = insn_off;
err = bpf_map_update_elem(map_fd, &i, &val, 0);
if (err)
goto err_close;
}
err = bpf_map_freeze(map_fd);
if (err)
goto err_close;
err = add_jt_map(obj, prog, sym_off, map_fd);
if (err)
goto err_close;
return map_fd;
err_close:
close(map_fd);
return err;
}
/* Relocate data references within program code:
* - map references;
* - global variable references;
@@ -6235,6 +6434,20 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog)
case RELO_CORE:
/* will be handled by bpf_program_record_relos() */
break;
case RELO_INSN_ARRAY: {
int map_fd;
map_fd = create_jt_map(obj, prog, relo);
if (map_fd < 0) {
pr_warn("prog '%s': relo #%d: can't create jump table: sym_off %u\n",
prog->name, i, relo->sym_off);
return map_fd;
}
insn[0].src_reg = BPF_PSEUDO_MAP_VALUE;
insn->imm = map_fd;
insn->off = 0;
}
break;
default:
pr_warn("prog '%s': relo #%d: bad relo type %d\n",
prog->name, i, relo->type);
@@ -6432,36 +6645,62 @@ static int append_subprog_relos(struct bpf_program *main_prog, struct bpf_progra
return 0;
}
static int save_subprog_offsets(struct bpf_program *main_prog, struct bpf_program *subprog)
{
size_t size = sizeof(main_prog->subprogs[0]);
int cnt = main_prog->subprog_cnt;
void *tmp;
tmp = libbpf_reallocarray(main_prog->subprogs, cnt + 1, size);
if (!tmp)
return -ENOMEM;
main_prog->subprogs = tmp;
main_prog->subprogs[cnt].sec_insn_off = subprog->sec_insn_off;
main_prog->subprogs[cnt].sub_insn_off = subprog->sub_insn_off;
main_prog->subprog_cnt++;
return 0;
}
static int
bpf_object__append_subprog_code(struct bpf_object *obj, struct bpf_program *main_prog,
struct bpf_program *subprog)
{
struct bpf_insn *insns;
size_t new_cnt;
int err;
struct bpf_insn *insns;
size_t new_cnt;
int err;
subprog->sub_insn_off = main_prog->insns_cnt;
subprog->sub_insn_off = main_prog->insns_cnt;
new_cnt = main_prog->insns_cnt + subprog->insns_cnt;
insns = libbpf_reallocarray(main_prog->insns, new_cnt, sizeof(*insns));
if (!insns) {
pr_warn("prog '%s': failed to realloc prog code\n", main_prog->name);
return -ENOMEM;
}
main_prog->insns = insns;
main_prog->insns_cnt = new_cnt;
new_cnt = main_prog->insns_cnt + subprog->insns_cnt;
insns = libbpf_reallocarray(main_prog->insns, new_cnt, sizeof(*insns));
if (!insns) {
pr_warn("prog '%s': failed to realloc prog code\n", main_prog->name);
return -ENOMEM;
}
main_prog->insns = insns;
main_prog->insns_cnt = new_cnt;
memcpy(main_prog->insns + subprog->sub_insn_off, subprog->insns,
subprog->insns_cnt * sizeof(*insns));
memcpy(main_prog->insns + subprog->sub_insn_off, subprog->insns,
subprog->insns_cnt * sizeof(*insns));
pr_debug("prog '%s': added %zu insns from sub-prog '%s'\n",
main_prog->name, subprog->insns_cnt, subprog->name);
pr_debug("prog '%s': added %zu insns from sub-prog '%s'\n",
main_prog->name, subprog->insns_cnt, subprog->name);
/* The subprog insns are now appended. Append its relos too. */
err = append_subprog_relos(main_prog, subprog);
if (err)
return err;
return 0;
/* The subprog insns are now appended. Append its relos too. */
err = append_subprog_relos(main_prog, subprog);
if (err)
return err;
err = save_subprog_offsets(main_prog, subprog);
if (err) {
pr_warn("prog '%s': failed to add subprog offsets: %s\n",
main_prog->name, errstr(err));
return err;
}
return 0;
}
static int
@@ -9228,6 +9467,13 @@ void bpf_object__close(struct bpf_object *obj)
zfree(&obj->arena_data);
zfree(&obj->jumptables_data);
obj->jumptables_data_sz = 0;
for (i = 0; i < obj->jumptable_map_cnt; i++)
close(obj->jumptable_maps[i].fd);
zfree(&obj->jumptable_maps);
free(obj);
}
@@ -13854,8 +14100,8 @@ int bpf_program__set_attach_target(struct bpf_program *prog,
return libbpf_err(-EINVAL);
if (attach_prog_fd && !attach_func_name) {
/* remember attach_prog_fd and let bpf_program__load() find
* BTF ID during the program load
/* Store attach_prog_fd. The BTF ID will be resolved later during
* the normal object/program load phase.
*/
prog->attach_prog_fd = attach_prog_fd;
return 0;

View File

@@ -448,7 +448,7 @@ LIBBPF_API int bpf_program__pin(struct bpf_program *prog, const char *path);
/**
* @brief **bpf_program__unpin()** unpins the BPF program from a file
* in the BPFFS specified by a path. This decrements the programs
* in the BPFFS specified by a path. This decrements program's in-kernel
* reference count.
*
* The file pinning the BPF program can also be unlinked by a different
@@ -481,14 +481,12 @@ LIBBPF_API int bpf_link__pin(struct bpf_link *link, const char *path);
/**
* @brief **bpf_link__unpin()** unpins the BPF link from a file
* in the BPFFS specified by a path. This decrements the links
* reference count.
* in the BPFFS. This decrements link's in-kernel reference count.
*
* The file pinning the BPF link can also be unlinked by a different
* process in which case this function will return an error.
*
* @param prog BPF program to unpin
* @param path file path to the pin in a BPF file system
* @param link BPF link to unpin
* @return 0, on success; negative error code, otherwise
*/
LIBBPF_API int bpf_link__unpin(struct bpf_link *link);
@@ -995,8 +993,13 @@ LIBBPF_API __u32 bpf_program__line_info_cnt(const struct bpf_program *prog);
* - fentry/fexit/fmod_ret;
* - lsm;
* - freplace.
* @param prog BPF program to set the attach type for
* @param type attach type to set the BPF map to have
* @param prog BPF program to configure; must be not yet loaded.
* @param attach_prog_fd FD of target BPF program (for freplace/extension).
* If >0 and func name omitted, defers BTF ID resolution.
* @param attach_func_name Target function name. Used either with
* attach_prog_fd to find destination BTF type ID in that BPF program, or
* alone (no attach_prog_fd) to resolve kernel (vmlinux/module) BTF ID.
* Must be provided if attach_prog_fd is 0.
* @return error code; or 0 if no error occurred.
*/
LIBBPF_API int
@@ -1098,6 +1101,7 @@ LIBBPF_API __u32 bpf_map__value_size(const struct bpf_map *map);
/**
* @brief **bpf_map__set_value_size()** sets map value size.
* @param map the BPF map instance
* @param size the new value size
* @return 0, on success; negative error, otherwise
*
* There is a special case for maps with associated memory-mapped regions, like
@@ -1202,7 +1206,7 @@ LIBBPF_API struct bpf_map *bpf_map__inner_map(struct bpf_map *map);
* per-CPU values value size has to be aligned up to closest 8 bytes for
* alignment reasons, so expected size is: `round_up(value_size, 8)
* * libbpf_num_possible_cpus()`.
* @flags extra flags passed to kernel for this operation
* @param flags extra flags passed to kernel for this operation
* @return 0, on success; negative error, otherwise
*
* **bpf_map__lookup_elem()** is high-level equivalent of
@@ -1226,7 +1230,7 @@ LIBBPF_API int bpf_map__lookup_elem(const struct bpf_map *map,
* per-CPU values value size has to be aligned up to closest 8 bytes for
* alignment reasons, so expected size is: `round_up(value_size, 8)
* * libbpf_num_possible_cpus()`.
* @flags extra flags passed to kernel for this operation
* @param flags extra flags passed to kernel for this operation
* @return 0, on success; negative error, otherwise
*
* **bpf_map__update_elem()** is high-level equivalent of
@@ -1242,7 +1246,7 @@ LIBBPF_API int bpf_map__update_elem(const struct bpf_map *map,
* @param map BPF map to delete element from
* @param key pointer to memory containing bytes of the key
* @param key_sz size in bytes of key data, needs to match BPF map definition's **key_size**
* @flags extra flags passed to kernel for this operation
* @param flags extra flags passed to kernel for this operation
* @return 0, on success; negative error, otherwise
*
* **bpf_map__delete_elem()** is high-level equivalent of
@@ -1265,7 +1269,7 @@ LIBBPF_API int bpf_map__delete_elem(const struct bpf_map *map,
* per-CPU values value size has to be aligned up to closest 8 bytes for
* alignment reasons, so expected size is: `round_up(value_size, 8)
* * libbpf_num_possible_cpus()`.
* @flags extra flags passed to kernel for this operation
* @param flags extra flags passed to kernel for this operation
* @return 0, on success; negative error, otherwise
*
* **bpf_map__lookup_and_delete_elem()** is high-level equivalent of
@@ -1637,6 +1641,7 @@ struct perf_buffer_opts {
* @param sample_cb function called on each received data record
* @param lost_cb function called when record loss has occurred
* @param ctx user-provided extra context passed into *sample_cb* and *lost_cb*
* @param opts optional parameters for the perf buffer, can be null
* @return a new instance of struct perf_buffer on success, NULL on error with
* *errno* containing an error code
*/

View File

@@ -74,6 +74,8 @@
#define ELF64_ST_VISIBILITY(o) ((o) & 0x03)
#endif
#define JUMPTABLES_SEC ".jumptables"
#define BTF_INFO_ENC(kind, kind_flag, vlen) \
((!!(kind_flag) << 31) | ((kind) << 24) | ((vlen) & BTF_MAX_VLEN))
#define BTF_TYPE_ENC(name, info, size_or_type) (name), (info), (size_or_type)

View File

@@ -364,6 +364,10 @@ static int probe_map_create(enum bpf_map_type map_type)
case BPF_MAP_TYPE_SOCKHASH:
case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY:
break;
case BPF_MAP_TYPE_INSN_ARRAY:
key_size = sizeof(__u32);
value_size = sizeof(struct bpf_insn_array_value);
break;
case BPF_MAP_TYPE_UNSPEC:
default:
return -EOPNOTSUPP;

View File

@@ -2025,6 +2025,9 @@ static int linker_append_elf_sym(struct bpf_linker *linker, struct src_obj *obj,
obj->sym_map[src_sym_idx] = dst_sec->sec_sym_idx;
return 0;
}
if (strcmp(src_sec->sec_name, JUMPTABLES_SEC) == 0)
goto add_sym;
}
if (sym_bind == STB_LOCAL)

View File

@@ -23,6 +23,7 @@ test_tcpnotify_user
test_libbpf
xdping
test_cpp
test_progs_verification_cert
*.d
*.subskel.h
*.skel.h
@@ -32,7 +33,6 @@ test_cpp
/cpuv4
/host-tools
/tools
/runqslower
/bench
/veristat
/sign-file

View File

@@ -46,6 +46,7 @@ endif
CFLAGS += -g $(OPT_FLAGS) -rdynamic -std=gnu11 \
-Wall -Werror -fno-omit-frame-pointer \
-Wno-unused-but-set-variable \
$(GENFLAGS) $(SAN_CFLAGS) $(LIBELF_CFLAGS) \
-I$(CURDIR) -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \
-I$(TOOLSINCDIR) -I$(TOOLSARCHINCDIR) -I$(APIDIR) -I$(OUTPUT)
@@ -98,14 +99,11 @@ TEST_GEN_PROGS += test_progs-cpuv4
TEST_INST_SUBDIRS += cpuv4
endif
TEST_GEN_FILES = test_tc_edt.bpf.o
TEST_FILES = xsk_prereqs.sh $(wildcard progs/btf_dump_test_case_*.c)
# Order correspond to 'make run_tests' order
TEST_PROGS := test_kmod.sh \
test_lirc_mode2.sh \
test_tc_tunnel.sh \
test_tc_edt.sh \
test_xdping.sh \
test_bpftool_build.sh \
test_bpftool.sh \
@@ -127,7 +125,6 @@ TEST_KMOD_TARGETS = $(addprefix $(OUTPUT)/,$(TEST_KMODS))
TEST_GEN_PROGS_EXTENDED = \
bench \
flow_dissector_load \
runqslower \
test_cpp \
test_lirc_mode2_user \
veristat \
@@ -209,8 +206,6 @@ HOST_INCLUDE_DIR := $(INCLUDE_DIR)
endif
HOST_BPFOBJ := $(HOST_BUILD_DIR)/libbpf/libbpf.a
RESOLVE_BTFIDS := $(HOST_BUILD_DIR)/resolve_btfids/resolve_btfids
RUNQSLOWER_OUTPUT := $(BUILD_DIR)/runqslower/
VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux) \
$(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux) \
../../../../vmlinux \
@@ -232,7 +227,7 @@ $(notdir $(TEST_GEN_PROGS) $(TEST_KMODS) \
MAKE_DIRS := $(sort $(BUILD_DIR)/libbpf $(HOST_BUILD_DIR)/libbpf \
$(BUILD_DIR)/bpftool $(HOST_BUILD_DIR)/bpftool \
$(HOST_BUILD_DIR)/resolve_btfids \
$(RUNQSLOWER_OUTPUT) $(INCLUDE_DIR))
$(INCLUDE_DIR))
$(MAKE_DIRS):
$(call msg,MKDIR,,$@)
$(Q)mkdir -p $@
@@ -304,17 +299,6 @@ TRUNNER_BPFTOOL := $(DEFAULT_BPFTOOL)
USE_BOOTSTRAP := "bootstrap/"
endif
$(OUTPUT)/runqslower: $(BPFOBJ) | $(DEFAULT_BPFTOOL) $(RUNQSLOWER_OUTPUT)
$(Q)$(MAKE) $(submake_extras) -C $(TOOLSDIR)/bpf/runqslower \
OUTPUT=$(RUNQSLOWER_OUTPUT) VMLINUX_BTF=$(VMLINUX_BTF) \
BPFTOOL_OUTPUT=$(HOST_BUILD_DIR)/bpftool/ \
BPFOBJ_OUTPUT=$(BUILD_DIR)/libbpf/ \
BPFOBJ=$(BPFOBJ) BPF_INCLUDE=$(INCLUDE_DIR) \
BPF_TARGET_ENDIAN=$(BPF_TARGET_ENDIAN) \
EXTRA_CFLAGS='-g $(OPT_FLAGS) $(SAN_CFLAGS) $(EXTRA_CFLAGS)' \
EXTRA_LDFLAGS='$(SAN_LDFLAGS) $(EXTRA_LDFLAGS)' && \
cp $(RUNQSLOWER_OUTPUT)runqslower $@
TEST_GEN_PROGS_EXTENDED += $(TRUNNER_BPFTOOL)
$(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED): $(BPFOBJ)
@@ -453,7 +437,9 @@ BPF_CFLAGS = -g -Wall -Werror -D__TARGET_ARCH_$(SRCARCH) $(MENDIAN) \
-I$(abspath $(OUTPUT)/../usr/include) \
-std=gnu11 \
-fno-strict-aliasing \
-Wno-compare-distinct-pointer-types
-Wno-compare-distinct-pointer-types \
-Wno-initializer-overrides \
#
# TODO: enable me -Wsign-compare
CLANG_CFLAGS = $(CLANG_SYS_INCLUDES)
@@ -498,7 +484,8 @@ LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h \
LSKELS := fexit_sleep.c trace_printk.c trace_vprintk.c map_ptr_kern.c \
core_kern.c core_kern_overflow.c test_ringbuf.c \
test_ringbuf_n.c test_ringbuf_map_key.c test_ringbuf_write.c
test_ringbuf_n.c test_ringbuf_map_key.c test_ringbuf_write.c \
test_ringbuf_overwrite.c
LSKELS_SIGNED := fentry_test.c fexit_test.c atomics.c
@@ -543,6 +530,8 @@ TRUNNER_TEST_OBJS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.test.o, \
$$(notdir $$(wildcard $(TRUNNER_TESTS_DIR)/*.c)))
TRUNNER_EXTRA_OBJS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.o, \
$$(filter %.c,$(TRUNNER_EXTRA_SOURCES)))
TRUNNER_LIB_OBJS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.o, \
$$(filter %.c,$(TRUNNER_LIB_SOURCES)))
TRUNNER_EXTRA_HDRS := $$(filter %.h,$(TRUNNER_EXTRA_SOURCES))
TRUNNER_TESTS_HDR := $(TRUNNER_TESTS_DIR)/tests.h
TRUNNER_BPF_SRCS := $$(notdir $$(wildcard $(TRUNNER_BPF_PROGS_DIR)/*.c))
@@ -686,6 +675,10 @@ $(TRUNNER_EXTRA_OBJS): $(TRUNNER_OUTPUT)/%.o: \
$$(call msg,EXT-OBJ,$(TRUNNER_BINARY),$$@)
$(Q)$$(CC) $$(CFLAGS) -c $$< $$(LDLIBS) -o $$@
$(TRUNNER_LIB_OBJS): $(TRUNNER_OUTPUT)/%.o:$(TOOLSDIR)/lib/%.c
$$(call msg,LIB-OBJ,$(TRUNNER_BINARY),$$@)
$(Q)$$(CC) $$(CFLAGS) -c $$< $$(LDLIBS) -o $$@
# non-flavored in-srctree builds receive special treatment, in particular, we
# do not need to copy extra resources (see e.g. test_btf_dump_case())
$(TRUNNER_BINARY)-extras: $(TRUNNER_EXTRA_FILES) | $(TRUNNER_OUTPUT)
@@ -699,6 +692,7 @@ $(OUTPUT)/$(TRUNNER_BINARY): | $(TRUNNER_BPF_OBJS)
$(OUTPUT)/$(TRUNNER_BINARY): $(TRUNNER_TEST_OBJS) \
$(TRUNNER_EXTRA_OBJS) $$(BPFOBJ) \
$(TRUNNER_LIB_OBJS) \
$(RESOLVE_BTFIDS) \
$(TRUNNER_BPFTOOL) \
$(OUTPUT)/veristat \
@@ -721,7 +715,8 @@ $(VERIFICATION_CERT) $(PRIVATE_KEY): $(VERIFY_SIG_SETUP)
$(Q)$(VERIFY_SIG_SETUP) genkey $(BUILD_DIR)
$(VERIFY_SIG_HDR): $(VERIFICATION_CERT)
$(Q)xxd -i -n test_progs_verification_cert $< > $@
$(Q)ln -fs $< test_progs_verification_cert && \
xxd -i test_progs_verification_cert > $@
# Define test_progs test runner.
TRUNNER_TESTS_DIR := prog_tests
@@ -745,6 +740,7 @@ TRUNNER_EXTRA_SOURCES := test_progs.c \
$(VERIFY_SIG_HDR) \
flow_dissector_load.h \
ip_check_defrag_frags.h
TRUNNER_LIB_SOURCES := find_bit.c
TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read \
$(OUTPUT)/liburandom_read.so \
$(OUTPUT)/xdp_synproxy \
@@ -782,6 +778,7 @@ endif
TRUNNER_TESTS_DIR := map_tests
TRUNNER_BPF_PROGS_DIR := progs
TRUNNER_EXTRA_SOURCES := test_maps.c
TRUNNER_LIB_SOURCES :=
TRUNNER_EXTRA_FILES :=
TRUNNER_BPF_BUILD_RULE := $$(error no BPF objects should be built)
TRUNNER_BPF_CFLAGS :=
@@ -803,7 +800,7 @@ $(OUTPUT)/test_verifier: test_verifier.c verifier/tests.h $(BPFOBJ) | $(OUTPUT)
$(Q)$(CC) $(CFLAGS) $(filter %.a %.o %.c,$^) $(LDLIBS) -o $@
# Include find_bit.c to compile xskxceiver.
EXTRA_SRC := $(TOOLSDIR)/lib/find_bit.c
EXTRA_SRC := $(TOOLSDIR)/lib/find_bit.c prog_tests/test_xsk.c prog_tests/test_xsk.h
$(OUTPUT)/xskxceiver: $(EXTRA_SRC) xskxceiver.c xskxceiver.h $(OUTPUT)/network_helpers.o $(OUTPUT)/xsk.o $(OUTPUT)/xsk_xdp_progs.skel.h $(BPFOBJ) | $(OUTPUT)
$(call msg,BINARY,,$@)
$(Q)$(CC) $(CFLAGS) $(filter %.a %.o %.c,$^) $(LDLIBS) -o $@
@@ -893,7 +890,8 @@ EXTRA_CLEAN := $(SCRATCH_DIR) $(HOST_SCRATCH_DIR) \
$(addprefix $(OUTPUT)/,*.o *.d *.skel.h *.lskel.h *.subskel.h \
no_alu32 cpuv4 bpf_gcc \
liburandom_read.so) \
$(OUTPUT)/FEATURE-DUMP.selftests
$(OUTPUT)/FEATURE-DUMP.selftests \
test_progs_verification_cert
.PHONY: docs docs-clean

View File

@@ -19,6 +19,8 @@ static struct {
int ringbuf_sz; /* per-ringbuf, in bytes */
bool ringbuf_use_output; /* use slower output API */
int perfbuf_sz; /* per-CPU size, in pages */
bool overwrite;
bool bench_producer;
} args = {
.back2back = false,
.batch_cnt = 500,
@@ -27,6 +29,8 @@ static struct {
.ringbuf_sz = 512 * 1024,
.ringbuf_use_output = false,
.perfbuf_sz = 128,
.overwrite = false,
.bench_producer = false,
};
enum {
@@ -35,6 +39,8 @@ enum {
ARG_RB_BATCH_CNT = 2002,
ARG_RB_SAMPLED = 2003,
ARG_RB_SAMPLE_RATE = 2004,
ARG_RB_OVERWRITE = 2005,
ARG_RB_BENCH_PRODUCER = 2006,
};
static const struct argp_option opts[] = {
@@ -43,6 +49,8 @@ static const struct argp_option opts[] = {
{ "rb-batch-cnt", ARG_RB_BATCH_CNT, "CNT", 0, "Set BPF-side record batch count"},
{ "rb-sampled", ARG_RB_SAMPLED, NULL, 0, "Notification sampling"},
{ "rb-sample-rate", ARG_RB_SAMPLE_RATE, "RATE", 0, "Notification sample rate"},
{ "rb-overwrite", ARG_RB_OVERWRITE, NULL, 0, "Overwrite mode"},
{ "rb-bench-producer", ARG_RB_BENCH_PRODUCER, NULL, 0, "Benchmark producer"},
{},
};
@@ -72,6 +80,12 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state)
argp_usage(state);
}
break;
case ARG_RB_OVERWRITE:
args.overwrite = true;
break;
case ARG_RB_BENCH_PRODUCER:
args.bench_producer = true;
break;
default:
return ARGP_ERR_UNKNOWN;
}
@@ -95,8 +109,33 @@ static inline void bufs_trigger_batch(void)
static void bufs_validate(void)
{
if (env.consumer_cnt != 1) {
fprintf(stderr, "rb-libbpf benchmark needs one consumer!\n");
if (args.bench_producer && strcmp(env.bench_name, "rb-libbpf")) {
fprintf(stderr, "--rb-bench-producer only works with rb-libbpf!\n");
exit(1);
}
if (args.overwrite && !args.bench_producer) {
fprintf(stderr, "overwrite mode only works with --rb-bench-producer for now!\n");
exit(1);
}
if (args.bench_producer && env.consumer_cnt != 0) {
fprintf(stderr, "no consumer is needed for --rb-bench-producer!\n");
exit(1);
}
if (args.bench_producer && args.back2back) {
fprintf(stderr, "back-to-back mode makes no sense for --rb-bench-producer!\n");
exit(1);
}
if (args.bench_producer && args.sampled) {
fprintf(stderr, "sampling mode makes no sense for --rb-bench-producer!\n");
exit(1);
}
if (!args.bench_producer && env.consumer_cnt != 1) {
fprintf(stderr, "benchmarks without --rb-bench-producer require exactly one consumer!\n");
exit(1);
}
@@ -128,12 +167,17 @@ static void ringbuf_libbpf_measure(struct bench_res *res)
{
struct ringbuf_libbpf_ctx *ctx = &ringbuf_libbpf_ctx;
res->hits = atomic_swap(&buf_hits.value, 0);
if (args.bench_producer)
res->hits = atomic_swap(&ctx->skel->bss->hits, 0);
else
res->hits = atomic_swap(&buf_hits.value, 0);
res->drops = atomic_swap(&ctx->skel->bss->dropped, 0);
}
static struct ringbuf_bench *ringbuf_setup_skeleton(void)
{
__u32 flags;
struct bpf_map *ringbuf;
struct ringbuf_bench *skel;
setup_libbpf();
@@ -146,12 +190,19 @@ static struct ringbuf_bench *ringbuf_setup_skeleton(void)
skel->rodata->batch_cnt = args.batch_cnt;
skel->rodata->use_output = args.ringbuf_use_output ? 1 : 0;
skel->rodata->bench_producer = args.bench_producer;
if (args.sampled)
/* record data + header take 16 bytes */
skel->rodata->wakeup_data_size = args.sample_rate * 16;
bpf_map__set_max_entries(skel->maps.ringbuf, args.ringbuf_sz);
ringbuf = skel->maps.ringbuf;
if (args.overwrite) {
flags = bpf_map__map_flags(ringbuf) | BPF_F_RB_OVERWRITE;
bpf_map__set_map_flags(ringbuf, flags);
}
bpf_map__set_max_entries(ringbuf, args.ringbuf_sz);
if (ringbuf_bench__load(skel)) {
fprintf(stderr, "failed to load skeleton\n");
@@ -171,10 +222,12 @@ static void ringbuf_libbpf_setup(void)
{
struct ringbuf_libbpf_ctx *ctx = &ringbuf_libbpf_ctx;
struct bpf_link *link;
int map_fd;
ctx->skel = ringbuf_setup_skeleton();
ctx->ringbuf = ring_buffer__new(bpf_map__fd(ctx->skel->maps.ringbuf),
buf_process_sample, NULL, NULL);
map_fd = bpf_map__fd(ctx->skel->maps.ringbuf);
ctx->ringbuf = ring_buffer__new(map_fd, buf_process_sample, NULL, NULL);
if (!ctx->ringbuf) {
fprintf(stderr, "failed to create ringbuf\n");
exit(1);

View File

@@ -180,10 +180,10 @@ static void trigger_kernel_count_setup(void)
{
setup_ctx();
bpf_program__set_autoload(ctx.skel->progs.trigger_driver, false);
bpf_program__set_autoload(ctx.skel->progs.trigger_count, true);
bpf_program__set_autoload(ctx.skel->progs.trigger_kernel_count, true);
load_ctx();
/* override driver program */
ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_count);
ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_kernel_count);
}
static void trigger_kprobe_setup(void)

View File

@@ -49,3 +49,7 @@ for b in 1 2 3 4 8 12 16 20 24 28 32 36 40 44 48 52; do
summarize "rb-libbpf nr_prod $b" "$($RUN_RB_BENCH -p$b --rb-batch-cnt 50 rb-libbpf)"
done
header "Ringbuf, multi-producer contention in overwrite mode, no consumer"
for b in 1 2 3 4 8 12 16 20 24 28 32 36 40 44 48 52; do
summarize "rb-prod nr_prod $b" "$($RUN_BENCH -p$b --rb-batch-cnt 50 --rb-overwrite --rb-bench-producer rb-libbpf)"
done

View File

@@ -64,14 +64,12 @@ static inline void list_add_head(arena_list_node_t *n, arena_list_head_t *h)
static inline void __list_del(arena_list_node_t *n)
{
arena_list_node_t *next = n->next, *tmp;
arena_list_node_t *next = n->next;
arena_list_node_t * __arena *pprev = n->pprev;
cast_user(next);
cast_kern(pprev);
tmp = *pprev;
cast_kern(tmp);
WRITE_ONCE(tmp, next);
WRITE_ONCE(*pprev, next);
if (next) {
cast_user(pprev);
cast_kern(next);

View File

@@ -0,0 +1,128 @@
/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
#pragma once
#include "bpf_arena_common.h"
__noinline int bpf_arena_strlen(const char __arena *s __arg_arena)
{
const char __arena *sc;
for (sc = s; *sc != '\0'; ++sc)
cond_break;
return sc - s;
}
/**
* glob_match - Shell-style pattern matching, like !fnmatch(pat, str, 0)
* @pat: Shell-style pattern to match, e.g. "*.[ch]".
* @str: String to match. The pattern must match the entire string.
*
* Perform shell-style glob matching, returning true (1) if the match
* succeeds, or false (0) if it fails. Equivalent to !fnmatch(@pat, @str, 0).
*
* Pattern metacharacters are ?, *, [ and \.
* (And, inside character classes, !, - and ].)
*
* This is small and simple implementation intended for device blacklists
* where a string is matched against a number of patterns. Thus, it
* does not preprocess the patterns. It is non-recursive, and run-time
* is at most quadratic: strlen(@str)*strlen(@pat).
*
* An example of the worst case is glob_match("*aaaaa", "aaaaaaaaaa");
* it takes 6 passes over the pattern before matching the string.
*
* Like !fnmatch(@pat, @str, 0) and unlike the shell, this does NOT
* treat / or leading . specially; it isn't actually used for pathnames.
*
* Note that according to glob(7) (and unlike bash), character classes
* are complemented by a leading !; this does not support the regex-style
* [^a-z] syntax.
*
* An opening bracket without a matching close is matched literally.
*/
__noinline bool glob_match(char const __arena *pat __arg_arena, char const __arena *str __arg_arena)
{
/*
* Backtrack to previous * on mismatch and retry starting one
* character later in the string. Because * matches all characters
* (no exception for /), it can be easily proved that there's
* never a need to backtrack multiple levels.
*/
char const __arena *back_pat = NULL, *back_str;
/*
* Loop over each token (character or class) in pat, matching
* it against the remaining unmatched tail of str. Return false
* on mismatch, or true after matching the trailing nul bytes.
*/
for (;;) {
unsigned char c = *str++;
unsigned char d = *pat++;
switch (d) {
case '?': /* Wildcard: anything but nul */
if (c == '\0')
return false;
break;
case '*': /* Any-length wildcard */
if (*pat == '\0') /* Optimize trailing * case */
return true;
back_pat = pat;
back_str = --str; /* Allow zero-length match */
break;
case '[': { /* Character class */
bool match = false, inverted = (*pat == '!');
char const __arena *class = pat + inverted;
unsigned char a = *class++;
/*
* Iterate over each span in the character class.
* A span is either a single character a, or a
* range a-b. The first span may begin with ']'.
*/
do {
unsigned char b = a;
if (a == '\0') /* Malformed */
goto literal;
if (class[0] == '-' && class[1] != ']') {
b = class[1];
if (b == '\0')
goto literal;
class += 2;
/* Any special action if a > b? */
}
match |= (a <= c && c <= b);
cond_break;
} while ((a = *class++) != ']');
if (match == inverted)
goto backtrack;
pat = class;
}
break;
case '\\':
d = *pat++;
__attribute__((__fallthrough__));
default: /* Literal character */
literal:
if (c == d) {
if (d == '\0')
return true;
break;
}
backtrack:
if (c == '\0' || !back_pat)
return false; /* No point continuing */
/* Try again from last *, one character later in str. */
pat = back_pat;
str = ++back_str;
break;
}
cond_break;
}
return false;
}

View File

@@ -28,8 +28,8 @@ extern int bpf_dynptr_from_skb_meta(struct __sk_buff *skb, __u64 flags,
* Either a direct pointer to the dynptr data or a pointer to the user-provided
* buffer if unable to obtain a direct pointer
*/
extern void *bpf_dynptr_slice(const struct bpf_dynptr *ptr, __u32 offset,
void *buffer, __u32 buffer__szk) __ksym __weak;
extern void *bpf_dynptr_slice(const struct bpf_dynptr *ptr, __u64 offset,
void *buffer, __u64 buffer__szk) __ksym __weak;
/* Description
* Obtain a read-write pointer to the dynptr's data
@@ -37,13 +37,13 @@ extern void *bpf_dynptr_slice(const struct bpf_dynptr *ptr, __u32 offset,
* Either a direct pointer to the dynptr data or a pointer to the user-provided
* buffer if unable to obtain a direct pointer
*/
extern void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *ptr, __u32 offset,
void *buffer, __u32 buffer__szk) __ksym __weak;
extern void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *ptr, __u64 offset, void *buffer,
__u64 buffer__szk) __ksym __weak;
extern int bpf_dynptr_adjust(const struct bpf_dynptr *ptr, __u32 start, __u32 end) __ksym __weak;
extern int bpf_dynptr_adjust(const struct bpf_dynptr *ptr, __u64 start, __u64 end) __ksym __weak;
extern bool bpf_dynptr_is_null(const struct bpf_dynptr *ptr) __ksym __weak;
extern bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *ptr) __ksym __weak;
extern __u32 bpf_dynptr_size(const struct bpf_dynptr *ptr) __ksym __weak;
extern __u64 bpf_dynptr_size(const struct bpf_dynptr *ptr) __ksym __weak;
extern int bpf_dynptr_clone(const struct bpf_dynptr *ptr, struct bpf_dynptr *clone__init) __ksym __weak;
/* Description

View File

@@ -97,7 +97,7 @@ int settimeo(int fd, int timeout_ms)
int start_server_addr(int type, const struct sockaddr_storage *addr, socklen_t addrlen,
const struct network_helper_opts *opts)
{
int fd;
int on = 1, fd;
if (!opts)
opts = &default_opts;
@@ -111,6 +111,12 @@ int start_server_addr(int type, const struct sockaddr_storage *addr, socklen_t a
if (settimeo(fd, opts->timeout_ms))
goto error_close;
if (type == SOCK_STREAM &&
setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on))) {
log_err("Failed to enable SO_REUSEADDR");
goto error_close;
}
if (opts->post_socket_cb &&
opts->post_socket_cb(fd, opts->cb_opts)) {
log_err("Failed to call post_socket_cb");
@@ -766,6 +772,50 @@ int send_recv_data(int lfd, int fd, uint32_t total_bytes)
return err;
}
int tc_prog_attach(const char *dev, int ingress_fd, int egress_fd)
{
int ifindex, ret;
if (!ASSERT_TRUE(ingress_fd >= 0 || egress_fd >= 0,
"at least one program fd is valid"))
return -1;
ifindex = if_nametoindex(dev);
if (!ASSERT_NEQ(ifindex, 0, "get ifindex"))
return -1;
DECLARE_LIBBPF_OPTS(bpf_tc_hook, hook, .ifindex = ifindex,
.attach_point = BPF_TC_INGRESS | BPF_TC_EGRESS);
DECLARE_LIBBPF_OPTS(bpf_tc_opts, opts1, .handle = 1,
.priority = 1, .prog_fd = ingress_fd);
DECLARE_LIBBPF_OPTS(bpf_tc_opts, opts2, .handle = 1,
.priority = 1, .prog_fd = egress_fd);
ret = bpf_tc_hook_create(&hook);
if (!ASSERT_OK(ret, "create tc hook"))
return ret;
if (ingress_fd >= 0) {
hook.attach_point = BPF_TC_INGRESS;
ret = bpf_tc_attach(&hook, &opts1);
if (!ASSERT_OK(ret, "bpf_tc_attach")) {
bpf_tc_hook_destroy(&hook);
return ret;
}
}
if (egress_fd >= 0) {
hook.attach_point = BPF_TC_EGRESS;
ret = bpf_tc_attach(&hook, &opts2);
if (!ASSERT_OK(ret, "bpf_tc_attach")) {
bpf_tc_hook_destroy(&hook);
return ret;
}
}
return 0;
}
#ifdef TRAFFIC_MONITOR
struct tmonitor_ctx {
pcap_t *pcap;

View File

@@ -255,6 +255,22 @@ struct tmonitor_ctx;
typedef int (*tm_print_fn_t)(const char *format, va_list args);
/**
* tc_prog_attach - attach BPF program(s) to an interface
*
* Takes file descriptors pointing to at least one, at most two BPF
* programs, and attach those programs to an interface ingress, egress or
* both.
*
* @dev: string containing the interface name
* @ingress_fd: file descriptor of the program to attach to interface ingress
* @egress_fd: file descriptor of the program to attach to interface egress
*
* Returns 0 on success, -1 if no valid file descriptor has been found, if
* the interface name is invalid or if an error ocurred during attach.
*/
int tc_prog_attach(const char *dev, int ingress_fd, int egress_fd);
#ifdef TRAFFIC_MONITOR
struct tmonitor_ctx *traffic_monitor_start(const char *netns, const char *test_name,
const char *subtest_name);

View File

@@ -0,0 +1,30 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
#include <test_progs.h>
#include "arena_strsearch.skel.h"
static void test_arena_str(void)
{
LIBBPF_OPTS(bpf_test_run_opts, opts);
struct arena_strsearch *skel;
int ret;
skel = arena_strsearch__open_and_load();
if (!ASSERT_OK_PTR(skel, "arena_strsearch__open_and_load"))
return;
ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.arena_strsearch), &opts);
ASSERT_OK(ret, "ret_add");
ASSERT_OK(opts.retval, "retval");
if (skel->bss->skip) {
printf("%s:SKIP:compiler doesn't support arena_cast\n", __func__);
test__skip();
}
arena_strsearch__destroy(skel);
}
void test_arena_strsearch(void)
{
if (test__start_subtest("arena_strsearch"))
test_arena_str();
}

View File

@@ -0,0 +1,292 @@
// SPDX-License-Identifier: GPL-2.0
#include <test_progs.h>
#include <linux/if_ether.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/in6.h>
#include <linux/udp.h>
#include <linux/tcp.h>
#include <sys/syscall.h>
#include <bpf/bpf.h>
#include "bpf_gotox.skel.h"
static void __test_run(struct bpf_program *prog, void *ctx_in, size_t ctx_size_in)
{
LIBBPF_OPTS(bpf_test_run_opts, topts,
.ctx_in = ctx_in,
.ctx_size_in = ctx_size_in,
);
int err, prog_fd;
prog_fd = bpf_program__fd(prog);
err = bpf_prog_test_run_opts(prog_fd, &topts);
ASSERT_OK(err, "test_run_opts err");
}
static void __subtest(struct bpf_gotox *skel, void (*check)(struct bpf_gotox *))
{
if (skel->data->skip)
test__skip();
else
check(skel);
}
static void check_simple(struct bpf_gotox *skel,
struct bpf_program *prog,
__u64 ctx_in,
__u64 expected)
{
skel->bss->ret_user = 0;
__test_run(prog, &ctx_in, sizeof(ctx_in));
if (!ASSERT_EQ(skel->bss->ret_user, expected, "skel->bss->ret_user"))
return;
}
static void check_simple_fentry(struct bpf_gotox *skel,
struct bpf_program *prog,
__u64 ctx_in,
__u64 expected)
{
skel->bss->in_user = ctx_in;
skel->bss->ret_user = 0;
/* trigger */
usleep(1);
if (!ASSERT_EQ(skel->bss->ret_user, expected, "skel->bss->ret_user"))
return;
}
/* validate that for two loads of the same jump table libbpf generates only one map */
static void check_one_map_two_jumps(struct bpf_gotox *skel)
{
struct bpf_prog_info prog_info;
struct bpf_map_info map_info;
__u32 len;
__u32 map_ids[16];
int prog_fd, map_fd;
int ret;
int i;
bool seen = false;
memset(&prog_info, 0, sizeof(prog_info));
prog_info.map_ids = (long)map_ids;
prog_info.nr_map_ids = ARRAY_SIZE(map_ids);
prog_fd = bpf_program__fd(skel->progs.one_map_two_jumps);
if (!ASSERT_GE(prog_fd, 0, "bpf_program__fd(one_map_two_jumps)"))
return;
len = sizeof(prog_info);
ret = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &len);
if (!ASSERT_OK(ret, "bpf_obj_get_info_by_fd(prog_fd)"))
return;
for (i = 0; i < prog_info.nr_map_ids; i++) {
map_fd = bpf_map_get_fd_by_id(map_ids[i]);
if (!ASSERT_GE(map_fd, 0, "bpf_map_get_fd_by_id"))
return;
len = sizeof(map_info);
memset(&map_info, 0, len);
ret = bpf_obj_get_info_by_fd(map_fd, &map_info, &len);
if (!ASSERT_OK(ret, "bpf_obj_get_info_by_fd(map_fd)")) {
close(map_fd);
return;
}
if (map_info.type == BPF_MAP_TYPE_INSN_ARRAY) {
if (!ASSERT_EQ(seen, false, "more than one INSN_ARRAY map")) {
close(map_fd);
return;
}
seen = true;
}
close(map_fd);
}
ASSERT_EQ(seen, true, "no INSN_ARRAY map");
}
static void check_one_switch(struct bpf_gotox *skel)
{
__u64 in[] = {0, 1, 2, 3, 4, 5, 77};
__u64 out[] = {2, 3, 4, 5, 7, 19, 19};
int i;
for (i = 0; i < ARRAY_SIZE(in); i++)
check_simple(skel, skel->progs.one_switch, in[i], out[i]);
}
static void check_one_switch_non_zero_sec_off(struct bpf_gotox *skel)
{
__u64 in[] = {0, 1, 2, 3, 4, 5, 77};
__u64 out[] = {2, 3, 4, 5, 7, 19, 19};
int i;
for (i = 0; i < ARRAY_SIZE(in); i++)
check_simple(skel, skel->progs.one_switch_non_zero_sec_off, in[i], out[i]);
}
static void check_two_switches(struct bpf_gotox *skel)
{
__u64 in[] = {0, 1, 2, 3, 4, 5, 77};
__u64 out[] = {103, 104, 107, 205, 115, 1019, 1019};
int i;
for (i = 0; i < ARRAY_SIZE(in); i++)
check_simple(skel, skel->progs.two_switches, in[i], out[i]);
}
static void check_big_jump_table(struct bpf_gotox *skel)
{
__u64 in[] = {0, 11, 27, 31, 22, 45, 99};
__u64 out[] = {2, 3, 4, 5, 19, 19, 19};
int i;
for (i = 0; i < ARRAY_SIZE(in); i++)
check_simple(skel, skel->progs.big_jump_table, in[i], out[i]);
}
static void check_one_jump_two_maps(struct bpf_gotox *skel)
{
__u64 in[] = {0, 1, 2, 3, 4, 5, 77};
__u64 out[] = {12, 15, 7 , 15, 12, 15, 15};
int i;
for (i = 0; i < ARRAY_SIZE(in); i++)
check_simple(skel, skel->progs.one_jump_two_maps, in[i], out[i]);
}
static void check_static_global(struct bpf_gotox *skel)
{
__u64 in[] = {0, 1, 2, 3, 4, 5, 77};
__u64 out[] = {2, 3, 4, 5, 7, 19, 19};
int i;
for (i = 0; i < ARRAY_SIZE(in); i++)
check_simple(skel, skel->progs.use_static_global1, in[i], out[i]);
for (i = 0; i < ARRAY_SIZE(in); i++)
check_simple(skel, skel->progs.use_static_global2, in[i], out[i]);
}
static void check_nonstatic_global(struct bpf_gotox *skel)
{
__u64 in[] = {0, 1, 2, 3, 4, 5, 77};
__u64 out[] = {2, 3, 4, 5, 7, 19, 19};
int i;
for (i = 0; i < ARRAY_SIZE(in); i++)
check_simple(skel, skel->progs.use_nonstatic_global1, in[i], out[i]);
for (i = 0; i < ARRAY_SIZE(in); i++)
check_simple(skel, skel->progs.use_nonstatic_global2, in[i], out[i]);
}
static void check_other_sec(struct bpf_gotox *skel)
{
struct bpf_link *link;
__u64 in[] = {0, 1, 2, 3, 4, 5, 77};
__u64 out[] = {2, 3, 4, 5, 7, 19, 19};
int i;
link = bpf_program__attach(skel->progs.simple_test_other_sec);
if (!ASSERT_OK_PTR(link, "link"))
return;
for (i = 0; i < ARRAY_SIZE(in); i++)
check_simple_fentry(skel, skel->progs.simple_test_other_sec, in[i], out[i]);
bpf_link__destroy(link);
}
static void check_static_global_other_sec(struct bpf_gotox *skel)
{
struct bpf_link *link;
__u64 in[] = {0, 1, 2, 3, 4, 5, 77};
__u64 out[] = {2, 3, 4, 5, 7, 19, 19};
int i;
link = bpf_program__attach(skel->progs.use_static_global_other_sec);
if (!ASSERT_OK_PTR(link, "link"))
return;
for (i = 0; i < ARRAY_SIZE(in); i++)
check_simple_fentry(skel, skel->progs.use_static_global_other_sec, in[i], out[i]);
bpf_link__destroy(link);
}
static void check_nonstatic_global_other_sec(struct bpf_gotox *skel)
{
struct bpf_link *link;
__u64 in[] = {0, 1, 2, 3, 4, 5, 77};
__u64 out[] = {2, 3, 4, 5, 7, 19, 19};
int i;
link = bpf_program__attach(skel->progs.use_nonstatic_global_other_sec);
if (!ASSERT_OK_PTR(link, "link"))
return;
for (i = 0; i < ARRAY_SIZE(in); i++)
check_simple_fentry(skel, skel->progs.use_nonstatic_global_other_sec, in[i], out[i]);
bpf_link__destroy(link);
}
void test_bpf_gotox(void)
{
struct bpf_gotox *skel;
int ret;
skel = bpf_gotox__open();
if (!ASSERT_NEQ(skel, NULL, "bpf_gotox__open"))
return;
ret = bpf_gotox__load(skel);
if (!ASSERT_OK(ret, "bpf_gotox__load"))
return;
skel->bss->pid = getpid();
if (test__start_subtest("one-switch"))
__subtest(skel, check_one_switch);
if (test__start_subtest("one-switch-non-zero-sec-offset"))
__subtest(skel, check_one_switch_non_zero_sec_off);
if (test__start_subtest("two-switches"))
__subtest(skel, check_two_switches);
if (test__start_subtest("big-jump-table"))
__subtest(skel, check_big_jump_table);
if (test__start_subtest("static-global"))
__subtest(skel, check_static_global);
if (test__start_subtest("nonstatic-global"))
__subtest(skel, check_nonstatic_global);
if (test__start_subtest("other-sec"))
__subtest(skel, check_other_sec);
if (test__start_subtest("static-global-other-sec"))
__subtest(skel, check_static_global_other_sec);
if (test__start_subtest("nonstatic-global-other-sec"))
__subtest(skel, check_nonstatic_global_other_sec);
if (test__start_subtest("one-jump-two-maps"))
__subtest(skel, check_one_jump_two_maps);
if (test__start_subtest("one-map-two-jumps"))
__subtest(skel, check_one_map_two_jumps);
bpf_gotox__destroy(skel);
}

View File

@@ -0,0 +1,504 @@
// SPDX-License-Identifier: GPL-2.0
#include <bpf/bpf.h>
#include <test_progs.h>
#ifdef __x86_64__
static int map_create(__u32 map_type, __u32 max_entries)
{
const char *map_name = "insn_array";
__u32 key_size = 4;
__u32 value_size = sizeof(struct bpf_insn_array_value);
return bpf_map_create(map_type, map_name, key_size, value_size, max_entries, NULL);
}
static int prog_load(struct bpf_insn *insns, __u32 insn_cnt, int *fd_array, __u32 fd_array_cnt)
{
LIBBPF_OPTS(bpf_prog_load_opts, opts);
opts.fd_array = fd_array;
opts.fd_array_cnt = fd_array_cnt;
return bpf_prog_load(BPF_PROG_TYPE_XDP, NULL, "GPL", insns, insn_cnt, &opts);
}
static void __check_success(struct bpf_insn *insns, __u32 insn_cnt, __u32 *map_in, __u32 *map_out)
{
struct bpf_insn_array_value val = {};
int prog_fd = -1, map_fd, i;
map_fd = map_create(BPF_MAP_TYPE_INSN_ARRAY, insn_cnt);
if (!ASSERT_GE(map_fd, 0, "map_create"))
return;
for (i = 0; i < insn_cnt; i++) {
val.orig_off = map_in[i];
if (!ASSERT_EQ(bpf_map_update_elem(map_fd, &i, &val, 0), 0, "bpf_map_update_elem"))
goto cleanup;
}
if (!ASSERT_EQ(bpf_map_freeze(map_fd), 0, "bpf_map_freeze"))
goto cleanup;
prog_fd = prog_load(insns, insn_cnt, &map_fd, 1);
if (!ASSERT_GE(prog_fd, 0, "bpf(BPF_PROG_LOAD)"))
goto cleanup;
for (i = 0; i < insn_cnt; i++) {
char buf[64];
if (!ASSERT_EQ(bpf_map_lookup_elem(map_fd, &i, &val), 0, "bpf_map_lookup_elem"))
goto cleanup;
snprintf(buf, sizeof(buf), "val.xlated_off should be equal map_out[%d]", i);
ASSERT_EQ(val.xlated_off, map_out[i], buf);
}
cleanup:
close(prog_fd);
close(map_fd);
}
/*
* Load a program, which will not be anyhow mangled by the verifier. Add an
* insn_array map pointing to every instruction. Check that it hasn't changed
* after the program load.
*/
static void check_one_to_one_mapping(void)
{
struct bpf_insn insns[] = {
BPF_MOV64_IMM(BPF_REG_0, 4),
BPF_MOV64_IMM(BPF_REG_0, 3),
BPF_MOV64_IMM(BPF_REG_0, 2),
BPF_MOV64_IMM(BPF_REG_0, 1),
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
};
__u32 map_in[] = {0, 1, 2, 3, 4, 5};
__u32 map_out[] = {0, 1, 2, 3, 4, 5};
__check_success(insns, ARRAY_SIZE(insns), map_in, map_out);
}
/*
* Load a program with two patches (get jiffies, for simplicity). Add an
* insn_array map pointing to every instruction. Check how it was changed
* after the program load.
*/
static void check_simple(void)
{
struct bpf_insn insns[] = {
BPF_MOV64_IMM(BPF_REG_0, 2),
BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_jiffies64),
BPF_MOV64_IMM(BPF_REG_0, 1),
BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_jiffies64),
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
};
__u32 map_in[] = {0, 1, 2, 3, 4, 5};
__u32 map_out[] = {0, 1, 4, 5, 8, 9};
__check_success(insns, ARRAY_SIZE(insns), map_in, map_out);
}
/*
* Verifier can delete code in two cases: nops & dead code. From insn
* array's point of view, the two cases are the same, so test using
* the simplest method: by loading some nops
*/
static void check_deletions(void)
{
struct bpf_insn insns[] = {
BPF_MOV64_IMM(BPF_REG_0, 2),
BPF_JMP_IMM(BPF_JA, 0, 0, 0), /* nop */
BPF_MOV64_IMM(BPF_REG_0, 1),
BPF_JMP_IMM(BPF_JA, 0, 0, 0), /* nop */
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
};
__u32 map_in[] = {0, 1, 2, 3, 4, 5};
__u32 map_out[] = {0, -1, 1, -1, 2, 3};
__check_success(insns, ARRAY_SIZE(insns), map_in, map_out);
}
/*
* Same test as check_deletions, but also add code which adds instructions
*/
static void check_deletions_with_functions(void)
{
struct bpf_insn insns[] = {
BPF_JMP_IMM(BPF_JA, 0, 0, 0), /* nop */
BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_jiffies64),
BPF_JMP_IMM(BPF_JA, 0, 0, 0), /* nop */
BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2),
BPF_MOV64_IMM(BPF_REG_0, 1),
BPF_EXIT_INSN(),
BPF_JMP_IMM(BPF_JA, 0, 0, 0), /* nop */
BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_jiffies64),
BPF_JMP_IMM(BPF_JA, 0, 0, 0), /* nop */
BPF_MOV64_IMM(BPF_REG_0, 2),
BPF_EXIT_INSN(),
};
__u32 map_in[] = { 0, 1, 2, 3, 4, 5, /* func */ 6, 7, 8, 9, 10};
__u32 map_out[] = {-1, 0, -1, 3, 4, 5, /* func */ -1, 6, -1, 9, 10};
__check_success(insns, ARRAY_SIZE(insns), map_in, map_out);
}
/*
* Try to load a program with a map which points to outside of the program
*/
static void check_out_of_bounds_index(void)
{
struct bpf_insn insns[] = {
BPF_MOV64_IMM(BPF_REG_0, 4),
BPF_MOV64_IMM(BPF_REG_0, 3),
BPF_MOV64_IMM(BPF_REG_0, 2),
BPF_MOV64_IMM(BPF_REG_0, 1),
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
};
int prog_fd, map_fd;
struct bpf_insn_array_value val = {};
int key;
map_fd = map_create(BPF_MAP_TYPE_INSN_ARRAY, 1);
if (!ASSERT_GE(map_fd, 0, "map_create"))
return;
key = 0;
val.orig_off = ARRAY_SIZE(insns); /* too big */
if (!ASSERT_EQ(bpf_map_update_elem(map_fd, &key, &val, 0), 0, "bpf_map_update_elem"))
goto cleanup;
if (!ASSERT_EQ(bpf_map_freeze(map_fd), 0, "bpf_map_freeze"))
goto cleanup;
prog_fd = prog_load(insns, ARRAY_SIZE(insns), &map_fd, 1);
if (!ASSERT_EQ(prog_fd, -EINVAL, "program should have been rejected (prog_fd != -EINVAL)")) {
close(prog_fd);
goto cleanup;
}
cleanup:
close(map_fd);
}
/*
* Try to load a program with a map which points to the middle of 16-bit insn
*/
static void check_mid_insn_index(void)
{
struct bpf_insn insns[] = {
BPF_LD_IMM64(BPF_REG_0, 0), /* 2 x 8 */
BPF_EXIT_INSN(),
};
int prog_fd, map_fd;
struct bpf_insn_array_value val = {};
int key;
map_fd = map_create(BPF_MAP_TYPE_INSN_ARRAY, 1);
if (!ASSERT_GE(map_fd, 0, "map_create"))
return;
key = 0;
val.orig_off = 1; /* middle of 16-byte instruction */
if (!ASSERT_EQ(bpf_map_update_elem(map_fd, &key, &val, 0), 0, "bpf_map_update_elem"))
goto cleanup;
if (!ASSERT_EQ(bpf_map_freeze(map_fd), 0, "bpf_map_freeze"))
goto cleanup;
prog_fd = prog_load(insns, ARRAY_SIZE(insns), &map_fd, 1);
if (!ASSERT_EQ(prog_fd, -EINVAL, "program should have been rejected (prog_fd != -EINVAL)")) {
close(prog_fd);
goto cleanup;
}
cleanup:
close(map_fd);
}
static void check_incorrect_index(void)
{
check_out_of_bounds_index();
check_mid_insn_index();
}
static int set_bpf_jit_harden(char *level)
{
char old_level;
int err = -1;
int fd = -1;
fd = open("/proc/sys/net/core/bpf_jit_harden", O_RDWR | O_NONBLOCK);
if (fd < 0) {
ASSERT_FAIL("open .../bpf_jit_harden returned %d (errno=%d)", fd, errno);
return -1;
}
err = read(fd, &old_level, 1);
if (err != 1) {
ASSERT_FAIL("read from .../bpf_jit_harden returned %d (errno=%d)", err, errno);
err = -1;
goto end;
}
lseek(fd, 0, SEEK_SET);
err = write(fd, level, 1);
if (err != 1) {
ASSERT_FAIL("write to .../bpf_jit_harden returned %d (errno=%d)", err, errno);
err = -1;
goto end;
}
err = 0;
*level = old_level;
end:
if (fd >= 0)
close(fd);
return err;
}
static void check_blindness(void)
{
struct bpf_insn insns[] = {
BPF_MOV64_IMM(BPF_REG_0, 4),
BPF_MOV64_IMM(BPF_REG_0, 3),
BPF_MOV64_IMM(BPF_REG_0, 2),
BPF_MOV64_IMM(BPF_REG_0, 1),
BPF_EXIT_INSN(),
};
int prog_fd = -1, map_fd;
struct bpf_insn_array_value val = {};
char bpf_jit_harden = '@'; /* non-exizsting value */
int i;
map_fd = map_create(BPF_MAP_TYPE_INSN_ARRAY, ARRAY_SIZE(insns));
if (!ASSERT_GE(map_fd, 0, "map_create"))
return;
for (i = 0; i < ARRAY_SIZE(insns); i++) {
val.orig_off = i;
if (!ASSERT_EQ(bpf_map_update_elem(map_fd, &i, &val, 0), 0, "bpf_map_update_elem"))
goto cleanup;
}
if (!ASSERT_EQ(bpf_map_freeze(map_fd), 0, "bpf_map_freeze"))
goto cleanup;
bpf_jit_harden = '2';
if (set_bpf_jit_harden(&bpf_jit_harden)) {
bpf_jit_harden = '@'; /* open, read or write failed => no write was done */
goto cleanup;
}
prog_fd = prog_load(insns, ARRAY_SIZE(insns), &map_fd, 1);
if (!ASSERT_GE(prog_fd, 0, "bpf(BPF_PROG_LOAD)"))
goto cleanup;
for (i = 0; i < ARRAY_SIZE(insns); i++) {
char fmt[32];
if (!ASSERT_EQ(bpf_map_lookup_elem(map_fd, &i, &val), 0, "bpf_map_lookup_elem"))
goto cleanup;
snprintf(fmt, sizeof(fmt), "val should be equal 3*%d", i);
ASSERT_EQ(val.xlated_off, i * 3, fmt);
}
cleanup:
/* restore the old one */
if (bpf_jit_harden != '@')
set_bpf_jit_harden(&bpf_jit_harden);
close(prog_fd);
close(map_fd);
}
/* Once map was initialized, it should be frozen */
static void check_load_unfrozen_map(void)
{
struct bpf_insn insns[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
};
int prog_fd = -1, map_fd;
struct bpf_insn_array_value val = {};
int i;
map_fd = map_create(BPF_MAP_TYPE_INSN_ARRAY, ARRAY_SIZE(insns));
if (!ASSERT_GE(map_fd, 0, "map_create"))
return;
for (i = 0; i < ARRAY_SIZE(insns); i++) {
val.orig_off = i;
if (!ASSERT_EQ(bpf_map_update_elem(map_fd, &i, &val, 0), 0, "bpf_map_update_elem"))
goto cleanup;
}
prog_fd = prog_load(insns, ARRAY_SIZE(insns), &map_fd, 1);
if (!ASSERT_EQ(prog_fd, -EINVAL, "program should have been rejected (prog_fd != -EINVAL)"))
goto cleanup;
/* correctness: now freeze the map, the program should load fine */
if (!ASSERT_EQ(bpf_map_freeze(map_fd), 0, "bpf_map_freeze"))
goto cleanup;
prog_fd = prog_load(insns, ARRAY_SIZE(insns), &map_fd, 1);
if (!ASSERT_GE(prog_fd, 0, "bpf(BPF_PROG_LOAD)"))
goto cleanup;
for (i = 0; i < ARRAY_SIZE(insns); i++) {
if (!ASSERT_EQ(bpf_map_lookup_elem(map_fd, &i, &val), 0, "bpf_map_lookup_elem"))
goto cleanup;
ASSERT_EQ(val.xlated_off, i, "val should be equal i");
}
cleanup:
close(prog_fd);
close(map_fd);
}
/* Map can be used only by one BPF program */
static void check_no_map_reuse(void)
{
struct bpf_insn insns[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
};
int prog_fd = -1, map_fd, extra_fd = -1;
struct bpf_insn_array_value val = {};
int i;
map_fd = map_create(BPF_MAP_TYPE_INSN_ARRAY, ARRAY_SIZE(insns));
if (!ASSERT_GE(map_fd, 0, "map_create"))
return;
for (i = 0; i < ARRAY_SIZE(insns); i++) {
val.orig_off = i;
if (!ASSERT_EQ(bpf_map_update_elem(map_fd, &i, &val, 0), 0, "bpf_map_update_elem"))
goto cleanup;
}
if (!ASSERT_EQ(bpf_map_freeze(map_fd), 0, "bpf_map_freeze"))
goto cleanup;
prog_fd = prog_load(insns, ARRAY_SIZE(insns), &map_fd, 1);
if (!ASSERT_GE(prog_fd, 0, "bpf(BPF_PROG_LOAD)"))
goto cleanup;
for (i = 0; i < ARRAY_SIZE(insns); i++) {
if (!ASSERT_EQ(bpf_map_lookup_elem(map_fd, &i, &val), 0, "bpf_map_lookup_elem"))
goto cleanup;
ASSERT_EQ(val.xlated_off, i, "val should be equal i");
}
extra_fd = prog_load(insns, ARRAY_SIZE(insns), &map_fd, 1);
if (!ASSERT_EQ(extra_fd, -EBUSY, "program should have been rejected (extra_fd != -EBUSY)"))
goto cleanup;
/* correctness: check that prog is still loadable without fd_array */
extra_fd = prog_load(insns, ARRAY_SIZE(insns), NULL, 0);
if (!ASSERT_GE(extra_fd, 0, "bpf(BPF_PROG_LOAD): expected no error"))
goto cleanup;
cleanup:
close(extra_fd);
close(prog_fd);
close(map_fd);
}
static void check_bpf_no_lookup(void)
{
struct bpf_insn insns[] = {
BPF_LD_MAP_FD(BPF_REG_1, 0),
BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
BPF_EXIT_INSN(),
};
int prog_fd = -1, map_fd;
map_fd = map_create(BPF_MAP_TYPE_INSN_ARRAY, 1);
if (!ASSERT_GE(map_fd, 0, "map_create"))
return;
insns[0].imm = map_fd;
if (!ASSERT_EQ(bpf_map_freeze(map_fd), 0, "bpf_map_freeze"))
goto cleanup;
prog_fd = prog_load(insns, ARRAY_SIZE(insns), NULL, 0);
if (!ASSERT_EQ(prog_fd, -EINVAL, "program should have been rejected (prog_fd != -EINVAL)"))
goto cleanup;
/* correctness: check that prog is still loadable with normal map */
close(map_fd);
map_fd = map_create(BPF_MAP_TYPE_ARRAY, 1);
insns[0].imm = map_fd;
prog_fd = prog_load(insns, ARRAY_SIZE(insns), NULL, 0);
if (!ASSERT_GE(prog_fd, 0, "bpf(BPF_PROG_LOAD)"))
goto cleanup;
cleanup:
close(prog_fd);
close(map_fd);
}
static void check_bpf_side(void)
{
check_bpf_no_lookup();
}
static void __test_bpf_insn_array(void)
{
/* Test if offsets are adjusted properly */
if (test__start_subtest("one2one"))
check_one_to_one_mapping();
if (test__start_subtest("simple"))
check_simple();
if (test__start_subtest("deletions"))
check_deletions();
if (test__start_subtest("deletions-with-functions"))
check_deletions_with_functions();
if (test__start_subtest("blindness"))
check_blindness();
/* Check all kinds of operations and related restrictions */
if (test__start_subtest("incorrect-index"))
check_incorrect_index();
if (test__start_subtest("load-unfrozen-map"))
check_load_unfrozen_map();
if (test__start_subtest("no-map-reuse"))
check_no_map_reuse();
if (test__start_subtest("bpf-side-ops"))
check_bpf_side();
}
#else
static void __test_bpf_insn_array(void)
{
test__skip();
}
#endif
void test_bpf_insn_array(void)
{
__test_bpf_insn_array();
}

View File

@@ -7495,6 +7495,71 @@ static struct btf_dedup_test dedup_tests[] = {
BTF_STR_SEC("\0t\0m1\0m2\0tag1\0tag2\0tag3"),
},
},
{
.descr = "dedup: recursive typedef",
/*
* This test simulates a recursive typedef, which in GO is defined as such:
*
* type Foo func() Foo
*
* In BTF terms, this is represented as a TYPEDEF referencing
* a FUNC_PROTO that returns the same TYPEDEF.
*/
.input = {
.raw_types = {
/*
* [1] typedef Foo -> func() Foo
* [2] func_proto() -> Foo
* [3] typedef Foo -> func() Foo
* [4] func_proto() -> Foo
*/
BTF_TYPEDEF_ENC(NAME_NTH(1), 2), /* [1] */
BTF_FUNC_PROTO_ENC(1, 0), /* [2] */
BTF_TYPEDEF_ENC(NAME_NTH(1), 4), /* [3] */
BTF_FUNC_PROTO_ENC(3, 0), /* [4] */
BTF_END_RAW,
},
BTF_STR_SEC("\0Foo"),
},
.expect = {
.raw_types = {
BTF_TYPEDEF_ENC(NAME_NTH(1), 2), /* [1] */
BTF_FUNC_PROTO_ENC(1, 0), /* [2] */
BTF_END_RAW,
},
BTF_STR_SEC("\0Foo"),
},
},
{
.descr = "dedup: typedef",
/*
* // CU 1:
* typedef int foo;
*
* // CU 2:
* typedef int foo;
*/
.input = {
.raw_types = {
/* CU 1 */
BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
BTF_TYPEDEF_ENC(NAME_NTH(1), 1), /* [2] */
/* CU 2 */
BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [3] */
BTF_TYPEDEF_ENC(NAME_NTH(1), 3), /* [4] */
BTF_END_RAW,
},
BTF_STR_SEC("\0foo"),
},
.expect = {
.raw_types = {
BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
BTF_TYPEDEF_ENC(NAME_NTH(1), 1), /* [2] */
BTF_END_RAW,
},
BTF_STR_SEC("\0foo"),
},
},
{
.descr = "dedup: typedef tags",
.input = {

View File

@@ -12,11 +12,45 @@ static void btf_dump_printf(void *ctx, const char *fmt, va_list args)
vfprintf(ctx, fmt, args);
}
/* Write raw BTF to file, return number of bytes written or negative errno */
static ssize_t btf_raw_write(struct btf *btf, char *file)
{
ssize_t written = 0;
const void *data;
__u32 size = 0;
int fd, ret;
fd = mkstemp(file);
if (!ASSERT_GE(fd, 0, "create_file"))
return -errno;
data = btf__raw_data(btf, &size);
if (!ASSERT_OK_PTR(data, "btf__raw_data")) {
close(fd);
return -EINVAL;
}
while (written < size) {
ret = write(fd, data + written, size - written);
if (!ASSERT_GE(ret, 0, "write succeeded")) {
close(fd);
return -errno;
}
written += ret;
}
close(fd);
return written;
}
static void __test_btf_split(bool multi)
{
char multisplit_btf_file[] = "/tmp/test_btf_multisplit.XXXXXX";
char split_btf_file[] = "/tmp/test_btf_split.XXXXXX";
char base_btf_file[] = "/tmp/test_btf_base.XXXXXX";
ssize_t multisplit_btf_sz = 0, split_btf_sz = 0, base_btf_sz = 0;
struct btf_dump *d = NULL;
const struct btf_type *t;
struct btf *btf1, *btf2, *btf3 = NULL;
const struct btf_type *t, *ot;
struct btf *btf1 = NULL, *btf2 = NULL, *btf3 = NULL;
struct btf *btf4 = NULL, *btf5 = NULL, *btf6 = NULL;
int str_off, i, err;
btf1 = btf__new_empty();
@@ -123,6 +157,45 @@ static void __test_btf_split(bool multi)
" int uf2;\n"
"};\n\n", "c_dump");
/* write base, split BTFs to files and ensure parsing succeeds */
base_btf_sz = btf_raw_write(btf1, base_btf_file);
if (base_btf_sz < 0)
goto cleanup;
split_btf_sz = btf_raw_write(btf2, split_btf_file);
if (split_btf_sz < 0)
goto cleanup;
btf4 = btf__parse(base_btf_file, NULL);
if (!ASSERT_OK_PTR(btf4, "parse_base"))
goto cleanup;
btf5 = btf__parse_split(split_btf_file, btf4);
if (!ASSERT_OK_PTR(btf5, "parse_split"))
goto cleanup;
if (multi) {
multisplit_btf_sz = btf_raw_write(btf3, multisplit_btf_file);
if (multisplit_btf_sz < 0)
goto cleanup;
btf6 = btf__parse_split(multisplit_btf_file, btf5);
if (!ASSERT_OK_PTR(btf6, "parse_multisplit"))
goto cleanup;
} else {
btf6 = btf5;
}
if (!ASSERT_EQ(btf__type_cnt(btf3), btf__type_cnt(btf6), "cmp_type_cnt"))
goto cleanup;
/* compare parsed to original BTF */
for (i = 1; i < btf__type_cnt(btf6); i++) {
t = btf__type_by_id(btf6, i);
if (!ASSERT_OK_PTR(t, "type_in_parsed_btf"))
goto cleanup;
ot = btf__type_by_id(btf3, i);
if (!ASSERT_OK_PTR(ot, "type_in_orig_btf"))
goto cleanup;
if (!ASSERT_EQ(memcmp(t, ot, sizeof(*ot)), 0, "cmp_parsed_orig_btf"))
goto cleanup;
}
cleanup:
if (dump_buf_file)
fclose(dump_buf_file);
@@ -132,6 +205,16 @@ cleanup:
btf__free(btf2);
if (btf2 != btf3)
btf__free(btf3);
btf__free(btf4);
btf__free(btf5);
if (btf5 != btf6)
btf__free(btf6);
if (base_btf_sz > 0)
unlink(base_btf_file);
if (split_btf_sz > 0)
unlink(split_btf_file);
if (multisplit_btf_sz > 0)
unlink(multisplit_btf_file);
}
void test_btf_split(void)

View File

@@ -153,6 +153,26 @@ static void test_check_mtu_run_tc(struct test_check_mtu *skel,
ASSERT_EQ(mtu_result, mtu_expect, "MTU-compare-user");
}
static void test_chk_segs_flag(struct test_check_mtu *skel, __u32 mtu)
{
int err, prog_fd = bpf_program__fd(skel->progs.tc_chk_segs_flag);
struct __sk_buff skb = {
.gso_size = 10,
};
LIBBPF_OPTS(bpf_test_run_opts, topts,
.data_in = &pkt_v4,
.data_size_in = sizeof(pkt_v4),
.ctx_in = &skb,
.ctx_size_in = sizeof(skb),
);
/* Lower the mtu to test the BPF_MTU_CHK_SEGS */
SYS_NOFAIL("ip link set dev lo mtu 10");
err = bpf_prog_test_run_opts(prog_fd, &topts);
SYS_NOFAIL("ip link set dev lo mtu %u", mtu);
ASSERT_OK(err, "test_run");
ASSERT_EQ(topts.retval, BPF_OK, "retval");
}
static void test_check_mtu_tc(__u32 mtu, __u32 ifindex)
{
@@ -177,11 +197,12 @@ static void test_check_mtu_tc(__u32 mtu, __u32 ifindex)
test_check_mtu_run_tc(skel, skel->progs.tc_minus_delta, mtu);
test_check_mtu_run_tc(skel, skel->progs.tc_input_len, mtu);
test_check_mtu_run_tc(skel, skel->progs.tc_input_len_exceed, mtu);
test_chk_segs_flag(skel, mtu);
cleanup:
test_check_mtu__destroy(skel);
}
void serial_test_check_mtu(void)
void test_ns_check_mtu(void)
{
int mtu_lo;

View File

@@ -22,79 +22,37 @@
static int duration = 0;
struct addr_port {
in_port_t port;
union {
struct in_addr in_addr;
struct in6_addr in6_addr;
};
};
struct tuple {
int family;
struct addr_port src;
struct addr_port dst;
};
static bool fill_addr_port(const struct sockaddr *sa, struct addr_port *ap)
{
const struct sockaddr_in6 *in6;
const struct sockaddr_in *in;
switch (sa->sa_family) {
case AF_INET:
in = (const struct sockaddr_in *)sa;
ap->in_addr = in->sin_addr;
ap->port = in->sin_port;
return true;
case AF_INET6:
in6 = (const struct sockaddr_in6 *)sa;
ap->in6_addr = in6->sin6_addr;
ap->port = in6->sin6_port;
return true;
default:
return false;
}
}
static bool set_up_conn(const struct sockaddr *addr, socklen_t len, int type,
int *server, int *conn, struct tuple *tuple)
static bool set_up_conn(const struct sockaddr_storage *addr, socklen_t len, int type,
int *server, int *conn,
struct sockaddr_storage *src,
struct sockaddr_storage *dst)
{
struct sockaddr_storage ss;
socklen_t slen = sizeof(ss);
struct sockaddr *sa = (struct sockaddr *)&ss;
*server = start_server_addr(type, (struct sockaddr_storage *)addr, len, NULL);
*server = start_server_addr(type, addr, len, NULL);
if (*server < 0)
return false;
if (CHECK_FAIL(getsockname(*server, sa, &slen)))
if (CHECK_FAIL(getsockname(*server, (struct sockaddr *)&ss, &slen)))
goto close_server;
*conn = connect_to_addr(type, (struct sockaddr_storage *)sa, slen, NULL);
*conn = connect_to_addr(type, &ss, slen, NULL);
if (*conn < 0)
goto close_server;
/* We want to simulate packets arriving at conn, so we have to
* swap src and dst.
*/
slen = sizeof(ss);
if (CHECK_FAIL(getsockname(*conn, sa, &slen)))
slen = sizeof(*dst);
if (CHECK_FAIL(getsockname(*conn, (struct sockaddr *)dst, &slen)))
goto close_conn;
if (CHECK_FAIL(!fill_addr_port(sa, &tuple->dst)))
slen = sizeof(*src);
if (CHECK_FAIL(getpeername(*conn, (struct sockaddr *)src, &slen)))
goto close_conn;
slen = sizeof(ss);
if (CHECK_FAIL(getpeername(*conn, sa, &slen)))
goto close_conn;
if (CHECK_FAIL(!fill_addr_port(sa, &tuple->src)))
goto close_conn;
tuple->family = ss.ss_family;
return true;
close_conn:
@@ -110,17 +68,16 @@ static socklen_t prepare_addr(struct sockaddr_storage *addr, int family)
{
struct sockaddr_in *addr4;
struct sockaddr_in6 *addr6;
memset(addr, 0, sizeof(*addr));
switch (family) {
case AF_INET:
addr4 = (struct sockaddr_in *)addr;
memset(addr4, 0, sizeof(*addr4));
addr4->sin_family = family;
addr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
return sizeof(*addr4);
case AF_INET6:
addr6 = (struct sockaddr_in6 *)addr;
memset(addr6, 0, sizeof(*addr6));
addr6->sin6_family = family;
addr6->sin6_addr = in6addr_loopback;
return sizeof(*addr6);
@@ -242,9 +199,15 @@ static void encap_init(encap_headers_t *encap, uint8_t hop_count, uint8_t proto)
}
static size_t build_input(const struct test_cfg *test, void *const buf,
const struct tuple *tuple)
const struct sockaddr_storage *src,
const struct sockaddr_storage *dst)
{
in_port_t sport = tuple->src.port;
struct sockaddr_in6 *src_in6 = (struct sockaddr_in6 *)src;
struct sockaddr_in6 *dst_in6 = (struct sockaddr_in6 *)dst;
struct sockaddr_in *src_in = (struct sockaddr_in *)src;
struct sockaddr_in *dst_in = (struct sockaddr_in *)dst;
sa_family_t family = src->ss_family;
in_port_t sport, dport;
encap_headers_t encap;
struct iphdr ip;
struct ipv6hdr ipv6;
@@ -254,8 +217,11 @@ static size_t build_input(const struct test_cfg *test, void *const buf,
uint8_t *p = buf;
int proto;
sport = (family == AF_INET) ? src_in->sin_port : src_in6->sin6_port;
dport = (family == AF_INET) ? dst_in->sin_port : dst_in6->sin6_port;
proto = IPPROTO_IPIP;
if (tuple->family == AF_INET6)
if (family == AF_INET6)
proto = IPPROTO_IPV6;
encap_init(&encap, test->hops == ONE_HOP ? 1 : 0, proto);
@@ -270,15 +236,15 @@ static size_t build_input(const struct test_cfg *test, void *const buf,
if (test->type == UDP)
proto = IPPROTO_UDP;
switch (tuple->family) {
switch (family) {
case AF_INET:
ip = (struct iphdr){
.ihl = 5,
.version = 4,
.ttl = IPDEFTTL,
.protocol = proto,
.saddr = tuple->src.in_addr.s_addr,
.daddr = tuple->dst.in_addr.s_addr,
.saddr = src_in->sin_addr.s_addr,
.daddr = dst_in->sin_addr.s_addr,
};
p = mempcpy(p, &ip, sizeof(ip));
break;
@@ -287,8 +253,8 @@ static size_t build_input(const struct test_cfg *test, void *const buf,
.version = 6,
.hop_limit = IPDEFTTL,
.nexthdr = proto,
.saddr = tuple->src.in6_addr,
.daddr = tuple->dst.in6_addr,
.saddr = src_in6->sin6_addr,
.daddr = dst_in6->sin6_addr,
};
p = mempcpy(p, &ipv6, sizeof(ipv6));
break;
@@ -303,18 +269,16 @@ static size_t build_input(const struct test_cfg *test, void *const buf,
case TCP:
tcp = (struct tcphdr){
.source = sport,
.dest = tuple->dst.port,
.dest = dport,
.syn = (test->flags == SYN),
.ack = (test->flags == ACK),
};
if (test->flags == SYN)
tcp.syn = true;
if (test->flags == ACK)
tcp.ack = true;
p = mempcpy(p, &tcp, sizeof(tcp));
break;
case UDP:
udp = (struct udphdr){
.source = sport,
.dest = tuple->dst.port,
.dest = dport,
};
p = mempcpy(p, &udp, sizeof(udp));
break;
@@ -339,27 +303,26 @@ static void test_cls_redirect_common(struct bpf_program *prog)
LIBBPF_OPTS(bpf_test_run_opts, tattr);
int families[] = { AF_INET, AF_INET6 };
struct sockaddr_storage ss;
struct sockaddr *addr;
socklen_t slen;
int i, j, err, prog_fd;
int servers[__NR_KIND][ARRAY_SIZE(families)] = {};
int conns[__NR_KIND][ARRAY_SIZE(families)] = {};
struct tuple tuples[__NR_KIND][ARRAY_SIZE(families)];
struct sockaddr_storage srcs[__NR_KIND][ARRAY_SIZE(families)];
struct sockaddr_storage dsts[__NR_KIND][ARRAY_SIZE(families)];
addr = (struct sockaddr *)&ss;
for (i = 0; i < ARRAY_SIZE(families); i++) {
slen = prepare_addr(&ss, families[i]);
if (CHECK_FAIL(!slen))
goto cleanup;
if (CHECK_FAIL(!set_up_conn(addr, slen, SOCK_DGRAM,
if (CHECK_FAIL(!set_up_conn(&ss, slen, SOCK_DGRAM,
&servers[UDP][i], &conns[UDP][i],
&tuples[UDP][i])))
&srcs[UDP][i], &dsts[UDP][i])))
goto cleanup;
if (CHECK_FAIL(!set_up_conn(addr, slen, SOCK_STREAM,
if (CHECK_FAIL(!set_up_conn(&ss, slen, SOCK_STREAM,
&servers[TCP][i], &conns[TCP][i],
&tuples[TCP][i])))
&srcs[TCP][i], &dsts[TCP][i])))
goto cleanup;
}
@@ -368,11 +331,12 @@ static void test_cls_redirect_common(struct bpf_program *prog)
struct test_cfg *test = &tests[i];
for (j = 0; j < ARRAY_SIZE(families); j++) {
struct tuple *tuple = &tuples[test->type][j];
struct sockaddr_storage *src = &srcs[test->type][j];
struct sockaddr_storage *dst = &dsts[test->type][j];
char input[256];
char tmp[256];
test_str(tmp, sizeof(tmp), test, tuple->family);
test_str(tmp, sizeof(tmp), test, families[j]);
if (!test__start_subtest(tmp))
continue;
@@ -380,7 +344,7 @@ static void test_cls_redirect_common(struct bpf_program *prog)
tattr.data_size_out = sizeof(tmp);
tattr.data_in = input;
tattr.data_size_in = build_input(test, input, tuple);
tattr.data_size_in = build_input(test, input, src, dst);
if (CHECK_FAIL(!tattr.data_size_in))
continue;

View File

@@ -0,0 +1,117 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
#include <test_progs.h>
#include <network_helpers.h>
#include "file_reader.skel.h"
#include "file_reader_fail.skel.h"
#include <dlfcn.h>
#include <sys/mman.h>
const char *user_ptr = "hello world";
char file_contents[256000];
void *get_executable_base_addr(void)
{
Dl_info info;
if (!dladdr((void *)&get_executable_base_addr, &info)) {
fprintf(stderr, "dladdr failed\n");
return NULL;
}
return info.dli_fbase;
}
static int initialize_file_contents(void)
{
int fd, page_sz = sysconf(_SC_PAGESIZE);
ssize_t n = 0, cur, off;
void *addr;
fd = open("/proc/self/exe", O_RDONLY);
if (!ASSERT_OK_FD(fd, "Open /proc/self/exe\n"))
return 1;
do {
cur = read(fd, file_contents + n, sizeof(file_contents) - n);
if (!ASSERT_GT(cur, 0, "read success"))
break;
n += cur;
} while (n < sizeof(file_contents));
close(fd);
if (!ASSERT_EQ(n, sizeof(file_contents), "Read /proc/self/exe\n"))
return 1;
addr = get_executable_base_addr();
if (!ASSERT_NEQ(addr, NULL, "get executable address"))
return 1;
/* page-align base file address */
addr = (void *)((unsigned long)addr & ~(page_sz - 1));
/*
* Page out range 0..512K, use 0..256K for positive tests and
* 256K..512K for negative tests expecting page faults
*/
for (off = 0; off < sizeof(file_contents) * 2; off += page_sz) {
if (!ASSERT_OK(madvise(addr + off, page_sz, MADV_PAGEOUT),
"madvise pageout"))
return errno;
}
return 0;
}
static void run_test(const char *prog_name)
{
struct file_reader *skel;
struct bpf_program *prog;
int err, fd;
err = initialize_file_contents();
if (!ASSERT_OK(err, "initialize file contents"))
return;
skel = file_reader__open();
if (!ASSERT_OK_PTR(skel, "file_reader__open"))
return;
bpf_object__for_each_program(prog, skel->obj) {
bpf_program__set_autoload(prog, strcmp(bpf_program__name(prog), prog_name) == 0);
}
memcpy(skel->bss->user_buf, file_contents, sizeof(file_contents));
skel->bss->pid = getpid();
err = file_reader__load(skel);
if (!ASSERT_OK(err, "file_reader__load"))
goto cleanup;
err = file_reader__attach(skel);
if (!ASSERT_OK(err, "file_reader__attach"))
goto cleanup;
fd = open("/proc/self/exe", O_RDONLY);
if (fd >= 0)
close(fd);
ASSERT_EQ(skel->bss->err, 0, "err");
ASSERT_EQ(skel->bss->run_success, 1, "run_success");
cleanup:
file_reader__destroy(skel);
}
void test_file_reader(void)
{
if (test__start_subtest("on_open_expect_fault"))
run_test("on_open_expect_fault");
if (test__start_subtest("on_open_validate_file_read"))
run_test("on_open_validate_file_read");
if (test__start_subtest("negative"))
RUN_TESTS(file_reader_fail);
}

View File

@@ -15,17 +15,17 @@ struct htab_update_ctx {
static void test_reenter_update(void)
{
struct htab_update *skel;
unsigned int key, value;
void *value = NULL;
unsigned int key, value_size;
int err;
skel = htab_update__open();
if (!ASSERT_OK_PTR(skel, "htab_update__open"))
return;
/* lookup_elem_raw() may be inlined and find_kernel_btf_id() will return -ESRCH */
bpf_program__set_autoload(skel->progs.lookup_elem_raw, true);
bpf_program__set_autoload(skel->progs.bpf_obj_free_fields, true);
err = htab_update__load(skel);
if (!ASSERT_TRUE(!err || err == -ESRCH, "htab_update__load") || err)
if (!ASSERT_TRUE(!err, "htab_update__load") || err)
goto out;
skel->bss->pid = getpid();
@@ -33,14 +33,33 @@ static void test_reenter_update(void)
if (!ASSERT_OK(err, "htab_update__attach"))
goto out;
/* Will trigger the reentrancy of bpf_map_update_elem() */
value_size = bpf_map__value_size(skel->maps.htab);
value = calloc(1, value_size);
if (!ASSERT_OK_PTR(value, "calloc value"))
goto out;
/*
* First update: plain insert. This should NOT trigger the re-entrancy
* path, because there is no old element to free yet.
*/
key = 0;
value = 0;
err = bpf_map_update_elem(bpf_map__fd(skel->maps.htab), &key, &value, 0);
if (!ASSERT_OK(err, "add element"))
err = bpf_map_update_elem(bpf_map__fd(skel->maps.htab), &key, value, BPF_ANY);
if (!ASSERT_OK(err, "first update (insert)"))
goto out;
ASSERT_EQ(skel->bss->update_err, -EBUSY, "no reentrancy");
/*
* Second update: replace existing element with same key and trigger
* the reentrancy of bpf_map_update_elem().
* check_and_free_fields() calls bpf_obj_free_fields() on the old
* value, which is where fentry program runs and performs a nested
* bpf_map_update_elem(), triggering -EDEADLK.
*/
memset(value, 0, value_size);
err = bpf_map_update_elem(bpf_map__fd(skel->maps.htab), &key, value, BPF_ANY);
if (!ASSERT_OK(err, "second update (replace)"))
goto out;
ASSERT_EQ(skel->bss->update_err, -EDEADLK, "no reentrancy");
out:
htab_update__destroy(skel);
}

View File

@@ -57,7 +57,8 @@ static void subtest_kmem_cache_iter_check_slabinfo(struct kmem_cache_iter *skel)
if (!ASSERT_OK(ret, "kmem_cache_lookup"))
break;
ASSERT_STREQ(r.name, name, "kmem_cache_name");
ASSERT_STRNEQ(r.name, name, sizeof(r.name) - 1,
"kmem_cache_name");
ASSERT_EQ(r.obj_size, objsize, "kmem_cache_objsize");
seen++;

View File

@@ -15,6 +15,10 @@ static void check_good_sample(struct test_perf_branches *skel)
int pbe_size = sizeof(struct perf_branch_entry);
int duration = 0;
if (CHECK(!skel->bss->run_cnt, "invalid run_cnt",
"checked sample validity before prog run"))
return;
if (CHECK(!skel->bss->valid, "output not valid",
"no valid sample from prog"))
return;
@@ -45,6 +49,10 @@ static void check_bad_sample(struct test_perf_branches *skel)
int written_stack = skel->bss->written_stack_out;
int duration = 0;
if (CHECK(!skel->bss->run_cnt, "invalid run_cnt",
"checked sample validity before prog run"))
return;
if (CHECK(!skel->bss->valid, "output not valid",
"no valid sample from prog"))
return;
@@ -83,8 +91,12 @@ static void test_perf_branches_common(int perf_fd,
err = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set), &cpu_set);
if (CHECK(err, "set_affinity", "cpu #0, err %d\n", err))
goto out_destroy;
/* spin the loop for a while (random high number) */
for (i = 0; i < 1000000; ++i)
/* Spin the loop for a while by using a high iteration count, and by
* checking whether the specific run count marker has been explicitly
* incremented at least once by the backing perf_event BPF program.
*/
for (i = 0; i < 100000000 && !*(volatile int *)&skel->bss->run_cnt; ++i)
++j;
test_perf_branches__detach(skel);
@@ -116,11 +128,11 @@ static void test_perf_branches_hw(void)
pfd = syscall(__NR_perf_event_open, &attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC);
/*
* Some setups don't support branch records (virtual machines, !x86),
* so skip test in this case.
* Some setups don't support LBR (virtual machines, !x86, AMD Milan Zen
* 3 which only supports BRS), so skip test in this case.
*/
if (pfd < 0) {
if (errno == ENOENT || errno == EOPNOTSUPP) {
if (errno == ENOENT || errno == EOPNOTSUPP || errno == EINVAL) {
printf("%s:SKIP:no PERF_SAMPLE_BRANCH_STACK\n",
__func__);
test__skip();

View File

@@ -28,6 +28,7 @@ static void test_success(void)
bpf_program__set_autoload(skel->progs.two_regions, true);
bpf_program__set_autoload(skel->progs.non_sleepable_1, true);
bpf_program__set_autoload(skel->progs.non_sleepable_2, true);
bpf_program__set_autoload(skel->progs.nested_rcu_region, true);
bpf_program__set_autoload(skel->progs.task_trusted_non_rcuptr, true);
bpf_program__set_autoload(skel->progs.rcu_read_lock_subprog, true);
bpf_program__set_autoload(skel->progs.rcu_read_lock_global_subprog, true);
@@ -78,7 +79,8 @@ static const char * const inproper_region_tests[] = {
"non_sleepable_rcu_mismatch",
"inproper_sleepable_helper",
"inproper_sleepable_kfunc",
"nested_rcu_region",
"nested_rcu_region_unbalanced_1",
"nested_rcu_region_unbalanced_2",
"rcu_read_lock_global_subprog_lock",
"rcu_read_lock_global_subprog_unlock",
"rcu_read_lock_sleepable_helper_global_subprog",

View File

@@ -44,3 +44,59 @@ void test_refcounted_kptr_wrong_owner(void)
ASSERT_OK(opts.retval, "rbtree_wrong_owner_remove_fail_a2 retval");
refcounted_kptr__destroy(skel);
}
void test_percpu_hash_refcounted_kptr_refcount_leak(void)
{
struct refcounted_kptr *skel;
int cpu_nr, fd, err, key = 0;
struct bpf_map *map;
size_t values_sz;
u64 *values;
LIBBPF_OPTS(bpf_test_run_opts, opts,
.data_in = &pkt_v4,
.data_size_in = sizeof(pkt_v4),
.repeat = 1,
);
cpu_nr = libbpf_num_possible_cpus();
if (!ASSERT_GT(cpu_nr, 0, "libbpf_num_possible_cpus"))
return;
values = calloc(cpu_nr, sizeof(u64));
if (!ASSERT_OK_PTR(values, "calloc values"))
return;
skel = refcounted_kptr__open_and_load();
if (!ASSERT_OK_PTR(skel, "refcounted_kptr__open_and_load")) {
free(values);
return;
}
values_sz = cpu_nr * sizeof(u64);
memset(values, 0, values_sz);
map = skel->maps.percpu_hash;
err = bpf_map__update_elem(map, &key, sizeof(key), values, values_sz, 0);
if (!ASSERT_OK(err, "bpf_map__update_elem"))
goto out;
fd = bpf_program__fd(skel->progs.percpu_hash_refcount_leak);
err = bpf_prog_test_run_opts(fd, &opts);
if (!ASSERT_OK(err, "bpf_prog_test_run_opts"))
goto out;
if (!ASSERT_EQ(opts.retval, 2, "opts.retval"))
goto out;
err = bpf_map__update_elem(map, &key, sizeof(key), values, values_sz, 0);
if (!ASSERT_OK(err, "bpf_map__update_elem"))
goto out;
fd = bpf_program__fd(skel->progs.check_percpu_hash_refcount);
err = bpf_prog_test_run_opts(fd, &opts);
ASSERT_OK(err, "bpf_prog_test_run_opts");
ASSERT_EQ(opts.retval, 1, "opts.retval");
out:
refcounted_kptr__destroy(skel);
free(values);
}

View File

@@ -110,8 +110,8 @@ void serial_test_res_spin_lock_stress(void)
ASSERT_OK(load_module("bpf_test_rqspinlock.ko", false), "load module AA");
sleep(5);
unload_module("bpf_test_rqspinlock", false);
ASSERT_OK(load_module_params("bpf_test_rqspinlock.ko", "test_ab=1", false), "load module ABBA");
sleep(5);
unload_module("bpf_test_rqspinlock", false);
/*
* Insert bpf_test_rqspinlock.ko manually with test_mode=[1|2] to test
* other cases (ABBA, ABBCCA).
*/
}

View File

@@ -17,6 +17,7 @@
#include "test_ringbuf_n.lskel.h"
#include "test_ringbuf_map_key.lskel.h"
#include "test_ringbuf_write.lskel.h"
#include "test_ringbuf_overwrite.lskel.h"
#define EDONE 7777
@@ -497,6 +498,68 @@ cleanup:
test_ringbuf_map_key_lskel__destroy(skel_map_key);
}
static void ringbuf_overwrite_mode_subtest(void)
{
unsigned long size, len1, len2, len3, len4, len5;
unsigned long expect_avail_data, expect_prod_pos, expect_over_pos;
struct test_ringbuf_overwrite_lskel *skel;
int page_size = getpagesize();
int err;
skel = test_ringbuf_overwrite_lskel__open();
if (!ASSERT_OK_PTR(skel, "skel_open"))
return;
size = page_size;
len1 = page_size / 2;
len2 = page_size / 4;
len3 = size - len1 - len2 - BPF_RINGBUF_HDR_SZ * 3;
len4 = len3 - 8;
len5 = len3; /* retry with len3 */
skel->maps.ringbuf.max_entries = size;
skel->rodata->LEN1 = len1;
skel->rodata->LEN2 = len2;
skel->rodata->LEN3 = len3;
skel->rodata->LEN4 = len4;
skel->rodata->LEN5 = len5;
skel->bss->pid = getpid();
err = test_ringbuf_overwrite_lskel__load(skel);
if (!ASSERT_OK(err, "skel_load"))
goto cleanup;
err = test_ringbuf_overwrite_lskel__attach(skel);
if (!ASSERT_OK(err, "skel_attach"))
goto cleanup;
syscall(__NR_getpgid);
ASSERT_EQ(skel->bss->reserve1_fail, 0, "reserve 1");
ASSERT_EQ(skel->bss->reserve2_fail, 0, "reserve 2");
ASSERT_EQ(skel->bss->reserve3_fail, 1, "reserve 3");
ASSERT_EQ(skel->bss->reserve4_fail, 0, "reserve 4");
ASSERT_EQ(skel->bss->reserve5_fail, 0, "reserve 5");
ASSERT_EQ(skel->bss->ring_size, size, "check_ring_size");
expect_avail_data = len2 + len4 + len5 + 3 * BPF_RINGBUF_HDR_SZ;
ASSERT_EQ(skel->bss->avail_data, expect_avail_data, "check_avail_size");
ASSERT_EQ(skel->bss->cons_pos, 0, "check_cons_pos");
expect_prod_pos = len1 + len2 + len4 + len5 + 4 * BPF_RINGBUF_HDR_SZ;
ASSERT_EQ(skel->bss->prod_pos, expect_prod_pos, "check_prod_pos");
expect_over_pos = len1 + BPF_RINGBUF_HDR_SZ;
ASSERT_EQ(skel->bss->over_pos, expect_over_pos, "check_over_pos");
test_ringbuf_overwrite_lskel__detach(skel);
cleanup:
test_ringbuf_overwrite_lskel__destroy(skel);
}
void test_ringbuf(void)
{
if (test__start_subtest("ringbuf"))
@@ -507,4 +570,6 @@ void test_ringbuf(void)
ringbuf_map_key_subtest();
if (test__start_subtest("ringbuf_write"))
ringbuf_write_subtest();
if (test__start_subtest("ringbuf_overwrite_mode"))
ringbuf_overwrite_mode_subtest();
}

View File

@@ -41,11 +41,7 @@ static struct bpf_object *obj;
static __u32 index_zero;
static int epfd;
static union sa46 {
struct sockaddr_in6 v6;
struct sockaddr_in v4;
sa_family_t family;
} srv_sa;
static struct sockaddr_storage srv_sa;
#define RET_IF(condition, tag, format...) ({ \
if (CHECK_FAIL(condition)) { \
@@ -135,24 +131,24 @@ static int prepare_bpf_obj(void)
return 0;
}
static void sa46_init_loopback(union sa46 *sa, sa_family_t family)
static void ss_init_loopback(struct sockaddr_storage *sa, sa_family_t family)
{
memset(sa, 0, sizeof(*sa));
sa->family = family;
if (sa->family == AF_INET6)
sa->v6.sin6_addr = in6addr_loopback;
sa->ss_family = family;
if (sa->ss_family == AF_INET6)
((struct sockaddr_in6 *)sa)->sin6_addr = in6addr_loopback;
else
sa->v4.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
((struct sockaddr_in *)sa)->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
}
static void sa46_init_inany(union sa46 *sa, sa_family_t family)
static void ss_init_inany(struct sockaddr_storage *sa, sa_family_t family)
{
memset(sa, 0, sizeof(*sa));
sa->family = family;
if (sa->family == AF_INET6)
sa->v6.sin6_addr = in6addr_any;
sa->ss_family = family;
if (sa->ss_family == AF_INET6)
((struct sockaddr_in6 *)sa)->sin6_addr = in6addr_any;
else
sa->v4.sin_addr.s_addr = INADDR_ANY;
((struct sockaddr_in *)sa)->sin_addr.s_addr = INADDR_ANY;
}
static int read_int_sysctl(const char *sysctl)
@@ -228,7 +224,7 @@ static void check_data(int type, sa_family_t family, const struct cmd *cmd,
int cli_fd)
{
struct data_check expected = {}, result;
union sa46 cli_sa;
struct sockaddr_storage cli_sa;
socklen_t addrlen;
int err;
@@ -251,26 +247,32 @@ static void check_data(int type, sa_family_t family, const struct cmd *cmd,
}
if (family == AF_INET6) {
expected.eth_protocol = htons(ETH_P_IPV6);
expected.bind_inany = !srv_sa.v6.sin6_addr.s6_addr32[3] &&
!srv_sa.v6.sin6_addr.s6_addr32[2] &&
!srv_sa.v6.sin6_addr.s6_addr32[1] &&
!srv_sa.v6.sin6_addr.s6_addr32[0];
struct sockaddr_in6 *srv_v6 = (struct sockaddr_in6 *)&srv_sa;
struct sockaddr_in6 *cli_v6 = (struct sockaddr_in6 *)&cli_sa;
memcpy(&expected.skb_addrs[0], cli_sa.v6.sin6_addr.s6_addr32,
sizeof(cli_sa.v6.sin6_addr));
expected.eth_protocol = htons(ETH_P_IPV6);
expected.bind_inany = !srv_v6->sin6_addr.s6_addr32[3] &&
!srv_v6->sin6_addr.s6_addr32[2] &&
!srv_v6->sin6_addr.s6_addr32[1] &&
!srv_v6->sin6_addr.s6_addr32[0];
memcpy(&expected.skb_addrs[0], cli_v6->sin6_addr.s6_addr32,
sizeof(cli_v6->sin6_addr));
memcpy(&expected.skb_addrs[4], &in6addr_loopback,
sizeof(in6addr_loopback));
expected.skb_ports[0] = cli_sa.v6.sin6_port;
expected.skb_ports[1] = srv_sa.v6.sin6_port;
expected.skb_ports[0] = cli_v6->sin6_port;
expected.skb_ports[1] = srv_v6->sin6_port;
} else {
expected.eth_protocol = htons(ETH_P_IP);
expected.bind_inany = !srv_sa.v4.sin_addr.s_addr;
struct sockaddr_in *srv_v4 = (struct sockaddr_in *)&srv_sa;
struct sockaddr_in *cli_v4 = (struct sockaddr_in *)&cli_sa;
expected.skb_addrs[0] = cli_sa.v4.sin_addr.s_addr;
expected.eth_protocol = htons(ETH_P_IP);
expected.bind_inany = !srv_v4->sin_addr.s_addr;
expected.skb_addrs[0] = cli_v4->sin_addr.s_addr;
expected.skb_addrs[1] = htonl(INADDR_LOOPBACK);
expected.skb_ports[0] = cli_sa.v4.sin_port;
expected.skb_ports[1] = srv_sa.v4.sin_port;
expected.skb_ports[0] = cli_v4->sin_port;
expected.skb_ports[1] = srv_v4->sin_port;
}
if (memcmp(&result, &expected, offsetof(struct data_check,
@@ -364,16 +366,15 @@ static void check_results(void)
static int send_data(int type, sa_family_t family, void *data, size_t len,
enum result expected)
{
union sa46 cli_sa;
struct sockaddr_storage cli_sa;
int fd, err;
fd = socket(family, type, 0);
RET_ERR(fd == -1, "socket()", "fd:%d errno:%d\n", fd, errno);
sa46_init_loopback(&cli_sa, family);
ss_init_loopback(&cli_sa, family);
err = bind(fd, (struct sockaddr *)&cli_sa, sizeof(cli_sa));
RET_ERR(fd == -1, "bind(cli_sa)", "err:%d errno:%d\n", err, errno);
err = sendto(fd, data, len, MSG_FASTOPEN, (struct sockaddr *)&srv_sa,
sizeof(srv_sa));
RET_ERR(err != len && expected >= PASS,
@@ -589,9 +590,9 @@ static void prepare_sk_fds(int type, sa_family_t family, bool inany)
socklen_t addrlen;
if (inany)
sa46_init_inany(&srv_sa, family);
ss_init_inany(&srv_sa, family);
else
sa46_init_loopback(&srv_sa, family);
ss_init_loopback(&srv_sa, family);
addrlen = sizeof(srv_sa);
/*

View File

@@ -206,6 +206,11 @@ destroy_skel:
skel_open_load_failure:
close(pipe_c2p[0]);
close(pipe_p2c[1]);
/*
* Child is either about to exit cleanly or stuck in case of errors.
* Nudge it to exit.
*/
kill(pid, SIGKILL);
wait(NULL);
}

View File

@@ -20,7 +20,9 @@ static const char * const test_cases[] = {
"strcspn_str",
"strcspn_reject",
"strstr",
"strcasestr",
"strnstr",
"strncasestr",
};
void run_too_long_tests(void)

View File

@@ -139,7 +139,7 @@ static void test_lsm_tailcall(void)
if (CHECK_FAIL(!err))
goto close_prog;
prog_fd = bpf_program__fd(skel->progs.lsm_file_alloc_security_prog);
prog_fd = bpf_program__fd(skel->progs.lsm_kernfs_init_security_prog);
if (CHECK_FAIL(prog_fd < 0))
goto close_prog;

View File

@@ -0,0 +1,145 @@
// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
/*
* BPF-based flow shaping
*
* The test brings up two veth in two isolated namespaces, attach some flow
* shaping program onto it, and ensures that a manual speedtest maximum
* value matches the rate set in the BPF shapers.
*/
#include <asm-generic/socket.h>
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <math.h>
#include <sys/time.h>
#include <sys/socket.h>
#include <bpf/libbpf.h>
#include <pthread.h>
#include "test_progs.h"
#include "network_helpers.h"
#include "test_tc_edt.skel.h"
#define SERVER_NS "tc-edt-server-ns"
#define CLIENT_NS "tc-edt-client-ns"
#define IP4_ADDR_VETH1 "192.168.1.1"
#define IP4_ADDR_VETH2 "192.168.1.2"
#define IP4_ADDR_VETH2_HEX 0xC0A80102
#define TIMEOUT_MS 2000
#define TEST_PORT 9000
#define TARGET_RATE_MBPS 5.0
#define TX_BYTES_COUNT (1 * 1000 * 1000)
#define RATE_ERROR_PERCENT 2.0
struct connection {
int server_listen_fd;
int server_conn_fd;
int client_conn_fd;
};
static int setup(struct test_tc_edt *skel)
{
struct nstoken *nstoken_client, *nstoken_server;
int ret;
if (!ASSERT_OK(make_netns(CLIENT_NS), "create client ns"))
goto fail;
if (!ASSERT_OK(make_netns(SERVER_NS), "create server ns"))
goto fail_delete_client_ns;
nstoken_client = open_netns(CLIENT_NS);
if (!ASSERT_OK_PTR(nstoken_client, "open client ns"))
goto fail_delete_server_ns;
SYS(fail_close_client_ns, "ip link add veth1 type veth peer name %s",
"veth2 netns " SERVER_NS);
SYS(fail_close_client_ns, "ip -4 addr add " IP4_ADDR_VETH1 "/24 dev veth1");
SYS(fail_close_client_ns, "ip link set veth1 up");
nstoken_server = open_netns(SERVER_NS);
if (!ASSERT_OK_PTR(nstoken_server, "enter server ns"))
goto fail_close_client_ns;
SYS(fail_close_server_ns, "ip -4 addr add " IP4_ADDR_VETH2 "/24 dev veth2");
SYS(fail_close_server_ns, "ip link set veth2 up");
SYS(fail_close_server_ns, "tc qdisc add dev veth2 root fq");
ret = tc_prog_attach("veth2", -1, bpf_program__fd(skel->progs.tc_prog));
if (!ASSERT_OK(ret, "attach bpf prog"))
goto fail_close_server_ns;
skel->bss->target_rate = TARGET_RATE_MBPS * 1000 * 1000;
close_netns(nstoken_server);
close_netns(nstoken_client);
return 0;
fail_close_server_ns:
close_netns(nstoken_server);
fail_close_client_ns:
close_netns(nstoken_client);
fail_delete_server_ns:
remove_netns(SERVER_NS);
fail_delete_client_ns:
remove_netns(CLIENT_NS);
fail:
return -1;
}
static void cleanup(void)
{
remove_netns(CLIENT_NS);
remove_netns(SERVER_NS);
}
static void run_test(void)
{
int server_fd, client_fd, err;
double rate_mbps, rate_error;
struct nstoken *nstoken;
__u64 ts_start, ts_end;
nstoken = open_netns(SERVER_NS);
if (!ASSERT_OK_PTR(nstoken, "open server ns"))
return;
server_fd = start_server(AF_INET, SOCK_STREAM, IP4_ADDR_VETH2,
TEST_PORT, TIMEOUT_MS);
if (!ASSERT_OK_FD(server_fd, "start server"))
return;
close_netns(nstoken);
nstoken = open_netns(CLIENT_NS);
if (!ASSERT_OK_PTR(nstoken, "open client ns"))
return;
client_fd = connect_to_fd(server_fd, 0);
if (!ASSERT_OK_FD(client_fd, "connect client"))
return;
ts_start = get_time_ns();
err = send_recv_data(server_fd, client_fd, TX_BYTES_COUNT);
ts_end = get_time_ns();
close_netns(nstoken);
ASSERT_OK(err, "send_recv_data");
rate_mbps = TX_BYTES_COUNT / ((ts_end - ts_start) / 1000.0);
rate_error =
fabs((rate_mbps - TARGET_RATE_MBPS) * 100.0 / TARGET_RATE_MBPS);
ASSERT_LE(rate_error, RATE_ERROR_PERCENT,
"rate error is lower than threshold");
}
void test_tc_edt(void)
{
struct test_tc_edt *skel;
skel = test_tc_edt__open_and_load();
if (!ASSERT_OK_PTR(skel, "skel open and load"))
return;
if (!ASSERT_OK(setup(skel), "global setup"))
return;
run_test();
cleanup();
test_tc_edt__destroy(skel);
}

Some files were not shown because too many files have changed in this diff Show More