From 34f6e38f58db9a94718c273edc9ca3fc8b4dba5f Mon Sep 17 00:00:00 2001 From: "Daniel T. Lee" Date: Fri, 18 Aug 2023 18:01:11 +0900 Subject: [PATCH 1/9] samples/bpf: fix warning with ignored-attributes Currently, compiling the bpf programs will result the warning with the ignored attribute as follows. This commit fixes the warning by adding cf-protection option. In file included from ./arch/x86/include/asm/linkage.h:6: ./arch/x86/include/asm/ibt.h:77:8: warning: 'nocf_check' attribute ignored; use -fcf-protection to enable the attribute [-Wignored-attributes] extern __noendbr u64 ibt_save(bool disable); ^ ./arch/x86/include/asm/ibt.h:32:34: note: expanded from macro '__noendbr' ^ Signed-off-by: Daniel T. Lee Link: https://lore.kernel.org/r/20230818090119.477441-2-danieltimlee@gmail.com Signed-off-by: Alexei Starovoitov --- samples/bpf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 595b98d825ce..b32cb8a62335 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -440,7 +440,7 @@ $(obj)/%.o: $(src)/%.c -Wno-gnu-variable-sized-type-not-at-end \ -Wno-address-of-packed-member -Wno-tautological-compare \ -Wno-unknown-warning-option $(CLANG_ARCH_ARGS) \ - -fno-asynchronous-unwind-tables \ + -fno-asynchronous-unwind-tables -fcf-protection \ -I$(srctree)/samples/bpf/ -include asm_goto_workaround.h \ -O2 -emit-llvm -Xclang -disable-llvm-passes -c $< -o - | \ $(OPT) -O2 -mtriple=bpf-pc-linux | $(LLVM_DIS) | \ From e7e6c774f5d40244444f23b8c49dac2ded158d8c Mon Sep 17 00:00:00 2001 From: "Daniel T. Lee" Date: Fri, 18 Aug 2023 18:01:12 +0900 Subject: [PATCH 2/9] samples/bpf: convert to vmlinux.h with tracing programs This commit replaces separate headers with a single vmlinux.h to tracing programs. Thanks to that, we no longer need to define the argument structure for tracing programs directly. For example, argument for the sched_switch tracpepoint (sched_switch_args) can be replaced with the vmlinux.h provided trace_event_raw_sched_switch. Additional defines have been added to the BPF program either directly or through the inclusion of net_shared.h. Defined values are PERF_MAX_STACK_DEPTH, IFNAMSIZ constants and __stringify() macro. This change enables the BPF program to access internal structures with BTF generated "vmlinux.h" header. Signed-off-by: Daniel T. Lee Link: https://lore.kernel.org/r/20230818090119.477441-3-danieltimlee@gmail.com Signed-off-by: Alexei Starovoitov --- samples/bpf/net_shared.h | 2 ++ samples/bpf/offwaketime_kern.c | 21 ++++++--------------- samples/bpf/spintest_kern.c | 10 ++++++---- samples/bpf/test_overhead_tp.bpf.c | 29 ++--------------------------- samples/bpf/tracex1_kern.c | 5 ++--- samples/bpf/tracex3_kern.c | 4 +--- samples/bpf/tracex4_kern.c | 3 +-- samples/bpf/tracex5_kern.c | 9 ++++----- samples/bpf/tracex6_kern.c | 3 +-- samples/bpf/tracex7_kern.c | 3 +-- 10 files changed, 26 insertions(+), 63 deletions(-) diff --git a/samples/bpf/net_shared.h b/samples/bpf/net_shared.h index e9429af9aa44..88cc52461c98 100644 --- a/samples/bpf/net_shared.h +++ b/samples/bpf/net_shared.h @@ -17,6 +17,8 @@ #define TC_ACT_OK 0 #define TC_ACT_SHOT 2 +#define IFNAMSIZ 16 + #if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && \ __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ #define bpf_ntohs(x) __builtin_bswap16(x) diff --git a/samples/bpf/offwaketime_kern.c b/samples/bpf/offwaketime_kern.c index 23f12b47e9e5..8e5105811178 100644 --- a/samples/bpf/offwaketime_kern.c +++ b/samples/bpf/offwaketime_kern.c @@ -4,14 +4,15 @@ * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. */ -#include -#include -#include +#include "vmlinux.h" #include -#include #include #include +#ifndef PERF_MAX_STACK_DEPTH +#define PERF_MAX_STACK_DEPTH 127 +#endif + #define _(P) \ ({ \ typeof(P) val; \ @@ -111,18 +112,8 @@ static inline int update_counts(void *ctx, u32 pid, u64 delta) #if 1 /* taken from /sys/kernel/tracing/events/sched/sched_switch/format */ -struct sched_switch_args { - unsigned long long pad; - char prev_comm[TASK_COMM_LEN]; - int prev_pid; - int prev_prio; - long long prev_state; - char next_comm[TASK_COMM_LEN]; - int next_pid; - int next_prio; -}; SEC("tracepoint/sched/sched_switch") -int oncpu(struct sched_switch_args *ctx) +int oncpu(struct trace_event_raw_sched_switch *ctx) { /* record previous thread sleep time */ u32 pid = ctx->prev_pid; diff --git a/samples/bpf/spintest_kern.c b/samples/bpf/spintest_kern.c index 455da77319d9..15740b16a3f7 100644 --- a/samples/bpf/spintest_kern.c +++ b/samples/bpf/spintest_kern.c @@ -4,14 +4,15 @@ * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. */ -#include -#include +#include "vmlinux.h" #include -#include -#include #include #include +#ifndef PERF_MAX_STACK_DEPTH +#define PERF_MAX_STACK_DEPTH 127 +#endif + struct { __uint(type, BPF_MAP_TYPE_HASH); __type(key, long); @@ -60,6 +61,7 @@ SEC("kprobe/_raw_spin_lock_irq")PROG(p11) SEC("kprobe/_raw_spin_trylock")PROG(p12) SEC("kprobe/_raw_spin_lock")PROG(p13) SEC("kprobe/_raw_spin_lock_bh")PROG(p14) + /* and to inner bpf helpers */ SEC("kprobe/htab_map_update_elem")PROG(p15) SEC("kprobe/__htab_percpu_map_update_elem")PROG(p16) diff --git a/samples/bpf/test_overhead_tp.bpf.c b/samples/bpf/test_overhead_tp.bpf.c index 8b498328e961..5dc08b587978 100644 --- a/samples/bpf/test_overhead_tp.bpf.c +++ b/samples/bpf/test_overhead_tp.bpf.c @@ -8,40 +8,15 @@ #include /* from /sys/kernel/tracing/events/task/task_rename/format */ -struct task_rename { - __u64 pad; - __u32 pid; - char oldcomm[TASK_COMM_LEN]; - char newcomm[TASK_COMM_LEN]; - __u16 oom_score_adj; -}; SEC("tracepoint/task/task_rename") -int prog(struct task_rename *ctx) +int prog(struct trace_event_raw_task_rename *ctx) { return 0; } /* from /sys/kernel/tracing/events/fib/fib_table_lookup/format */ -struct fib_table_lookup { - __u64 pad; - __u32 tb_id; - int err; - int oif; - int iif; - __u8 proto; - __u8 tos; - __u8 scope; - __u8 flags; - __u8 src[4]; - __u8 dst[4]; - __u8 gw4[4]; - __u8 gw6[16]; - __u16 sport; - __u16 dport; - char name[16]; -}; SEC("tracepoint/fib/fib_table_lookup") -int prog2(struct fib_table_lookup *ctx) +int prog2(struct trace_event_raw_fib_table_lookup *ctx) { return 0; } diff --git a/samples/bpf/tracex1_kern.c b/samples/bpf/tracex1_kern.c index ef30d2b353b0..bb78bdbffa87 100644 --- a/samples/bpf/tracex1_kern.c +++ b/samples/bpf/tracex1_kern.c @@ -4,9 +4,8 @@ * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. */ -#include -#include -#include +#include "vmlinux.h" +#include "net_shared.h" #include #include #include diff --git a/samples/bpf/tracex3_kern.c b/samples/bpf/tracex3_kern.c index bde6591cb20c..7cc60f10d2e5 100644 --- a/samples/bpf/tracex3_kern.c +++ b/samples/bpf/tracex3_kern.c @@ -4,10 +4,8 @@ * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. */ -#include -#include +#include "vmlinux.h" #include -#include #include #include diff --git a/samples/bpf/tracex4_kern.c b/samples/bpf/tracex4_kern.c index eb0f8fdd14bf..ca826750901a 100644 --- a/samples/bpf/tracex4_kern.c +++ b/samples/bpf/tracex4_kern.c @@ -4,9 +4,8 @@ * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. */ -#include +#include "vmlinux.h" #include -#include #include #include diff --git a/samples/bpf/tracex5_kern.c b/samples/bpf/tracex5_kern.c index 64a1f7550d7e..8cd697ee7047 100644 --- a/samples/bpf/tracex5_kern.c +++ b/samples/bpf/tracex5_kern.c @@ -4,15 +4,14 @@ * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. */ -#include -#include -#include -#include -#include +#include "vmlinux.h" #include "syscall_nrs.h" +#include +#include #include #include +#define __stringify(x) #x #define PROG(F) SEC("kprobe/"__stringify(F)) int bpf_func_##F struct { diff --git a/samples/bpf/tracex6_kern.c b/samples/bpf/tracex6_kern.c index acad5712d8b4..6ad82e68f998 100644 --- a/samples/bpf/tracex6_kern.c +++ b/samples/bpf/tracex6_kern.c @@ -1,6 +1,5 @@ -#include +#include "vmlinux.h" #include -#include #include struct { diff --git a/samples/bpf/tracex7_kern.c b/samples/bpf/tracex7_kern.c index c5a92df8ac31..ab8d6704a5a4 100644 --- a/samples/bpf/tracex7_kern.c +++ b/samples/bpf/tracex7_kern.c @@ -1,5 +1,4 @@ -#include -#include +#include "vmlinux.h" #include #include From 4a0ee78890699706f59cc9bdf8283ecaa4e0a141 Mon Sep 17 00:00:00 2001 From: "Daniel T. Lee" Date: Fri, 18 Aug 2023 18:01:13 +0900 Subject: [PATCH 3/9] samples/bpf: unify bpf program suffix to .bpf with tracing programs Currently, BPF programs typically have a suffix of .bpf.c. However, some programs still utilize a mixture of _kern.c suffix alongside the naming convention. In order to achieve consistency in the naming of these programs, this commit unifies the inconsistency in the naming convention of BPF kernel programs. Signed-off-by: Daniel T. Lee Link: https://lore.kernel.org/r/20230818090119.477441-4-danieltimlee@gmail.com Signed-off-by: Alexei Starovoitov --- samples/bpf/Makefile | 18 +++++++++--------- .../{offwaketime_kern.c => offwaketime.bpf.c} | 0 samples/bpf/offwaketime_user.c | 2 +- .../bpf/{spintest_kern.c => spintest.bpf.c} | 0 samples/bpf/spintest_user.c | 2 +- samples/bpf/{tracex1_kern.c => tracex1.bpf.c} | 0 samples/bpf/tracex1_user.c | 2 +- samples/bpf/{tracex3_kern.c => tracex3.bpf.c} | 0 samples/bpf/tracex3_user.c | 2 +- samples/bpf/{tracex4_kern.c => tracex4.bpf.c} | 0 samples/bpf/tracex4_user.c | 2 +- samples/bpf/{tracex5_kern.c => tracex5.bpf.c} | 0 samples/bpf/tracex5_user.c | 2 +- samples/bpf/{tracex6_kern.c => tracex6.bpf.c} | 0 samples/bpf/tracex6_user.c | 2 +- samples/bpf/{tracex7_kern.c => tracex7.bpf.c} | 0 samples/bpf/tracex7_user.c | 2 +- 17 files changed, 17 insertions(+), 17 deletions(-) rename samples/bpf/{offwaketime_kern.c => offwaketime.bpf.c} (100%) rename samples/bpf/{spintest_kern.c => spintest.bpf.c} (100%) rename samples/bpf/{tracex1_kern.c => tracex1.bpf.c} (100%) rename samples/bpf/{tracex3_kern.c => tracex3.bpf.c} (100%) rename samples/bpf/{tracex4_kern.c => tracex4.bpf.c} (100%) rename samples/bpf/{tracex5_kern.c => tracex5.bpf.c} (100%) rename samples/bpf/{tracex6_kern.c => tracex6.bpf.c} (100%) rename samples/bpf/{tracex7_kern.c => tracex7.bpf.c} (100%) diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index b32cb8a62335..f90bcd3696bd 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -124,21 +124,21 @@ always-y := $(tprogs-y) always-y += sockex1_kern.o always-y += sockex2_kern.o always-y += sockex3_kern.o -always-y += tracex1_kern.o +always-y += tracex1.bpf.o always-y += tracex2.bpf.o -always-y += tracex3_kern.o -always-y += tracex4_kern.o -always-y += tracex5_kern.o -always-y += tracex6_kern.o -always-y += tracex7_kern.o +always-y += tracex3.bpf.o +always-y += tracex4.bpf.o +always-y += tracex5.bpf.o +always-y += tracex6.bpf.o +always-y += tracex7.bpf.o always-y += sock_flags.bpf.o always-y += test_probe_write_user.bpf.o always-y += trace_output.bpf.o always-y += tcbpf1_kern.o always-y += tc_l2_redirect_kern.o always-y += lathist_kern.o -always-y += offwaketime_kern.o -always-y += spintest_kern.o +always-y += offwaketime.bpf.o +always-y += spintest.bpf.o always-y += map_perf_test.bpf.o always-y += test_overhead_tp.bpf.o always-y += test_overhead_raw_tp.bpf.o @@ -333,7 +333,7 @@ $(obj)/xdp_redirect_user.o: $(obj)/xdp_redirect.skel.h $(obj)/xdp_monitor_user.o: $(obj)/xdp_monitor.skel.h $(obj)/xdp_router_ipv4_user.o: $(obj)/xdp_router_ipv4.skel.h -$(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h +$(obj)/tracex5.bpf.o: $(obj)/syscall_nrs.h $(obj)/hbm_out_kern.o: $(src)/hbm.h $(src)/hbm_kern.h $(obj)/hbm.o: $(src)/hbm.h $(obj)/hbm_edt_kern.o: $(src)/hbm.h $(src)/hbm_kern.h diff --git a/samples/bpf/offwaketime_kern.c b/samples/bpf/offwaketime.bpf.c similarity index 100% rename from samples/bpf/offwaketime_kern.c rename to samples/bpf/offwaketime.bpf.c diff --git a/samples/bpf/offwaketime_user.c b/samples/bpf/offwaketime_user.c index b6eedcb98fb9..5557b5393642 100644 --- a/samples/bpf/offwaketime_user.c +++ b/samples/bpf/offwaketime_user.c @@ -105,7 +105,7 @@ int main(int argc, char **argv) return 2; } - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + snprintf(filename, sizeof(filename), "%s.bpf.o", argv[0]); obj = bpf_object__open_file(filename, NULL); if (libbpf_get_error(obj)) { fprintf(stderr, "ERROR: opening BPF object file failed\n"); diff --git a/samples/bpf/spintest_kern.c b/samples/bpf/spintest.bpf.c similarity index 100% rename from samples/bpf/spintest_kern.c rename to samples/bpf/spintest.bpf.c diff --git a/samples/bpf/spintest_user.c b/samples/bpf/spintest_user.c index aadac14f748a..8c77600776fb 100644 --- a/samples/bpf/spintest_user.c +++ b/samples/bpf/spintest_user.c @@ -23,7 +23,7 @@ int main(int ac, char **argv) return 2; } - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + snprintf(filename, sizeof(filename), "%s.bpf.o", argv[0]); obj = bpf_object__open_file(filename, NULL); if (libbpf_get_error(obj)) { fprintf(stderr, "ERROR: opening BPF object file failed\n"); diff --git a/samples/bpf/tracex1_kern.c b/samples/bpf/tracex1.bpf.c similarity index 100% rename from samples/bpf/tracex1_kern.c rename to samples/bpf/tracex1.bpf.c diff --git a/samples/bpf/tracex1_user.c b/samples/bpf/tracex1_user.c index 9d4adb7fd834..8c3d9043a2b6 100644 --- a/samples/bpf/tracex1_user.c +++ b/samples/bpf/tracex1_user.c @@ -12,7 +12,7 @@ int main(int ac, char **argv) char filename[256]; FILE *f; - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + snprintf(filename, sizeof(filename), "%s.bpf.o", argv[0]); obj = bpf_object__open_file(filename, NULL); if (libbpf_get_error(obj)) { fprintf(stderr, "ERROR: opening BPF object file failed\n"); diff --git a/samples/bpf/tracex3_kern.c b/samples/bpf/tracex3.bpf.c similarity index 100% rename from samples/bpf/tracex3_kern.c rename to samples/bpf/tracex3.bpf.c diff --git a/samples/bpf/tracex3_user.c b/samples/bpf/tracex3_user.c index d5eebace31e6..1002eb0323b4 100644 --- a/samples/bpf/tracex3_user.c +++ b/samples/bpf/tracex3_user.c @@ -125,7 +125,7 @@ int main(int ac, char **argv) } } - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + snprintf(filename, sizeof(filename), "%s.bpf.o", argv[0]); obj = bpf_object__open_file(filename, NULL); if (libbpf_get_error(obj)) { fprintf(stderr, "ERROR: opening BPF object file failed\n"); diff --git a/samples/bpf/tracex4_kern.c b/samples/bpf/tracex4.bpf.c similarity index 100% rename from samples/bpf/tracex4_kern.c rename to samples/bpf/tracex4.bpf.c diff --git a/samples/bpf/tracex4_user.c b/samples/bpf/tracex4_user.c index dee8f0a091ba..a5145ad72cbf 100644 --- a/samples/bpf/tracex4_user.c +++ b/samples/bpf/tracex4_user.c @@ -53,7 +53,7 @@ int main(int ac, char **argv) char filename[256]; int map_fd, j = 0; - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + snprintf(filename, sizeof(filename), "%s.bpf.o", argv[0]); obj = bpf_object__open_file(filename, NULL); if (libbpf_get_error(obj)) { fprintf(stderr, "ERROR: opening BPF object file failed\n"); diff --git a/samples/bpf/tracex5_kern.c b/samples/bpf/tracex5.bpf.c similarity index 100% rename from samples/bpf/tracex5_kern.c rename to samples/bpf/tracex5.bpf.c diff --git a/samples/bpf/tracex5_user.c b/samples/bpf/tracex5_user.c index 9d7d79f0d47d..7e2d8397fb98 100644 --- a/samples/bpf/tracex5_user.c +++ b/samples/bpf/tracex5_user.c @@ -42,7 +42,7 @@ int main(int ac, char **argv) char filename[256]; FILE *f; - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + snprintf(filename, sizeof(filename), "%s.bpf.o", argv[0]); obj = bpf_object__open_file(filename, NULL); if (libbpf_get_error(obj)) { fprintf(stderr, "ERROR: opening BPF object file failed\n"); diff --git a/samples/bpf/tracex6_kern.c b/samples/bpf/tracex6.bpf.c similarity index 100% rename from samples/bpf/tracex6_kern.c rename to samples/bpf/tracex6.bpf.c diff --git a/samples/bpf/tracex6_user.c b/samples/bpf/tracex6_user.c index 8e83bf2a84a4..ae811ac83bc2 100644 --- a/samples/bpf/tracex6_user.c +++ b/samples/bpf/tracex6_user.c @@ -180,7 +180,7 @@ int main(int argc, char **argv) char filename[256]; int i = 0; - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + snprintf(filename, sizeof(filename), "%s.bpf.o", argv[0]); obj = bpf_object__open_file(filename, NULL); if (libbpf_get_error(obj)) { fprintf(stderr, "ERROR: opening BPF object file failed\n"); diff --git a/samples/bpf/tracex7_kern.c b/samples/bpf/tracex7.bpf.c similarity index 100% rename from samples/bpf/tracex7_kern.c rename to samples/bpf/tracex7.bpf.c diff --git a/samples/bpf/tracex7_user.c b/samples/bpf/tracex7_user.c index 8be7ce18d3ba..b10b5e03a226 100644 --- a/samples/bpf/tracex7_user.c +++ b/samples/bpf/tracex7_user.c @@ -19,7 +19,7 @@ int main(int argc, char **argv) return 0; } - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + snprintf(filename, sizeof(filename), "%s.bpf.o", argv[0]); obj = bpf_object__open_file(filename, NULL); if (libbpf_get_error(obj)) { fprintf(stderr, "ERROR: opening BPF object file failed\n"); From 02dabc247ad68b41758bf39f11e2d682b8b32dd7 Mon Sep 17 00:00:00 2001 From: "Daniel T. Lee" Date: Fri, 18 Aug 2023 18:01:14 +0900 Subject: [PATCH 4/9] samples/bpf: fix symbol mismatch by compiler optimization Currently, multiple kprobe programs are suffering from symbol mismatch due to compiler optimization. These optimizations might induce additional suffix to the symbol name such as '.isra' or '.constprop'. # egrep ' finish_task_switch| __netif_receive_skb_core' /proc/kallsyms ffffffff81135e50 t finish_task_switch.isra.0 ffffffff81dd36d0 t __netif_receive_skb_core.constprop.0 ffffffff8205cc0e t finish_task_switch.isra.0.cold ffffffff820b1aba t __netif_receive_skb_core.constprop.0.cold To avoid this, this commit replaces the original kprobe section to kprobe.multi in order to match symbol with wildcard characters. Here, asterisk is used for avoiding symbol mismatch. Signed-off-by: Daniel T. Lee Link: https://lore.kernel.org/r/20230818090119.477441-5-danieltimlee@gmail.com Signed-off-by: Alexei Starovoitov --- samples/bpf/offwaketime.bpf.c | 2 +- samples/bpf/tracex1.bpf.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/samples/bpf/offwaketime.bpf.c b/samples/bpf/offwaketime.bpf.c index 8e5105811178..3200a0f44969 100644 --- a/samples/bpf/offwaketime.bpf.c +++ b/samples/bpf/offwaketime.bpf.c @@ -118,7 +118,7 @@ int oncpu(struct trace_event_raw_sched_switch *ctx) /* record previous thread sleep time */ u32 pid = ctx->prev_pid; #else -SEC("kprobe/finish_task_switch") +SEC("kprobe.multi/finish_task_switch*") int oncpu(struct pt_regs *ctx) { struct task_struct *p = (void *) PT_REGS_PARM1(ctx); diff --git a/samples/bpf/tracex1.bpf.c b/samples/bpf/tracex1.bpf.c index bb78bdbffa87..f3be14a03964 100644 --- a/samples/bpf/tracex1.bpf.c +++ b/samples/bpf/tracex1.bpf.c @@ -22,11 +22,12 @@ * Number of arguments and their positions can change, etc. * In such case this bpf+kprobe example will no longer be meaningful */ -SEC("kprobe/__netif_receive_skb_core") +SEC("kprobe.multi/__netif_receive_skb_core*") int bpf_prog1(struct pt_regs *ctx) { /* attaches to kprobe __netif_receive_skb_core, * looks for packets on loobpack device and prints them + * (wildcard is used for avoiding symbol mismatch due to optimization) */ char devname[IFNAMSIZ]; struct net_device *dev; From 11430421b440427e10dfb4bd2bdc418fab0ff166 Mon Sep 17 00:00:00 2001 From: "Daniel T. Lee" Date: Fri, 18 Aug 2023 18:01:15 +0900 Subject: [PATCH 5/9] samples/bpf: make tracing programs to be more CO-RE centric The existing tracing programs have been developed for a considerable period of time and, as a result, do not properly incorporate the features of the current libbpf, such as CO-RE. This is evident in frequent usage of functions like PT_REGS* and the persistence of "hack" methods using underscore-style bpf_probe_read_kernel from the past. These programs are far behind the current level of libbpf and can potentially confuse users. Therefore, this commit aims to convert the outdated BPF programs to be more CO-RE centric. Signed-off-by: Daniel T. Lee Link: https://lore.kernel.org/r/20230818090119.477441-6-danieltimlee@gmail.com Signed-off-by: Alexei Starovoitov --- samples/bpf/offwaketime.bpf.c | 18 +++++------------- samples/bpf/test_overhead_kprobe.bpf.c | 20 +++++++------------- samples/bpf/tracex1.bpf.c | 17 +++++------------ samples/bpf/tracex5.bpf.c | 5 +++-- 4 files changed, 20 insertions(+), 40 deletions(-) diff --git a/samples/bpf/offwaketime.bpf.c b/samples/bpf/offwaketime.bpf.c index 3200a0f44969..4a65ba76c1b1 100644 --- a/samples/bpf/offwaketime.bpf.c +++ b/samples/bpf/offwaketime.bpf.c @@ -8,18 +8,12 @@ #include #include #include +#include #ifndef PERF_MAX_STACK_DEPTH #define PERF_MAX_STACK_DEPTH 127 #endif -#define _(P) \ - ({ \ - typeof(P) val; \ - bpf_probe_read_kernel(&val, sizeof(val), &(P)); \ - val; \ - }) - #define MINBLOCK_US 1 #define MAX_ENTRIES 10000 @@ -68,11 +62,9 @@ struct { SEC("kprobe/try_to_wake_up") int waker(struct pt_regs *ctx) { - struct task_struct *p = (void *) PT_REGS_PARM1(ctx); + struct task_struct *p = (void *)PT_REGS_PARM1_CORE(ctx); + u32 pid = BPF_CORE_READ(p, pid); struct wokeby_t woke; - u32 pid; - - pid = _(p->pid); bpf_get_current_comm(&woke.name, sizeof(woke.name)); woke.ret = bpf_get_stackid(ctx, &stackmap, STACKID_FLAGS); @@ -121,9 +113,9 @@ int oncpu(struct trace_event_raw_sched_switch *ctx) SEC("kprobe.multi/finish_task_switch*") int oncpu(struct pt_regs *ctx) { - struct task_struct *p = (void *) PT_REGS_PARM1(ctx); + struct task_struct *p = (void *)PT_REGS_PARM1_CORE(ctx); /* record previous thread sleep time */ - u32 pid = _(p->pid); + u32 pid = BPF_CORE_READ(p, pid); #endif u64 delta, ts, *tsp; diff --git a/samples/bpf/test_overhead_kprobe.bpf.c b/samples/bpf/test_overhead_kprobe.bpf.c index c3528731e0e1..668cf5259c60 100644 --- a/samples/bpf/test_overhead_kprobe.bpf.c +++ b/samples/bpf/test_overhead_kprobe.bpf.c @@ -8,13 +8,7 @@ #include #include #include - -#define _(P) \ - ({ \ - typeof(P) val = 0; \ - bpf_probe_read_kernel(&val, sizeof(val), &(P)); \ - val; \ - }) +#include SEC("kprobe/__set_task_comm") int prog(struct pt_regs *ctx) @@ -26,14 +20,14 @@ int prog(struct pt_regs *ctx) u16 oom_score_adj; u32 pid; - tsk = (void *)PT_REGS_PARM1(ctx); + tsk = (void *)PT_REGS_PARM1_CORE(ctx); - pid = _(tsk->pid); - bpf_probe_read_kernel_str(oldcomm, sizeof(oldcomm), &tsk->comm); - bpf_probe_read_kernel_str(newcomm, sizeof(newcomm), + pid = BPF_CORE_READ(tsk, pid); + bpf_core_read_str(oldcomm, sizeof(oldcomm), &tsk->comm); + bpf_core_read_str(newcomm, sizeof(newcomm), (void *)PT_REGS_PARM2(ctx)); - signal = _(tsk->signal); - oom_score_adj = _(signal->oom_score_adj); + signal = BPF_CORE_READ(tsk, signal); + oom_score_adj = BPF_CORE_READ(signal, oom_score_adj); return 0; } diff --git a/samples/bpf/tracex1.bpf.c b/samples/bpf/tracex1.bpf.c index f3be14a03964..0ab39d76ff8f 100644 --- a/samples/bpf/tracex1.bpf.c +++ b/samples/bpf/tracex1.bpf.c @@ -8,15 +8,9 @@ #include "net_shared.h" #include #include +#include #include -#define _(P) \ - ({ \ - typeof(P) val = 0; \ - bpf_probe_read_kernel(&val, sizeof(val), &(P)); \ - val; \ - }) - /* kprobe is NOT a stable ABI * kernel functions can be removed, renamed or completely change semantics. * Number of arguments and their positions can change, etc. @@ -34,12 +28,11 @@ int bpf_prog1(struct pt_regs *ctx) struct sk_buff *skb; int len; - /* non-portable! works for the given kernel only */ - bpf_probe_read_kernel(&skb, sizeof(skb), (void *)PT_REGS_PARM1(ctx)); - dev = _(skb->dev); - len = _(skb->len); + bpf_core_read(&skb, sizeof(skb), (void *)PT_REGS_PARM1(ctx)); + dev = BPF_CORE_READ(skb, dev); + len = BPF_CORE_READ(skb, len); - bpf_probe_read_kernel(devname, sizeof(devname), dev->name); + BPF_CORE_READ_STR_INTO(&devname, dev, name); if (devname[0] == 'l' && devname[1] == 'o') { char fmt[] = "skb %p len %d\n"; diff --git a/samples/bpf/tracex5.bpf.c b/samples/bpf/tracex5.bpf.c index 8cd697ee7047..4d3d6c9b25fa 100644 --- a/samples/bpf/tracex5.bpf.c +++ b/samples/bpf/tracex5.bpf.c @@ -10,6 +10,7 @@ #include #include #include +#include #define __stringify(x) #x #define PROG(F) SEC("kprobe/"__stringify(F)) int bpf_func_##F @@ -46,7 +47,7 @@ PROG(SYS__NR_write)(struct pt_regs *ctx) { struct seccomp_data sd; - bpf_probe_read_kernel(&sd, sizeof(sd), (void *)PT_REGS_PARM2(ctx)); + bpf_core_read(&sd, sizeof(sd), (void *)PT_REGS_PARM2(ctx)); if (sd.args[2] == 512) { char fmt[] = "write(fd=%d, buf=%p, size=%d)\n"; bpf_trace_printk(fmt, sizeof(fmt), @@ -59,7 +60,7 @@ PROG(SYS__NR_read)(struct pt_regs *ctx) { struct seccomp_data sd; - bpf_probe_read_kernel(&sd, sizeof(sd), (void *)PT_REGS_PARM2(ctx)); + bpf_core_read(&sd, sizeof(sd), (void *)PT_REGS_PARM2(ctx)); if (sd.args[2] > 128 && sd.args[2] <= 1024) { char fmt[] = "read(fd=%d, buf=%p, size=%d)\n"; bpf_trace_printk(fmt, sizeof(fmt), From 92632115fb57ff9e368f256913e96d6fd5abf5ab Mon Sep 17 00:00:00 2001 From: "Daniel T. Lee" Date: Fri, 18 Aug 2023 18:01:16 +0900 Subject: [PATCH 6/9] samples/bpf: fix bio latency check with tracepoint Recently, a new tracepoint for the block layer, specifically the block_io_start/done tracepoints, was introduced in commit 5a80bd075f3b ("block: introduce block_io_start/block_io_done tracepoints"). Previously, the kprobe entry used for this purpose was quite unstable and inherently broke relevant probes [1]. Now that a stable tracepoint is available, this commit replaces the bio latency check with it. One of the changes made during this replacement is the key used for the hash table. Since 'struct request' cannot be used as a hash key, the approach taken follows that which was implemented in bcc/biolatency [2]. (uses dev:sector for the key) [1]: https://github.com/iovisor/bcc/issues/4261 [2]: https://github.com/iovisor/bcc/pull/4691 Fixes: 450b7879e345 ("block: move blk_account_io_{start,done} to blk-mq.c") Signed-off-by: Daniel T. Lee Link: https://lore.kernel.org/r/20230818090119.477441-7-danieltimlee@gmail.com Signed-off-by: Alexei Starovoitov --- samples/bpf/tracex3.bpf.c | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/samples/bpf/tracex3.bpf.c b/samples/bpf/tracex3.bpf.c index 7cc60f10d2e5..41f37966f5f5 100644 --- a/samples/bpf/tracex3.bpf.c +++ b/samples/bpf/tracex3.bpf.c @@ -9,6 +9,12 @@ #include #include +struct start_key { + dev_t dev; + u32 _pad; + sector_t sector; +}; + struct { __uint(type, BPF_MAP_TYPE_HASH); __type(key, long); @@ -16,16 +22,17 @@ struct { __uint(max_entries, 4096); } my_map SEC(".maps"); -/* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe - * example will no longer be meaningful - */ -SEC("kprobe/blk_mq_start_request") -int bpf_prog1(struct pt_regs *ctx) +/* from /sys/kernel/tracing/events/block/block_io_start/format */ +SEC("tracepoint/block/block_io_start") +int bpf_prog1(struct trace_event_raw_block_rq *ctx) { - long rq = PT_REGS_PARM1(ctx); u64 val = bpf_ktime_get_ns(); + struct start_key key = { + .dev = ctx->dev, + .sector = ctx->sector + }; - bpf_map_update_elem(&my_map, &rq, &val, BPF_ANY); + bpf_map_update_elem(&my_map, &key, &val, BPF_ANY); return 0; } @@ -47,21 +54,26 @@ struct { __uint(max_entries, SLOTS); } lat_map SEC(".maps"); -SEC("kprobe/__blk_account_io_done") -int bpf_prog2(struct pt_regs *ctx) +/* from /sys/kernel/tracing/events/block/block_io_done/format */ +SEC("tracepoint/block/block_io_done") +int bpf_prog2(struct trace_event_raw_block_rq *ctx) { - long rq = PT_REGS_PARM1(ctx); + struct start_key key = { + .dev = ctx->dev, + .sector = ctx->sector + }; + u64 *value, l, base; u32 index; - value = bpf_map_lookup_elem(&my_map, &rq); + value = bpf_map_lookup_elem(&my_map, &key); if (!value) return 0; u64 cur_time = bpf_ktime_get_ns(); u64 delta = cur_time - *value; - bpf_map_delete_elem(&my_map, &rq); + bpf_map_delete_elem(&my_map, &key); /* the lines below are computing index = log10(delta)*10 * using integer arithmetic From d93a7cf6ca2cfcd7de5d06f753ce8d5e863316ac Mon Sep 17 00:00:00 2001 From: "Daniel T. Lee" Date: Fri, 18 Aug 2023 18:01:17 +0900 Subject: [PATCH 7/9] samples/bpf: fix broken map lookup probe In the commit 7c4cd051add3 ("bpf: Fix syscall's stackmap lookup potential deadlock"), a potential deadlock issue was addressed, which resulted in *_map_lookup_elem not triggering BPF programs. (prior to lookup, bpf_disable_instrumentation() is used) To resolve the broken map lookup probe using "htab_map_lookup_elem", this commit introduces an alternative approach. Instead, it utilize "bpf_map_copy_value" and apply a filter specifically for the hash table with map_type. Signed-off-by: Daniel T. Lee Fixes: 7c4cd051add3 ("bpf: Fix syscall's stackmap lookup potential deadlock") Link: https://lore.kernel.org/r/20230818090119.477441-8-danieltimlee@gmail.com Signed-off-by: Alexei Starovoitov --- samples/bpf/tracex6.bpf.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/samples/bpf/tracex6.bpf.c b/samples/bpf/tracex6.bpf.c index 6ad82e68f998..9b23b4737cfb 100644 --- a/samples/bpf/tracex6.bpf.c +++ b/samples/bpf/tracex6.bpf.c @@ -1,6 +1,8 @@ #include "vmlinux.h" #include #include +#include +#include struct { __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); @@ -44,13 +46,24 @@ int bpf_prog1(struct pt_regs *ctx) return 0; } -SEC("kprobe/htab_map_lookup_elem") -int bpf_prog2(struct pt_regs *ctx) +/* + * Since *_map_lookup_elem can't be expected to trigger bpf programs + * due to potential deadlocks (bpf_disable_instrumentation), this bpf + * program will be attached to bpf_map_copy_value (which is called + * from map_lookup_elem) and will only filter the hashtable type. + */ +SEC("kprobe/bpf_map_copy_value") +int BPF_KPROBE(bpf_prog2, struct bpf_map *map) { u32 key = bpf_get_smp_processor_id(); struct bpf_perf_event_value *val, buf; + enum bpf_map_type type; int error; + type = BPF_CORE_READ(map, map_type); + if (type != BPF_MAP_TYPE_HASH) + return 0; + error = bpf_perf_event_read_value(&counters, key, &buf, sizeof(buf)); if (error) return 0; From 8dc80551463197ec79ce0966ec2b5bd700042614 Mon Sep 17 00:00:00 2001 From: "Daniel T. Lee" Date: Fri, 18 Aug 2023 18:01:18 +0900 Subject: [PATCH 8/9] samples/bpf: refactor syscall tracing programs using BPF_KSYSCALL macro This commit refactors the syscall tracing programs by adopting the BPF_KSYSCALL macro. This change aims to enhance the clarity and simplicity of the BPF programs by reducing the complexity of argument parsing from pt_regs. Signed-off-by: Daniel T. Lee Link: https://lore.kernel.org/r/20230818090119.477441-9-danieltimlee@gmail.com Signed-off-by: Alexei Starovoitov --- samples/bpf/test_map_in_map.bpf.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/samples/bpf/test_map_in_map.bpf.c b/samples/bpf/test_map_in_map.bpf.c index 1883559e5977..9f030f9c4e1b 100644 --- a/samples/bpf/test_map_in_map.bpf.c +++ b/samples/bpf/test_map_in_map.bpf.c @@ -103,19 +103,15 @@ static __always_inline int do_inline_hash_lookup(void *inner_map, u32 port) return result ? *result : -ENOENT; } -SEC("kprobe/__sys_connect") -int trace_sys_connect(struct pt_regs *ctx) +SEC("ksyscall/connect") +int BPF_KSYSCALL(trace_sys_connect, unsigned int fd, struct sockaddr_in6 *in6, int addrlen) { - struct sockaddr_in6 *in6; u16 test_case, port, dst6[8]; - int addrlen, ret, inline_ret, ret_key = 0; + int ret, inline_ret, ret_key = 0; u32 port_key; void *outer_map, *inner_map; bool inline_hash = false; - in6 = (struct sockaddr_in6 *)PT_REGS_PARM2_CORE(ctx); - addrlen = (int)PT_REGS_PARM3_CORE(ctx); - if (addrlen != sizeof(*in6)) return 0; From 456d53554ca7e93d7e5e8eb7fe8c906d5ec6e7d0 Mon Sep 17 00:00:00 2001 From: "Daniel T. Lee" Date: Fri, 18 Aug 2023 18:01:19 +0900 Subject: [PATCH 9/9] samples/bpf: simplify spintest with kprobe.multi With the introduction of kprobe.multi, it is now possible to attach multiple kprobes to a single BPF program without the need for multiple definitions. Additionally, this method supports wildcard-based matching, allowing for further simplification of BPF programs. In here, an asterisk (*) wildcard is used to map to all symbols relevant to spin_{lock|unlock}. Furthermore, since kprobe.multi handles symbol matching, this commit eliminates the need for the previous logic of reading the ksym table to verify the existence of symbols. Signed-off-by: Daniel T. Lee Link: https://lore.kernel.org/r/20230818090119.477441-10-danieltimlee@gmail.com Signed-off-by: Alexei Starovoitov --- samples/bpf/spintest.bpf.c | 17 +++-------------- samples/bpf/spintest_user.c | 22 +++++++--------------- 2 files changed, 10 insertions(+), 29 deletions(-) diff --git a/samples/bpf/spintest.bpf.c b/samples/bpf/spintest.bpf.c index 15740b16a3f7..cba5a9d50783 100644 --- a/samples/bpf/spintest.bpf.c +++ b/samples/bpf/spintest.bpf.c @@ -47,20 +47,9 @@ int foo(struct pt_regs *ctx) \ } /* add kprobes to all possible *spin* functions */ -SEC("kprobe/spin_unlock")PROG(p1) -SEC("kprobe/spin_lock")PROG(p2) -SEC("kprobe/mutex_spin_on_owner")PROG(p3) -SEC("kprobe/rwsem_spin_on_owner")PROG(p4) -SEC("kprobe/spin_unlock_irqrestore")PROG(p5) -SEC("kprobe/_raw_spin_unlock_irqrestore")PROG(p6) -SEC("kprobe/_raw_spin_unlock_bh")PROG(p7) -SEC("kprobe/_raw_spin_unlock")PROG(p8) -SEC("kprobe/_raw_spin_lock_irqsave")PROG(p9) -SEC("kprobe/_raw_spin_trylock_bh")PROG(p10) -SEC("kprobe/_raw_spin_lock_irq")PROG(p11) -SEC("kprobe/_raw_spin_trylock")PROG(p12) -SEC("kprobe/_raw_spin_lock")PROG(p13) -SEC("kprobe/_raw_spin_lock_bh")PROG(p14) +SEC("kprobe.multi/spin_*lock*")PROG(spin_lock) +SEC("kprobe.multi/*_spin_on_owner")PROG(spin_on_owner) +SEC("kprobe.multi/_raw_spin_*lock*")PROG(raw_spin_lock) /* and to inner bpf helpers */ SEC("kprobe/htab_map_update_elem")PROG(p15) diff --git a/samples/bpf/spintest_user.c b/samples/bpf/spintest_user.c index 8c77600776fb..55971edb1088 100644 --- a/samples/bpf/spintest_user.c +++ b/samples/bpf/spintest_user.c @@ -9,13 +9,12 @@ int main(int ac, char **argv) { - char filename[256], symbol[256]; struct bpf_object *obj = NULL; struct bpf_link *links[20]; long key, next_key, value; struct bpf_program *prog; int map_fd, i, j = 0; - const char *section; + char filename[256]; struct ksym *sym; if (load_kallsyms()) { @@ -44,20 +43,13 @@ int main(int ac, char **argv) } bpf_object__for_each_program(prog, obj) { - section = bpf_program__section_name(prog); - if (sscanf(section, "kprobe/%s", symbol) != 1) - continue; - - /* Attach prog only when symbol exists */ - if (ksym_get_addr(symbol)) { - links[j] = bpf_program__attach(prog); - if (libbpf_get_error(links[j])) { - fprintf(stderr, "bpf_program__attach failed\n"); - links[j] = NULL; - goto cleanup; - } - j++; + links[j] = bpf_program__attach(prog); + if (libbpf_get_error(links[j])) { + fprintf(stderr, "bpf_program__attach failed\n"); + links[j] = NULL; + goto cleanup; } + j++; } for (i = 0; i < 5; i++) {