From 1f043a687e47b9b3c0469ad8d2021708981536af Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 23 Feb 2021 09:16:45 -0300 Subject: tools headers UAPI: Update tools's copy of drm.h headers Picking the changes from: 0e0dc448005583a6 ("drm/doc: demote old doc-comments in drm.h") Silencing these perf build warnings: Warning: Kernel ABI header at 'tools/include/uapi/drm/drm.h' differs from latest version at 'include/uapi/drm/drm.h' diff -u tools/include/uapi/drm/drm.h include/uapi/drm/drm.h No changes in tooling as these are just C comment documentation changes. Cc: Simon Ser Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/drm/drm.h | 97 ++++++++++++++++++++++---------------------- 1 file changed, 49 insertions(+), 48 deletions(-) (limited to 'tools') diff --git a/tools/include/uapi/drm/drm.h b/tools/include/uapi/drm/drm.h index 808b48a93330..0827037c5484 100644 --- a/tools/include/uapi/drm/drm.h +++ b/tools/include/uapi/drm/drm.h @@ -1,11 +1,10 @@ -/** - * \file drm.h +/* * Header for the Direct Rendering Manager * - * \author Rickard E. (Rik) Faith + * Author: Rickard E. (Rik) Faith * - * \par Acknowledgments: - * Dec 1999, Richard Henderson , move to generic \c cmpxchg. + * Acknowledgments: + * Dec 1999, Richard Henderson , move to generic cmpxchg. */ /* @@ -85,7 +84,7 @@ typedef unsigned int drm_context_t; typedef unsigned int drm_drawable_t; typedef unsigned int drm_magic_t; -/** +/* * Cliprect. * * \warning: If you change this structure, make sure you change @@ -101,7 +100,7 @@ struct drm_clip_rect { unsigned short y2; }; -/** +/* * Drawable information. */ struct drm_drawable_info { @@ -109,7 +108,7 @@ struct drm_drawable_info { struct drm_clip_rect *rects; }; -/** +/* * Texture region, */ struct drm_tex_region { @@ -120,7 +119,7 @@ struct drm_tex_region { unsigned int age; }; -/** +/* * Hardware lock. * * The lock structure is a simple cache-line aligned integer. To avoid @@ -132,7 +131,7 @@ struct drm_hw_lock { char padding[60]; /**< Pad to cache line */ }; -/** +/* * DRM_IOCTL_VERSION ioctl argument type. * * \sa drmGetVersion(). @@ -149,7 +148,7 @@ struct drm_version { char __user *desc; /**< User-space buffer to hold desc */ }; -/** +/* * DRM_IOCTL_GET_UNIQUE ioctl argument type. * * \sa drmGetBusid() and drmSetBusId(). @@ -168,7 +167,7 @@ struct drm_block { int unused; }; -/** +/* * DRM_IOCTL_CONTROL ioctl argument type. * * \sa drmCtlInstHandler() and drmCtlUninstHandler(). @@ -183,7 +182,7 @@ struct drm_control { int irq; }; -/** +/* * Type of memory to map. */ enum drm_map_type { @@ -195,7 +194,7 @@ enum drm_map_type { _DRM_CONSISTENT = 5 /**< Consistent memory for PCI DMA */ }; -/** +/* * Memory mapping flags. */ enum drm_map_flags { @@ -214,7 +213,7 @@ struct drm_ctx_priv_map { void *handle; /**< Handle of map */ }; -/** +/* * DRM_IOCTL_GET_MAP, DRM_IOCTL_ADD_MAP and DRM_IOCTL_RM_MAP ioctls * argument type. * @@ -231,7 +230,7 @@ struct drm_map { /* Private data */ }; -/** +/* * DRM_IOCTL_GET_CLIENT ioctl argument type. */ struct drm_client { @@ -263,7 +262,7 @@ enum drm_stat_type { /* Add to the *END* of the list */ }; -/** +/* * DRM_IOCTL_GET_STATS ioctl argument type. */ struct drm_stats { @@ -274,7 +273,7 @@ struct drm_stats { } data[15]; }; -/** +/* * Hardware locking flags. */ enum drm_lock_flags { @@ -289,7 +288,7 @@ enum drm_lock_flags { _DRM_HALT_CUR_QUEUES = 0x20 /**< Halt all current queues */ }; -/** +/* * DRM_IOCTL_LOCK, DRM_IOCTL_UNLOCK and DRM_IOCTL_FINISH ioctl argument type. * * \sa drmGetLock() and drmUnlock(). @@ -299,7 +298,7 @@ struct drm_lock { enum drm_lock_flags flags; }; -/** +/* * DMA flags * * \warning @@ -328,7 +327,7 @@ enum drm_dma_flags { _DRM_DMA_LARGER_OK = 0x40 /**< Larger-than-requested buffers OK */ }; -/** +/* * DRM_IOCTL_ADD_BUFS and DRM_IOCTL_MARK_BUFS ioctl argument type. * * \sa drmAddBufs(). @@ -351,7 +350,7 @@ struct drm_buf_desc { */ }; -/** +/* * DRM_IOCTL_INFO_BUFS ioctl argument type. */ struct drm_buf_info { @@ -359,7 +358,7 @@ struct drm_buf_info { struct drm_buf_desc __user *list; }; -/** +/* * DRM_IOCTL_FREE_BUFS ioctl argument type. */ struct drm_buf_free { @@ -367,7 +366,7 @@ struct drm_buf_free { int __user *list; }; -/** +/* * Buffer information * * \sa drm_buf_map. @@ -379,7 +378,7 @@ struct drm_buf_pub { void __user *address; /**< Address of buffer */ }; -/** +/* * DRM_IOCTL_MAP_BUFS ioctl argument type. */ struct drm_buf_map { @@ -392,7 +391,7 @@ struct drm_buf_map { struct drm_buf_pub __user *list; /**< Buffer information */ }; -/** +/* * DRM_IOCTL_DMA ioctl argument type. * * Indices here refer to the offset into the buffer list in drm_buf_get. @@ -417,7 +416,7 @@ enum drm_ctx_flags { _DRM_CONTEXT_2DONLY = 0x02 }; -/** +/* * DRM_IOCTL_ADD_CTX ioctl argument type. * * \sa drmCreateContext() and drmDestroyContext(). @@ -427,7 +426,7 @@ struct drm_ctx { enum drm_ctx_flags flags; }; -/** +/* * DRM_IOCTL_RES_CTX ioctl argument type. */ struct drm_ctx_res { @@ -435,14 +434,14 @@ struct drm_ctx_res { struct drm_ctx __user *contexts; }; -/** +/* * DRM_IOCTL_ADD_DRAW and DRM_IOCTL_RM_DRAW ioctl argument type. */ struct drm_draw { drm_drawable_t handle; }; -/** +/* * DRM_IOCTL_UPDATE_DRAW ioctl argument type. */ typedef enum { @@ -456,14 +455,14 @@ struct drm_update_draw { unsigned long long data; }; -/** +/* * DRM_IOCTL_GET_MAGIC and DRM_IOCTL_AUTH_MAGIC ioctl argument type. */ struct drm_auth { drm_magic_t magic; }; -/** +/* * DRM_IOCTL_IRQ_BUSID ioctl argument type. * * \sa drmGetInterruptFromBusID(). @@ -505,7 +504,7 @@ struct drm_wait_vblank_reply { long tval_usec; }; -/** +/* * DRM_IOCTL_WAIT_VBLANK ioctl argument type. * * \sa drmWaitVBlank(). @@ -518,7 +517,7 @@ union drm_wait_vblank { #define _DRM_PRE_MODESET 1 #define _DRM_POST_MODESET 2 -/** +/* * DRM_IOCTL_MODESET_CTL ioctl argument type * * \sa drmModesetCtl(). @@ -528,7 +527,7 @@ struct drm_modeset_ctl { __u32 cmd; }; -/** +/* * DRM_IOCTL_AGP_ENABLE ioctl argument type. * * \sa drmAgpEnable(). @@ -537,7 +536,7 @@ struct drm_agp_mode { unsigned long mode; /**< AGP mode */ }; -/** +/* * DRM_IOCTL_AGP_ALLOC and DRM_IOCTL_AGP_FREE ioctls argument type. * * \sa drmAgpAlloc() and drmAgpFree(). @@ -549,7 +548,7 @@ struct drm_agp_buffer { unsigned long physical; /**< Physical used by i810 */ }; -/** +/* * DRM_IOCTL_AGP_BIND and DRM_IOCTL_AGP_UNBIND ioctls argument type. * * \sa drmAgpBind() and drmAgpUnbind(). @@ -559,7 +558,7 @@ struct drm_agp_binding { unsigned long offset; /**< In bytes -- will round to page boundary */ }; -/** +/* * DRM_IOCTL_AGP_INFO ioctl argument type. * * \sa drmAgpVersionMajor(), drmAgpVersionMinor(), drmAgpGetMode(), @@ -580,7 +579,7 @@ struct drm_agp_info { unsigned short id_device; }; -/** +/* * DRM_IOCTL_SG_ALLOC ioctl argument type. */ struct drm_scatter_gather { @@ -588,7 +587,7 @@ struct drm_scatter_gather { unsigned long handle; /**< Used for mapping / unmapping */ }; -/** +/* * DRM_IOCTL_SET_VERSION ioctl argument type. */ struct drm_set_version { @@ -598,14 +597,14 @@ struct drm_set_version { int drm_dd_minor; }; -/** DRM_IOCTL_GEM_CLOSE ioctl argument type */ +/* DRM_IOCTL_GEM_CLOSE ioctl argument type */ struct drm_gem_close { /** Handle of the object to be closed. */ __u32 handle; __u32 pad; }; -/** DRM_IOCTL_GEM_FLINK ioctl argument type */ +/* DRM_IOCTL_GEM_FLINK ioctl argument type */ struct drm_gem_flink { /** Handle for the object being named */ __u32 handle; @@ -614,7 +613,7 @@ struct drm_gem_flink { __u32 name; }; -/** DRM_IOCTL_GEM_OPEN ioctl argument type */ +/* DRM_IOCTL_GEM_OPEN ioctl argument type */ struct drm_gem_open { /** Name of object being opened */ __u32 name; @@ -652,7 +651,7 @@ struct drm_gem_open { #define DRM_CAP_SYNCOBJ 0x13 #define DRM_CAP_SYNCOBJ_TIMELINE 0x14 -/** DRM_IOCTL_GET_CAP ioctl argument type */ +/* DRM_IOCTL_GET_CAP ioctl argument type */ struct drm_get_cap { __u64 capability; __u64 value; @@ -678,7 +677,9 @@ struct drm_get_cap { /** * DRM_CLIENT_CAP_ATOMIC * - * If set to 1, the DRM core will expose atomic properties to userspace + * If set to 1, the DRM core will expose atomic properties to userspace. This + * implicitly enables &DRM_CLIENT_CAP_UNIVERSAL_PLANES and + * &DRM_CLIENT_CAP_ASPECT_RATIO. */ #define DRM_CLIENT_CAP_ATOMIC 3 @@ -698,7 +699,7 @@ struct drm_get_cap { */ #define DRM_CLIENT_CAP_WRITEBACK_CONNECTORS 5 -/** DRM_IOCTL_SET_CLIENT_CAP ioctl argument type */ +/* DRM_IOCTL_SET_CLIENT_CAP ioctl argument type */ struct drm_set_client_cap { __u64 capability; __u64 value; @@ -950,7 +951,7 @@ extern "C" { #define DRM_IOCTL_MODE_GETFB2 DRM_IOWR(0xCE, struct drm_mode_fb_cmd2) -/** +/* * Device specific ioctls should only be in their respective headers * The device specific ioctl range is from 0x40 to 0x9f. * Generic IOCTLS restart at 0xA0. @@ -961,7 +962,7 @@ extern "C" { #define DRM_COMMAND_BASE 0x40 #define DRM_COMMAND_END 0xA0 -/** +/* * Header for events written back to userspace on the drm fd. The * type defines the type of event, the length specifies the total * length of the event (including the header), and user_data is -- cgit v1.2.3 From 4a8176fd62aa7fa86599efcbd3631af272b109d8 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 23 Feb 2021 09:21:00 -0300 Subject: tools headers UAPI: Sync drm/i915_drm.h with the kernel sources To pick the changes in: 8c3b1ba0e7ea9a80 ("drm/i915/gt: Track the overall awake/busy time") 348fb0cb0a79bce0 ("drm/i915/pmu: Deprecate I915_PMU_LAST and optimize state tracking") That don't result in any change in tooling: $ tools/perf/trace/beauty/drm_ioctl.sh > before $ cp include/uapi/drm/i915_drm.h tools/include/uapi/drm/i915_drm.h $ tools/perf/trace/beauty/drm_ioctl.sh > after $ diff -u before after $ Only silences this perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/drm/i915_drm.h' differs from latest version at 'include/uapi/drm/i915_drm.h' diff -u tools/include/uapi/drm/i915_drm.h include/uapi/drm/i915_drm.h Cc: Chris Wilson Cc: Tvrtko Ursulin Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/drm/i915_drm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/include/uapi/drm/i915_drm.h b/tools/include/uapi/drm/i915_drm.h index fa1f3d62f9a6..1987e2ea79a3 100644 --- a/tools/include/uapi/drm/i915_drm.h +++ b/tools/include/uapi/drm/i915_drm.h @@ -177,8 +177,9 @@ enum drm_i915_pmu_engine_sample { #define I915_PMU_REQUESTED_FREQUENCY __I915_PMU_OTHER(1) #define I915_PMU_INTERRUPTS __I915_PMU_OTHER(2) #define I915_PMU_RC6_RESIDENCY __I915_PMU_OTHER(3) +#define I915_PMU_SOFTWARE_GT_AWAKE_TIME __I915_PMU_OTHER(4) -#define I915_PMU_LAST I915_PMU_RC6_RESIDENCY +#define I915_PMU_LAST /* Deprecated - do not use */ I915_PMU_RC6_RESIDENCY /* Each region is a minimum of 16k, and there are at most 255 of them. */ -- cgit v1.2.3 From ed72adf64979ee2b5aa9f1c74fb45b2bf592ad2a Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 23 Feb 2021 09:44:37 -0300 Subject: tools headers UAPI: Sync openat2.h with the kernel sources To pick the changes in: 99668f618062816c ("fs: expose LOOKUP_CACHED through openat2() RESOLVE_CACHED") That don't result in any change in tooling, only silences this perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/openat2.h' differs from latest version at 'include/uapi/linux/openat2.h' diff -u tools/include/uapi/linux/openat2.h include/uapi/linux/openat2.h Cc: Al Viro Cc: Jens Axboe Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/openat2.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'tools') diff --git a/tools/include/uapi/linux/openat2.h b/tools/include/uapi/linux/openat2.h index 58b1eb711360..a5feb7604948 100644 --- a/tools/include/uapi/linux/openat2.h +++ b/tools/include/uapi/linux/openat2.h @@ -35,5 +35,9 @@ struct open_how { #define RESOLVE_IN_ROOT 0x10 /* Make all jumps to "/" and ".." be scoped inside the dirfd (similar to chroot(2)). */ +#define RESOLVE_CACHED 0x20 /* Only complete if resolution can be + completed through cached lookup. May + return -EAGAIN if that's not + possible. */ #endif /* _UAPI_LINUX_OPENAT2_H */ -- cgit v1.2.3 From 867a9148298b42dee341b780ddfd706415a1253e Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 23 Feb 2021 09:48:05 -0300 Subject: perf arch powerpc: Sync powerpc syscall.tbl with the kernel sources To get the changes in: fbcee2ebe8edbb6a ("powerpc/32: Always save non volatile GPRs at syscall entry") That shouldn't cause any change in tooling, just silences the following tools/perf/ build warning: Warning: Kernel ABI header at 'tools/perf/arch/powerpc/entry/syscalls/syscall.tbl' differs from latest version at 'arch/powerpc/kernel/syscalls/syscall.tbl' Cc: Christophe Leroy Cc: Michael Ellerman Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/powerpc/entry/syscalls/syscall.tbl | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl index f744eb5cba88..96b2157f0371 100644 --- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl @@ -9,9 +9,7 @@ # 0 nospu restart_syscall sys_restart_syscall 1 nospu exit sys_exit -2 32 fork ppc_fork sys_fork -2 64 fork sys_fork -2 spu fork sys_ni_syscall +2 nospu fork sys_fork 3 common read sys_read 4 common write sys_write 5 common open sys_open compat_sys_open @@ -160,9 +158,7 @@ 119 32 sigreturn sys_sigreturn compat_sys_sigreturn 119 64 sigreturn sys_ni_syscall 119 spu sigreturn sys_ni_syscall -120 32 clone ppc_clone sys_clone -120 64 clone sys_clone -120 spu clone sys_ni_syscall +120 nospu clone sys_clone 121 common setdomainname sys_setdomainname 122 common uname sys_newuname 123 common modify_ldt sys_ni_syscall @@ -244,9 +240,7 @@ 186 spu sendfile sys_sendfile64 187 common getpmsg sys_ni_syscall 188 common putpmsg sys_ni_syscall -189 32 vfork ppc_vfork sys_vfork -189 64 vfork sys_vfork -189 spu vfork sys_ni_syscall +189 nospu vfork sys_vfork 190 common ugetrlimit sys_getrlimit compat_sys_getrlimit 191 common readahead sys_readahead compat_sys_readahead 192 32 mmap2 sys_mmap2 compat_sys_mmap2 @@ -322,9 +316,7 @@ 248 32 clock_nanosleep sys_clock_nanosleep_time32 248 64 clock_nanosleep sys_clock_nanosleep 248 spu clock_nanosleep sys_clock_nanosleep -249 32 swapcontext ppc_swapcontext compat_sys_swapcontext -249 64 swapcontext sys_swapcontext -249 spu swapcontext sys_ni_syscall +249 nospu swapcontext sys_swapcontext compat_sys_swapcontext 250 common tgkill sys_tgkill 251 32 utimes sys_utimes_time32 251 64 utimes sys_utimes @@ -522,9 +514,7 @@ 432 common fsmount sys_fsmount 433 common fspick sys_fspick 434 common pidfd_open sys_pidfd_open -435 32 clone3 ppc_clone3 sys_clone3 -435 64 clone3 sys_clone3 -435 spu clone3 sys_ni_syscall +435 nospu clone3 sys_clone3 436 common close_range sys_close_range 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd -- cgit v1.2.3 From 20e32b9cb0c61c9264efcb16d8e3a80d3738ff2b Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 23 Feb 2021 09:51:17 -0300 Subject: tools headers UAPI s390: Sync ptrace.h kernel headers To pick up the changes from: 56e62a7370283601 ("s390: convert to generic entry") That only adds two new defines, so shouldn't cause problems when building the BPF selftests. Silencing this perf build warning: Warning: Kernel ABI header at 'tools/arch/s390/include/uapi/asm/ptrace.h' differs from latest version at 'arch/s390/include/uapi/asm/ptrace.h' diff -u tools/arch/s390/include/uapi/asm/ptrace.h arch/s390/include/uapi/asm/ptrace.h Cc: Hendrik Brueckner Cc: Sven Schnelle Cc: Vasily Gorbik Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/s390/include/uapi/asm/ptrace.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/arch/s390/include/uapi/asm/ptrace.h b/tools/arch/s390/include/uapi/asm/ptrace.h index 543dd70e12c8..ad64d673b5e6 100644 --- a/tools/arch/s390/include/uapi/asm/ptrace.h +++ b/tools/arch/s390/include/uapi/asm/ptrace.h @@ -179,8 +179,9 @@ #define ACR_SIZE 4 -#define PTRACE_OLDSETOPTIONS 21 - +#define PTRACE_OLDSETOPTIONS 21 +#define PTRACE_SYSEMU 31 +#define PTRACE_SYSEMU_SINGLESTEP 32 #ifndef __ASSEMBLY__ #include #include -- cgit v1.2.3 From 84b7725536d82e99b7564a079b8627f3be692a13 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 23 Feb 2021 09:56:50 -0300 Subject: tools headers UAPI: Sync kvm.h headers with the kernel sources To pick the changes in: d9a47edabc4f9481 ("KVM: PPC: Book3S HV: Introduce new capability for 2nd DAWR") 8d4e7e80838f45d3 ("KVM: x86: declare Xen HVM shared info capability and add test case") 40da8ccd724f7ca2 ("KVM: x86/xen: Add event channel interrupt vector upcall") These new IOCTLs are now supported on 'perf trace': $ tools/perf/trace/beauty/kvm_ioctl.sh > before $ cp include/uapi/linux/kvm.h tools/include/uapi/linux/kvm.h $ tools/perf/trace/beauty/kvm_ioctl.sh > after $ diff -u before after --- before 2021-02-23 09:55:46.229058308 -0300 +++ after 2021-02-23 09:55:57.509308058 -0300 @@ -91,6 +91,10 @@ [0xc1] = "GET_SUPPORTED_HV_CPUID", [0xc6] = "X86_SET_MSR_FILTER", [0xc7] = "RESET_DIRTY_RINGS", + [0xc8] = "XEN_HVM_GET_ATTR", + [0xc9] = "XEN_HVM_SET_ATTR", + [0xca] = "XEN_VCPU_GET_ATTR", + [0xcb] = "XEN_VCPU_SET_ATTR", [0xe0] = "CREATE_DEVICE", [0xe1] = "SET_DEVICE_ATTR", [0xe2] = "GET_DEVICE_ATTR", $ Addressing this perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/kvm.h' differs from latest version at 'include/uapi/linux/kvm.h' diff -u tools/include/uapi/linux/kvm.h include/uapi/linux/kvm.h Cc: David Woodhouse Cc: Paul Mackerras Cc: Ravi Bangoria Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/kvm.h | 73 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) (limited to 'tools') diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index abb89bbe5635..8b281f722e5b 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -216,6 +216,20 @@ struct kvm_hyperv_exit { } u; }; +struct kvm_xen_exit { +#define KVM_EXIT_XEN_HCALL 1 + __u32 type; + union { + struct { + __u32 longmode; + __u32 cpl; + __u64 input; + __u64 result; + __u64 params[6]; + } hcall; + } u; +}; + #define KVM_S390_GET_SKEYS_NONE 1 #define KVM_S390_SKEYS_MAX 1048576 @@ -252,6 +266,8 @@ struct kvm_hyperv_exit { #define KVM_EXIT_X86_WRMSR 30 #define KVM_EXIT_DIRTY_RING_FULL 31 #define KVM_EXIT_AP_RESET_HOLD 32 +#define KVM_EXIT_X86_BUS_LOCK 33 +#define KVM_EXIT_XEN 34 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -428,6 +444,8 @@ struct kvm_run { __u32 index; /* kernel -> user */ __u64 data; /* kernel <-> user */ } msr; + /* KVM_EXIT_XEN */ + struct kvm_xen_exit xen; /* Fix the size of the union. */ char padding[256]; }; @@ -1058,6 +1076,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190 #define KVM_CAP_SYS_HYPERV_CPUID 191 #define KVM_CAP_DIRTY_LOG_RING 192 +#define KVM_CAP_X86_BUS_LOCK_EXIT 193 #define KVM_CAP_PPC_DAWR1 194 #ifdef KVM_CAP_IRQ_ROUTING @@ -1132,6 +1151,10 @@ struct kvm_x86_mce { #endif #ifdef KVM_CAP_XEN_HVM +#define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR (1 << 0) +#define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1) +#define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2) + struct kvm_xen_hvm_config { __u32 flags; __u32 msr; @@ -1566,6 +1589,45 @@ struct kvm_pv_cmd { /* Available with KVM_CAP_DIRTY_LOG_RING */ #define KVM_RESET_DIRTY_RINGS _IO(KVMIO, 0xc7) +/* Per-VM Xen attributes */ +#define KVM_XEN_HVM_GET_ATTR _IOWR(KVMIO, 0xc8, struct kvm_xen_hvm_attr) +#define KVM_XEN_HVM_SET_ATTR _IOW(KVMIO, 0xc9, struct kvm_xen_hvm_attr) + +struct kvm_xen_hvm_attr { + __u16 type; + __u16 pad[3]; + union { + __u8 long_mode; + __u8 vector; + struct { + __u64 gfn; + } shared_info; + __u64 pad[8]; + } u; +}; + +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */ +#define KVM_XEN_ATTR_TYPE_LONG_MODE 0x0 +#define KVM_XEN_ATTR_TYPE_SHARED_INFO 0x1 +#define KVM_XEN_ATTR_TYPE_UPCALL_VECTOR 0x2 + +/* Per-vCPU Xen attributes */ +#define KVM_XEN_VCPU_GET_ATTR _IOWR(KVMIO, 0xca, struct kvm_xen_vcpu_attr) +#define KVM_XEN_VCPU_SET_ATTR _IOW(KVMIO, 0xcb, struct kvm_xen_vcpu_attr) + +struct kvm_xen_vcpu_attr { + __u16 type; + __u16 pad[3]; + union { + __u64 gpa; + __u64 pad[8]; + } u; +}; + +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */ +#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO 0x0 +#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO 0x1 + /* Secure Encrypted Virtualization command */ enum sev_cmd_id { /* Guest initialization commands */ @@ -1594,6 +1656,8 @@ enum sev_cmd_id { KVM_SEV_DBG_ENCRYPT, /* Guest certificates commands */ KVM_SEV_CERT_EXPORT, + /* Attestation report */ + KVM_SEV_GET_ATTESTATION_REPORT, KVM_SEV_NR_MAX, }; @@ -1646,6 +1710,12 @@ struct kvm_sev_dbg { __u32 len; }; +struct kvm_sev_attestation_report { + __u8 mnonce[16]; + __u64 uaddr; + __u32 len; +}; + #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) #define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) @@ -1767,4 +1837,7 @@ struct kvm_dirty_gfn { __u64 offset; }; +#define KVM_BUS_LOCK_DETECTION_OFF (1 << 0) +#define KVM_BUS_LOCK_DETECTION_EXIT (1 << 1) + #endif /* __LINUX_KVM_H */ -- cgit v1.2.3 From b9fc8b4a591811546fec2dbef7e9f809362100c9 Mon Sep 17 00:00:00 2001 From: Grant Seltzer Date: Mon, 22 Feb 2021 19:58:46 +0000 Subject: bpf: Add kernel/modules BTF presence checks to bpftool feature command This adds both the CONFIG_DEBUG_INFO_BTF and CONFIG_DEBUG_INFO_BTF_MODULES kernel compile option to output of the bpftool feature command. This is relevant for developers that want to account for data structure definition differences between kernels. Signed-off-by: Grant Seltzer Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210222195846.155483-1-grantseltzer@gmail.com --- tools/bpf/bpftool/feature.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'tools') diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index 359960a8f1de..40a88df275f9 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -336,6 +336,10 @@ static void probe_kernel_image_config(const char *define_prefix) { "CONFIG_BPF_JIT", }, /* Avoid compiling eBPF interpreter (use JIT only) */ { "CONFIG_BPF_JIT_ALWAYS_ON", }, + /* Kernel BTF debug information available */ + { "CONFIG_DEBUG_INFO_BTF", }, + /* Kernel module BTF debug information available */ + { "CONFIG_DEBUG_INFO_BTF_MODULES", }, /* cgroups */ { "CONFIG_CGROUPS", }, -- cgit v1.2.3 From 1f87dcf116adedd6b9f47258d4fdf4fa9ce74002 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 25 Feb 2021 15:43:16 -0800 Subject: selftests/bpf: Add non-BPF_LSM test for task local storage Task local storage is enabled for tracing programs. Add two tests for task local storage without CONFIG_BPF_LSM. The first test stores a value in sys_enter and read it back in sys_exit. The second test checks whether the kernel allows allocating task local storage in exit_creds() (which it should not). Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210225234319.336131-4-songliubraving@fb.com --- .../selftests/bpf/prog_tests/task_local_storage.c | 69 ++++++++++++++++++++++ .../selftests/bpf/progs/task_local_storage.c | 64 ++++++++++++++++++++ .../bpf/progs/task_local_storage_exit_creds.c | 32 ++++++++++ 3 files changed, 165 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/task_local_storage.c create mode 100644 tools/testing/selftests/bpf/progs/task_local_storage.c create mode 100644 tools/testing/selftests/bpf/progs/task_local_storage_exit_creds.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c new file mode 100644 index 000000000000..dbb7525cdd56 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#define _GNU_SOURCE /* See feature_test_macros(7) */ +#include +#include /* For SYS_xxx definitions */ +#include +#include +#include "task_local_storage.skel.h" +#include "task_local_storage_exit_creds.skel.h" + +static void test_sys_enter_exit(void) +{ + struct task_local_storage *skel; + int err; + + skel = task_local_storage__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) + return; + + skel->bss->target_pid = syscall(SYS_gettid); + + err = task_local_storage__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto out; + + syscall(SYS_gettid); + syscall(SYS_gettid); + + /* 3x syscalls: 1x attach and 2x gettid */ + ASSERT_EQ(skel->bss->enter_cnt, 3, "enter_cnt"); + ASSERT_EQ(skel->bss->exit_cnt, 3, "exit_cnt"); + ASSERT_EQ(skel->bss->mismatch_cnt, 0, "mismatch_cnt"); +out: + task_local_storage__destroy(skel); +} + +static void test_exit_creds(void) +{ + struct task_local_storage_exit_creds *skel; + int err; + + skel = task_local_storage_exit_creds__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) + return; + + err = task_local_storage_exit_creds__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto out; + + /* trigger at least one exit_creds() */ + if (CHECK_FAIL(system("ls > /dev/null"))) + goto out; + + /* sync rcu to make sure exit_creds() is called for "ls" */ + kern_sync_rcu(); + ASSERT_EQ(skel->bss->valid_ptr_count, 0, "valid_ptr_count"); + ASSERT_NEQ(skel->bss->null_ptr_count, 0, "null_ptr_count"); +out: + task_local_storage_exit_creds__destroy(skel); +} + +void test_task_local_storage(void) +{ + if (test__start_subtest("sys_enter_exit")) + test_sys_enter_exit(); + if (test__start_subtest("exit_creds")) + test_exit_creds(); +} diff --git a/tools/testing/selftests/bpf/progs/task_local_storage.c b/tools/testing/selftests/bpf/progs/task_local_storage.c new file mode 100644 index 000000000000..80a0a20db88d --- /dev/null +++ b/tools/testing/selftests/bpf/progs/task_local_storage.c @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include "vmlinux.h" +#include +#include + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, long); +} enter_id SEC(".maps"); + +#define MAGIC_VALUE 0xabcd1234 + +pid_t target_pid = 0; +int mismatch_cnt = 0; +int enter_cnt = 0; +int exit_cnt = 0; + +SEC("tp_btf/sys_enter") +int BPF_PROG(on_enter, struct pt_regs *regs, long id) +{ + struct task_struct *task; + long *ptr; + + task = bpf_get_current_task_btf(); + if (task->pid != target_pid) + return 0; + + ptr = bpf_task_storage_get(&enter_id, task, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (!ptr) + return 0; + + __sync_fetch_and_add(&enter_cnt, 1); + *ptr = MAGIC_VALUE + enter_cnt; + + return 0; +} + +SEC("tp_btf/sys_exit") +int BPF_PROG(on_exit, struct pt_regs *regs, long id) +{ + struct task_struct *task; + long *ptr; + + task = bpf_get_current_task_btf(); + if (task->pid != target_pid) + return 0; + + ptr = bpf_task_storage_get(&enter_id, task, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (!ptr) + return 0; + + __sync_fetch_and_add(&exit_cnt, 1); + if (*ptr != MAGIC_VALUE + exit_cnt) + __sync_fetch_and_add(&mismatch_cnt, 1); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/task_local_storage_exit_creds.c b/tools/testing/selftests/bpf/progs/task_local_storage_exit_creds.c new file mode 100644 index 000000000000..81758c0aef99 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/task_local_storage_exit_creds.c @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include "vmlinux.h" +#include +#include + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, __u64); +} task_storage SEC(".maps"); + +int valid_ptr_count = 0; +int null_ptr_count = 0; + +SEC("fentry/exit_creds") +int BPF_PROG(trace_exit_creds, struct task_struct *task) +{ + __u64 *ptr; + + ptr = bpf_task_storage_get(&task_storage, task, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (ptr) + __sync_fetch_and_add(&valid_ptr_count, 1); + else + __sync_fetch_and_add(&null_ptr_count, 1); + return 0; +} -- cgit v1.2.3 From c540957a4d1d13934e93e53ce3056ac4108ffc29 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 25 Feb 2021 15:43:17 -0800 Subject: selftests/bpf: Test deadlock from recursive bpf_task_storage_[get|delete] Add a test with recursive bpf_task_storage_[get|delete] from fentry programs on bpf_local_storage_lookup and bpf_local_storage_update. Without proper deadlock prevent mechanism, this test would cause deadlock. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210225234319.336131-5-songliubraving@fb.com --- .../selftests/bpf/prog_tests/task_local_storage.c | 23 +++++++ .../selftests/bpf/progs/task_ls_recursion.c | 70 ++++++++++++++++++++++ 2 files changed, 93 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/task_ls_recursion.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c index dbb7525cdd56..035c263aab1b 100644 --- a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c +++ b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c @@ -8,6 +8,7 @@ #include #include "task_local_storage.skel.h" #include "task_local_storage_exit_creds.skel.h" +#include "task_ls_recursion.skel.h" static void test_sys_enter_exit(void) { @@ -60,10 +61,32 @@ out: task_local_storage_exit_creds__destroy(skel); } +static void test_recursion(void) +{ + struct task_ls_recursion *skel; + int err; + + skel = task_ls_recursion__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) + return; + + err = task_ls_recursion__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto out; + + /* trigger sys_enter, make sure it does not cause deadlock */ + syscall(SYS_gettid); + +out: + task_ls_recursion__destroy(skel); +} + void test_task_local_storage(void) { if (test__start_subtest("sys_enter_exit")) test_sys_enter_exit(); if (test__start_subtest("exit_creds")) test_exit_creds(); + if (test__start_subtest("recursion")) + test_recursion(); } diff --git a/tools/testing/selftests/bpf/progs/task_ls_recursion.c b/tools/testing/selftests/bpf/progs/task_ls_recursion.c new file mode 100644 index 000000000000..564583dca7c8 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/task_ls_recursion.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include "vmlinux.h" +#include +#include + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, long); +} map_a SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, long); +} map_b SEC(".maps"); + +SEC("fentry/bpf_local_storage_lookup") +int BPF_PROG(on_lookup) +{ + struct task_struct *task = bpf_get_current_task_btf(); + + bpf_task_storage_delete(&map_a, task); + bpf_task_storage_delete(&map_b, task); + return 0; +} + +SEC("fentry/bpf_local_storage_update") +int BPF_PROG(on_update) +{ + struct task_struct *task = bpf_get_current_task_btf(); + long *ptr; + + ptr = bpf_task_storage_get(&map_a, task, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (ptr) + *ptr += 1; + + ptr = bpf_task_storage_get(&map_b, task, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (ptr) + *ptr += 1; + + return 0; +} + +SEC("tp_btf/sys_enter") +int BPF_PROG(on_enter, struct pt_regs *regs, long id) +{ + struct task_struct *task; + long *ptr; + + task = bpf_get_current_task_btf(); + ptr = bpf_task_storage_get(&map_a, task, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (ptr) + *ptr = 200; + + ptr = bpf_task_storage_get(&map_b, task, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (ptr) + *ptr = 100; + return 0; +} -- cgit v1.2.3 From 4b0d2d4156cfb9f27d8e52a33d3522a78002fca1 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 25 Feb 2021 15:43:18 -0800 Subject: bpf: runqslower: Prefer using local vmlimux to generate vmlinux.h Update the Makefile to prefer using $(O)/vmlinux, $(KBUILD_OUTPUT)/vmlinux (for selftests) or ../../../vmlinux. These two files should have latest definitions for vmlinux.h. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210225234319.336131-6-songliubraving@fb.com --- tools/bpf/runqslower/Makefile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/bpf/runqslower/Makefile b/tools/bpf/runqslower/Makefile index 9d9fb6209be1..c96ba90c6f01 100644 --- a/tools/bpf/runqslower/Makefile +++ b/tools/bpf/runqslower/Makefile @@ -16,7 +16,10 @@ CFLAGS := -g -Wall # Try to detect best kernel BTF source KERNEL_REL := $(shell uname -r) -VMLINUX_BTF_PATHS := /sys/kernel/btf/vmlinux /boot/vmlinux-$(KERNEL_REL) +VMLINUX_BTF_PATHS := $(if $(O),$(O)/vmlinux) \ + $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux) \ + ../../../vmlinux /sys/kernel/btf/vmlinux \ + /boot/vmlinux-$(KERNEL_REL) VMLINUX_BTF_PATH := $(or $(VMLINUX_BTF),$(firstword \ $(wildcard $(VMLINUX_BTF_PATHS)))) -- cgit v1.2.3 From ced47e30ab8b3ed986e28411f63e041b51c1fdf8 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 25 Feb 2021 15:43:19 -0800 Subject: bpf: runqslower: Use task local storage Replace hashtab with task local storage in runqslower. This improves the performance of these BPF programs. The following table summarizes average runtime of these programs, in nanoseconds: task-local hash-prealloc hash-no-prealloc handle__sched_wakeup 125 340 3124 handle__sched_wakeup_new 2812 1510 2998 handle__sched_switch 151 208 991 Note that, task local storage gives better performance than hashtab for handle__sched_wakeup and handle__sched_switch. On the other hand, for handle__sched_wakeup_new, task local storage is slower than hashtab with prealloc. This is because handle__sched_wakeup_new accesses the data for the first time, so it has to allocate the data for task local storage. Once the initial allocation is done, subsequent accesses, as those in handle__sched_wakeup, are much faster with task local storage. If we disable hashtab prealloc, task local storage is much faster for all 3 functions. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210225234319.336131-7-songliubraving@fb.com --- tools/bpf/runqslower/runqslower.bpf.c | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/bpf/runqslower/runqslower.bpf.c b/tools/bpf/runqslower/runqslower.bpf.c index 1f18a409f044..645530ca7e98 100644 --- a/tools/bpf/runqslower/runqslower.bpf.c +++ b/tools/bpf/runqslower/runqslower.bpf.c @@ -11,9 +11,9 @@ const volatile __u64 min_us = 0; const volatile pid_t targ_pid = 0; struct { - __uint(type, BPF_MAP_TYPE_HASH); - __uint(max_entries, 10240); - __type(key, u32); + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); __type(value, u64); } start SEC(".maps"); @@ -25,15 +25,20 @@ struct { /* record enqueue timestamp */ __always_inline -static int trace_enqueue(u32 tgid, u32 pid) +static int trace_enqueue(struct task_struct *t) { - u64 ts; + u32 pid = t->pid; + u64 *ptr; if (!pid || (targ_pid && targ_pid != pid)) return 0; - ts = bpf_ktime_get_ns(); - bpf_map_update_elem(&start, &pid, &ts, 0); + ptr = bpf_task_storage_get(&start, t, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (!ptr) + return 0; + + *ptr = bpf_ktime_get_ns(); return 0; } @@ -43,7 +48,7 @@ int handle__sched_wakeup(u64 *ctx) /* TP_PROTO(struct task_struct *p) */ struct task_struct *p = (void *)ctx[0]; - return trace_enqueue(p->tgid, p->pid); + return trace_enqueue(p); } SEC("tp_btf/sched_wakeup_new") @@ -52,7 +57,7 @@ int handle__sched_wakeup_new(u64 *ctx) /* TP_PROTO(struct task_struct *p) */ struct task_struct *p = (void *)ctx[0]; - return trace_enqueue(p->tgid, p->pid); + return trace_enqueue(p); } SEC("tp_btf/sched_switch") @@ -70,12 +75,16 @@ int handle__sched_switch(u64 *ctx) /* ivcsw: treat like an enqueue event and store timestamp */ if (prev->state == TASK_RUNNING) - trace_enqueue(prev->tgid, prev->pid); + trace_enqueue(prev); pid = next->pid; + /* For pid mismatch, save a bpf_task_storage_get */ + if (!pid || (targ_pid && targ_pid != pid)) + return 0; + /* fetch timestamp and calculate delta */ - tsp = bpf_map_lookup_elem(&start, &pid); + tsp = bpf_task_storage_get(&start, next, 0, 0); if (!tsp) return 0; /* missed enqueue */ @@ -91,7 +100,7 @@ int handle__sched_switch(u64 *ctx) bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &event, sizeof(event)); - bpf_map_delete_elem(&start, &pid); + bpf_task_storage_delete(&start, next); return 0; } -- cgit v1.2.3 From ecde60614d5ed60fde1c80b38b71582a3ea2e662 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Tue, 23 Feb 2021 16:23:01 +0000 Subject: selftest/bpf: Make xsk tests less verbose Make the xsk tests less verbose by only printing the essentials. Currently, it is hard to see if the tests passed or not due to all the printouts. Move the extra printouts to a verbose option, if further debugging is needed when a problem arises. To run the xsk tests with verbose output: ./test_xsk.sh -v Signed-off-by: Magnus Karlsson Signed-off-by: Ciara Loftus Signed-off-by: Alexei Starovoitov Acked-by: Maciej Fijalkowski Link: https://lore.kernel.org/bpf/20210223162304.7450-2-ciara.loftus@intel.com --- tools/testing/selftests/bpf/test_xsk.sh | 27 ++++++++++++++++++++------ tools/testing/selftests/bpf/xdpxceiver.c | 31 +++++++++++++++++------------- tools/testing/selftests/bpf/xdpxceiver.h | 13 ++++++++----- tools/testing/selftests/bpf/xsk_prereqs.sh | 9 +++------ 4 files changed, 50 insertions(+), 30 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_xsk.sh b/tools/testing/selftests/bpf/test_xsk.sh index 88a7483eaae4..f4cedf4c2718 100755 --- a/tools/testing/selftests/bpf/test_xsk.sh +++ b/tools/testing/selftests/bpf/test_xsk.sh @@ -71,13 +71,17 @@ # # Run (full output without color-coding): # sudo ./test_xsk.sh +# +# Run with verbose output: +# sudo ./test_xsk.sh -v . xsk_prereqs.sh -while getopts c flag +while getopts "cv" flag do case "${flag}" in c) colorconsole=1;; + v) verbose=1;; esac done @@ -95,13 +99,17 @@ NS1=af_xdp${VETH1_POSTFIX} MTU=1500 setup_vethPairs() { - echo "setting up ${VETH0}: namespace: ${NS0}" + if [[ $verbose -eq 1 ]]; then + echo "setting up ${VETH0}: namespace: ${NS0}" + fi ip netns add ${NS1} ip link add ${VETH0} type veth peer name ${VETH1} if [ -f /proc/net/if_inet6 ]; then echo 1 > /proc/sys/net/ipv6/conf/${VETH0}/disable_ipv6 fi - echo "setting up ${VETH1}: namespace: ${NS1}" + if [[ $verbose -eq 1 ]]; then + echo "setting up ${VETH1}: namespace: ${NS1}" + fi ip link set ${VETH1} netns ${NS1} ip netns exec ${NS1} ip link set ${VETH1} mtu ${MTU} ip link set ${VETH0} mtu ${MTU} @@ -125,7 +133,10 @@ echo "${VETH0}:${VETH1},${NS1}" > ${SPECFILE} validate_veth_spec_file -echo "Spec file created: ${SPECFILE}" +if [[ $verbose -eq 1 ]]; then + echo "Spec file created: ${SPECFILE}" + VERBOSE_ARG="-v" +fi test_status $retval "${TEST_NAME}" @@ -136,12 +147,16 @@ statusList=() ### TEST 1 TEST_NAME="XSK KSELFTEST FRAMEWORK" -echo "Switching interfaces [${VETH0}, ${VETH1}] to XDP Generic mode" +if [[ $verbose -eq 1 ]]; then + echo "Switching interfaces [${VETH0}, ${VETH1}] to XDP Generic mode" +fi vethXDPgeneric ${VETH0} ${VETH1} ${NS1} retval=$? if [ $retval -eq 0 ]; then - echo "Switching interfaces [${VETH0}, ${VETH1}] to XDP Native mode" + if [[ $verbose -eq 1 ]]; then + echo "Switching interfaces [${VETH0}, ${VETH1}] to XDP Native mode" + fi vethXDPnative ${VETH0} ${VETH1} ${NS1} fi diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index f4a96d5ff524..8af746c9a6b6 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -341,6 +341,7 @@ static struct option long_options[] = { {"tear-down", no_argument, 0, 'T'}, {"bidi", optional_argument, 0, 'B'}, {"debug", optional_argument, 0, 'D'}, + {"verbose", no_argument, 0, 'v'}, {"tx-pkt-count", optional_argument, 0, 'C'}, {0, 0, 0, 0} }; @@ -359,6 +360,7 @@ static void usage(const char *prog) " -T, --tear-down Tear down sockets by repeatedly recreating them\n" " -B, --bidi Bi-directional sockets test\n" " -D, --debug Debug mode - dump packets L2 - L5\n" + " -v, --verbose Verbose output\n" " -C, --tx-pkt-count=n Number of packets to send\n"; ksft_print_msg(str, prog); } @@ -392,7 +394,7 @@ static void *nsswitchthread(void *args) ksft_test_result_fail("ERROR: [%s] interface \"%s\" does not exist\n", __func__, ifdict[targs->idx]->ifname); } else { - ksft_print_msg("Interface found: %s\n", ifdict[targs->idx]->ifname); + print_verbose("Interface found: %s\n", ifdict[targs->idx]->ifname); targs->retptr = true; } } @@ -422,7 +424,7 @@ static int validate_interfaces(void) pthread_join(ns_thread, NULL); if (targs->retptr) - ksft_print_msg("NS switched: %s\n", ifdict[i]->nsname); + print_verbose("NS switched: %s\n", ifdict[i]->nsname); free(targs); } else { @@ -432,7 +434,7 @@ static int validate_interfaces(void) ("ERROR: interface \"%s\" does not exist\n", ifdict[i]->ifname); ret = false; } else { - ksft_print_msg("Interface found: %s\n", ifdict[i]->ifname); + print_verbose("Interface found: %s\n", ifdict[i]->ifname); } } } @@ -446,7 +448,7 @@ static void parse_command_line(int argc, char **argv) opterr = 0; for (;;) { - c = getopt_long(argc, argv, "i:q:pSNcTBDC:", long_options, &option_index); + c = getopt_long(argc, argv, "i:q:pSNcTBDC:v", long_options, &option_index); if (c == -1) break; @@ -497,6 +499,9 @@ static void parse_command_line(int argc, char **argv) case 'C': opt_pkt_count = atoi(optarg); break; + case 'v': + opt_verbose = 1; + break; default: usage(basename(argv[0])); ksft_exit_xfail(); @@ -714,7 +719,7 @@ static void worker_pkt_dump(void) int payload = *((uint32_t *)(pkt_buf[iter]->payload + PKT_HDR_SIZE)); if (payload == EOT) { - ksft_print_msg("End-of-transmission frame received\n"); + print_verbose("End-of-transmission frame received\n"); fprintf(stdout, "---------------------------------------\n"); break; } @@ -746,7 +751,7 @@ static void worker_pkt_validate(void) } if (payloadseqnum == EOT) { - ksft_print_msg("End-of-transmission frame received: PASS\n"); + print_verbose("End-of-transmission frame received: PASS\n"); sigvar = 1; break; } @@ -836,7 +841,7 @@ static void *worker_testapp_validate(void *arg) usleep(USLEEP_MAX); } - ksft_print_msg("Interface [%s] vector [Tx]\n", ifobject->ifname); + print_verbose("Interface [%s] vector [Tx]\n", ifobject->ifname); for (int i = 0; i < num_frames; i++) { /*send EOT frame */ if (i == (num_frames - 1)) @@ -850,7 +855,7 @@ static void *worker_testapp_validate(void *arg) gen_eth_frame(ifobject->umem, i * XSK_UMEM__DEFAULT_FRAME_SIZE); } - ksft_print_msg("Sending %d packets on interface %s\n", + print_verbose("Sending %d packets on interface %s\n", (opt_pkt_count - 1), ifobject->ifname); tx_only_all(ifobject); } else if (ifobject->fv.vector == rx) { @@ -860,7 +865,7 @@ static void *worker_testapp_validate(void *arg) if (!bidi_pass) thread_common_ops(ifobject, bufs, &sync_mutex_tx, &spinning_rx); - ksft_print_msg("Interface [%s] vector [Rx]\n", ifobject->ifname); + print_verbose("Interface [%s] vector [Rx]\n", ifobject->ifname); xsk_populate_fill_ring(ifobject->umem); TAILQ_INIT(&head); @@ -890,11 +895,11 @@ static void *worker_testapp_validate(void *arg) break; } - ksft_print_msg("Received %d packets on interface %s\n", + print_verbose("Received %d packets on interface %s\n", pkt_counter, ifobject->ifname); if (opt_teardown) - ksft_print_msg("Destroying socket\n"); + print_verbose("Destroying socket\n"); } if (!opt_bidi || bidi_pass) { @@ -914,7 +919,7 @@ static void testapp_validate(void) if (opt_bidi && bidi_pass) { pthread_init_mutex(); if (!switching_notify) { - ksft_print_msg("Switching Tx/Rx vectors\n"); + print_verbose("Switching Tx/Rx vectors\n"); switching_notify++; } } @@ -974,7 +979,7 @@ static void testapp_sockets(void) pkt_counter = 0; prev_pkt = -1; sigvar = 0; - ksft_print_msg("Creating socket\n"); + print_verbose("Creating socket\n"); testapp_validate(); opt_bidi ? bidi_pass++ : bidi_pass; } diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h index 0e9f9b7e61c2..f66f399dfb2d 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.h +++ b/tools/testing/selftests/bpf/xdpxceiver.h @@ -42,6 +42,8 @@ #define POLL_TMOUT 1000 #define NEED_WAKEUP true +#define print_verbose(x...) do { if (opt_verbose) ksft_print_msg(x); } while (0) + typedef __u32 u32; typedef __u16 u16; typedef __u8 u8; @@ -51,11 +53,11 @@ enum TESTS { ORDER_CONTENT_VALIDATE_XDP_DRV = 1, }; -u8 uut; -u8 debug_pkt_dump; -u32 num_frames; -u8 switching_notify; -u8 bidi_pass; +static u8 uut; +static u8 debug_pkt_dump; +static u32 num_frames; +static u8 switching_notify; +static u8 bidi_pass; static u32 opt_xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; static int opt_queue; @@ -64,6 +66,7 @@ static int opt_poll; static int opt_teardown; static int opt_bidi; static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP; +static u8 opt_verbose; static u8 pkt_data[XSK_UMEM__DEFAULT_FRAME_SIZE]; static u32 pkt_counter; static u32 prev_pkt = -1; diff --git a/tools/testing/selftests/bpf/xsk_prereqs.sh b/tools/testing/selftests/bpf/xsk_prereqs.sh index 9d54c4645127..ef8c5b31f4b6 100755 --- a/tools/testing/selftests/bpf/xsk_prereqs.sh +++ b/tools/testing/selftests/bpf/xsk_prereqs.sh @@ -82,24 +82,21 @@ clear_configs() { if [ $(ip netns show | grep $3 &>/dev/null; echo $?;) == 0 ]; then [ $(ip netns exec $3 ip link show $2 &>/dev/null; echo $?;) == 0 ] && - { echo "removing link $1:$2"; ip netns exec $3 ip link del $2; } - echo "removing ns $3" + { ip netns exec $3 ip link del $2; } ip netns del $3 fi #Once we delete a veth pair node, the entire veth pair is removed, #this is just to be cautious just incase the NS does not exist then #veth node inside NS won't get removed so we explicitly remove it [ $(ip link show $1 &>/dev/null; echo $?;) == 0 ] && - { echo "removing link $1"; ip link del $1; } + { ip link del $1; } if [ -f ${SPECFILE} ]; then - echo "removing spec file:" ${SPECFILE} rm -f ${SPECFILE} fi } cleanup_exit() { - echo "cleaning up..." clear_configs $1 $2 $3 } @@ -131,5 +128,5 @@ execxdpxceiver() copy[$index]=${!current} done - ./${XSKOBJ} -i ${VETH0} -i ${VETH1},${NS1} ${copy[*]} -C ${NUMPKTS} + ./${XSKOBJ} -i ${VETH0} -i ${VETH1},${NS1} ${copy[*]} -C ${NUMPKTS} ${VERBOSE_ARG} } -- cgit v1.2.3 From d2b0dfd5d1f94fe74ed580b5a1d5fdb5bf11f2fb Mon Sep 17 00:00:00 2001 From: Ciara Loftus Date: Tue, 23 Feb 2021 16:23:02 +0000 Subject: selftests/bpf: Expose and rename debug argument Launching xdpxceiver with -D enables what was formerly know as 'debug' mode. Rename this mode to 'dump-pkts' as it better describes the behavior enabled by the option. New usage: ./xdpxceiver .. -D or ./xdpxceiver .. --dump-pkts Also make it possible to pass this flag to the app via the test_xsk.sh shell script like so: ./test_xsk.sh -D Signed-off-by: Ciara Loftus Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210223162304.7450-3-ciara.loftus@intel.com --- tools/testing/selftests/bpf/test_xsk.sh | 10 +++++++++- tools/testing/selftests/bpf/xdpxceiver.c | 6 +++--- tools/testing/selftests/bpf/xsk_prereqs.sh | 3 ++- 3 files changed, 14 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_xsk.sh b/tools/testing/selftests/bpf/test_xsk.sh index f4cedf4c2718..dbb129a36606 100755 --- a/tools/testing/selftests/bpf/test_xsk.sh +++ b/tools/testing/selftests/bpf/test_xsk.sh @@ -74,14 +74,18 @@ # # Run with verbose output: # sudo ./test_xsk.sh -v +# +# Run and dump packet contents: +# sudo ./test_xsk.sh -D . xsk_prereqs.sh -while getopts "cv" flag +while getopts "cvD" flag do case "${flag}" in c) colorconsole=1;; v) verbose=1;; + D) dump_pkts=1;; esac done @@ -138,6 +142,10 @@ if [[ $verbose -eq 1 ]]; then VERBOSE_ARG="-v" fi +if [[ $dump_pkts -eq 1 ]]; then + DUMP_PKTS_ARG="-D" +fi + test_status $retval "${TEST_NAME}" ## START TESTS diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index 8af746c9a6b6..506423201197 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -58,7 +58,7 @@ * - Rx thread verifies if all 10k packets were received and delivered in-order, * and have the right content * - * Enable/disable debug mode: + * Enable/disable packet dump mode: * -------------------------- * To enable L2 - L4 headers and payload dump of each packet on STDOUT, add * parameter -D to params array in test_xsk.sh, i.e. params=("-S" "-D") @@ -340,7 +340,7 @@ static struct option long_options[] = { {"copy", no_argument, 0, 'c'}, {"tear-down", no_argument, 0, 'T'}, {"bidi", optional_argument, 0, 'B'}, - {"debug", optional_argument, 0, 'D'}, + {"dump-pkts", optional_argument, 0, 'D'}, {"verbose", no_argument, 0, 'v'}, {"tx-pkt-count", optional_argument, 0, 'C'}, {0, 0, 0, 0} @@ -359,7 +359,7 @@ static void usage(const char *prog) " -c, --copy Force copy mode\n" " -T, --tear-down Tear down sockets by repeatedly recreating them\n" " -B, --bidi Bi-directional sockets test\n" - " -D, --debug Debug mode - dump packets L2 - L5\n" + " -D, --dump-pkts Dump packets L2 - L5\n" " -v, --verbose Verbose output\n" " -C, --tx-pkt-count=n Number of packets to send\n"; ksft_print_msg(str, prog); diff --git a/tools/testing/selftests/bpf/xsk_prereqs.sh b/tools/testing/selftests/bpf/xsk_prereqs.sh index ef8c5b31f4b6..da93575d757a 100755 --- a/tools/testing/selftests/bpf/xsk_prereqs.sh +++ b/tools/testing/selftests/bpf/xsk_prereqs.sh @@ -128,5 +128,6 @@ execxdpxceiver() copy[$index]=${!current} done - ./${XSKOBJ} -i ${VETH0} -i ${VETH1},${NS1} ${copy[*]} -C ${NUMPKTS} ${VERBOSE_ARG} + ./${XSKOBJ} -i ${VETH0} -i ${VETH1},${NS1} ${copy[*]} -C ${NUMPKTS} ${VERBOSE_ARG} \ + ${DUMP_PKTS_ARG} } -- cgit v1.2.3 From d3e3bf5b4c673e79c5a11608f53d4e0059a7ec79 Mon Sep 17 00:00:00 2001 From: Ciara Loftus Date: Tue, 23 Feb 2021 16:23:03 +0000 Subject: selftests/bpf: Restructure xsk selftests Prior to this commit individual xsk tests were launched from the shell script 'test_xsk.sh'. When adding a new test type, two new test configurations had to be added to this file - one for each of the supported XDP 'modes' (skb or drv). Should zero copy support be added to the xsk selftest framework in the future, three new test configurations would need to be added for each new test type. Each new test type also typically requires new CLI arguments for the xdpxceiver program. This commit aims to reduce the overhead of adding new tests, by launching the test configurations from within the xdpxceiver program itself, using simple loops. Every test is run every time the C program is executed. Many of the CLI arguments can be removed as a result. Signed-off-by: Ciara Loftus Signed-off-by: Alexei Starovoitov Reviewed-by: Maciej Fijalkowski Link: https://lore.kernel.org/bpf/20210223162304.7450-4-ciara.loftus@intel.com --- tools/testing/selftests/bpf/test_xsk.sh | 112 +-------------- tools/testing/selftests/bpf/xdpxceiver.c | 218 ++++++++++++++++++----------- tools/testing/selftests/bpf/xdpxceiver.h | 33 +++-- tools/testing/selftests/bpf/xsk_prereqs.sh | 24 +--- 4 files changed, 159 insertions(+), 228 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_xsk.sh b/tools/testing/selftests/bpf/test_xsk.sh index dbb129a36606..56d4474e2c83 100755 --- a/tools/testing/selftests/bpf/test_xsk.sh +++ b/tools/testing/selftests/bpf/test_xsk.sh @@ -152,117 +152,9 @@ test_status $retval "${TEST_NAME}" statusList=() -### TEST 1 -TEST_NAME="XSK KSELFTEST FRAMEWORK" +TEST_NAME="XSK KSELFTESTS" -if [[ $verbose -eq 1 ]]; then - echo "Switching interfaces [${VETH0}, ${VETH1}] to XDP Generic mode" -fi -vethXDPgeneric ${VETH0} ${VETH1} ${NS1} - -retval=$? -if [ $retval -eq 0 ]; then - if [[ $verbose -eq 1 ]]; then - echo "Switching interfaces [${VETH0}, ${VETH1}] to XDP Native mode" - fi - vethXDPnative ${VETH0} ${VETH1} ${NS1} -fi - -retval=$? -test_status $retval "${TEST_NAME}" -statusList+=($retval) - -### TEST 2 -TEST_NAME="SKB NOPOLL" - -vethXDPgeneric ${VETH0} ${VETH1} ${NS1} - -params=("-S") -execxdpxceiver params - -retval=$? -test_status $retval "${TEST_NAME}" -statusList+=($retval) - -### TEST 3 -TEST_NAME="SKB POLL" - -vethXDPgeneric ${VETH0} ${VETH1} ${NS1} - -params=("-S" "-p") -execxdpxceiver params - -retval=$? -test_status $retval "${TEST_NAME}" -statusList+=($retval) - -### TEST 4 -TEST_NAME="DRV NOPOLL" - -vethXDPnative ${VETH0} ${VETH1} ${NS1} - -params=("-N") -execxdpxceiver params - -retval=$? -test_status $retval "${TEST_NAME}" -statusList+=($retval) - -### TEST 5 -TEST_NAME="DRV POLL" - -vethXDPnative ${VETH0} ${VETH1} ${NS1} - -params=("-N" "-p") -execxdpxceiver params - -retval=$? -test_status $retval "${TEST_NAME}" -statusList+=($retval) - -### TEST 6 -TEST_NAME="SKB SOCKET TEARDOWN" - -vethXDPgeneric ${VETH0} ${VETH1} ${NS1} - -params=("-S" "-T") -execxdpxceiver params - -retval=$? -test_status $retval "${TEST_NAME}" -statusList+=($retval) - -### TEST 7 -TEST_NAME="DRV SOCKET TEARDOWN" - -vethXDPnative ${VETH0} ${VETH1} ${NS1} - -params=("-N" "-T") -execxdpxceiver params - -retval=$? -test_status $retval "${TEST_NAME}" -statusList+=($retval) - -### TEST 8 -TEST_NAME="SKB BIDIRECTIONAL SOCKETS" - -vethXDPgeneric ${VETH0} ${VETH1} ${NS1} - -params=("-S" "-B") -execxdpxceiver params - -retval=$? -test_status $retval "${TEST_NAME}" -statusList+=($retval) - -### TEST 9 -TEST_NAME="DRV BIDIRECTIONAL SOCKETS" - -vethXDPnative ${VETH0} ${VETH1} ${NS1} - -params=("-N" "-B") -execxdpxceiver params +execxdpxceiver retval=$? test_status $retval "${TEST_NAME}" diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index 506423201197..e7913444518d 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -18,12 +18,7 @@ * These selftests test AF_XDP SKB and Native/DRV modes using veth * Virtual Ethernet interfaces. * - * The following tests are run: - * - * 1. AF_XDP SKB mode - * Generic mode XDP is driver independent, used when the driver does - * not have support for XDP. Works on any netdevice using sockets and - * generic XDP path. XDP hook from netif_receive_skb(). + * For each mode, the following tests are run: * a. nopoll - soft-irq processing * b. poll - using poll() syscall * c. Socket Teardown @@ -34,17 +29,6 @@ * completion rings on each socket, tx/rx in both directions. Only nopoll * mode is used * - * 2. AF_XDP DRV/Native mode - * Works on any netdevice with XDP_REDIRECT support, driver dependent. Processes - * packets before SKB allocation. Provides better performance than SKB. Driver - * hook available just after DMA of buffer descriptor. - * a. nopoll - * b. poll - * c. Socket Teardown - * d. Bi-directional sockets - * - Only copy mode is supported because veth does not currently support - * zero-copy mode - * * Total tests: 8 * * Flow: @@ -98,17 +82,23 @@ typedef __u16 __sum16; static void __exit_with_error(int error, const char *file, const char *func, int line) { - ksft_test_result_fail - ("[%s:%s:%i]: ERROR: %d/\"%s\"\n", file, func, line, error, strerror(error)); - ksft_exit_xfail(); + if (configured_mode == TEST_MODE_UNCONFIGURED) { + ksft_exit_fail_msg + ("[%s:%s:%i]: ERROR: %d/\"%s\"\n", file, func, line, error, strerror(error)); + } else { + ksft_test_result_fail + ("[%s:%s:%i]: ERROR: %d/\"%s\"\n", file, func, line, error, strerror(error)); + ksft_exit_xfail(); + } } #define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, __LINE__) #define print_ksft_result(void)\ - (ksft_test_result_pass("PASS: %s %s %s%s\n", uut ? "DRV" : "SKB", opt_poll ? "POLL" :\ - "NOPOLL", opt_teardown ? "Socket Teardown" : "",\ - opt_bidi ? "Bi-directional Sockets" : "")) + (ksft_test_result_pass("PASS: %s %s %s%s\n", configured_mode ? "DRV" : "SKB",\ + test_type == TEST_TYPE_POLL ? "POLL" : "NOPOLL",\ + test_type == TEST_TYPE_TEARDOWN ? "Socket Teardown" : "",\ + test_type == TEST_TYPE_BIDI ? "Bi-directional Sockets" : "")) static void pthread_init_mutex(void) { @@ -311,10 +301,10 @@ static int xsk_configure_socket(struct ifobject *ifobject) cfg.rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS; cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; cfg.libbpf_flags = 0; - cfg.xdp_flags = opt_xdp_flags; - cfg.bind_flags = opt_xdp_bind_flags; + cfg.xdp_flags = xdp_flags; + cfg.bind_flags = xdp_bind_flags; - if (!opt_bidi) { + if (test_type != TEST_TYPE_BIDI) { rxr = (ifobject->fv.vector == rx) ? &ifobject->xsk->rx : NULL; txr = (ifobject->fv.vector == tx) ? &ifobject->xsk->tx : NULL; } else { @@ -334,12 +324,6 @@ static int xsk_configure_socket(struct ifobject *ifobject) static struct option long_options[] = { {"interface", required_argument, 0, 'i'}, {"queue", optional_argument, 0, 'q'}, - {"poll", no_argument, 0, 'p'}, - {"xdp-skb", no_argument, 0, 'S'}, - {"xdp-native", no_argument, 0, 'N'}, - {"copy", no_argument, 0, 'c'}, - {"tear-down", no_argument, 0, 'T'}, - {"bidi", optional_argument, 0, 'B'}, {"dump-pkts", optional_argument, 0, 'D'}, {"verbose", no_argument, 0, 'v'}, {"tx-pkt-count", optional_argument, 0, 'C'}, @@ -353,12 +337,6 @@ static void usage(const char *prog) " Options:\n" " -i, --interface Use interface\n" " -q, --queue=n Use queue n (default 0)\n" - " -p, --poll Use poll syscall\n" - " -S, --xdp-skb=n Use XDP SKB mode\n" - " -N, --xdp-native=n Enforce XDP DRV (native) mode\n" - " -c, --copy Force copy mode\n" - " -T, --tear-down Tear down sockets by repeatedly recreating them\n" - " -B, --bidi Bi-directional sockets test\n" " -D, --dump-pkts Dump packets L2 - L5\n" " -v, --verbose Verbose output\n" " -C, --tx-pkt-count=n Number of packets to send\n"; @@ -448,7 +426,7 @@ static void parse_command_line(int argc, char **argv) opterr = 0; for (;;) { - c = getopt_long(argc, argv, "i:q:pSNcTBDC:v", long_options, &option_index); + c = getopt_long(argc, argv, "i:q:DC:v", long_options, &option_index); if (c == -1) break; @@ -471,28 +449,6 @@ static void parse_command_line(int argc, char **argv) case 'q': opt_queue = atoi(optarg); break; - case 'p': - opt_poll = 1; - break; - case 'S': - opt_xdp_flags |= XDP_FLAGS_SKB_MODE; - opt_xdp_bind_flags |= XDP_COPY; - uut = ORDER_CONTENT_VALIDATE_XDP_SKB; - break; - case 'N': - opt_xdp_flags |= XDP_FLAGS_DRV_MODE; - opt_xdp_bind_flags |= XDP_COPY; - uut = ORDER_CONTENT_VALIDATE_XDP_DRV; - break; - case 'c': - opt_xdp_bind_flags |= XDP_COPY; - break; - case 'T': - opt_teardown = 1; - break; - case 'B': - opt_bidi = 1; - break; case 'D': debug_pkt_dump = 1; break; @@ -508,6 +464,11 @@ static void parse_command_line(int argc, char **argv) } } + if (!opt_pkt_count) { + print_verbose("No tx-pkt-count specified, using default %u\n", DEFAULT_PKT_CNT); + opt_pkt_count = DEFAULT_PKT_CNT; + } + if (!validate_interfaces()) { usage(basename(argv[0])); ksft_exit_xfail(); @@ -659,7 +620,7 @@ static void tx_only_all(struct ifobject *ifobject) while ((opt_pkt_count && pkt_cnt < opt_pkt_count) || !opt_pkt_count) { int batch_size = get_batch_size(pkt_cnt); - if (opt_poll) { + if (test_type == TEST_TYPE_POLL) { ret = poll(fds, 1, POLL_TMOUT); if (ret <= 0) continue; @@ -883,7 +844,7 @@ static void *worker_testapp_validate(void *arg) pthread_mutex_unlock(&sync_mutex); while (1) { - if (opt_poll) { + if (test_type == TEST_TYPE_POLL) { ret = poll(fds, 1, POLL_TMOUT); if (ret <= 0) continue; @@ -898,11 +859,11 @@ static void *worker_testapp_validate(void *arg) print_verbose("Received %d packets on interface %s\n", pkt_counter, ifobject->ifname); - if (opt_teardown) + if (test_type == TEST_TYPE_TEARDOWN) print_verbose("Destroying socket\n"); } - if (!opt_bidi || bidi_pass) { + if ((test_type != TEST_TYPE_BIDI) || bidi_pass) { xsk_socket__delete(ifobject->xsk->xsk); (void)xsk_umem__delete(ifobject->umem->umem); } @@ -912,11 +873,12 @@ static void *worker_testapp_validate(void *arg) static void testapp_validate(void) { struct timespec max_wait = { 0, 0 }; + bool bidi = test_type == TEST_TYPE_BIDI; pthread_attr_init(&attr); pthread_attr_setstacksize(&attr, THREAD_STACK); - if (opt_bidi && bidi_pass) { + if ((test_type == TEST_TYPE_BIDI) && bidi_pass) { pthread_init_mutex(); if (!switching_notify) { print_verbose("Switching Tx/Rx vectors\n"); @@ -927,10 +889,10 @@ static void testapp_validate(void) pthread_mutex_lock(&sync_mutex); /*Spawn RX thread */ - if (!opt_bidi || !bidi_pass) { + if (!bidi || !bidi_pass) { if (pthread_create(&t0, &attr, worker_testapp_validate, ifdict[1])) exit_with_error(errno); - } else if (opt_bidi && bidi_pass) { + } else if (bidi && bidi_pass) { /*switch Tx/Rx vectors */ ifdict[0]->fv.vector = rx; if (pthread_create(&t0, &attr, worker_testapp_validate, ifdict[0])) @@ -947,10 +909,10 @@ static void testapp_validate(void) pthread_mutex_unlock(&sync_mutex); /*Spawn TX thread */ - if (!opt_bidi || !bidi_pass) { + if (!bidi || !bidi_pass) { if (pthread_create(&t1, &attr, worker_testapp_validate, ifdict[0])) exit_with_error(errno); - } else if (opt_bidi && bidi_pass) { + } else if (bidi && bidi_pass) { /*switch Tx/Rx vectors */ ifdict[1]->fv.vector = tx; if (pthread_create(&t1, &attr, worker_testapp_validate, ifdict[1])) @@ -969,19 +931,20 @@ static void testapp_validate(void) free(pkt_buf); } - if (!opt_teardown && !opt_bidi) + if (!(test_type == TEST_TYPE_TEARDOWN) && !bidi) print_ksft_result(); } static void testapp_sockets(void) { - for (int i = 0; i < (opt_teardown ? MAX_TEARDOWN_ITER : MAX_BIDI_ITER); i++) { + for (int i = 0; i < ((test_type == TEST_TYPE_TEARDOWN) ? MAX_TEARDOWN_ITER : MAX_BIDI_ITER); + i++) { pkt_counter = 0; prev_pkt = -1; sigvar = 0; print_verbose("Creating socket\n"); testapp_validate(); - opt_bidi ? bidi_pass++ : bidi_pass; + test_type == TEST_TYPE_BIDI ? bidi_pass++ : bidi_pass; } print_ksft_result(); @@ -1008,6 +971,98 @@ static void init_iface_config(struct ifaceconfigobj *ifaceconfig) ifdict[1]->src_port = ifaceconfig->dst_port; } +static void *nsdisablemodethread(void *args) +{ + struct targs *targs = args; + + targs->retptr = false; + + if (switch_namespace(targs->idx)) { + targs->retptr = bpf_set_link_xdp_fd(ifdict[targs->idx]->ifindex, -1, targs->flags); + } else { + targs->retptr = errno; + print_verbose("Failed to switch namespace to %s\n", ifdict[targs->idx]->nsname); + } + + pthread_exit(NULL); +} + +static void disable_xdp_mode(int mode) +{ + int err = 0; + __u32 flags = XDP_FLAGS_UPDATE_IF_NOEXIST | mode; + char *mode_str = mode & XDP_FLAGS_SKB_MODE ? "skb" : "drv"; + + for (int i = 0; i < MAX_INTERFACES; i++) { + if (strcmp(ifdict[i]->nsname, "")) { + struct targs *targs; + + targs = malloc(sizeof(*targs)); + memset(targs, 0, sizeof(*targs)); + if (!targs) + exit_with_error(errno); + + targs->idx = i; + targs->flags = flags; + if (pthread_create(&ns_thread, NULL, nsdisablemodethread, targs)) + exit_with_error(errno); + + pthread_join(ns_thread, NULL); + err = targs->retptr; + free(targs); + } else { + err = bpf_set_link_xdp_fd(ifdict[i]->ifindex, -1, flags); + } + + if (err) { + print_verbose("Failed to disable %s mode on interface %s\n", + mode_str, ifdict[i]->ifname); + exit_with_error(err); + } + + print_verbose("Disabled %s mode for interface: %s\n", mode_str, ifdict[i]->ifname); + configured_mode = mode & XDP_FLAGS_SKB_MODE ? TEST_MODE_DRV : TEST_MODE_SKB; + } +} + +static void run_pkt_test(int mode, int type) +{ + test_type = type; + + /* reset defaults after potential previous test */ + xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; + pkt_counter = 0; + switching_notify = 0; + bidi_pass = 0; + prev_pkt = -1; + ifdict[0]->fv.vector = tx; + ifdict[1]->fv.vector = rx; + + switch (mode) { + case (TEST_MODE_SKB): + if (configured_mode == TEST_MODE_DRV) + disable_xdp_mode(XDP_FLAGS_DRV_MODE); + xdp_flags |= XDP_FLAGS_SKB_MODE; + break; + case (TEST_MODE_DRV): + if (configured_mode == TEST_MODE_SKB) + disable_xdp_mode(XDP_FLAGS_SKB_MODE); + xdp_flags |= XDP_FLAGS_DRV_MODE; + break; + default: + break; + } + + pthread_init_mutex(); + + if ((test_type != TEST_TYPE_TEARDOWN) && (test_type != TEST_TYPE_BIDI)) + testapp_validate(); + else + testapp_sockets(); + + pthread_destroy_mutex(); +} + int main(int argc, char **argv) { struct rlimit _rlim = { RLIM_INFINITY, RLIM_INFINITY }; @@ -1021,6 +1076,7 @@ int main(int argc, char **argv) const char *IP2 = "192.168.100.161"; u16 UDP_DST_PORT = 2020; u16 UDP_SRC_PORT = 2121; + int i, j; ifaceconfig = malloc(sizeof(struct ifaceconfigobj)); memcpy(ifaceconfig->dst_mac, MAC1, ETH_ALEN); @@ -1046,24 +1102,18 @@ int main(int argc, char **argv) init_iface_config(ifaceconfig); - pthread_init_mutex(); + disable_xdp_mode(XDP_FLAGS_DRV_MODE); - ksft_set_plan(1); + ksft_set_plan(TEST_MODE_MAX * TEST_TYPE_MAX); - if (!opt_teardown && !opt_bidi) { - testapp_validate(); - } else if (opt_teardown && opt_bidi) { - ksft_test_result_fail("ERROR: parameters -T and -B cannot be used together\n"); - ksft_exit_xfail(); - } else { - testapp_sockets(); + for (i = 0; i < TEST_MODE_MAX; i++) { + for (j = 0; j < TEST_TYPE_MAX; j++) + run_pkt_test(i, j); } for (int i = 0; i < MAX_INTERFACES; i++) free(ifdict[i]); - pthread_destroy_mutex(); - ksft_exit_pass(); return 0; diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h index f66f399dfb2d..e05703f661f8 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.h +++ b/tools/testing/selftests/bpf/xdpxceiver.h @@ -41,6 +41,7 @@ #define BATCH_SIZE 64 #define POLL_TMOUT 1000 #define NEED_WAKEUP true +#define DEFAULT_PKT_CNT 10000 #define print_verbose(x...) do { if (opt_verbose) ksft_print_msg(x); } while (0) @@ -48,28 +49,37 @@ typedef __u32 u32; typedef __u16 u16; typedef __u8 u8; -enum TESTS { - ORDER_CONTENT_VALIDATE_XDP_SKB = 0, - ORDER_CONTENT_VALIDATE_XDP_DRV = 1, +enum TEST_MODES { + TEST_MODE_UNCONFIGURED = -1, + TEST_MODE_SKB, + TEST_MODE_DRV, + TEST_MODE_MAX }; -static u8 uut; +enum TEST_TYPES { + TEST_TYPE_NOPOLL, + TEST_TYPE_POLL, + TEST_TYPE_TEARDOWN, + TEST_TYPE_BIDI, + TEST_TYPE_MAX +}; + +static int configured_mode = TEST_MODE_UNCONFIGURED; static u8 debug_pkt_dump; static u32 num_frames; static u8 switching_notify; static u8 bidi_pass; +static int test_type; -static u32 opt_xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; static int opt_queue; static int opt_pkt_count; -static int opt_poll; -static int opt_teardown; -static int opt_bidi; -static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP; static u8 opt_verbose; + +static u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; +static u32 xdp_bind_flags = XDP_USE_NEED_WAKEUP | XDP_COPY; static u8 pkt_data[XSK_UMEM__DEFAULT_FRAME_SIZE]; static u32 pkt_counter; -static u32 prev_pkt = -1; +static long prev_pkt = -1; static int sigvar; struct xsk_umem_info { @@ -140,8 +150,9 @@ pthread_t t0, t1, ns_thread; pthread_attr_t attr; struct targs { - bool retptr; + u8 retptr; int idx; + u32 flags; }; TAILQ_HEAD(head_s, pkt) head = TAILQ_HEAD_INITIALIZER(head); diff --git a/tools/testing/selftests/bpf/xsk_prereqs.sh b/tools/testing/selftests/bpf/xsk_prereqs.sh index da93575d757a..dac1c5f78752 100755 --- a/tools/testing/selftests/bpf/xsk_prereqs.sh +++ b/tools/testing/selftests/bpf/xsk_prereqs.sh @@ -105,29 +105,7 @@ validate_ip_utility() [ ! $(type -P ip) ] && { echo "'ip' not found. Skipping tests."; test_exit $ksft_skip 1; } } -vethXDPgeneric() -{ - ip link set dev $1 xdpdrv off - ip netns exec $3 ip link set dev $2 xdpdrv off -} - -vethXDPnative() -{ - ip link set dev $1 xdpgeneric off - ip netns exec $3 ip link set dev $2 xdpgeneric off -} - execxdpxceiver() { - local -a 'paramkeys=("${!'"$1"'[@]}")' copy - paramkeysstr=${paramkeys[*]} - - for index in $paramkeysstr; - do - current=$1"[$index]" - copy[$index]=${!current} - done - - ./${XSKOBJ} -i ${VETH0} -i ${VETH1},${NS1} ${copy[*]} -C ${NUMPKTS} ${VERBOSE_ARG} \ - ${DUMP_PKTS_ARG} + ./${XSKOBJ} -i ${VETH0} -i ${VETH1},${NS1} -C ${NUMPKTS} ${VERBOSE_ARG} ${DUMP_PKTS_ARG} } -- cgit v1.2.3 From b267e5a458a719f3f5eaaaebe87c5f4a13584832 Mon Sep 17 00:00:00 2001 From: Ciara Loftus Date: Tue, 23 Feb 2021 16:23:04 +0000 Subject: selftests/bpf: Introduce xsk statistics tests This commit introduces a range of tests to the xsk testsuite for validating xsk statistics. A new test type called 'stats' is added. Within it there are four sub-tests. Each test configures a scenario which should trigger the given error statistic. The test passes if the statistic is successfully incremented. The four statistics for which tests have been created are: 1. rx dropped Increase the UMEM frame headroom to a value which results in insufficient space in the rx buffer for both the packet and the headroom. 2. tx invalid Set the 'len' field of tx descriptors to an invalid value (umem frame size + 1). 3. rx ring full Reduce the size of the RX ring to a fraction of the fill ring size. 4. fill queue empty Do not populate the fill queue and then try to receive pkts. Signed-off-by: Ciara Loftus Signed-off-by: Alexei Starovoitov Reviewed-by: Maciej Fijalkowski Link: https://lore.kernel.org/bpf/20210223162304.7450-5-ciara.loftus@intel.com --- tools/testing/selftests/bpf/xdpxceiver.c | 137 +++++++++++++++++++++++++++---- tools/testing/selftests/bpf/xdpxceiver.h | 13 +++ 2 files changed, 136 insertions(+), 14 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index e7913444518d..8b0f7fdd9003 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -28,8 +28,21 @@ * Configure sockets as bi-directional tx/rx sockets, sets up fill and * completion rings on each socket, tx/rx in both directions. Only nopoll * mode is used + * e. Statistics + * Trigger some error conditions and ensure that the appropriate statistics + * are incremented. Within this test, the following statistics are tested: + * i. rx dropped + * Increase the UMEM frame headroom to a value which results in + * insufficient space in the rx buffer for both the packet and the headroom. + * ii. tx invalid + * Set the 'len' field of tx descriptors to an invalid value (umem frame + * size + 1). + * iii. rx ring full + * Reduce the size of the RX ring to a fraction of the fill ring size. + * iv. fill queue empty + * Do not populate the fill queue and then try to receive pkts. * - * Total tests: 8 + * Total tests: 10 * * Flow: * ----- @@ -95,10 +108,11 @@ static void __exit_with_error(int error, const char *file, const char *func, int #define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, __LINE__) #define print_ksft_result(void)\ - (ksft_test_result_pass("PASS: %s %s %s%s\n", configured_mode ? "DRV" : "SKB",\ + (ksft_test_result_pass("PASS: %s %s %s%s%s\n", configured_mode ? "DRV" : "SKB",\ test_type == TEST_TYPE_POLL ? "POLL" : "NOPOLL",\ test_type == TEST_TYPE_TEARDOWN ? "Socket Teardown" : "",\ - test_type == TEST_TYPE_BIDI ? "Bi-directional Sockets" : "")) + test_type == TEST_TYPE_BIDI ? "Bi-directional Sockets" : "",\ + test_type == TEST_TYPE_STATS ? "Stats" : "")) static void pthread_init_mutex(void) { @@ -260,13 +274,20 @@ static void gen_eth_frame(struct xsk_umem_info *umem, u64 addr) static void xsk_configure_umem(struct ifobject *data, void *buffer, u64 size) { int ret; + struct xsk_umem_config cfg = { + .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, + .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, + .frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE, + .frame_headroom = frame_headroom, + .flags = XSK_UMEM__DEFAULT_FLAGS + }; data->umem = calloc(1, sizeof(struct xsk_umem_info)); if (!data->umem) exit_with_error(errno); ret = xsk_umem__create(&data->umem->umem, buffer, size, - &data->umem->fq, &data->umem->cq, NULL); + &data->umem->fq, &data->umem->cq, &cfg); if (ret) exit_with_error(ret); @@ -298,7 +319,7 @@ static int xsk_configure_socket(struct ifobject *ifobject) exit_with_error(errno); ifobject->xsk->umem = ifobject->umem; - cfg.rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS; + cfg.rx_size = rxqsize; cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; cfg.libbpf_flags = 0; cfg.xdp_flags = xdp_flags; @@ -565,6 +586,8 @@ static void tx_only(struct xsk_socket_info *xsk, u32 *frameptr, int batch_size) { u32 idx; unsigned int i; + bool tx_invalid_test = stat_test_type == STAT_TEST_TX_INVALID; + u32 len = tx_invalid_test ? XSK_UMEM__DEFAULT_FRAME_SIZE + 1 : PKT_SIZE; while (xsk_ring_prod__reserve(&xsk->tx, batch_size, &idx) < batch_size) complete_tx_only(xsk, batch_size); @@ -573,11 +596,16 @@ static void tx_only(struct xsk_socket_info *xsk, u32 *frameptr, int batch_size) struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx + i); tx_desc->addr = (*frameptr + i) << XSK_UMEM__DEFAULT_FRAME_SHIFT; - tx_desc->len = PKT_SIZE; + tx_desc->len = len; } xsk_ring_prod__submit(&xsk->tx, batch_size); - xsk->outstanding_tx += batch_size; + if (!tx_invalid_test) { + xsk->outstanding_tx += batch_size; + } else { + if (!NEED_WAKEUP || xsk_ring_prod__needs_wakeup(&xsk->tx)) + kick_tx(xsk); + } *frameptr += batch_size; *frameptr %= num_frames; complete_tx_only(xsk, batch_size); @@ -689,6 +717,48 @@ static void worker_pkt_dump(void) } } +static void worker_stats_validate(struct ifobject *ifobject) +{ + struct xdp_statistics stats; + socklen_t optlen; + int err; + struct xsk_socket *xsk = stat_test_type == STAT_TEST_TX_INVALID ? + ifdict[!ifobject->ifdict_index]->xsk->xsk : + ifobject->xsk->xsk; + int fd = xsk_socket__fd(xsk); + unsigned long xsk_stat = 0, expected_stat = opt_pkt_count; + + sigvar = 0; + + optlen = sizeof(stats); + err = getsockopt(fd, SOL_XDP, XDP_STATISTICS, &stats, &optlen); + if (err) + return; + + if (optlen == sizeof(struct xdp_statistics)) { + switch (stat_test_type) { + case STAT_TEST_RX_DROPPED: + xsk_stat = stats.rx_dropped; + break; + case STAT_TEST_TX_INVALID: + xsk_stat = stats.tx_invalid_descs; + break; + case STAT_TEST_RX_FULL: + xsk_stat = stats.rx_ring_full; + expected_stat -= RX_FULL_RXQSIZE; + break; + case STAT_TEST_RX_FILL_EMPTY: + xsk_stat = stats.rx_fill_ring_empty_descs; + break; + default: + break; + } + + if (xsk_stat == expected_stat) + sigvar = 1; + } +} + static void worker_pkt_validate(void) { u32 payloadseqnum = -2; @@ -827,7 +897,8 @@ static void *worker_testapp_validate(void *arg) thread_common_ops(ifobject, bufs, &sync_mutex_tx, &spinning_rx); print_verbose("Interface [%s] vector [Rx]\n", ifobject->ifname); - xsk_populate_fill_ring(ifobject->umem); + if (stat_test_type != STAT_TEST_RX_FILL_EMPTY) + xsk_populate_fill_ring(ifobject->umem); TAILQ_INIT(&head); if (debug_pkt_dump) { @@ -849,15 +920,21 @@ static void *worker_testapp_validate(void *arg) if (ret <= 0) continue; } - rx_pkt(ifobject->xsk, fds); - worker_pkt_validate(); + + if (test_type != TEST_TYPE_STATS) { + rx_pkt(ifobject->xsk, fds); + worker_pkt_validate(); + } else { + worker_stats_validate(ifobject); + } if (sigvar) break; } - print_verbose("Received %d packets on interface %s\n", - pkt_counter, ifobject->ifname); + if (test_type != TEST_TYPE_STATS) + print_verbose("Received %d packets on interface %s\n", + pkt_counter, ifobject->ifname); if (test_type == TEST_TYPE_TEARDOWN) print_verbose("Destroying socket\n"); @@ -931,7 +1008,7 @@ static void testapp_validate(void) free(pkt_buf); } - if (!(test_type == TEST_TYPE_TEARDOWN) && !bidi) + if (!(test_type == TEST_TYPE_TEARDOWN) && !bidi && !(test_type == TEST_TYPE_STATS)) print_ksft_result(); } @@ -950,6 +1027,32 @@ static void testapp_sockets(void) print_ksft_result(); } +static void testapp_stats(void) +{ + for (int i = 0; i < STAT_TEST_TYPE_MAX; i++) { + stat_test_type = i; + + /* reset defaults */ + rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS; + frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM; + + switch (stat_test_type) { + case STAT_TEST_RX_DROPPED: + frame_headroom = XSK_UMEM__DEFAULT_FRAME_SIZE - + XDP_PACKET_HEADROOM - 1; + break; + case STAT_TEST_RX_FULL: + rxqsize = RX_FULL_RXQSIZE; + break; + default: + break; + } + testapp_validate(); + } + + print_ksft_result(); +} + static void init_iface_config(struct ifaceconfigobj *ifaceconfig) { /*Init interface0 */ @@ -1037,6 +1140,10 @@ static void run_pkt_test(int mode, int type) prev_pkt = -1; ifdict[0]->fv.vector = tx; ifdict[1]->fv.vector = rx; + sigvar = 0; + stat_test_type = -1; + rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS; + frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM; switch (mode) { case (TEST_MODE_SKB): @@ -1055,7 +1162,9 @@ static void run_pkt_test(int mode, int type) pthread_init_mutex(); - if ((test_type != TEST_TYPE_TEARDOWN) && (test_type != TEST_TYPE_BIDI)) + if (test_type == TEST_TYPE_STATS) + testapp_stats(); + else if ((test_type != TEST_TYPE_TEARDOWN) && (test_type != TEST_TYPE_BIDI)) testapp_validate(); else testapp_sockets(); diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h index e05703f661f8..30314ef305c2 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.h +++ b/tools/testing/selftests/bpf/xdpxceiver.h @@ -42,6 +42,7 @@ #define POLL_TMOUT 1000 #define NEED_WAKEUP true #define DEFAULT_PKT_CNT 10000 +#define RX_FULL_RXQSIZE 32 #define print_verbose(x...) do { if (opt_verbose) ksft_print_msg(x); } while (0) @@ -61,9 +62,18 @@ enum TEST_TYPES { TEST_TYPE_POLL, TEST_TYPE_TEARDOWN, TEST_TYPE_BIDI, + TEST_TYPE_STATS, TEST_TYPE_MAX }; +enum STAT_TEST_TYPES { + STAT_TEST_RX_DROPPED, + STAT_TEST_TX_INVALID, + STAT_TEST_RX_FULL, + STAT_TEST_RX_FILL_EMPTY, + STAT_TEST_TYPE_MAX +}; + static int configured_mode = TEST_MODE_UNCONFIGURED; static u8 debug_pkt_dump; static u32 num_frames; @@ -81,6 +91,9 @@ static u8 pkt_data[XSK_UMEM__DEFAULT_FRAME_SIZE]; static u32 pkt_counter; static long prev_pkt = -1; static int sigvar; +static int stat_test_type; +static u32 rxqsize; +static u32 frame_headroom; struct xsk_umem_info { struct xsk_ring_prod fq; -- cgit v1.2.3 From a83586a7ddba25065ec37323c05deb9019ce4fa9 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 23 Feb 2021 21:14:57 +0800 Subject: bpf: Remove blank line in bpf helper description comment Commit 34b2021cc616 ("bpf: Add BPF-helper for MTU checking") added an extra blank line in bpf helper description. This will make bpf_helpers_doc.py stop building bpf_helper_defs.h immediately after bpf_check_mtu(), which will affect future added functions. Fixes: 34b2021cc616 ("bpf: Add BPF-helper for MTU checking") Signed-off-by: Hangbin Liu Signed-off-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/20210223131457.1378978-1-liuhangbin@gmail.com Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 1 - tools/include/uapi/linux/bpf.h | 1 - 2 files changed, 2 deletions(-) (limited to 'tools') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4c24daa43bac..79c893310492 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3850,7 +3850,6 @@ union bpf_attr { * * long bpf_check_mtu(void *ctx, u32 ifindex, u32 *mtu_len, s32 len_diff, u64 flags) * Description - * Check ctx packet size against exceeding MTU of net device (based * on *ifindex*). This helper will likely be used in combination * with helpers that adjust/change the packet size. diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 4c24daa43bac..79c893310492 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3850,7 +3850,6 @@ union bpf_attr { * * long bpf_check_mtu(void *ctx, u32 ifindex, u32 *mtu_len, s32 len_diff, u64 flags) * Description - * Check ctx packet size against exceeding MTU of net device (based * on *ifindex*). This helper will likely be used in combination * with helpers that adjust/change the packet size. -- cgit v1.2.3 From ae8b8332fbb512f53bf50ff6a7586dd0f90ed18a Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:30 -0800 Subject: sock_map: Rename skb_parser and skb_verdict These two eBPF programs are tied to BPF_SK_SKB_STREAM_PARSER and BPF_SK_SKB_STREAM_VERDICT, rename them to reflect the fact they are only used for TCP. And save the name 'skb_verdict' for general use later. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Reviewed-by: Lorenz Bauer Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-6-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 8 +-- net/core/skmsg.c | 14 ++--- net/core/sock_map.c | 60 +++++++++++----------- .../selftests/bpf/prog_tests/sockmap_listen.c | 8 +-- .../selftests/bpf/progs/test_sockmap_listen.c | 4 +- 5 files changed, 47 insertions(+), 47 deletions(-) (limited to 'tools') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index e0de45527bb6..d9f6ec4a9cf2 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -56,8 +56,8 @@ struct sk_msg { struct sk_psock_progs { struct bpf_prog *msg_parser; - struct bpf_prog *skb_parser; - struct bpf_prog *skb_verdict; + struct bpf_prog *stream_parser; + struct bpf_prog *stream_verdict; }; enum sk_psock_state_bits { @@ -443,8 +443,8 @@ static inline int psock_replace_prog(struct bpf_prog **pprog, static inline void psock_progs_drop(struct sk_psock_progs *progs) { psock_set_prog(&progs->msg_parser, NULL); - psock_set_prog(&progs->skb_parser, NULL); - psock_set_prog(&progs->skb_verdict, NULL); + psock_set_prog(&progs->stream_parser, NULL); + psock_set_prog(&progs->stream_verdict, NULL); } int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 409258367bea..35f9caa3b125 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -691,9 +691,9 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) write_lock_bh(&sk->sk_callback_lock); sk_psock_restore_proto(sk, psock); rcu_assign_sk_user_data(sk, NULL); - if (psock->progs.skb_parser) + if (psock->progs.stream_parser) sk_psock_stop_strp(sk, psock); - else if (psock->progs.skb_verdict) + else if (psock->progs.stream_verdict) sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock); sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); @@ -803,7 +803,7 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) int ret = __SK_PASS; rcu_read_lock(); - prog = READ_ONCE(psock->progs.skb_verdict); + prog = READ_ONCE(psock->progs.stream_verdict); if (likely(prog)) { /* We skip full set_owner_r here because if we do a SK_PASS * or SK_DROP we can skip skb memory accounting and use the @@ -895,7 +895,7 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) goto out; } skb_set_owner_r(skb, sk); - prog = READ_ONCE(psock->progs.skb_verdict); + prog = READ_ONCE(psock->progs.stream_verdict); if (likely(prog)) { skb_dst_drop(skb); skb_bpf_redirect_clear(skb); @@ -919,7 +919,7 @@ static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb) int ret = skb->len; rcu_read_lock(); - prog = READ_ONCE(psock->progs.skb_parser); + prog = READ_ONCE(psock->progs.stream_parser); if (likely(prog)) { skb->sk = psock->sk; ret = sk_psock_bpf_run(psock, prog, skb); @@ -982,7 +982,7 @@ void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) static void sk_psock_done_strp(struct sk_psock *psock) { /* Parser has been stopped */ - if (psock->progs.skb_parser) + if (psock->progs.stream_parser) strp_done(&psock->strp); } #else @@ -1015,7 +1015,7 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, goto out; } skb_set_owner_r(skb, sk); - prog = READ_ONCE(psock->progs.skb_verdict); + prog = READ_ONCE(psock->progs.stream_verdict); if (likely(prog)) { skb_dst_drop(skb); skb_bpf_redirect_clear(skb); diff --git a/net/core/sock_map.c b/net/core/sock_map.c index dbfcd7006338..69785070f02d 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -148,9 +148,9 @@ static void sock_map_del_link(struct sock *sk, struct bpf_map *map = link->map; struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - if (psock->saved_data_ready && stab->progs.skb_parser) + if (psock->saved_data_ready && stab->progs.stream_parser) strp_stop = true; - if (psock->saved_data_ready && stab->progs.skb_verdict) + if (psock->saved_data_ready && stab->progs.stream_verdict) verdict_stop = true; list_del(&link->list); sk_psock_free_link(link); @@ -224,23 +224,23 @@ out: static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, struct sock *sk) { - struct bpf_prog *msg_parser, *skb_parser, *skb_verdict; + struct bpf_prog *msg_parser, *stream_parser, *stream_verdict; struct sk_psock *psock; int ret; - skb_verdict = READ_ONCE(progs->skb_verdict); - if (skb_verdict) { - skb_verdict = bpf_prog_inc_not_zero(skb_verdict); - if (IS_ERR(skb_verdict)) - return PTR_ERR(skb_verdict); + stream_verdict = READ_ONCE(progs->stream_verdict); + if (stream_verdict) { + stream_verdict = bpf_prog_inc_not_zero(stream_verdict); + if (IS_ERR(stream_verdict)) + return PTR_ERR(stream_verdict); } - skb_parser = READ_ONCE(progs->skb_parser); - if (skb_parser) { - skb_parser = bpf_prog_inc_not_zero(skb_parser); - if (IS_ERR(skb_parser)) { - ret = PTR_ERR(skb_parser); - goto out_put_skb_verdict; + stream_parser = READ_ONCE(progs->stream_parser); + if (stream_parser) { + stream_parser = bpf_prog_inc_not_zero(stream_parser); + if (IS_ERR(stream_parser)) { + ret = PTR_ERR(stream_parser); + goto out_put_stream_verdict; } } @@ -249,7 +249,7 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, msg_parser = bpf_prog_inc_not_zero(msg_parser); if (IS_ERR(msg_parser)) { ret = PTR_ERR(msg_parser); - goto out_put_skb_parser; + goto out_put_stream_parser; } } @@ -261,8 +261,8 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, if (psock) { if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) || - (skb_parser && READ_ONCE(psock->progs.skb_parser)) || - (skb_verdict && READ_ONCE(psock->progs.skb_verdict))) { + (stream_parser && READ_ONCE(psock->progs.stream_parser)) || + (stream_verdict && READ_ONCE(psock->progs.stream_verdict))) { sk_psock_put(sk, psock); ret = -EBUSY; goto out_progs; @@ -283,15 +283,15 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, goto out_drop; write_lock_bh(&sk->sk_callback_lock); - if (skb_parser && skb_verdict && !psock->saved_data_ready) { + if (stream_parser && stream_verdict && !psock->saved_data_ready) { ret = sk_psock_init_strp(sk, psock); if (ret) goto out_unlock_drop; - psock_set_prog(&psock->progs.skb_verdict, skb_verdict); - psock_set_prog(&psock->progs.skb_parser, skb_parser); + psock_set_prog(&psock->progs.stream_verdict, stream_verdict); + psock_set_prog(&psock->progs.stream_parser, stream_parser); sk_psock_start_strp(sk, psock); - } else if (!skb_parser && skb_verdict && !psock->saved_data_ready) { - psock_set_prog(&psock->progs.skb_verdict, skb_verdict); + } else if (!stream_parser && stream_verdict && !psock->saved_data_ready) { + psock_set_prog(&psock->progs.stream_verdict, stream_verdict); sk_psock_start_verdict(sk,psock); } write_unlock_bh(&sk->sk_callback_lock); @@ -303,12 +303,12 @@ out_drop: out_progs: if (msg_parser) bpf_prog_put(msg_parser); -out_put_skb_parser: - if (skb_parser) - bpf_prog_put(skb_parser); -out_put_skb_verdict: - if (skb_verdict) - bpf_prog_put(skb_verdict); +out_put_stream_parser: + if (stream_parser) + bpf_prog_put(stream_parser); +out_put_stream_verdict: + if (stream_verdict) + bpf_prog_put(stream_verdict); return ret; } @@ -1459,11 +1459,11 @@ int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, break; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) case BPF_SK_SKB_STREAM_PARSER: - pprog = &progs->skb_parser; + pprog = &progs->stream_parser; break; #endif case BPF_SK_SKB_STREAM_VERDICT: - pprog = &progs->skb_verdict; + pprog = &progs->stream_verdict; break; default: return -EOPNOTSUPP; diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c index d7d65a700799..c26e6bf05e49 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -1014,8 +1014,8 @@ static void test_skb_redir_to_connected(struct test_sockmap_listen *skel, struct bpf_map *inner_map, int family, int sotype) { - int verdict = bpf_program__fd(skel->progs.prog_skb_verdict); - int parser = bpf_program__fd(skel->progs.prog_skb_parser); + int verdict = bpf_program__fd(skel->progs.prog_stream_verdict); + int parser = bpf_program__fd(skel->progs.prog_stream_parser); int verdict_map = bpf_map__fd(skel->maps.verdict_map); int sock_map = bpf_map__fd(inner_map); int err; @@ -1125,8 +1125,8 @@ static void test_skb_redir_to_listening(struct test_sockmap_listen *skel, struct bpf_map *inner_map, int family, int sotype) { - int verdict = bpf_program__fd(skel->progs.prog_skb_verdict); - int parser = bpf_program__fd(skel->progs.prog_skb_parser); + int verdict = bpf_program__fd(skel->progs.prog_stream_verdict); + int parser = bpf_program__fd(skel->progs.prog_stream_parser); int verdict_map = bpf_map__fd(skel->maps.verdict_map); int sock_map = bpf_map__fd(inner_map); int err; diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_listen.c b/tools/testing/selftests/bpf/progs/test_sockmap_listen.c index a3a366c57ce1..fa221141e9c1 100644 --- a/tools/testing/selftests/bpf/progs/test_sockmap_listen.c +++ b/tools/testing/selftests/bpf/progs/test_sockmap_listen.c @@ -31,13 +31,13 @@ struct { static volatile bool test_sockmap; /* toggled by user-space */ SEC("sk_skb/stream_parser") -int prog_skb_parser(struct __sk_buff *skb) +int prog_stream_parser(struct __sk_buff *skb) { return skb->len; } SEC("sk_skb/stream_verdict") -int prog_skb_verdict(struct __sk_buff *skb) +int prog_stream_verdict(struct __sk_buff *skb) { unsigned int *count; __u32 zero = 0; -- cgit v1.2.3 From 2854436612c4d2606c9246ce2976ab6634276337 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Thu, 25 Feb 2021 16:19:47 +0000 Subject: selftests/bpf: Propagate error code of the command to vmtest.sh When vmtest.sh ran a command in a VM, it did not record or propagate the error code of the command. This made the script less "script-able". The script now saves the error code of the said command in a file in the VM, copies the file back to the host and (when available) uses this error code instead of its own. Signed-off-by: KP Singh Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210225161947.1778590-1-kpsingh@kernel.org --- tools/testing/selftests/bpf/vmtest.sh | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/vmtest.sh b/tools/testing/selftests/bpf/vmtest.sh index 26ae8d0b6ce3..22554894db99 100755 --- a/tools/testing/selftests/bpf/vmtest.sh +++ b/tools/testing/selftests/bpf/vmtest.sh @@ -17,6 +17,9 @@ KCONFIG_URL="https://raw.githubusercontent.com/libbpf/libbpf/master/travis-ci/vm KCONFIG_API_URL="https://api.github.com/repos/libbpf/libbpf/contents/travis-ci/vmtest/configs/latest.config" INDEX_URL="https://raw.githubusercontent.com/libbpf/libbpf/master/travis-ci/vmtest/configs/INDEX" NUM_COMPILE_JOBS="$(nproc)" +LOG_FILE_BASE="$(date +"bpf_selftests.%Y-%m-%d_%H-%M-%S")" +LOG_FILE="${LOG_FILE_BASE}.log" +EXIT_STATUS_FILE="${LOG_FILE_BASE}.exit_status" usage() { @@ -146,7 +149,6 @@ update_init_script() local init_script_dir="${OUTPUT_DIR}/${MOUNT_DIR}/etc/rcS.d" local init_script="${init_script_dir}/S50-startup" local command="$1" - local log_file="$2" mount_image @@ -163,11 +165,16 @@ EOF sudo bash -c "cat >${init_script}" < "/root/${EXIT_STATUS_FILE}" + { cd /root/bpf echo ${command} stdbuf -oL -eL ${command} -} 2>&1 | tee /root/${log_file} + echo "\$?" > "/root/${EXIT_STATUS_FILE}" +} 2>&1 | tee "/root/${LOG_FILE}" poweroff -f EOF @@ -221,10 +228,12 @@ EOF copy_logs() { local mount_dir="${OUTPUT_DIR}/${MOUNT_DIR}" - local log_file="${mount_dir}/root/$1" + local log_file="${mount_dir}/root/${LOG_FILE}" + local exit_status_file="${mount_dir}/root/${EXIT_STATUS_FILE}" mount_image sudo cp ${log_file} "${OUTPUT_DIR}" + sudo cp ${exit_status_file} "${OUTPUT_DIR}" sudo rm -f ${log_file} unmount_image } @@ -263,7 +272,6 @@ main() { local script_dir="$(cd -P -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)" local kernel_checkout=$(realpath "${script_dir}"/../../../../) - local log_file="$(date +"bpf_selftests.%Y-%m-%d_%H-%M-%S.log")" # By default the script searches for the kernel in the checkout directory but # it also obeys environment variables O= and KBUILD_OUTPUT= local kernel_bzimage="${kernel_checkout}/${X86_BZIMAGE}" @@ -347,19 +355,23 @@ main() fi update_selftests "${kernel_checkout}" "${make_command}" - update_init_script "${command}" "${log_file}" + update_init_script "${command}" run_vm "${kernel_bzimage}" - copy_logs "${log_file}" - echo "Logs saved in ${OUTPUT_DIR}/${log_file}" + copy_logs + echo "Logs saved in ${OUTPUT_DIR}/${LOG_FILE}" } catch() { local exit_code=$1 + local exit_status_file="${OUTPUT_DIR}/${EXIT_STATUS_FILE}" # This is just a cleanup and the directory may # have already been unmounted. So, don't let this # clobber the error code we intend to return. unmount_image || true + if [[ -f "${exit_status_file}" ]]; then + exit_code="$(cat ${exit_status_file})" + fi exit ${exit_code} } -- cgit v1.2.3 From 86fd166575c38c17ecd3a6b8fb9c64fa19871486 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Wed, 24 Feb 2021 12:14:45 +0100 Subject: selftests/bpf: Copy extras in out-of-srctree builds Building selftests in a separate directory like this: make O="$BUILD" -C tools/testing/selftests/bpf and then running: cd "$BUILD" && ./test_progs -t btf causes all the non-flavored btf_dump_test_case_*.c tests to fail, because these files are not copied to where test_progs expects to find them. Fix by not skipping EXT-COPY when the original $(OUTPUT) is not empty (lib.mk sets it to $(shell pwd) in that case) and using rsync instead of cp: cp fails because e.g. urandom_read is being copied into itself, and rsync simply skips such cases. rsync is already used by kselftests and therefore is not a new dependency. Signed-off-by: Ilya Leoshkevich Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210224111445.102342-1-iii@linux.ibm.com --- tools/testing/selftests/bpf/Makefile | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 044bfdcf5b74..a81af15e4ded 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -382,11 +382,12 @@ $(TRUNNER_EXTRA_OBJS): $(TRUNNER_OUTPUT)/%.o: \ $$(call msg,EXT-OBJ,$(TRUNNER_BINARY),$$@) $(Q)$$(CC) $$(CFLAGS) -c $$< $$(LDLIBS) -o $$@ -# only copy extra resources if in flavored build +# non-flavored in-srctree builds receive special treatment, in particular, we +# do not need to copy extra resources (see e.g. test_btf_dump_case()) $(TRUNNER_BINARY)-extras: $(TRUNNER_EXTRA_FILES) | $(TRUNNER_OUTPUT) -ifneq ($2,) +ifneq ($2:$(OUTPUT),:$(shell pwd)) $$(call msg,EXT-COPY,$(TRUNNER_BINARY),$(TRUNNER_EXTRA_FILES)) - $(Q)cp -a $$^ $(TRUNNER_OUTPUT)/ + $(Q)rsync -aq $$^ $(TRUNNER_OUTPUT)/ endif $(OUTPUT)/$(TRUNNER_BINARY): $(TRUNNER_TEST_OBJS) \ -- cgit v1.2.3 From 69c087ba6225b574afb6e505b72cb75242a3d844 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 26 Feb 2021 12:49:25 -0800 Subject: bpf: Add bpf_for_each_map_elem() helper The bpf_for_each_map_elem() helper is introduced which iterates all map elements with a callback function. The helper signature looks like long bpf_for_each_map_elem(map, callback_fn, callback_ctx, flags) and for each map element, the callback_fn will be called. For example, like hashmap, the callback signature may look like long callback_fn(map, key, val, callback_ctx) There are two known use cases for this. One is from upstream ([1]) where a for_each_map_elem helper may help implement a timeout mechanism in a more generic way. Another is from our internal discussion for a firewall use case where a map contains all the rules. The packet data can be compared to all these rules to decide allow or deny the packet. For array maps, users can already use a bounded loop to traverse elements. Using this helper can avoid using bounded loop. For other type of maps (e.g., hash maps) where bounded loop is hard or impossible to use, this helper provides a convenient way to operate on all elements. For callback_fn, besides map and map element, a callback_ctx, allocated on caller stack, is also passed to the callback function. This callback_ctx argument can provide additional input and allow to write to caller stack for output. If the callback_fn returns 0, the helper will iterate through next element if available. If the callback_fn returns 1, the helper will stop iterating and returns to the bpf program. Other return values are not used for now. Currently, this helper is only available with jit. It is possible to make it work with interpreter with so effort but I leave it as the future work. [1]: https://lore.kernel.org/bpf/20210122205415.113822-1-xiyou.wangcong@gmail.com/ Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210226204925.3884923-1-yhs@fb.com --- include/linux/bpf.h | 13 +++ include/linux/bpf_verifier.h | 3 + include/uapi/linux/bpf.h | 38 ++++++++ kernel/bpf/bpf_iter.c | 16 ++++ kernel/bpf/helpers.c | 2 + kernel/bpf/verifier.c | 208 ++++++++++++++++++++++++++++++++++++++--- kernel/trace/bpf_trace.c | 2 + tools/include/uapi/linux/bpf.h | 38 ++++++++ 8 files changed, 307 insertions(+), 13 deletions(-) (limited to 'tools') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e1e4d2f60527..aeb1b93a4d75 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -39,6 +39,7 @@ struct bpf_local_storage; struct bpf_local_storage_map; struct kobject; struct mem_cgroup; +struct bpf_func_state; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; @@ -129,6 +130,13 @@ struct bpf_map_ops { bool (*map_meta_equal)(const struct bpf_map *meta0, const struct bpf_map *meta1); + + int (*map_set_for_each_callback_args)(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee); + int (*map_for_each_callback)(struct bpf_map *map, void *callback_fn, + void *callback_ctx, u64 flags); + /* BTF name and id of struct allocated by map_alloc */ const char * const map_btf_name; int *map_btf_id; @@ -295,6 +303,8 @@ enum bpf_arg_type { ARG_CONST_ALLOC_SIZE_OR_ZERO, /* number of allocated bytes requested */ ARG_PTR_TO_BTF_ID_SOCK_COMMON, /* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */ ARG_PTR_TO_PERCPU_BTF_ID, /* pointer to in-kernel percpu type */ + ARG_PTR_TO_FUNC, /* pointer to a bpf program function */ + ARG_PTR_TO_STACK_OR_NULL, /* pointer to stack or NULL */ __BPF_ARG_TYPE_MAX, }; @@ -411,6 +421,8 @@ enum bpf_reg_type { PTR_TO_RDWR_BUF, /* reg points to a read/write buffer */ PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */ PTR_TO_PERCPU_BTF_ID, /* reg points to a percpu kernel variable */ + PTR_TO_FUNC, /* reg points to a bpf program function */ + PTR_TO_MAP_KEY, /* reg points to a map element key */ }; /* The information passed from prog-specific *_is_valid_access @@ -1887,6 +1899,7 @@ extern const struct bpf_func_proto bpf_sock_from_file_proto; extern const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto; extern const struct bpf_func_proto bpf_task_storage_get_proto; extern const struct bpf_func_proto bpf_task_storage_delete_proto; +extern const struct bpf_func_proto bpf_for_each_map_elem_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 971b33aca13d..51c2ffa3d901 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -68,6 +68,8 @@ struct bpf_reg_state { unsigned long raw1; unsigned long raw2; } raw; + + u32 subprogno; /* for PTR_TO_FUNC */ }; /* For PTR_TO_PACKET, used to find other pointers with the same variable * offset, so they can share range knowledge. @@ -204,6 +206,7 @@ struct bpf_func_state { int acquired_refs; struct bpf_reference_state *refs; int allocated_stack; + bool in_callback_fn; struct bpf_stack_state *stack; }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 79c893310492..b89af20cfa19 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -393,6 +393,15 @@ enum bpf_link_type { * is struct/union. */ #define BPF_PSEUDO_BTF_ID 3 +/* insn[0].src_reg: BPF_PSEUDO_FUNC + * insn[0].imm: insn offset to the func + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of the function + * verifier type: PTR_TO_FUNC. + */ +#define BPF_PSEUDO_FUNC 4 /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative * offset to another bpf function @@ -3909,6 +3918,34 @@ union bpf_attr { * * **BPF_MTU_CHK_RET_FRAG_NEEDED** * * **BPF_MTU_CHK_RET_SEGS_TOOBIG** * + * long bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn, void *callback_ctx, u64 flags) + * Description + * For each element in **map**, call **callback_fn** function with + * **map**, **callback_ctx** and other map-specific parameters. + * The **callback_fn** should be a static function and + * the **callback_ctx** should be a pointer to the stack. + * The **flags** is used to control certain aspects of the helper. + * Currently, the **flags** must be 0. + * + * The following are a list of supported map types and their + * respective expected callback signatures: + * + * BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_PERCPU_HASH, + * BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH, + * BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PERCPU_ARRAY + * + * long (\*callback_fn)(struct bpf_map \*map, const void \*key, void \*value, void \*ctx); + * + * For per_cpu maps, the map_value is the value on the cpu where the + * bpf_prog is running. + * + * If **callback_fn** return 0, the helper will continue to the next + * element. If return value is 1, the helper will skip the rest of + * elements and return. Other return values are not used now. + * + * Return + * The number of traversed map elements for success, **-EINVAL** for + * invalid **flags**. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4075,6 +4112,7 @@ union bpf_attr { FN(ima_inode_hash), \ FN(sock_from_file), \ FN(check_mtu), \ + FN(for_each_map_elem), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index a0d9eade9c80..931870f9cf56 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -675,3 +675,19 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx) */ return ret == 0 ? 0 : -EAGAIN; } + +BPF_CALL_4(bpf_for_each_map_elem, struct bpf_map *, map, void *, callback_fn, + void *, callback_ctx, u64, flags) +{ + return map->ops->map_for_each_callback(map, callback_fn, callback_ctx, flags); +} + +const struct bpf_func_proto bpf_for_each_map_elem_proto = { + .func = bpf_for_each_map_elem, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_FUNC, + .arg3_type = ARG_PTR_TO_STACK_OR_NULL, + .arg4_type = ARG_ANYTHING, +}; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 308427fe03a3..074800226327 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -708,6 +708,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ringbuf_discard_proto; case BPF_FUNC_ringbuf_query: return &bpf_ringbuf_query_proto; + case BPF_FUNC_for_each_map_elem: + return &bpf_for_each_map_elem_proto; default: break; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index dbdca49ac6cc..53afe9461b03 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -234,6 +234,12 @@ static bool bpf_pseudo_call(const struct bpf_insn *insn) insn->src_reg == BPF_PSEUDO_CALL; } +static bool bpf_pseudo_func(const struct bpf_insn *insn) +{ + return insn->code == (BPF_LD | BPF_IMM | BPF_DW) && + insn->src_reg == BPF_PSEUDO_FUNC; +} + struct bpf_call_arg_meta { struct bpf_map *map_ptr; bool raw_mode; @@ -248,6 +254,7 @@ struct bpf_call_arg_meta { u32 btf_id; struct btf *ret_btf; u32 ret_btf_id; + u32 subprogno; }; struct btf *btf_vmlinux; @@ -427,6 +434,7 @@ static bool reg_type_not_null(enum bpf_reg_type type) return type == PTR_TO_SOCKET || type == PTR_TO_TCP_SOCK || type == PTR_TO_MAP_VALUE || + type == PTR_TO_MAP_KEY || type == PTR_TO_SOCK_COMMON; } @@ -469,7 +477,8 @@ static bool arg_type_may_be_null(enum bpf_arg_type type) type == ARG_PTR_TO_MEM_OR_NULL || type == ARG_PTR_TO_CTX_OR_NULL || type == ARG_PTR_TO_SOCKET_OR_NULL || - type == ARG_PTR_TO_ALLOC_MEM_OR_NULL; + type == ARG_PTR_TO_ALLOC_MEM_OR_NULL || + type == ARG_PTR_TO_STACK_OR_NULL; } /* Determine whether the function releases some resources allocated by another @@ -552,6 +561,8 @@ static const char * const reg_type_str[] = { [PTR_TO_RDONLY_BUF_OR_NULL] = "rdonly_buf_or_null", [PTR_TO_RDWR_BUF] = "rdwr_buf", [PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null", + [PTR_TO_FUNC] = "func", + [PTR_TO_MAP_KEY] = "map_key", }; static char slot_type_char[] = { @@ -623,6 +634,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, if (type_is_pkt_pointer(t)) verbose(env, ",r=%d", reg->range); else if (t == CONST_PTR_TO_MAP || + t == PTR_TO_MAP_KEY || t == PTR_TO_MAP_VALUE || t == PTR_TO_MAP_VALUE_OR_NULL) verbose(env, ",ks=%d,vs=%d", @@ -1555,6 +1567,19 @@ static int check_subprogs(struct bpf_verifier_env *env) /* determine subprog starts. The end is one before the next starts */ for (i = 0; i < insn_cnt; i++) { + if (bpf_pseudo_func(insn + i)) { + if (!env->bpf_capable) { + verbose(env, + "function pointers are allowed for CAP_BPF and CAP_SYS_ADMIN\n"); + return -EPERM; + } + ret = add_subprog(env, i + insn[i].imm + 1); + if (ret < 0) + return ret; + /* remember subprog */ + insn[i + 1].imm = ret; + continue; + } if (!bpf_pseudo_call(insn + i)) continue; if (!env->bpf_capable) { @@ -2286,6 +2311,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_PERCPU_BTF_ID: case PTR_TO_MEM: case PTR_TO_MEM_OR_NULL: + case PTR_TO_FUNC: + case PTR_TO_MAP_KEY: return true; default: return false; @@ -2890,6 +2917,10 @@ static int __check_mem_access(struct bpf_verifier_env *env, int regno, reg = &cur_regs(env)[regno]; switch (reg->type) { + case PTR_TO_MAP_KEY: + verbose(env, "invalid access to map key, key_size=%d off=%d size=%d\n", + mem_size, off, size); + break; case PTR_TO_MAP_VALUE: verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n", mem_size, off, size); @@ -3295,6 +3326,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, case PTR_TO_FLOW_KEYS: pointer_desc = "flow keys "; break; + case PTR_TO_MAP_KEY: + pointer_desc = "key "; + break; case PTR_TO_MAP_VALUE: pointer_desc = "value "; break; @@ -3396,7 +3430,7 @@ process_func: continue_func: subprog_end = subprog[idx + 1].start; for (; i < subprog_end; i++) { - if (!bpf_pseudo_call(insn + i)) + if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i)) continue; /* remember insn and function to return to */ ret_insn[frame] = i + 1; @@ -3833,7 +3867,19 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn /* for access checks, reg->off is just part of off */ off += reg->off; - if (reg->type == PTR_TO_MAP_VALUE) { + if (reg->type == PTR_TO_MAP_KEY) { + if (t == BPF_WRITE) { + verbose(env, "write to change key R%d not allowed\n", regno); + return -EACCES; + } + + err = check_mem_region_access(env, regno, off, size, + reg->map_ptr->key_size, false); + if (err) + return err; + if (value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); + } else if (reg->type == PTR_TO_MAP_VALUE) { if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { verbose(env, "R%d leaks addr into map\n", value_regno); @@ -4249,6 +4295,9 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, case PTR_TO_PACKET_META: return check_packet_access(env, regno, reg->off, access_size, zero_size_allowed); + case PTR_TO_MAP_KEY: + return check_mem_region_access(env, regno, reg->off, access_size, + reg->map_ptr->key_size, false); case PTR_TO_MAP_VALUE: if (check_map_access_type(env, regno, reg->off, access_size, meta && meta->raw_mode ? BPF_WRITE : @@ -4465,6 +4514,7 @@ static const struct bpf_reg_types map_key_value_types = { PTR_TO_STACK, PTR_TO_PACKET, PTR_TO_PACKET_META, + PTR_TO_MAP_KEY, PTR_TO_MAP_VALUE, }, }; @@ -4496,6 +4546,7 @@ static const struct bpf_reg_types mem_types = { PTR_TO_STACK, PTR_TO_PACKET, PTR_TO_PACKET_META, + PTR_TO_MAP_KEY, PTR_TO_MAP_VALUE, PTR_TO_MEM, PTR_TO_RDONLY_BUF, @@ -4508,6 +4559,7 @@ static const struct bpf_reg_types int_ptr_types = { PTR_TO_STACK, PTR_TO_PACKET, PTR_TO_PACKET_META, + PTR_TO_MAP_KEY, PTR_TO_MAP_VALUE, }, }; @@ -4520,6 +4572,8 @@ static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_T static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } }; static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } }; static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } }; +static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } }; +static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } }; static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, @@ -4548,6 +4602,8 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_INT] = &int_ptr_types, [ARG_PTR_TO_LONG] = &int_ptr_types, [ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types, + [ARG_PTR_TO_FUNC] = &func_ptr_types, + [ARG_PTR_TO_STACK_OR_NULL] = &stack_ptr_types, }; static int check_reg_type(struct bpf_verifier_env *env, u32 regno, @@ -4729,6 +4785,8 @@ skip_type_check: verbose(env, "verifier internal error\n"); return -EFAULT; } + } else if (arg_type == ARG_PTR_TO_FUNC) { + meta->subprogno = reg->subprogno; } else if (arg_type_is_mem_ptr(arg_type)) { /* The access to this pointer is only checked when we hit the * next is_mem_size argument below. @@ -5375,6 +5433,35 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return __check_func_call(env, insn, insn_idx, subprog, set_callee_state); } +static int set_map_elem_callback_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, + int insn_idx) +{ + struct bpf_insn_aux_data *insn_aux = &env->insn_aux_data[insn_idx]; + struct bpf_map *map; + int err; + + if (bpf_map_ptr_poisoned(insn_aux)) { + verbose(env, "tail_call abusing map_ptr\n"); + return -EINVAL; + } + + map = BPF_MAP_PTR(insn_aux->map_ptr_state); + if (!map->ops->map_set_for_each_callback_args || + !map->ops->map_for_each_callback) { + verbose(env, "callback function not allowed for map\n"); + return -ENOTSUPP; + } + + err = map->ops->map_set_for_each_callback_args(env, caller, callee); + if (err) + return err; + + callee->in_callback_fn = true; + return 0; +} + static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) { struct bpf_verifier_state *state = env->cur_state; @@ -5397,8 +5484,22 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) state->curframe--; caller = state->frame[state->curframe]; - /* return to the caller whatever r0 had in the callee */ - caller->regs[BPF_REG_0] = *r0; + if (callee->in_callback_fn) { + /* enforce R0 return value range [0, 1]. */ + struct tnum range = tnum_range(0, 1); + + if (r0->type != SCALAR_VALUE) { + verbose(env, "R0 not a scalar value\n"); + return -EACCES; + } + if (!tnum_in(range, r0->var_off)) { + verbose_invalid_scalar(env, r0, &range, "callback return", "R0"); + return -EINVAL; + } + } else { + /* return to the caller whatever r0 had in the callee */ + caller->regs[BPF_REG_0] = *r0; + } /* Transfer references to the caller */ err = transfer_reference_state(caller, callee); @@ -5453,7 +5554,8 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, func_id != BPF_FUNC_map_delete_elem && func_id != BPF_FUNC_map_push_elem && func_id != BPF_FUNC_map_pop_elem && - func_id != BPF_FUNC_map_peek_elem) + func_id != BPF_FUNC_map_peek_elem && + func_id != BPF_FUNC_for_each_map_elem) return 0; if (map == NULL) { @@ -5534,15 +5636,18 @@ static int check_reference_leak(struct bpf_verifier_env *env) return state->acquired_refs ? -EINVAL : 0; } -static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx) +static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + int *insn_idx_p) { const struct bpf_func_proto *fn = NULL; struct bpf_reg_state *regs; struct bpf_call_arg_meta meta; + int insn_idx = *insn_idx_p; bool changes_data; - int i, err; + int i, err, func_id; /* find function prototype */ + func_id = insn->imm; if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) { verbose(env, "invalid func %s#%d\n", func_id_name(func_id), func_id); @@ -5638,6 +5743,13 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return -EINVAL; } + if (func_id == BPF_FUNC_for_each_map_elem) { + err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, + set_map_elem_callback_state); + if (err < 0) + return -EINVAL; + } + /* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(env, regs, caller_saved[i]); @@ -5891,6 +6003,19 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, else *ptr_limit = -off; return 0; + case PTR_TO_MAP_KEY: + /* Currently, this code is not exercised as the only use + * is bpf_for_each_map_elem() helper which requires + * bpf_capble. The code has been tested manually for + * future use. + */ + if (mask_to_left) { + *ptr_limit = ptr_reg->umax_value + ptr_reg->off; + } else { + off = ptr_reg->smin_value + ptr_reg->off; + *ptr_limit = ptr_reg->map_ptr->key_size - off; + } + return 0; case PTR_TO_MAP_VALUE: if (mask_to_left) { *ptr_limit = ptr_reg->umax_value + ptr_reg->off; @@ -6092,6 +6217,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, verbose(env, "R%d pointer arithmetic on %s prohibited\n", dst, reg_type_str[ptr_reg->type]); return -EACCES; + case PTR_TO_MAP_KEY: case PTR_TO_MAP_VALUE: if (!env->allow_ptr_leaks && !known && (smin_val < 0) != (smax_val < 0)) { verbose(env, "R%d has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root\n", @@ -8271,6 +8397,24 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } + if (insn->src_reg == BPF_PSEUDO_FUNC) { + struct bpf_prog_aux *aux = env->prog->aux; + u32 subprogno = insn[1].imm; + + if (!aux->func_info) { + verbose(env, "missing btf func_info\n"); + return -EINVAL; + } + if (aux->func_info_aux[subprogno].linkage != BTF_FUNC_STATIC) { + verbose(env, "callback function not static\n"); + return -EINVAL; + } + + dst_reg->type = PTR_TO_FUNC; + dst_reg->subprogno = subprogno; + return 0; + } + map = env->used_maps[aux->map_index]; mark_reg_known_zero(env, regs, insn->dst_reg); dst_reg->map_ptr = map; @@ -8657,6 +8801,9 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) struct bpf_insn *insns = env->prog->insnsi; int ret; + if (bpf_pseudo_func(insns + t)) + return visit_func_call_insn(t, insn_cnt, insns, env, true); + /* All non-branch instructions have a single fall-through edge. */ if (BPF_CLASS(insns[t].code) != BPF_JMP && BPF_CLASS(insns[t].code) != BPF_JMP32) @@ -9277,6 +9424,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, */ return false; } + case PTR_TO_MAP_KEY: case PTR_TO_MAP_VALUE: /* If the new min/max/var_off satisfy the old ones and * everything else matches, we are OK. @@ -10123,10 +10271,9 @@ static int do_check(struct bpf_verifier_env *env) if (insn->src_reg == BPF_PSEUDO_CALL) err = check_func_call(env, insn, &env->insn_idx); else - err = check_helper_call(env, insn->imm, env->insn_idx); + err = check_helper_call(env, insn, &env->insn_idx); if (err) return err; - } else if (opcode == BPF_JA) { if (BPF_SRC(insn->code) != BPF_K || insn->imm != 0 || @@ -10555,6 +10702,12 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) goto next_insn; } + if (insn[0].src_reg == BPF_PSEUDO_FUNC) { + aux = &env->insn_aux_data[i]; + aux->ptr_type = PTR_TO_FUNC; + goto next_insn; + } + /* In final convert_pseudo_ld_imm64() step, this is * converted into regular 64-bit imm load insn. */ @@ -10687,9 +10840,13 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env) int insn_cnt = env->prog->len; int i; - for (i = 0; i < insn_cnt; i++, insn++) - if (insn->code == (BPF_LD | BPF_IMM | BPF_DW)) - insn->src_reg = 0; + for (i = 0; i < insn_cnt; i++, insn++) { + if (insn->code != (BPF_LD | BPF_IMM | BPF_DW)) + continue; + if (insn->src_reg == BPF_PSEUDO_FUNC) + continue; + insn->src_reg = 0; + } } /* single env->prog->insni[off] instruction was replaced with the range @@ -11330,6 +11487,12 @@ static int jit_subprogs(struct bpf_verifier_env *env) return 0; for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { + if (bpf_pseudo_func(insn)) { + env->insn_aux_data[i].call_imm = insn->imm; + /* subprog is encoded in insn[1].imm */ + continue; + } + if (!bpf_pseudo_call(insn)) continue; /* Upon error here we cannot fall back to interpreter but @@ -11459,6 +11622,12 @@ static int jit_subprogs(struct bpf_verifier_env *env) for (i = 0; i < env->subprog_cnt; i++) { insn = func[i]->insnsi; for (j = 0; j < func[i]->len; j++, insn++) { + if (bpf_pseudo_func(insn)) { + subprog = insn[1].imm; + insn[0].imm = (u32)(long)func[subprog]->bpf_func; + insn[1].imm = ((u64)(long)func[subprog]->bpf_func) >> 32; + continue; + } if (!bpf_pseudo_call(insn)) continue; subprog = insn->off; @@ -11504,6 +11673,11 @@ static int jit_subprogs(struct bpf_verifier_env *env) * later look the same as if they were interpreted only. */ for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { + if (bpf_pseudo_func(insn)) { + insn[0].imm = env->insn_aux_data[i].call_imm; + insn[1].imm = find_subprog(env, i + insn[0].imm + 1); + continue; + } if (!bpf_pseudo_call(insn)) continue; insn->off = env->insn_aux_data[i].call_imm; @@ -11568,6 +11742,14 @@ static int fixup_call_args(struct bpf_verifier_env *env) return -EINVAL; } for (i = 0; i < prog->len; i++, insn++) { + if (bpf_pseudo_func(insn)) { + /* When JIT fails the progs with callback calls + * have to be rejected, since interpreter doesn't support them yet. + */ + verbose(env, "callbacks are not allowed in non-JITed programs\n"); + return -EINVAL; + } + if (!bpf_pseudo_call(insn)) continue; depth = get_callee_stack_depth(env, insn, i); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e9701744d8e4..0d23755c2747 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1371,6 +1371,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_task_storage_get_proto; case BPF_FUNC_task_storage_delete: return &bpf_task_storage_delete_proto; + case BPF_FUNC_for_each_map_elem: + return &bpf_for_each_map_elem_proto; default: return NULL; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 79c893310492..b89af20cfa19 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -393,6 +393,15 @@ enum bpf_link_type { * is struct/union. */ #define BPF_PSEUDO_BTF_ID 3 +/* insn[0].src_reg: BPF_PSEUDO_FUNC + * insn[0].imm: insn offset to the func + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of the function + * verifier type: PTR_TO_FUNC. + */ +#define BPF_PSEUDO_FUNC 4 /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative * offset to another bpf function @@ -3909,6 +3918,34 @@ union bpf_attr { * * **BPF_MTU_CHK_RET_FRAG_NEEDED** * * **BPF_MTU_CHK_RET_SEGS_TOOBIG** * + * long bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn, void *callback_ctx, u64 flags) + * Description + * For each element in **map**, call **callback_fn** function with + * **map**, **callback_ctx** and other map-specific parameters. + * The **callback_fn** should be a static function and + * the **callback_ctx** should be a pointer to the stack. + * The **flags** is used to control certain aspects of the helper. + * Currently, the **flags** must be 0. + * + * The following are a list of supported map types and their + * respective expected callback signatures: + * + * BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_PERCPU_HASH, + * BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH, + * BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PERCPU_ARRAY + * + * long (\*callback_fn)(struct bpf_map \*map, const void \*key, void \*value, void \*ctx); + * + * For per_cpu maps, the map_value is the value on the cpu where the + * bpf_prog is running. + * + * If **callback_fn** return 0, the helper will continue to the next + * element. If return value is 1, the helper will skip the rest of + * elements and return. Other return values are not used now. + * + * Return + * The number of traversed map elements for success, **-EINVAL** for + * invalid **flags**. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4075,6 +4112,7 @@ union bpf_attr { FN(ima_inode_hash), \ FN(sock_from_file), \ FN(check_mtu), \ + FN(for_each_map_elem), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From b8f871fa32ad392759bc70090fa8c60d9f10625c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 26 Feb 2021 12:49:29 -0800 Subject: libbpf: Move function is_ldimm64() earlier in libbpf.c Move function is_ldimm64() close to the beginning of libbpf.c, so it can be reused by later code and the next patch as well. There is no functionality change. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210226204929.3885295-1-yhs@fb.com --- tools/lib/bpf/libbpf.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index d43cc3f29dae..21a3eedf070d 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -574,6 +574,11 @@ static bool insn_is_subprog_call(const struct bpf_insn *insn) insn->off == 0; } +static bool is_ldimm64(struct bpf_insn *insn) +{ + return insn->code == (BPF_LD | BPF_IMM | BPF_DW); +} + static int bpf_object__init_prog(struct bpf_object *obj, struct bpf_program *prog, const char *name, size_t sec_idx, const char *sec_name, @@ -3395,7 +3400,7 @@ static int bpf_program__record_reloc(struct bpf_program *prog, return 0; } - if (insn->code != (BPF_LD | BPF_IMM | BPF_DW)) { + if (!is_ldimm64(insn)) { pr_warn("prog '%s': invalid relo against '%s' for insns[%d].code 0x%x\n", prog->name, sym_name, insn_idx, insn->code); return -LIBBPF_ERRNO__RELOC; @@ -5566,11 +5571,6 @@ static void bpf_core_poison_insn(struct bpf_program *prog, int relo_idx, insn->imm = 195896080; /* => 0xbad2310 => "bad relo" */ } -static bool is_ldimm64(struct bpf_insn *insn) -{ - return insn->code == (BPF_LD | BPF_IMM | BPF_DW); -} - static int insn_bpf_size_to_bytes(struct bpf_insn *insn) { switch (BPF_SIZE(insn->code)) { -- cgit v1.2.3 From 53eddb5e04ac5c53a4ccef9f1f900562a5a75246 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 26 Feb 2021 12:49:30 -0800 Subject: libbpf: Support subprog address relocation A new relocation RELO_SUBPROG_ADDR is added to capture subprog addresses loaded with ld_imm64 insns. Such ld_imm64 insns are marked with BPF_PSEUDO_FUNC and will be passed to kernel. For bpf_for_each_map_elem() case, kernel will check that the to-be-used subprog address must be a static function and replace it with proper actual jited func address. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210226204930.3885367-1-yhs@fb.com --- tools/lib/bpf/libbpf.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 61 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 21a3eedf070d..62d9ed56b081 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -188,6 +188,7 @@ enum reloc_type { RELO_CALL, RELO_DATA, RELO_EXTERN, + RELO_SUBPROG_ADDR, }; struct reloc_desc { @@ -579,6 +580,11 @@ static bool is_ldimm64(struct bpf_insn *insn) return insn->code == (BPF_LD | BPF_IMM | BPF_DW); } +static bool insn_is_pseudo_func(struct bpf_insn *insn) +{ + return is_ldimm64(insn) && insn->src_reg == BPF_PSEUDO_FUNC; +} + static int bpf_object__init_prog(struct bpf_object *obj, struct bpf_program *prog, const char *name, size_t sec_idx, const char *sec_name, @@ -2979,6 +2985,23 @@ static bool sym_is_extern(const GElf_Sym *sym) GELF_ST_TYPE(sym->st_info) == STT_NOTYPE; } +static bool sym_is_subprog(const GElf_Sym *sym, int text_shndx) +{ + int bind = GELF_ST_BIND(sym->st_info); + int type = GELF_ST_TYPE(sym->st_info); + + /* in .text section */ + if (sym->st_shndx != text_shndx) + return false; + + /* local function */ + if (bind == STB_LOCAL && type == STT_SECTION) + return true; + + /* global function */ + return bind == STB_GLOBAL && type == STT_FUNC; +} + static int find_extern_btf_id(const struct btf *btf, const char *ext_name) { const struct btf_type *t; @@ -3435,6 +3458,23 @@ static int bpf_program__record_reloc(struct bpf_program *prog, return -LIBBPF_ERRNO__RELOC; } + /* loading subprog addresses */ + if (sym_is_subprog(sym, obj->efile.text_shndx)) { + /* global_func: sym->st_value = offset in the section, insn->imm = 0. + * local_func: sym->st_value = 0, insn->imm = offset in the section. + */ + if ((sym->st_value % BPF_INSN_SZ) || (insn->imm % BPF_INSN_SZ)) { + pr_warn("prog '%s': bad subprog addr relo against '%s' at offset %zu+%d\n", + prog->name, sym_name, (size_t)sym->st_value, insn->imm); + return -LIBBPF_ERRNO__RELOC; + } + + reloc_desc->type = RELO_SUBPROG_ADDR; + reloc_desc->insn_idx = insn_idx; + reloc_desc->sym_off = sym->st_value; + return 0; + } + type = bpf_object__section_to_libbpf_map_type(obj, shdr_idx); sym_sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, shdr_idx)); @@ -6172,6 +6212,10 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog) } relo->processed = true; break; + case RELO_SUBPROG_ADDR: + insn[0].src_reg = BPF_PSEUDO_FUNC; + /* will be handled as a follow up pass */ + break; case RELO_CALL: /* will be handled as a follow up pass */ break; @@ -6358,11 +6402,11 @@ bpf_object__reloc_code(struct bpf_object *obj, struct bpf_program *main_prog, for (insn_idx = 0; insn_idx < prog->sec_insn_cnt; insn_idx++) { insn = &main_prog->insns[prog->sub_insn_off + insn_idx]; - if (!insn_is_subprog_call(insn)) + if (!insn_is_subprog_call(insn) && !insn_is_pseudo_func(insn)) continue; relo = find_prog_insn_relo(prog, insn_idx); - if (relo && relo->type != RELO_CALL) { + if (relo && relo->type != RELO_CALL && relo->type != RELO_SUBPROG_ADDR) { pr_warn("prog '%s': unexpected relo for insn #%zu, type %d\n", prog->name, insn_idx, relo->type); return -LIBBPF_ERRNO__RELOC; @@ -6374,8 +6418,22 @@ bpf_object__reloc_code(struct bpf_object *obj, struct bpf_program *main_prog, * call always has imm = -1, but for static functions * relocation is against STT_SECTION and insn->imm * points to a start of a static function + * + * for subprog addr relocation, the relo->sym_off + insn->imm is + * the byte offset in the corresponding section. */ - sub_insn_idx = relo->sym_off / BPF_INSN_SZ + insn->imm + 1; + if (relo->type == RELO_CALL) + sub_insn_idx = relo->sym_off / BPF_INSN_SZ + insn->imm + 1; + else + sub_insn_idx = (relo->sym_off + insn->imm) / BPF_INSN_SZ; + } else if (insn_is_pseudo_func(insn)) { + /* + * RELO_SUBPROG_ADDR relo is always emitted even if both + * functions are in the same section, so it shouldn't reach here. + */ + pr_warn("prog '%s': missing subprog addr relo for insn #%zu\n", + prog->name, insn_idx); + return -LIBBPF_ERRNO__RELOC; } else { /* if subprogram call is to a static function within * the same ELF section, there won't be any relocation -- cgit v1.2.3 From f1f9f0d8d737b9a1c5d15635bf5696643626fd39 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 26 Feb 2021 12:49:31 -0800 Subject: bpftool: Print subprog address properly With later hashmap example, using bpftool xlated output may look like: int dump_task(struct bpf_iter__task * ctx): ; struct task_struct *task = ctx->task; 0: (79) r2 = *(u64 *)(r1 +8) ; if (task == (void *)0 || called > 0) ... 19: (18) r2 = subprog[+17] 30: (18) r2 = subprog[+25] ... 36: (95) exit __u64 check_hash_elem(struct bpf_map * map, __u32 * key, __u64 * val, struct callback_ctx * data): ; struct bpf_iter__task *ctx = data->ctx; 37: (79) r5 = *(u64 *)(r4 +0) ... 55: (95) exit __u64 check_percpu_elem(struct bpf_map * map, __u32 * key, __u64 * val, void * unused): ; check_percpu_elem(struct bpf_map *map, __u32 *key, __u64 *val, void *unused) 56: (bf) r6 = r3 ... 83: (18) r2 = subprog[-47] Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210226204931.3885458-1-yhs@fb.com --- tools/bpf/bpftool/xlated_dumper.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'tools') diff --git a/tools/bpf/bpftool/xlated_dumper.c b/tools/bpf/bpftool/xlated_dumper.c index 8608cd68cdd0..6fc3e6f7f40c 100644 --- a/tools/bpf/bpftool/xlated_dumper.c +++ b/tools/bpf/bpftool/xlated_dumper.c @@ -196,6 +196,9 @@ static const char *print_imm(void *private_data, else if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) snprintf(dd->scratch_buff, sizeof(dd->scratch_buff), "map[id:%u][0]+%u", insn->imm, (insn + 1)->imm); + else if (insn->src_reg == BPF_PSEUDO_FUNC) + snprintf(dd->scratch_buff, sizeof(dd->scratch_buff), + "subprog[%+d]", insn->imm); else snprintf(dd->scratch_buff, sizeof(dd->scratch_buff), "0x%llx", (unsigned long long)full_imm); -- cgit v1.2.3 From 9de7f0fdab326a37c9f741f0f6c0f1cbc320a5ab Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 26 Feb 2021 12:49:33 -0800 Subject: selftests/bpf: Add hashmap test for bpf_for_each_map_elem() helper A test case is added for hashmap and percpu hashmap. The test also exercises nested bpf_for_each_map_elem() calls like bpf_prog: bpf_for_each_map_elem(func1) func1: bpf_for_each_map_elem(func2) func2: $ ./test_progs -n 45 #45/1 hash_map:OK #45 for_each:OK Summary: 1/1 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210226204933.3885657-1-yhs@fb.com --- tools/testing/selftests/bpf/prog_tests/for_each.c | 73 +++++++++++++++++ .../selftests/bpf/progs/for_each_hash_map_elem.c | 95 ++++++++++++++++++++++ tools/testing/selftests/bpf/test_progs.h | 11 +++ 3 files changed, 179 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/for_each.c create mode 100644 tools/testing/selftests/bpf/progs/for_each_hash_map_elem.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/for_each.c b/tools/testing/selftests/bpf/prog_tests/for_each.c new file mode 100644 index 000000000000..aa847cd9f71f --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/for_each.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include +#include +#include "for_each_hash_map_elem.skel.h" + +static unsigned int duration; + +static void test_hash_map(void) +{ + int i, err, hashmap_fd, max_entries, percpu_map_fd; + struct for_each_hash_map_elem *skel; + __u64 *percpu_valbuf = NULL; + __u32 key, num_cpus, retval; + __u64 val; + + skel = for_each_hash_map_elem__open_and_load(); + if (!ASSERT_OK_PTR(skel, "for_each_hash_map_elem__open_and_load")) + return; + + hashmap_fd = bpf_map__fd(skel->maps.hashmap); + max_entries = bpf_map__max_entries(skel->maps.hashmap); + for (i = 0; i < max_entries; i++) { + key = i; + val = i + 1; + err = bpf_map_update_elem(hashmap_fd, &key, &val, BPF_ANY); + if (!ASSERT_OK(err, "map_update")) + goto out; + } + + num_cpus = bpf_num_possible_cpus(); + percpu_map_fd = bpf_map__fd(skel->maps.percpu_map); + percpu_valbuf = malloc(sizeof(__u64) * num_cpus); + if (!ASSERT_OK_PTR(percpu_valbuf, "percpu_valbuf")) + goto out; + + key = 1; + for (i = 0; i < num_cpus; i++) + percpu_valbuf[i] = i + 1; + err = bpf_map_update_elem(percpu_map_fd, &key, percpu_valbuf, BPF_ANY); + if (!ASSERT_OK(err, "percpu_map_update")) + goto out; + + err = bpf_prog_test_run(bpf_program__fd(skel->progs.test_pkt_access), + 1, &pkt_v4, sizeof(pkt_v4), NULL, NULL, + &retval, &duration); + if (CHECK(err || retval, "ipv4", "err %d errno %d retval %d\n", + err, errno, retval)) + goto out; + + ASSERT_EQ(skel->bss->hashmap_output, 4, "hashmap_output"); + ASSERT_EQ(skel->bss->hashmap_elems, max_entries, "hashmap_elems"); + + key = 1; + err = bpf_map_lookup_elem(hashmap_fd, &key, &val); + ASSERT_ERR(err, "hashmap_lookup"); + + ASSERT_EQ(skel->bss->percpu_called, 1, "percpu_called"); + ASSERT_LT(skel->bss->cpu, num_cpus, "num_cpus"); + ASSERT_EQ(skel->bss->percpu_map_elems, 1, "percpu_map_elems"); + ASSERT_EQ(skel->bss->percpu_key, 1, "percpu_key"); + ASSERT_EQ(skel->bss->percpu_val, skel->bss->cpu + 1, "percpu_val"); + ASSERT_EQ(skel->bss->percpu_output, 100, "percpu_output"); +out: + free(percpu_valbuf); + for_each_hash_map_elem__destroy(skel); +} + +void test_for_each(void) +{ + if (test__start_subtest("hash_map")) + test_hash_map(); +} diff --git a/tools/testing/selftests/bpf/progs/for_each_hash_map_elem.c b/tools/testing/selftests/bpf/progs/for_each_hash_map_elem.c new file mode 100644 index 000000000000..913dd91aafff --- /dev/null +++ b/tools/testing/selftests/bpf/progs/for_each_hash_map_elem.c @@ -0,0 +1,95 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include "vmlinux.h" +#include + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 3); + __type(key, __u32); + __type(value, __u64); +} hashmap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, __u64); +} percpu_map SEC(".maps"); + +struct callback_ctx { + struct __sk_buff *ctx; + int input; + int output; +}; + +static __u64 +check_hash_elem(struct bpf_map *map, __u32 *key, __u64 *val, + struct callback_ctx *data) +{ + struct __sk_buff *skb = data->ctx; + __u32 k; + __u64 v; + + if (skb) { + k = *key; + v = *val; + if (skb->len == 10000 && k == 10 && v == 10) + data->output = 3; /* impossible path */ + else + data->output = 4; + } else { + data->output = data->input; + bpf_map_delete_elem(map, key); + } + + return 0; +} + +__u32 cpu = 0; +__u32 percpu_called = 0; +__u32 percpu_key = 0; +__u64 percpu_val = 0; +int percpu_output = 0; + +static __u64 +check_percpu_elem(struct bpf_map *map, __u32 *key, __u64 *val, + struct callback_ctx *unused) +{ + struct callback_ctx data; + + percpu_called++; + cpu = bpf_get_smp_processor_id(); + percpu_key = *key; + percpu_val = *val; + + data.ctx = 0; + data.input = 100; + data.output = 0; + bpf_for_each_map_elem(&hashmap, check_hash_elem, &data, 0); + percpu_output = data.output; + + return 0; +} + +int hashmap_output = 0; +int hashmap_elems = 0; +int percpu_map_elems = 0; + +SEC("classifier") +int test_pkt_access(struct __sk_buff *skb) +{ + struct callback_ctx data; + + data.ctx = skb; + data.input = 10; + data.output = 0; + hashmap_elems = bpf_for_each_map_elem(&hashmap, check_hash_elem, &data, 0); + hashmap_output = data.output; + + percpu_map_elems = bpf_for_each_map_elem(&percpu_map, check_percpu_elem, + (void *)0, 0); + return 0; +} diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index f7c2fd89d01a..e87c8546230e 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -152,6 +152,17 @@ extern int test__join_cgroup(const char *path); ___ok; \ }) +#define ASSERT_LT(actual, expected, name) ({ \ + static int duration = 0; \ + typeof(actual) ___act = (actual); \ + typeof(expected) ___exp = (expected); \ + bool ___ok = ___act < ___exp; \ + CHECK(!___ok, (name), \ + "unexpected %s: actual %lld >= expected %lld\n", \ + (name), (long long)(___act), (long long)(___exp)); \ + ___ok; \ +}) + #define ASSERT_STREQ(actual, expected, name) ({ \ static int duration = 0; \ const char *___act = actual; \ -- cgit v1.2.3 From 6b9e3331347ee9e84fe5c71d3eba7ec204f9bb25 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 26 Feb 2021 12:49:34 -0800 Subject: selftests/bpf: Add arraymap test for bpf_for_each_map_elem() helper A test is added for arraymap and percpu arraymap. The test also exercises the early return for the helper which does not traverse all elements. $ ./test_progs -n 45 #45/1 hash_map:OK #45/2 array_map:OK #45 for_each:OK Summary: 1/2 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210226204934.3885756-1-yhs@fb.com --- tools/testing/selftests/bpf/prog_tests/for_each.c | 57 ++++++++++++++++++++ .../selftests/bpf/progs/for_each_array_map_elem.c | 61 ++++++++++++++++++++++ 2 files changed, 118 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/for_each_array_map_elem.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/for_each.c b/tools/testing/selftests/bpf/prog_tests/for_each.c index aa847cd9f71f..68eb12a287d4 100644 --- a/tools/testing/selftests/bpf/prog_tests/for_each.c +++ b/tools/testing/selftests/bpf/prog_tests/for_each.c @@ -3,6 +3,7 @@ #include #include #include "for_each_hash_map_elem.skel.h" +#include "for_each_array_map_elem.skel.h" static unsigned int duration; @@ -66,8 +67,64 @@ out: for_each_hash_map_elem__destroy(skel); } +static void test_array_map(void) +{ + __u32 key, num_cpus, max_entries, retval; + int i, arraymap_fd, percpu_map_fd, err; + struct for_each_array_map_elem *skel; + __u64 *percpu_valbuf = NULL; + __u64 val, expected_total; + + skel = for_each_array_map_elem__open_and_load(); + if (!ASSERT_OK_PTR(skel, "for_each_array_map_elem__open_and_load")) + return; + + arraymap_fd = bpf_map__fd(skel->maps.arraymap); + expected_total = 0; + max_entries = bpf_map__max_entries(skel->maps.arraymap); + for (i = 0; i < max_entries; i++) { + key = i; + val = i + 1; + /* skip the last iteration for expected total */ + if (i != max_entries - 1) + expected_total += val; + err = bpf_map_update_elem(arraymap_fd, &key, &val, BPF_ANY); + if (!ASSERT_OK(err, "map_update")) + goto out; + } + + num_cpus = bpf_num_possible_cpus(); + percpu_map_fd = bpf_map__fd(skel->maps.percpu_map); + percpu_valbuf = malloc(sizeof(__u64) * num_cpus); + if (!ASSERT_OK_PTR(percpu_valbuf, "percpu_valbuf")) + goto out; + + key = 0; + for (i = 0; i < num_cpus; i++) + percpu_valbuf[i] = i + 1; + err = bpf_map_update_elem(percpu_map_fd, &key, percpu_valbuf, BPF_ANY); + if (!ASSERT_OK(err, "percpu_map_update")) + goto out; + + err = bpf_prog_test_run(bpf_program__fd(skel->progs.test_pkt_access), + 1, &pkt_v4, sizeof(pkt_v4), NULL, NULL, + &retval, &duration); + if (CHECK(err || retval, "ipv4", "err %d errno %d retval %d\n", + err, errno, retval)) + goto out; + + ASSERT_EQ(skel->bss->arraymap_output, expected_total, "array_output"); + ASSERT_EQ(skel->bss->cpu + 1, skel->bss->percpu_val, "percpu_val"); + +out: + free(percpu_valbuf); + for_each_array_map_elem__destroy(skel); +} + void test_for_each(void) { if (test__start_subtest("hash_map")) test_hash_map(); + if (test__start_subtest("array_map")) + test_array_map(); } diff --git a/tools/testing/selftests/bpf/progs/for_each_array_map_elem.c b/tools/testing/selftests/bpf/progs/for_each_array_map_elem.c new file mode 100644 index 000000000000..75e8e1069fe7 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/for_each_array_map_elem.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include "vmlinux.h" +#include + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 3); + __type(key, __u32); + __type(value, __u64); +} arraymap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, __u64); +} percpu_map SEC(".maps"); + +struct callback_ctx { + int output; +}; + +static __u64 +check_array_elem(struct bpf_map *map, __u32 *key, __u64 *val, + struct callback_ctx *data) +{ + data->output += *val; + if (*key == 1) + return 1; /* stop the iteration */ + return 0; +} + +__u32 cpu = 0; +__u64 percpu_val = 0; + +static __u64 +check_percpu_elem(struct bpf_map *map, __u32 *key, __u64 *val, + struct callback_ctx *data) +{ + cpu = bpf_get_smp_processor_id(); + percpu_val = *val; + return 0; +} + +u32 arraymap_output = 0; + +SEC("classifier") +int test_pkt_access(struct __sk_buff *skb) +{ + struct callback_ctx data; + + data.output = 0; + bpf_for_each_map_elem(&arraymap, check_array_elem, &data, 0); + arraymap_output = data.output; + + bpf_for_each_map_elem(&percpu_map, check_percpu_elem, (void *)0, 0); + return 0; +} -- cgit v1.2.3 From 04883a079968e6250a4549f6fadb6427c534885e Mon Sep 17 00:00:00 2001 From: Ian Denhardt Date: Tue, 23 Feb 2021 21:15:31 -0500 Subject: tools, bpf_asm: Hard error on out of range jumps Per discussion at [0] this was originally introduced as a warning due to concerns about breaking existing code, but a hard error probably makes more sense, especially given that concerns about breakage were only speculation. [0] https://lore.kernel.org/bpf/c964892195a6b91d20a67691448567ef528ffa6d.camel@linux.ibm.com/T/#t Signed-off-by: Ian Denhardt Signed-off-by: Daniel Borkmann Acked-by: Ilya Leoshkevich Link: https://lore.kernel.org/bpf/a6b6c7516f5d559049d669968e953b4a8d7adea3.1614201868.git.ian@zenhack.net --- tools/bpf/bpf_exp.y | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpf_exp.y b/tools/bpf/bpf_exp.y index 8d48e896be50..8d03e5245da5 100644 --- a/tools/bpf/bpf_exp.y +++ b/tools/bpf/bpf_exp.y @@ -549,9 +549,11 @@ static uint8_t bpf_encode_jt_jf_offset(int off, int i) { int delta = off - i - 1; - if (delta < 0 || delta > 255) - fprintf(stderr, "warning: insn #%d jumps to insn #%d, " + if (delta < 0 || delta > 255) { + fprintf(stderr, "error: insn #%d jumps to insn #%d, " "which is out of range\n", i, off); + exit(1); + } return (uint8_t) delta; } -- cgit v1.2.3 From 85e142cb42a1e7b33971bf035dae432d8670c46b Mon Sep 17 00:00:00 2001 From: Ian Denhardt Date: Tue, 23 Feb 2021 21:24:00 -0500 Subject: tools, bpf_asm: Exit non-zero on errors ... so callers can correctly detect failure. Signed-off-by: Ian Denhardt Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/b0bea780bc292f29e7b389dd062f20adc2a2d634.1614201868.git.ian@zenhack.net --- tools/bpf/bpf_exp.y | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpf_exp.y b/tools/bpf/bpf_exp.y index 8d03e5245da5..dfb7254a24e8 100644 --- a/tools/bpf/bpf_exp.y +++ b/tools/bpf/bpf_exp.y @@ -185,13 +185,13 @@ ldx | OP_LDXB number '*' '(' '[' number ']' '&' number ')' { if ($2 != 4 || $9 != 0xf) { fprintf(stderr, "ldxb offset not supported!\n"); - exit(0); + exit(1); } else { bpf_set_curr_instr(BPF_LDX | BPF_MSH | BPF_B, 0, 0, $6); } } | OP_LDX number '*' '(' '[' number ']' '&' number ')' { if ($2 != 4 || $9 != 0xf) { fprintf(stderr, "ldxb offset not supported!\n"); - exit(0); + exit(1); } else { bpf_set_curr_instr(BPF_LDX | BPF_MSH | BPF_B, 0, 0, $6); } } ; @@ -472,7 +472,7 @@ static void bpf_assert_max(void) { if (curr_instr >= BPF_MAXINSNS) { fprintf(stderr, "only max %u insns allowed!\n", BPF_MAXINSNS); - exit(0); + exit(1); } } @@ -522,7 +522,7 @@ static int bpf_find_insns_offset(const char *label) if (ret == -ENOENT) { fprintf(stderr, "no such label \'%s\'!\n", label); - exit(0); + exit(1); } return ret; -- cgit v1.2.3 From b5f184fbdb03b4fcc1141de34dd5ec964ca5d99e Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Thu, 4 Feb 2021 11:35:23 +0800 Subject: perf tools: Support MIPS unwinding and dwarf-regs Map perf APIs (perf_reg_name/get_arch_regstr/unwind__arch_reg_id) with MIPS specific registers. [ayan@wavecomp.com: repick this patch for unwinding userstack backtrace by perf and libunwind on MIPS based CPU.] [yangtiezhu@loongson.cn: Add sample_reg_masks[] to fix build error, silence some checkpatch errors and warnings, and also separate the original patches into two parts (MIPS kernel and perf tools) to merge easily.] The original patches: https://lore.kernel.org/patchwork/patch/1126521/ https://lore.kernel.org/patchwork/patch/1126520/ Committer notes: Do it as __perf_reg_name() to cope with: 067012974c8ae31a ("perf tools: Fix arm64 build error with gcc-11") Signed-off-by: Tiezhu Yang Cc: Alexander Shishkin Cc: Archer Yan Cc: David Daney Cc: Jianlin Lv Cc: Jiri Olsa Cc: Juxin Gao Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Xuefeng Li Cc: Thomas Bogendoerfer Cc: linux-mips@vger.kernel.org Link: http://lore.kernel.org/lkml/1612409724-3516-3-git-send-email-yangtiezhu@loongson.cn Signed-off-by: Archer Yan Signed-off-by: David Daney Signed-off-by: Ralf Baechle Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.config | 6 ++ tools/perf/arch/mips/Makefile | 4 ++ tools/perf/arch/mips/include/dwarf-regs-table.h | 31 +++++++++ tools/perf/arch/mips/include/perf_regs.h | 84 +++++++++++++++++++++++++ tools/perf/arch/mips/util/Build | 3 + tools/perf/arch/mips/util/dwarf-regs.c | 38 +++++++++++ tools/perf/arch/mips/util/perf_regs.c | 6 ++ tools/perf/arch/mips/util/unwind-libunwind.c | 22 +++++++ tools/perf/util/dwarf-regs.c | 3 + 9 files changed, 197 insertions(+) create mode 100644 tools/perf/arch/mips/Makefile create mode 100644 tools/perf/arch/mips/include/dwarf-regs-table.h create mode 100644 tools/perf/arch/mips/include/perf_regs.h create mode 100644 tools/perf/arch/mips/util/Build create mode 100644 tools/perf/arch/mips/util/dwarf-regs.c create mode 100644 tools/perf/arch/mips/util/perf_regs.c create mode 100644 tools/perf/arch/mips/util/unwind-libunwind.c (limited to 'tools') diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index d8e59d31399a..bd0c93c35379 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -87,6 +87,12 @@ ifeq ($(ARCH),s390) CFLAGS += -fPIC -I$(OUTPUT)arch/s390/include/generated endif +ifeq ($(ARCH),mips) + NO_PERF_REGS := 0 + CFLAGS += -I../../arch/mips/include/uapi -I../../arch/mips/include/generated/uapi + LIBUNWIND_LIBS = -lunwind -lunwind-mips +endif + ifeq ($(NO_PERF_REGS),0) $(call detected,CONFIG_PERF_REGS) endif diff --git a/tools/perf/arch/mips/Makefile b/tools/perf/arch/mips/Makefile new file mode 100644 index 000000000000..6e1106fab26e --- /dev/null +++ b/tools/perf/arch/mips/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 +ifndef NO_DWARF +PERF_HAVE_DWARF_REGS := 1 +endif diff --git a/tools/perf/arch/mips/include/dwarf-regs-table.h b/tools/perf/arch/mips/include/dwarf-regs-table.h new file mode 100644 index 000000000000..5badbcd3c5ec --- /dev/null +++ b/tools/perf/arch/mips/include/dwarf-regs-table.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * dwarf-regs-table.h : Mapping of DWARF debug register numbers into + * register names. + * + * Copyright (C) 2013 Cavium, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#ifdef DEFINE_DWARF_REGSTR_TABLE +#undef REG_DWARFNUM_NAME +#define REG_DWARFNUM_NAME(reg, idx) [idx] = "$" #reg +static const char * const mips_regstr_tbl[] = { + "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7", "$8", "$9", + "$10", "$11", "$12", "$13", "$14", "$15", "$16", "$17", "$18", "$19", + "$20", "$21", "$22", "$23", "$24", "$25", "$26", "$27", "$28", "%29", + "$30", "$31", + REG_DWARFNUM_NAME(hi, 64), + REG_DWARFNUM_NAME(lo, 65), +}; +#endif diff --git a/tools/perf/arch/mips/include/perf_regs.h b/tools/perf/arch/mips/include/perf_regs.h new file mode 100644 index 000000000000..ee73b36a14d1 --- /dev/null +++ b/tools/perf/arch/mips/include/perf_regs.h @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef ARCH_PERF_REGS_H +#define ARCH_PERF_REGS_H + +#include +#include +#include + +#define PERF_REGS_MAX PERF_REG_MIPS_MAX +#define PERF_REG_IP PERF_REG_MIPS_PC +#define PERF_REG_SP PERF_REG_MIPS_R29 + +#define PERF_REGS_MASK ((1ULL << PERF_REG_MIPS_MAX) - 1) + +static inline const char *__perf_reg_name(int id) +{ + switch (id) { + case PERF_REG_MIPS_PC: + return "PC"; + case PERF_REG_MIPS_R1: + return "$1"; + case PERF_REG_MIPS_R2: + return "$2"; + case PERF_REG_MIPS_R3: + return "$3"; + case PERF_REG_MIPS_R4: + return "$4"; + case PERF_REG_MIPS_R5: + return "$5"; + case PERF_REG_MIPS_R6: + return "$6"; + case PERF_REG_MIPS_R7: + return "$7"; + case PERF_REG_MIPS_R8: + return "$8"; + case PERF_REG_MIPS_R9: + return "$9"; + case PERF_REG_MIPS_R10: + return "$10"; + case PERF_REG_MIPS_R11: + return "$11"; + case PERF_REG_MIPS_R12: + return "$12"; + case PERF_REG_MIPS_R13: + return "$13"; + case PERF_REG_MIPS_R14: + return "$14"; + case PERF_REG_MIPS_R15: + return "$15"; + case PERF_REG_MIPS_R16: + return "$16"; + case PERF_REG_MIPS_R17: + return "$17"; + case PERF_REG_MIPS_R18: + return "$18"; + case PERF_REG_MIPS_R19: + return "$19"; + case PERF_REG_MIPS_R20: + return "$20"; + case PERF_REG_MIPS_R21: + return "$21"; + case PERF_REG_MIPS_R22: + return "$22"; + case PERF_REG_MIPS_R23: + return "$23"; + case PERF_REG_MIPS_R24: + return "$24"; + case PERF_REG_MIPS_R25: + return "$25"; + case PERF_REG_MIPS_R28: + return "$28"; + case PERF_REG_MIPS_R29: + return "$29"; + case PERF_REG_MIPS_R30: + return "$30"; + case PERF_REG_MIPS_R31: + return "$31"; + default: + break; + } + return NULL; +} + +#endif /* ARCH_PERF_REGS_H */ diff --git a/tools/perf/arch/mips/util/Build b/tools/perf/arch/mips/util/Build new file mode 100644 index 000000000000..51c8900a9a10 --- /dev/null +++ b/tools/perf/arch/mips/util/Build @@ -0,0 +1,3 @@ +perf-y += perf_regs.o +perf-$(CONFIG_DWARF) += dwarf-regs.o +perf-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind.o diff --git a/tools/perf/arch/mips/util/dwarf-regs.c b/tools/perf/arch/mips/util/dwarf-regs.c new file mode 100644 index 000000000000..25c13a91c2a7 --- /dev/null +++ b/tools/perf/arch/mips/util/dwarf-regs.c @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * dwarf-regs.c : Mapping of DWARF debug register numbers into register names. + * + * Copyright (C) 2013 Cavium, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include +#include + +static const char *mips_gpr_names[32] = { + "$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7", "$8", "$9", + "$10", "$11", "$12", "$13", "$14", "$15", "$16", "$17", "$18", "$19", + "$20", "$21", "$22", "$23", "$24", "$25", "$26", "$27", "$28", "$29", + "$30", "$31" +}; + +const char *get_arch_regstr(unsigned int n) +{ + if (n < 32) + return mips_gpr_names[n]; + if (n == 64) + return "hi"; + if (n == 65) + return "lo"; + return NULL; +} diff --git a/tools/perf/arch/mips/util/perf_regs.c b/tools/perf/arch/mips/util/perf_regs.c new file mode 100644 index 000000000000..2864e2e3776d --- /dev/null +++ b/tools/perf/arch/mips/util/perf_regs.c @@ -0,0 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "../../util/perf_regs.h" + +const struct sample_reg sample_reg_masks[] = { + SMPL_REG_END +}; diff --git a/tools/perf/arch/mips/util/unwind-libunwind.c b/tools/perf/arch/mips/util/unwind-libunwind.c new file mode 100644 index 000000000000..0d8c99c29da6 --- /dev/null +++ b/tools/perf/arch/mips/util/unwind-libunwind.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "perf_regs.h" +#include "../../util/unwind.h" +#include "util/debug.h" + +int libunwind__arch_reg_id(int regnum) +{ + switch (regnum) { + case UNW_MIPS_R1 ... UNW_MIPS_R25: + return regnum - UNW_MIPS_R1 + PERF_REG_MIPS_R1; + case UNW_MIPS_R28 ... UNW_MIPS_R31: + return regnum - UNW_MIPS_R28 + PERF_REG_MIPS_R28; + case UNW_MIPS_PC: + return PERF_REG_MIPS_PC; + default: + pr_err("unwind: invalid reg id %d\n", regnum); + return -EINVAL; + } +} diff --git a/tools/perf/util/dwarf-regs.c b/tools/perf/util/dwarf-regs.c index 1b49ecee5aff..3fa4486742cd 100644 --- a/tools/perf/util/dwarf-regs.c +++ b/tools/perf/util/dwarf-regs.c @@ -24,6 +24,7 @@ #include "../arch/s390/include/dwarf-regs-table.h" #include "../arch/sparc/include/dwarf-regs-table.h" #include "../arch/xtensa/include/dwarf-regs-table.h" +#include "../arch/mips/include/dwarf-regs-table.h" #define __get_dwarf_regstr(tbl, n) (((n) < ARRAY_SIZE(tbl)) ? (tbl)[(n)] : NULL) @@ -53,6 +54,8 @@ const char *get_dwarf_regstr(unsigned int n, unsigned int machine) return __get_dwarf_regstr(sparc_regstr_tbl, n); case EM_XTENSA: return __get_dwarf_regstr(xtensa_regstr_tbl, n); + case EM_MIPS: + return __get_dwarf_regstr(mips_regstr_tbl, n); default: pr_err("ELF MACHINE %x is not supported.\n", machine); } -- cgit v1.2.3 From d9fd5a718977702f2fd112a081b62572e39f24db Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Thu, 4 Feb 2021 11:35:24 +0800 Subject: perf tools: Generate mips syscalls_n64.c syscall table Grab a copy of arch/mips/kernel/syscalls/syscall_n64.tbl and use it to generate tools/perf/arch/mips/include/generated/asm/syscalls_n64.c file, this is similar with commit 1b700c997500 ("perf tools: Build syscall table .c header from kernel's syscall_64.tbl") Signed-off-by: Tiezhu Yang Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Juxin Gao Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Bogendoerfer Cc: Xuefeng Li Cc: linux-mips@vger.kernel.org Link: http://lore.kernel.org/lkml/1612409724-3516-4-git-send-email-yangtiezhu@loongson.cn Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.config | 3 +- tools/perf/arch/mips/Makefile | 18 ++ tools/perf/arch/mips/entry/syscalls/mksyscalltbl | 32 ++ .../perf/arch/mips/entry/syscalls/syscall_n64.tbl | 358 +++++++++++++++++++++ tools/perf/check-headers.sh | 1 + tools/perf/util/syscalltbl.c | 4 + 6 files changed, 415 insertions(+), 1 deletion(-) create mode 100644 tools/perf/arch/mips/entry/syscalls/mksyscalltbl create mode 100644 tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl (limited to 'tools') diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index bd0c93c35379..3514fe956ed1 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -32,7 +32,7 @@ ifneq ($(NO_SYSCALL_TABLE),1) NO_SYSCALL_TABLE := 0 endif else - ifeq ($(SRCARCH),$(filter $(SRCARCH),powerpc arm64 s390)) + ifeq ($(SRCARCH),$(filter $(SRCARCH),powerpc arm64 s390 mips)) NO_SYSCALL_TABLE := 0 endif endif @@ -89,6 +89,7 @@ endif ifeq ($(ARCH),mips) NO_PERF_REGS := 0 + CFLAGS += -I$(OUTPUT)arch/mips/include/generated CFLAGS += -I../../arch/mips/include/uapi -I../../arch/mips/include/generated/uapi LIBUNWIND_LIBS = -lunwind -lunwind-mips endif diff --git a/tools/perf/arch/mips/Makefile b/tools/perf/arch/mips/Makefile index 6e1106fab26e..8bc09072e3d6 100644 --- a/tools/perf/arch/mips/Makefile +++ b/tools/perf/arch/mips/Makefile @@ -2,3 +2,21 @@ ifndef NO_DWARF PERF_HAVE_DWARF_REGS := 1 endif + +# Syscall table generation for perf +out := $(OUTPUT)arch/mips/include/generated/asm +header := $(out)/syscalls_n64.c +sysprf := $(srctree)/tools/perf/arch/mips/entry/syscalls +sysdef := $(sysprf)/syscall_n64.tbl +systbl := $(sysprf)/mksyscalltbl + +# Create output directory if not already present +_dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') + +$(header): $(sysdef) $(systbl) + $(Q)$(SHELL) '$(systbl)' $(sysdef) > $@ + +clean:: + $(call QUIET_CLEAN, mips) $(RM) $(header) + +archheaders: $(header) diff --git a/tools/perf/arch/mips/entry/syscalls/mksyscalltbl b/tools/perf/arch/mips/entry/syscalls/mksyscalltbl new file mode 100644 index 000000000000..fb1f49451af6 --- /dev/null +++ b/tools/perf/arch/mips/entry/syscalls/mksyscalltbl @@ -0,0 +1,32 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# +# Generate system call table for perf. Derived from +# s390 script. +# +# Author(s): Hendrik Brueckner +# Changed by: Tiezhu Yang + +SYSCALL_TBL=$1 + +if ! test -r $SYSCALL_TBL; then + echo "Could not read input file" >&2 + exit 1 +fi + +create_table() +{ + local max_nr nr abi sc discard + + echo 'static const char *syscalltbl_mips_n64[] = {' + while read nr abi sc discard; do + printf '\t[%d] = "%s",\n' $nr $sc + max_nr=$nr + done + echo '};' + echo "#define SYSCALLTBL_MIPS_N64_MAX_ID $max_nr" +} + +grep -E "^[[:digit:]]+[[:space:]]+(n64)" $SYSCALL_TBL \ + |sort -k1 -n \ + |create_table diff --git a/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl b/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl new file mode 100644 index 000000000000..91649690b52f --- /dev/null +++ b/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl @@ -0,0 +1,358 @@ +# SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note +# +# system call numbers and entry vectors for mips +# +# The format is: +# +# +# The is always "n64" for this file. +# +0 n64 read sys_read +1 n64 write sys_write +2 n64 open sys_open +3 n64 close sys_close +4 n64 stat sys_newstat +5 n64 fstat sys_newfstat +6 n64 lstat sys_newlstat +7 n64 poll sys_poll +8 n64 lseek sys_lseek +9 n64 mmap sys_mips_mmap +10 n64 mprotect sys_mprotect +11 n64 munmap sys_munmap +12 n64 brk sys_brk +13 n64 rt_sigaction sys_rt_sigaction +14 n64 rt_sigprocmask sys_rt_sigprocmask +15 n64 ioctl sys_ioctl +16 n64 pread64 sys_pread64 +17 n64 pwrite64 sys_pwrite64 +18 n64 readv sys_readv +19 n64 writev sys_writev +20 n64 access sys_access +21 n64 pipe sysm_pipe +22 n64 _newselect sys_select +23 n64 sched_yield sys_sched_yield +24 n64 mremap sys_mremap +25 n64 msync sys_msync +26 n64 mincore sys_mincore +27 n64 madvise sys_madvise +28 n64 shmget sys_shmget +29 n64 shmat sys_shmat +30 n64 shmctl sys_old_shmctl +31 n64 dup sys_dup +32 n64 dup2 sys_dup2 +33 n64 pause sys_pause +34 n64 nanosleep sys_nanosleep +35 n64 getitimer sys_getitimer +36 n64 setitimer sys_setitimer +37 n64 alarm sys_alarm +38 n64 getpid sys_getpid +39 n64 sendfile sys_sendfile64 +40 n64 socket sys_socket +41 n64 connect sys_connect +42 n64 accept sys_accept +43 n64 sendto sys_sendto +44 n64 recvfrom sys_recvfrom +45 n64 sendmsg sys_sendmsg +46 n64 recvmsg sys_recvmsg +47 n64 shutdown sys_shutdown +48 n64 bind sys_bind +49 n64 listen sys_listen +50 n64 getsockname sys_getsockname +51 n64 getpeername sys_getpeername +52 n64 socketpair sys_socketpair +53 n64 setsockopt sys_setsockopt +54 n64 getsockopt sys_getsockopt +55 n64 clone __sys_clone +56 n64 fork __sys_fork +57 n64 execve sys_execve +58 n64 exit sys_exit +59 n64 wait4 sys_wait4 +60 n64 kill sys_kill +61 n64 uname sys_newuname +62 n64 semget sys_semget +63 n64 semop sys_semop +64 n64 semctl sys_old_semctl +65 n64 shmdt sys_shmdt +66 n64 msgget sys_msgget +67 n64 msgsnd sys_msgsnd +68 n64 msgrcv sys_msgrcv +69 n64 msgctl sys_old_msgctl +70 n64 fcntl sys_fcntl +71 n64 flock sys_flock +72 n64 fsync sys_fsync +73 n64 fdatasync sys_fdatasync +74 n64 truncate sys_truncate +75 n64 ftruncate sys_ftruncate +76 n64 getdents sys_getdents +77 n64 getcwd sys_getcwd +78 n64 chdir sys_chdir +79 n64 fchdir sys_fchdir +80 n64 rename sys_rename +81 n64 mkdir sys_mkdir +82 n64 rmdir sys_rmdir +83 n64 creat sys_creat +84 n64 link sys_link +85 n64 unlink sys_unlink +86 n64 symlink sys_symlink +87 n64 readlink sys_readlink +88 n64 chmod sys_chmod +89 n64 fchmod sys_fchmod +90 n64 chown sys_chown +91 n64 fchown sys_fchown +92 n64 lchown sys_lchown +93 n64 umask sys_umask +94 n64 gettimeofday sys_gettimeofday +95 n64 getrlimit sys_getrlimit +96 n64 getrusage sys_getrusage +97 n64 sysinfo sys_sysinfo +98 n64 times sys_times +99 n64 ptrace sys_ptrace +100 n64 getuid sys_getuid +101 n64 syslog sys_syslog +102 n64 getgid sys_getgid +103 n64 setuid sys_setuid +104 n64 setgid sys_setgid +105 n64 geteuid sys_geteuid +106 n64 getegid sys_getegid +107 n64 setpgid sys_setpgid +108 n64 getppid sys_getppid +109 n64 getpgrp sys_getpgrp +110 n64 setsid sys_setsid +111 n64 setreuid sys_setreuid +112 n64 setregid sys_setregid +113 n64 getgroups sys_getgroups +114 n64 setgroups sys_setgroups +115 n64 setresuid sys_setresuid +116 n64 getresuid sys_getresuid +117 n64 setresgid sys_setresgid +118 n64 getresgid sys_getresgid +119 n64 getpgid sys_getpgid +120 n64 setfsuid sys_setfsuid +121 n64 setfsgid sys_setfsgid +122 n64 getsid sys_getsid +123 n64 capget sys_capget +124 n64 capset sys_capset +125 n64 rt_sigpending sys_rt_sigpending +126 n64 rt_sigtimedwait sys_rt_sigtimedwait +127 n64 rt_sigqueueinfo sys_rt_sigqueueinfo +128 n64 rt_sigsuspend sys_rt_sigsuspend +129 n64 sigaltstack sys_sigaltstack +130 n64 utime sys_utime +131 n64 mknod sys_mknod +132 n64 personality sys_personality +133 n64 ustat sys_ustat +134 n64 statfs sys_statfs +135 n64 fstatfs sys_fstatfs +136 n64 sysfs sys_sysfs +137 n64 getpriority sys_getpriority +138 n64 setpriority sys_setpriority +139 n64 sched_setparam sys_sched_setparam +140 n64 sched_getparam sys_sched_getparam +141 n64 sched_setscheduler sys_sched_setscheduler +142 n64 sched_getscheduler sys_sched_getscheduler +143 n64 sched_get_priority_max sys_sched_get_priority_max +144 n64 sched_get_priority_min sys_sched_get_priority_min +145 n64 sched_rr_get_interval sys_sched_rr_get_interval +146 n64 mlock sys_mlock +147 n64 munlock sys_munlock +148 n64 mlockall sys_mlockall +149 n64 munlockall sys_munlockall +150 n64 vhangup sys_vhangup +151 n64 pivot_root sys_pivot_root +152 n64 _sysctl sys_ni_syscall +153 n64 prctl sys_prctl +154 n64 adjtimex sys_adjtimex +155 n64 setrlimit sys_setrlimit +156 n64 chroot sys_chroot +157 n64 sync sys_sync +158 n64 acct sys_acct +159 n64 settimeofday sys_settimeofday +160 n64 mount sys_mount +161 n64 umount2 sys_umount +162 n64 swapon sys_swapon +163 n64 swapoff sys_swapoff +164 n64 reboot sys_reboot +165 n64 sethostname sys_sethostname +166 n64 setdomainname sys_setdomainname +167 n64 create_module sys_ni_syscall +168 n64 init_module sys_init_module +169 n64 delete_module sys_delete_module +170 n64 get_kernel_syms sys_ni_syscall +171 n64 query_module sys_ni_syscall +172 n64 quotactl sys_quotactl +173 n64 nfsservctl sys_ni_syscall +174 n64 getpmsg sys_ni_syscall +175 n64 putpmsg sys_ni_syscall +176 n64 afs_syscall sys_ni_syscall +# 177 reserved for security +177 n64 reserved177 sys_ni_syscall +178 n64 gettid sys_gettid +179 n64 readahead sys_readahead +180 n64 setxattr sys_setxattr +181 n64 lsetxattr sys_lsetxattr +182 n64 fsetxattr sys_fsetxattr +183 n64 getxattr sys_getxattr +184 n64 lgetxattr sys_lgetxattr +185 n64 fgetxattr sys_fgetxattr +186 n64 listxattr sys_listxattr +187 n64 llistxattr sys_llistxattr +188 n64 flistxattr sys_flistxattr +189 n64 removexattr sys_removexattr +190 n64 lremovexattr sys_lremovexattr +191 n64 fremovexattr sys_fremovexattr +192 n64 tkill sys_tkill +193 n64 reserved193 sys_ni_syscall +194 n64 futex sys_futex +195 n64 sched_setaffinity sys_sched_setaffinity +196 n64 sched_getaffinity sys_sched_getaffinity +197 n64 cacheflush sys_cacheflush +198 n64 cachectl sys_cachectl +199 n64 sysmips __sys_sysmips +200 n64 io_setup sys_io_setup +201 n64 io_destroy sys_io_destroy +202 n64 io_getevents sys_io_getevents +203 n64 io_submit sys_io_submit +204 n64 io_cancel sys_io_cancel +205 n64 exit_group sys_exit_group +206 n64 lookup_dcookie sys_lookup_dcookie +207 n64 epoll_create sys_epoll_create +208 n64 epoll_ctl sys_epoll_ctl +209 n64 epoll_wait sys_epoll_wait +210 n64 remap_file_pages sys_remap_file_pages +211 n64 rt_sigreturn sys_rt_sigreturn +212 n64 set_tid_address sys_set_tid_address +213 n64 restart_syscall sys_restart_syscall +214 n64 semtimedop sys_semtimedop +215 n64 fadvise64 sys_fadvise64_64 +216 n64 timer_create sys_timer_create +217 n64 timer_settime sys_timer_settime +218 n64 timer_gettime sys_timer_gettime +219 n64 timer_getoverrun sys_timer_getoverrun +220 n64 timer_delete sys_timer_delete +221 n64 clock_settime sys_clock_settime +222 n64 clock_gettime sys_clock_gettime +223 n64 clock_getres sys_clock_getres +224 n64 clock_nanosleep sys_clock_nanosleep +225 n64 tgkill sys_tgkill +226 n64 utimes sys_utimes +227 n64 mbind sys_mbind +228 n64 get_mempolicy sys_get_mempolicy +229 n64 set_mempolicy sys_set_mempolicy +230 n64 mq_open sys_mq_open +231 n64 mq_unlink sys_mq_unlink +232 n64 mq_timedsend sys_mq_timedsend +233 n64 mq_timedreceive sys_mq_timedreceive +234 n64 mq_notify sys_mq_notify +235 n64 mq_getsetattr sys_mq_getsetattr +236 n64 vserver sys_ni_syscall +237 n64 waitid sys_waitid +# 238 was sys_setaltroot +239 n64 add_key sys_add_key +240 n64 request_key sys_request_key +241 n64 keyctl sys_keyctl +242 n64 set_thread_area sys_set_thread_area +243 n64 inotify_init sys_inotify_init +244 n64 inotify_add_watch sys_inotify_add_watch +245 n64 inotify_rm_watch sys_inotify_rm_watch +246 n64 migrate_pages sys_migrate_pages +247 n64 openat sys_openat +248 n64 mkdirat sys_mkdirat +249 n64 mknodat sys_mknodat +250 n64 fchownat sys_fchownat +251 n64 futimesat sys_futimesat +252 n64 newfstatat sys_newfstatat +253 n64 unlinkat sys_unlinkat +254 n64 renameat sys_renameat +255 n64 linkat sys_linkat +256 n64 symlinkat sys_symlinkat +257 n64 readlinkat sys_readlinkat +258 n64 fchmodat sys_fchmodat +259 n64 faccessat sys_faccessat +260 n64 pselect6 sys_pselect6 +261 n64 ppoll sys_ppoll +262 n64 unshare sys_unshare +263 n64 splice sys_splice +264 n64 sync_file_range sys_sync_file_range +265 n64 tee sys_tee +266 n64 vmsplice sys_vmsplice +267 n64 move_pages sys_move_pages +268 n64 set_robust_list sys_set_robust_list +269 n64 get_robust_list sys_get_robust_list +270 n64 kexec_load sys_kexec_load +271 n64 getcpu sys_getcpu +272 n64 epoll_pwait sys_epoll_pwait +273 n64 ioprio_set sys_ioprio_set +274 n64 ioprio_get sys_ioprio_get +275 n64 utimensat sys_utimensat +276 n64 signalfd sys_signalfd +277 n64 timerfd sys_ni_syscall +278 n64 eventfd sys_eventfd +279 n64 fallocate sys_fallocate +280 n64 timerfd_create sys_timerfd_create +281 n64 timerfd_gettime sys_timerfd_gettime +282 n64 timerfd_settime sys_timerfd_settime +283 n64 signalfd4 sys_signalfd4 +284 n64 eventfd2 sys_eventfd2 +285 n64 epoll_create1 sys_epoll_create1 +286 n64 dup3 sys_dup3 +287 n64 pipe2 sys_pipe2 +288 n64 inotify_init1 sys_inotify_init1 +289 n64 preadv sys_preadv +290 n64 pwritev sys_pwritev +291 n64 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo +292 n64 perf_event_open sys_perf_event_open +293 n64 accept4 sys_accept4 +294 n64 recvmmsg sys_recvmmsg +295 n64 fanotify_init sys_fanotify_init +296 n64 fanotify_mark sys_fanotify_mark +297 n64 prlimit64 sys_prlimit64 +298 n64 name_to_handle_at sys_name_to_handle_at +299 n64 open_by_handle_at sys_open_by_handle_at +300 n64 clock_adjtime sys_clock_adjtime +301 n64 syncfs sys_syncfs +302 n64 sendmmsg sys_sendmmsg +303 n64 setns sys_setns +304 n64 process_vm_readv sys_process_vm_readv +305 n64 process_vm_writev sys_process_vm_writev +306 n64 kcmp sys_kcmp +307 n64 finit_module sys_finit_module +308 n64 getdents64 sys_getdents64 +309 n64 sched_setattr sys_sched_setattr +310 n64 sched_getattr sys_sched_getattr +311 n64 renameat2 sys_renameat2 +312 n64 seccomp sys_seccomp +313 n64 getrandom sys_getrandom +314 n64 memfd_create sys_memfd_create +315 n64 bpf sys_bpf +316 n64 execveat sys_execveat +317 n64 userfaultfd sys_userfaultfd +318 n64 membarrier sys_membarrier +319 n64 mlock2 sys_mlock2 +320 n64 copy_file_range sys_copy_file_range +321 n64 preadv2 sys_preadv2 +322 n64 pwritev2 sys_pwritev2 +323 n64 pkey_mprotect sys_pkey_mprotect +324 n64 pkey_alloc sys_pkey_alloc +325 n64 pkey_free sys_pkey_free +326 n64 statx sys_statx +327 n64 rseq sys_rseq +328 n64 io_pgetevents sys_io_pgetevents +# 329 through 423 are reserved to sync up with other architectures +424 n64 pidfd_send_signal sys_pidfd_send_signal +425 n64 io_uring_setup sys_io_uring_setup +426 n64 io_uring_enter sys_io_uring_enter +427 n64 io_uring_register sys_io_uring_register +428 n64 open_tree sys_open_tree +429 n64 move_mount sys_move_mount +430 n64 fsopen sys_fsopen +431 n64 fsconfig sys_fsconfig +432 n64 fsmount sys_fsmount +433 n64 fspick sys_fspick +434 n64 pidfd_open sys_pidfd_open +435 n64 clone3 __sys_clone3 +436 n64 close_range sys_close_range +437 n64 openat2 sys_openat2 +438 n64 pidfd_getfd sys_pidfd_getfd +439 n64 faccessat2 sys_faccessat2 +440 n64 process_madvise sys_process_madvise +441 n64 epoll_pwait2 sys_epoll_pwait2 diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index dded93a2bc89..39eada935d4e 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -146,6 +146,7 @@ check arch/x86/lib/insn.c '-I "^#include [\"<]\(../include/\)*asm/in check_2 tools/perf/arch/x86/entry/syscalls/syscall_64.tbl arch/x86/entry/syscalls/syscall_64.tbl check_2 tools/perf/arch/powerpc/entry/syscalls/syscall.tbl arch/powerpc/kernel/syscalls/syscall.tbl check_2 tools/perf/arch/s390/entry/syscalls/syscall.tbl arch/s390/kernel/syscalls/syscall.tbl +check_2 tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl arch/mips/kernel/syscalls/syscall_n64.tbl for i in $BEAUTY_FILES; do beauty_check $i -B diff --git a/tools/perf/util/syscalltbl.c b/tools/perf/util/syscalltbl.c index 03bd99d3be16..a2e906858891 100644 --- a/tools/perf/util/syscalltbl.c +++ b/tools/perf/util/syscalltbl.c @@ -34,6 +34,10 @@ static const char **syscalltbl_native = syscalltbl_powerpc_32; #include const int syscalltbl_native_max_id = SYSCALLTBL_ARM64_MAX_ID; static const char **syscalltbl_native = syscalltbl_arm64; +#elif defined(__mips__) +#include +const int syscalltbl_native_max_id = SYSCALLTBL_MIPS_N64_MAX_ID; +static const char **syscalltbl_native = syscalltbl_mips_n64; #endif struct syscall { -- cgit v1.2.3 From 9bb8b74bdb186bd378cd5d510e8261538ca40094 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 19 Nov 2020 22:30:37 -0800 Subject: perf docs: Add man pages to see also Add all other man pages to the "see also" list except for perf-script-perl and perf-script-python that are linked to from perf-script. Signed-off-by: Ian Rogers Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20201120063037.3166069-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf.txt | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'tools') diff --git a/tools/perf/Documentation/perf.txt b/tools/perf/Documentation/perf.txt index c130a3c46a90..9c330cdfa973 100644 --- a/tools/perf/Documentation/perf.txt +++ b/tools/perf/Documentation/perf.txt @@ -76,3 +76,15 @@ SEE ALSO linkperf:perf-stat[1], linkperf:perf-top[1], linkperf:perf-record[1], linkperf:perf-report[1], linkperf:perf-list[1] + +linkperf:perf-annotate[1],linkperf:perf-archive[1], +linkperf:perf-bench[1], linkperf:perf-buildid-cache[1], +linkperf:perf-buildid-list[1], linkperf:perf-c2c[1], +linkperf:perf-config[1], linkperf:perf-data[1], linkperf:perf-diff[1], +linkperf:perf-evlist[1], linkperf:perf-ftrace[1], +linkperf:perf-help[1], linkperf:perf-inject[1], +linkperf:perf-intel-pt[1], linkperf:perf-kallsyms[1], +linkperf:perf-kmem[1], linkperf:perf-kvm[1], linkperf:perf-lock[1], +linkperf:perf-mem[1], linkperf:perf-probe[1], linkperf:perf-sched[1], +linkperf:perf-script[1], linkperf:perf-test[1], +linkperf:perf-trace[1], linkperf:perf-version[1] -- cgit v1.2.3 From 34968b9327c83589e2867af3c5b9fd993666a514 Mon Sep 17 00:00:00 2001 From: Nicholas Fraser Date: Wed, 24 Feb 2021 14:59:16 -0500 Subject: perf buildid-cache: Add test for PE executable This builds on the previous changes to tests/shell/buildid.sh, adding tests for a PE file. It adds it to the build-id cache manually and, if Wine is available, runs it under "perf record" and verifies that it was added automatically. If wine is not installed, only warnings are printed; the test can still exit 0. Signed-off-by: Nicholas Fraser Reviewed-by: Jiri Olsa Cc: Alexander Shishkin Cc: Huw Davies Cc: Ian Rogers Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ulrich Czekalla Link: http://lore.kernel.org/lkml/790bfe67-2155-a426-7130-ae7c45cb055b@codeweavers.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/buildid.sh | 65 ++++++++++++++++++++++++++++++++++----- 1 file changed, 58 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/shell/buildid.sh b/tools/perf/tests/shell/buildid.sh index 416af614bbe0..f05670d1e39e 100755 --- a/tools/perf/tests/shell/buildid.sh +++ b/tools/perf/tests/shell/buildid.sh @@ -14,18 +14,56 @@ if ! [ -x "$(command -v cc)" ]; then exit 2 fi +# check what we need to test windows binaries +add_pe=1 +run_pe=1 +if ! perf version --build-options | grep -q 'libbfd: .* on '; then + echo "WARNING: perf not built with libbfd. PE binaries will not be tested." + add_pe=0 + run_pe=0 +fi +if ! which wine > /dev/null; then + echo "WARNING: wine not found. PE binaries will not be run." + run_pe=0 +fi + +# set up wine +if [ ${run_pe} -eq 1 ]; then + wineprefix=$(mktemp -d /tmp/perf.wineprefix.XXX) + export WINEPREFIX=${wineprefix} + # clear display variables to prevent wine from popping up dialogs + unset DISPLAY + unset WAYLAND_DISPLAY +fi + ex_md5=$(mktemp /tmp/perf.ex.MD5.XXX) ex_sha1=$(mktemp /tmp/perf.ex.SHA1.XXX) +ex_pe=$(dirname $0)/../pe-file.exe echo 'int main(void) { return 0; }' | cc -Wl,--build-id=sha1 -o ${ex_sha1} -x c - echo 'int main(void) { return 0; }' | cc -Wl,--build-id=md5 -o ${ex_md5} -x c - -echo "test binaries: ${ex_sha1} ${ex_md5}" +echo "test binaries: ${ex_sha1} ${ex_md5} ${ex_pe}" check() { - id=`readelf -n ${1} 2>/dev/null | grep 'Build ID' | awk '{print $3}'` - + case $1 in + *.exe) + # We don't have a tool that can pull a nicely formatted build-id out of + # a PE file, but we can extract the whole section with objcopy and + # format it ourselves. The .buildid section is a Debug Directory + # containing a CodeView entry: + # https://docs.microsoft.com/en-us/windows/win32/debug/pe-format#debug-directory-image-only + # https://github.com/dotnet/runtime/blob/da94c022576a5c3bbc0e896f006565905eb137f9/docs/design/specs/PE-COFF.md + # The build-id starts at byte 33 and must be rearranged into a GUID. + id=`objcopy -O binary --only-section=.buildid $1 /dev/stdout | \ + cut -c 33-48 | hexdump -ve '/1 "%02x"' | \ + sed 's@^\(..\)\(..\)\(..\)\(..\)\(..\)\(..\)\(..\)\(..\)\(.*\)0a$@\4\3\2\1\6\5\8\7\9@'` + ;; + *) + id=`readelf -n ${1} 2>/dev/null | grep 'Build ID' | awk '{print $3}'` + ;; + esac echo "build id: ${id}" link=${build_id_dir}/.build-id/${id:0:2}/${id:2} @@ -50,7 +88,7 @@ check() exit 1 fi - ${perf} buildid-cache -l | grep $id + ${perf} buildid-cache -l | grep ${id} if [ $? -ne 0 ]; then echo "failed: ${id} is not reported by \"perf buildid-cache -l\"" exit 1 @@ -79,16 +117,20 @@ test_record() { data=$(mktemp /tmp/perf.data.XXX) build_id_dir=$(mktemp -d /tmp/perf.debug.XXX) + log=$(mktemp /tmp/perf.log.XXX) perf="perf --buildid-dir ${build_id_dir}" - ${perf} record --buildid-all -o ${data} ${1} + echo "running: perf record $@" + ${perf} record --buildid-all -o ${data} $@ &> ${log} if [ $? -ne 0 ]; then - echo "failed: record ${1}" + echo "failed: record $@" + echo "see log: ${log}" exit 1 fi - check ${1} + check ${@: -1} + rm -f ${log} rm -rf ${build_id_dir} rm -rf ${data} } @@ -96,12 +138,21 @@ test_record() # add binaries manual via perf buildid-cache -a test_add ${ex_sha1} test_add ${ex_md5} +if [ ${add_pe} -eq 1 ]; then + test_add ${ex_pe} +fi # add binaries via perf record post processing test_record ${ex_sha1} test_record ${ex_md5} +if [ ${run_pe} -eq 1 ]; then + test_record wine ${ex_pe} +fi # cleanup rm ${ex_sha1} ${ex_md5} +if [ ${run_pe} -eq 1 ]; then + rm -r ${wineprefix} +fi exit ${err} -- cgit v1.2.3 From 83bf6fb8b076c72fe42e7d0fab5a5c98b5e2a11a Mon Sep 17 00:00:00 2001 From: "Paul A. Clarke" Date: Wed, 24 Feb 2021 12:14:36 -0600 Subject: perf vendor events power9: Remove unsupported metrics Several metrics are defined based on unsupported / non-existent events, and silently discarded. Remove them for good code hygiene and to avoid confusion. Signed-off-by: Paul A. Clarke Reviewed-by: Kajol Jain Cc: Ananth N Mavinakayanahalli Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Naveen N. Rao Cc: Sukadev Bhattiprolu Link: https://lore.kernel.org/r/20210224181436.782091-1-pc@us.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- .../pmu-events/arch/powerpc/power9/metrics.json | 132 --------------------- 1 file changed, 132 deletions(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/powerpc/power9/metrics.json b/tools/perf/pmu-events/arch/powerpc/power9/metrics.json index f8784c608479..140402d2855f 100644 --- a/tools/perf/pmu-events/arch/powerpc/power9/metrics.json +++ b/tools/perf/pmu-events/arch/powerpc/power9/metrics.json @@ -1209,156 +1209,24 @@ "MetricGroup": "instruction_stats_percent_per_ref", "MetricName": "inst_from_rmem_percent" }, - { - "BriefDescription": "%L2 Modified CO Cache read Utilization (4 pclks per disp attempt)", - "MetricExpr": "((PM_L2_CASTOUT_MOD/2)*4)/ PM_RUN_CYC * 100", - "MetricGroup": "l2_stats", - "MetricName": "l2_co_m_rd_util" - }, - { - "BriefDescription": "L2 dcache invalidates per run inst (per core)", - "MetricExpr": "(PM_L2_DC_INV / 2) / PM_RUN_INST_CMPL * 100", - "MetricGroup": "l2_stats", - "MetricName": "l2_dc_inv_rate_percent" - }, { "BriefDescription": "Demand load misses as a % of L2 LD dispatches (per thread)", "MetricExpr": "PM_L1_DCACHE_RELOAD_VALID / (PM_L2_LD / 2) * 100", "MetricGroup": "l2_stats", "MetricName": "l2_dem_ld_disp_percent" }, - { - "BriefDescription": "L2 Icache invalidates per run inst (per core)", - "MetricExpr": "(PM_L2_IC_INV / 2) / PM_RUN_INST_CMPL * 100", - "MetricGroup": "l2_stats", - "MetricName": "l2_ic_inv_rate_percent" - }, - { - "BriefDescription": "L2 Inst misses as a % of total L2 Inst dispatches (per thread)", - "MetricExpr": "PM_L2_INST_MISS / PM_L2_INST * 100", - "MetricGroup": "l2_stats", - "MetricName": "l2_inst_miss_ratio_percent" - }, - { - "BriefDescription": "Average number of cycles between L2 Load hits", - "MetricExpr": "(PM_L2_LD_HIT / PM_RUN_CYC) / 2", - "MetricGroup": "l2_stats", - "MetricName": "l2_ld_hit_frequency" - }, - { - "BriefDescription": "Average number of cycles between L2 Load misses", - "MetricExpr": "(PM_L2_LD_MISS / PM_RUN_CYC) / 2", - "MetricGroup": "l2_stats", - "MetricName": "l2_ld_miss_frequency" - }, - { - "BriefDescription": "L2 Load misses as a % of total L2 Load dispatches (per thread)", - "MetricExpr": "PM_L2_LD_MISS / PM_L2_LD * 100", - "MetricGroup": "l2_stats", - "MetricName": "l2_ld_miss_ratio_percent" - }, - { - "BriefDescription": "% L2 load disp attempts Cache read Utilization (4 pclks per disp attempt)", - "MetricExpr": "((PM_L2_RCLD_DISP/2)*4)/ PM_RUN_CYC * 100", - "MetricGroup": "l2_stats", - "MetricName": "l2_ld_rd_util" - }, - { - "BriefDescription": "L2 load misses that require a cache write (4 pclks per disp attempt) % of pclks", - "MetricExpr": "((( PM_L2_LD_DISP - PM_L2_LD_HIT)/2)*4)/ PM_RUN_CYC * 100", - "MetricGroup": "l2_stats", - "MetricName": "l2_ldmiss_wr_util" - }, - { - "BriefDescription": "L2 local pump prediction success", - "MetricExpr": "PM_L2_LOC_GUESS_CORRECT / (PM_L2_LOC_GUESS_CORRECT + PM_L2_LOC_GUESS_WRONG) * 100", - "MetricGroup": "l2_stats", - "MetricName": "l2_local_pred_correct_percent" - }, - { - "BriefDescription": "L2 COs that were in M,Me,Mu state as a % of all L2 COs", - "MetricExpr": "PM_L2_CASTOUT_MOD / (PM_L2_CASTOUT_MOD + PM_L2_CASTOUT_SHR) * 100", - "MetricGroup": "l2_stats", - "MetricName": "l2_mod_co_percent" - }, - { - "BriefDescription": "% of L2 Load RC dispatch atampts that failed because of address collisions and cclass conflicts", - "MetricExpr": "(PM_L2_RCLD_DISP_FAIL_ADDR )/ PM_L2_RCLD_DISP * 100", - "MetricGroup": "l2_stats", - "MetricName": "l2_rc_ld_disp_addr_fail_percent" - }, - { - "BriefDescription": "% of L2 Load RC dispatch attempts that failed", - "MetricExpr": "(PM_L2_RCLD_DISP_FAIL_ADDR + PM_L2_RCLD_DISP_FAIL_OTHER)/ PM_L2_RCLD_DISP * 100", - "MetricGroup": "l2_stats", - "MetricName": "l2_rc_ld_disp_fail_percent" - }, - { - "BriefDescription": "% of L2 Store RC dispatch atampts that failed because of address collisions and cclass conflicts", - "MetricExpr": "PM_L2_RCST_DISP_FAIL_ADDR / PM_L2_RCST_DISP * 100", - "MetricGroup": "l2_stats", - "MetricName": "l2_rc_st_disp_addr_fail_percent" - }, - { - "BriefDescription": "% of L2 Store RC dispatch attempts that failed", - "MetricExpr": "(PM_L2_RCST_DISP_FAIL_ADDR + PM_L2_RCST_DISP_FAIL_OTHER)/ PM_L2_RCST_DISP * 100", - "MetricGroup": "l2_stats", - "MetricName": "l2_rc_st_disp_fail_percent" - }, - { - "BriefDescription": "L2 Cache Read Utilization (per core)", - "MetricExpr": "(((PM_L2_RCLD_DISP/2)*4)/ PM_RUN_CYC * 100) + (((PM_L2_RCST_DISP/2)*4)/PM_RUN_CYC * 100) + (((PM_L2_CASTOUT_MOD/2)*4)/PM_RUN_CYC * 100)", - "MetricGroup": "l2_stats", - "MetricName": "l2_rd_util_percent" - }, - { - "BriefDescription": "L2 COs that were in T,Te,Si,S state as a % of all L2 COs", - "MetricExpr": "PM_L2_CASTOUT_SHR / (PM_L2_CASTOUT_MOD + PM_L2_CASTOUT_SHR) * 100", - "MetricGroup": "l2_stats", - "MetricName": "l2_shr_co_percent" - }, { "BriefDescription": "L2 Store misses as a % of total L2 Store dispatches (per thread)", "MetricExpr": "PM_L2_ST_MISS / PM_L2_ST * 100", "MetricGroup": "l2_stats", "MetricName": "l2_st_miss_ratio_percent" }, - { - "BriefDescription": "% L2 store disp attempts Cache read Utilization (4 pclks per disp attempt)", - "MetricExpr": "((PM_L2_RCST_DISP/2)*4) / PM_RUN_CYC * 100", - "MetricGroup": "l2_stats", - "MetricName": "l2_st_rd_util" - }, { "BriefDescription": "L2 stores that require a cache write (4 pclks per disp attempt) % of pclks", "MetricExpr": "((PM_L2_ST_DISP/2)*4) / PM_RUN_CYC * 100", "MetricGroup": "l2_stats", "MetricName": "l2_st_wr_util" }, - { - "BriefDescription": "L2 Cache Write Utilization (per core)", - "MetricExpr": "((((PM_L2_LD_DISP - PM_L2_LD_HIT)/2)*4) / PM_RUN_CYC * 100) + (((PM_L2_ST_DISP/2)*4) / PM_RUN_CYC * 100)", - "MetricGroup": "l2_stats", - "MetricName": "l2_wr_util_percent" - }, - { - "BriefDescription": "Average number of cycles between L3 Load hits", - "MetricExpr": "(PM_L3_LD_HIT / PM_RUN_CYC) / 2", - "MetricGroup": "l3_stats", - "MetricName": "l3_ld_hit_frequency" - }, - { - "BriefDescription": "Average number of cycles between L3 Load misses", - "MetricExpr": "(PM_L3_LD_MISS / PM_RUN_CYC) / 2", - "MetricGroup": "l3_stats", - "MetricName": "l3_ld_miss_frequency" - }, - { - "BriefDescription": "Average number of Write-in machines used. 1 of 8 WI machines is sampled every L3 cycle", - "MetricExpr": "(PM_L3_WI_USAGE / PM_RUN_CYC) * 8", - "MetricGroup": "l3_stats", - "MetricName": "l3_wi_usage" - }, { "BriefDescription": "Average icache miss latency", "MetricExpr": "PM_IC_DEMAND_CYC / PM_IC_DEMAND_REQ", -- cgit v1.2.3 From 42b2b570b34afb5fb9dc16ac77cb332194136a85 Mon Sep 17 00:00:00 2001 From: Mike Leach Date: Wed, 24 Feb 2021 09:48:30 -0700 Subject: perf cs-etm: Update ETM metadata format The current fixed metadata version format (version 0), means that adding metadata parameter items renders files from a previous version of perf unreadable. Per CPU parameters appear in a fixed order, but there is no field to indicate the number of ETM parameters per CPU. This patch updates the per CPU parameter blocks to include a NR_PARAMs value which indicates the number of parameters in the block. The header version is incremented to 1. Fixed ordering is retained, new ETM parameters are added to the end of the list. The reader code is updated to be able to read current version 0 files, For version 1, the reader will read the number of parameters in the per CPU block. This allows the reader to process older or newer files that may have different numbers of parameters than in use at the time perf was built. Signed-off-by: Mike Leach Reviewed-by: Leo Yan Tested-by: Leo Yan Link: https://lore.kernel.org/r/20210202214040.32349-1-mike.leach@linaro.org Link: https://lore.kernel.org/r/20210224164835.3497311-2-mathieu.poirier@linaro.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm/util/cs-etm.c | 7 +- tools/perf/util/cs-etm.c | 235 +++++++++++++++++++++++++++++++------- tools/perf/util/cs-etm.h | 30 ++++- 3 files changed, 223 insertions(+), 49 deletions(-) (limited to 'tools') diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c index bd446aba64f7..b0470f2a955a 100644 --- a/tools/perf/arch/arm/util/cs-etm.c +++ b/tools/perf/arch/arm/util/cs-etm.c @@ -572,7 +572,7 @@ static void cs_etm_get_metadata(int cpu, u32 *offset, struct auxtrace_record *itr, struct perf_record_auxtrace_info *info) { - u32 increment; + u32 increment, nr_trc_params; u64 magic; struct cs_etm_recording *ptr = container_of(itr, struct cs_etm_recording, itr); @@ -607,6 +607,7 @@ static void cs_etm_get_metadata(int cpu, u32 *offset, /* How much space was used */ increment = CS_ETMV4_PRIV_MAX; + nr_trc_params = CS_ETMV4_PRIV_MAX - CS_ETMV4_TRCCONFIGR; } else { magic = __perf_cs_etmv3_magic; /* Get configuration register */ @@ -624,11 +625,13 @@ static void cs_etm_get_metadata(int cpu, u32 *offset, /* How much space was used */ increment = CS_ETM_PRIV_MAX; + nr_trc_params = CS_ETM_PRIV_MAX - CS_ETM_ETMCR; } /* Build generic header portion */ info->priv[*offset + CS_ETM_MAGIC] = magic; info->priv[*offset + CS_ETM_CPU] = cpu; + info->priv[*offset + CS_ETM_NR_TRC_PARAMS] = nr_trc_params; /* Where the next CPU entry should start from */ *offset += increment; } @@ -674,7 +677,7 @@ static int cs_etm_info_fill(struct auxtrace_record *itr, /* First fill out the session header */ info->type = PERF_AUXTRACE_CS_ETM; - info->priv[CS_HEADER_VERSION_0] = 0; + info->priv[CS_HEADER_VERSION] = CS_HEADER_CURRENT_VERSION; info->priv[CS_PMU_TYPE_CPUS] = type << 32; info->priv[CS_PMU_TYPE_CPUS] |= nr_cpu; info->priv[CS_ETM_SNAPSHOT] = ptr->snapshot_mode; diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index a2a369e2fbb6..ee32d023e9bd 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -2435,7 +2435,7 @@ static bool cs_etm__is_timeless_decoding(struct cs_etm_auxtrace *etm) } static const char * const cs_etm_global_header_fmts[] = { - [CS_HEADER_VERSION_0] = " Header version %llx\n", + [CS_HEADER_VERSION] = " Header version %llx\n", [CS_PMU_TYPE_CPUS] = " PMU type/num cpus %llx\n", [CS_ETM_SNAPSHOT] = " Snapshot %llx\n", }; @@ -2443,6 +2443,7 @@ static const char * const cs_etm_global_header_fmts[] = { static const char * const cs_etm_priv_fmts[] = { [CS_ETM_MAGIC] = " Magic number %llx\n", [CS_ETM_CPU] = " CPU %lld\n", + [CS_ETM_NR_TRC_PARAMS] = " NR_TRC_PARAMS %llx\n", [CS_ETM_ETMCR] = " ETMCR %llx\n", [CS_ETM_ETMTRACEIDR] = " ETMTRACEIDR %llx\n", [CS_ETM_ETMCCER] = " ETMCCER %llx\n", @@ -2452,6 +2453,7 @@ static const char * const cs_etm_priv_fmts[] = { static const char * const cs_etmv4_priv_fmts[] = { [CS_ETM_MAGIC] = " Magic number %llx\n", [CS_ETM_CPU] = " CPU %lld\n", + [CS_ETM_NR_TRC_PARAMS] = " NR_TRC_PARAMS %llx\n", [CS_ETMV4_TRCCONFIGR] = " TRCCONFIGR %llx\n", [CS_ETMV4_TRCTRACEIDR] = " TRCTRACEIDR %llx\n", [CS_ETMV4_TRCIDR0] = " TRCIDR0 %llx\n", @@ -2461,26 +2463,167 @@ static const char * const cs_etmv4_priv_fmts[] = { [CS_ETMV4_TRCAUTHSTATUS] = " TRCAUTHSTATUS %llx\n", }; -static void cs_etm__print_auxtrace_info(__u64 *val, int num) +static const char * const param_unk_fmt = + " Unknown parameter [%d] %llx\n"; +static const char * const magic_unk_fmt = + " Magic number Unknown %llx\n"; + +static int cs_etm__print_cpu_metadata_v0(__u64 *val, int *offset) { - int i, j, cpu = 0; + int i = *offset, j, nr_params = 0, fmt_offset; + __u64 magic; - for (i = 0; i < CS_HEADER_VERSION_0_MAX; i++) - fprintf(stdout, cs_etm_global_header_fmts[i], val[i]); + /* check magic value */ + magic = val[i + CS_ETM_MAGIC]; + if ((magic != __perf_cs_etmv3_magic) && + (magic != __perf_cs_etmv4_magic)) { + /* failure - note bad magic value */ + fprintf(stdout, magic_unk_fmt, magic); + return -EINVAL; + } + + /* print common header block */ + fprintf(stdout, cs_etm_priv_fmts[CS_ETM_MAGIC], val[i++]); + fprintf(stdout, cs_etm_priv_fmts[CS_ETM_CPU], val[i++]); + + if (magic == __perf_cs_etmv3_magic) { + nr_params = CS_ETM_NR_TRC_PARAMS_V0; + fmt_offset = CS_ETM_ETMCR; + /* after common block, offset format index past NR_PARAMS */ + for (j = fmt_offset; j < nr_params + fmt_offset; j++, i++) + fprintf(stdout, cs_etm_priv_fmts[j], val[i]); + } else if (magic == __perf_cs_etmv4_magic) { + nr_params = CS_ETMV4_NR_TRC_PARAMS_V0; + fmt_offset = CS_ETMV4_TRCCONFIGR; + /* after common block, offset format index past NR_PARAMS */ + for (j = fmt_offset; j < nr_params + fmt_offset; j++, i++) + fprintf(stdout, cs_etmv4_priv_fmts[j], val[i]); + } + *offset = i; + return 0; +} + +static int cs_etm__print_cpu_metadata_v1(__u64 *val, int *offset) +{ + int i = *offset, j, total_params = 0; + __u64 magic; + + magic = val[i + CS_ETM_MAGIC]; + /* total params to print is NR_PARAMS + common block size for v1 */ + total_params = val[i + CS_ETM_NR_TRC_PARAMS] + CS_ETM_COMMON_BLK_MAX_V1; - for (i = CS_HEADER_VERSION_0_MAX; cpu < num; cpu++) { - if (val[i] == __perf_cs_etmv3_magic) - for (j = 0; j < CS_ETM_PRIV_MAX; j++, i++) + if (magic == __perf_cs_etmv3_magic) { + for (j = 0; j < total_params; j++, i++) { + /* if newer record - could be excess params */ + if (j >= CS_ETM_PRIV_MAX) + fprintf(stdout, param_unk_fmt, j, val[i]); + else fprintf(stdout, cs_etm_priv_fmts[j], val[i]); - else if (val[i] == __perf_cs_etmv4_magic) - for (j = 0; j < CS_ETMV4_PRIV_MAX; j++, i++) + } + } else if (magic == __perf_cs_etmv4_magic) { + for (j = 0; j < total_params; j++, i++) { + /* if newer record - could be excess params */ + if (j >= CS_ETMV4_PRIV_MAX) + fprintf(stdout, param_unk_fmt, j, val[i]); + else fprintf(stdout, cs_etmv4_priv_fmts[j], val[i]); - else - /* failure.. return */ + } + } else { + /* failure - note bad magic value and error out */ + fprintf(stdout, magic_unk_fmt, magic); + return -EINVAL; + } + *offset = i; + return 0; +} + +static void cs_etm__print_auxtrace_info(__u64 *val, int num) +{ + int i, cpu = 0, version, err; + + /* bail out early on bad header version */ + version = val[0]; + if (version > CS_HEADER_CURRENT_VERSION) { + /* failure.. return */ + fprintf(stdout, " Unknown Header Version = %x, ", version); + fprintf(stdout, "Version supported <= %x\n", CS_HEADER_CURRENT_VERSION); + return; + } + + for (i = 0; i < CS_HEADER_VERSION_MAX; i++) + fprintf(stdout, cs_etm_global_header_fmts[i], val[i]); + + for (i = CS_HEADER_VERSION_MAX; cpu < num; cpu++) { + if (version == 0) + err = cs_etm__print_cpu_metadata_v0(val, &i); + else if (version == 1) + err = cs_etm__print_cpu_metadata_v1(val, &i); + if (err) return; } } +/* + * Read a single cpu parameter block from the auxtrace_info priv block. + * + * For version 1 there is a per cpu nr_params entry. If we are handling + * version 1 file, then there may be less, the same, or more params + * indicated by this value than the compile time number we understand. + * + * For a version 0 info block, there are a fixed number, and we need to + * fill out the nr_param value in the metadata we create. + */ +static u64 *cs_etm__create_meta_blk(u64 *buff_in, int *buff_in_offset, + int out_blk_size, int nr_params_v0) +{ + u64 *metadata = NULL; + int hdr_version; + int nr_in_params, nr_out_params, nr_cmn_params; + int i, k; + + metadata = zalloc(sizeof(*metadata) * out_blk_size); + if (!metadata) + return NULL; + + /* read block current index & version */ + i = *buff_in_offset; + hdr_version = buff_in[CS_HEADER_VERSION]; + + if (!hdr_version) { + /* read version 0 info block into a version 1 metadata block */ + nr_in_params = nr_params_v0; + metadata[CS_ETM_MAGIC] = buff_in[i + CS_ETM_MAGIC]; + metadata[CS_ETM_CPU] = buff_in[i + CS_ETM_CPU]; + metadata[CS_ETM_NR_TRC_PARAMS] = nr_in_params; + /* remaining block params at offset +1 from source */ + for (k = CS_ETM_COMMON_BLK_MAX_V1 - 1; k < nr_in_params; k++) + metadata[k + 1] = buff_in[i + k]; + /* version 0 has 2 common params */ + nr_cmn_params = 2; + } else { + /* read version 1 info block - input and output nr_params may differ */ + /* version 1 has 3 common params */ + nr_cmn_params = 3; + nr_in_params = buff_in[i + CS_ETM_NR_TRC_PARAMS]; + + /* if input has more params than output - skip excess */ + nr_out_params = nr_in_params + nr_cmn_params; + if (nr_out_params > out_blk_size) + nr_out_params = out_blk_size; + + for (k = CS_ETM_MAGIC; k < nr_out_params; k++) + metadata[k] = buff_in[i + k]; + + /* record the actual nr params we copied */ + metadata[CS_ETM_NR_TRC_PARAMS] = nr_out_params - nr_cmn_params; + } + + /* adjust in offset by number of in params used */ + i += nr_in_params + nr_cmn_params; + *buff_in_offset = i; + return metadata; +} + int cs_etm__process_auxtrace_info(union perf_event *event, struct perf_session *session) { @@ -2492,11 +2635,12 @@ int cs_etm__process_auxtrace_info(union perf_event *event, int info_header_size; int total_size = auxtrace_info->header.size; int priv_size = 0; - int num_cpu; - int err = 0, idx = -1; - int i, j, k; + int num_cpu, trcidr_idx; + int err = 0; + int i, j; u64 *ptr, *hdr = NULL; u64 **metadata = NULL; + u64 hdr_version; /* * sizeof(auxtrace_info_event::type) + @@ -2512,16 +2656,21 @@ int cs_etm__process_auxtrace_info(union perf_event *event, /* First the global part */ ptr = (u64 *) auxtrace_info->priv; - /* Look for version '0' of the header */ - if (ptr[0] != 0) + /* Look for version of the header */ + hdr_version = ptr[0]; + if (hdr_version > CS_HEADER_CURRENT_VERSION) { + /* print routine will print an error on bad version */ + if (dump_trace) + cs_etm__print_auxtrace_info(auxtrace_info->priv, 0); return -EINVAL; + } - hdr = zalloc(sizeof(*hdr) * CS_HEADER_VERSION_0_MAX); + hdr = zalloc(sizeof(*hdr) * CS_HEADER_VERSION_MAX); if (!hdr) return -ENOMEM; /* Extract header information - see cs-etm.h for format */ - for (i = 0; i < CS_HEADER_VERSION_0_MAX; i++) + for (i = 0; i < CS_HEADER_VERSION_MAX; i++) hdr[i] = ptr[i]; num_cpu = hdr[CS_PMU_TYPE_CPUS] & 0xffffffff; pmu_type = (unsigned int) ((hdr[CS_PMU_TYPE_CPUS] >> 32) & @@ -2552,35 +2701,31 @@ int cs_etm__process_auxtrace_info(union perf_event *event, */ for (j = 0; j < num_cpu; j++) { if (ptr[i] == __perf_cs_etmv3_magic) { - metadata[j] = zalloc(sizeof(*metadata[j]) * - CS_ETM_PRIV_MAX); - if (!metadata[j]) { - err = -ENOMEM; - goto err_free_metadata; - } - for (k = 0; k < CS_ETM_PRIV_MAX; k++) - metadata[j][k] = ptr[i + k]; + metadata[j] = + cs_etm__create_meta_blk(ptr, &i, + CS_ETM_PRIV_MAX, + CS_ETM_NR_TRC_PARAMS_V0); /* The traceID is our handle */ - idx = metadata[j][CS_ETM_ETMTRACEIDR]; - i += CS_ETM_PRIV_MAX; + trcidr_idx = CS_ETM_ETMTRACEIDR; + } else if (ptr[i] == __perf_cs_etmv4_magic) { - metadata[j] = zalloc(sizeof(*metadata[j]) * - CS_ETMV4_PRIV_MAX); - if (!metadata[j]) { - err = -ENOMEM; - goto err_free_metadata; - } - for (k = 0; k < CS_ETMV4_PRIV_MAX; k++) - metadata[j][k] = ptr[i + k]; + metadata[j] = + cs_etm__create_meta_blk(ptr, &i, + CS_ETMV4_PRIV_MAX, + CS_ETMV4_NR_TRC_PARAMS_V0); /* The traceID is our handle */ - idx = metadata[j][CS_ETMV4_TRCTRACEIDR]; - i += CS_ETMV4_PRIV_MAX; + trcidr_idx = CS_ETMV4_TRCTRACEIDR; + } + + if (!metadata[j]) { + err = -ENOMEM; + goto err_free_metadata; } /* Get an RB node for this CPU */ - inode = intlist__findnew(traceid_list, idx); + inode = intlist__findnew(traceid_list, metadata[j][trcidr_idx]); /* Something went wrong, no need to continue */ if (!inode) { @@ -2601,7 +2746,7 @@ int cs_etm__process_auxtrace_info(union perf_event *event, } /* - * Each of CS_HEADER_VERSION_0_MAX, CS_ETM_PRIV_MAX and + * Each of CS_HEADER_VERSION_MAX, CS_ETM_PRIV_MAX and * CS_ETMV4_PRIV_MAX mark how many double words are in the * global metadata, and each cpu's metadata respectively. * The following tests if the correct number of double words was @@ -2703,6 +2848,12 @@ err_free_traceid_list: intlist__delete(traceid_list); err_free_hdr: zfree(&hdr); - + /* + * At this point, as a minimum we have valid header. Dump the rest of + * the info section - the print routines will error out on structural + * issues. + */ + if (dump_trace) + cs_etm__print_auxtrace_info(auxtrace_info->priv, num_cpu); return err; } diff --git a/tools/perf/util/cs-etm.h b/tools/perf/util/cs-etm.h index 4ad925d6d799..e153d02df0de 100644 --- a/tools/perf/util/cs-etm.h +++ b/tools/perf/util/cs-etm.h @@ -17,23 +17,37 @@ struct perf_session; */ enum { /* Starting with 0x0 */ - CS_HEADER_VERSION_0, + CS_HEADER_VERSION, /* PMU->type (32 bit), total # of CPUs (32 bit) */ CS_PMU_TYPE_CPUS, CS_ETM_SNAPSHOT, - CS_HEADER_VERSION_0_MAX, + CS_HEADER_VERSION_MAX, }; +/* + * Update the version for new format. + * + * New version 1 format adds a param count to the per cpu metadata. + * This allows easy adding of new metadata parameters. + * Requires that new params always added after current ones. + * Also allows client reader to handle file versions that are different by + * checking the number of params in the file vs the number expected. + */ +#define CS_HEADER_CURRENT_VERSION 1 + /* Beginning of header common to both ETMv3 and V4 */ enum { CS_ETM_MAGIC, CS_ETM_CPU, + /* Number of trace config params in following ETM specific block */ + CS_ETM_NR_TRC_PARAMS, + CS_ETM_COMMON_BLK_MAX_V1, }; /* ETMv3/PTM metadata */ enum { /* Dynamic, configurable parameters */ - CS_ETM_ETMCR = CS_ETM_CPU + 1, + CS_ETM_ETMCR = CS_ETM_COMMON_BLK_MAX_V1, CS_ETM_ETMTRACEIDR, /* RO, taken from sysFS */ CS_ETM_ETMCCER, @@ -41,10 +55,13 @@ enum { CS_ETM_PRIV_MAX, }; +/* define fixed version 0 length - allow new format reader to read old files. */ +#define CS_ETM_NR_TRC_PARAMS_V0 (CS_ETM_ETMIDR - CS_ETM_ETMCR + 1) + /* ETMv4 metadata */ enum { /* Dynamic, configurable parameters */ - CS_ETMV4_TRCCONFIGR = CS_ETM_CPU + 1, + CS_ETMV4_TRCCONFIGR = CS_ETM_COMMON_BLK_MAX_V1, CS_ETMV4_TRCTRACEIDR, /* RO, taken from sysFS */ CS_ETMV4_TRCIDR0, @@ -55,6 +72,9 @@ enum { CS_ETMV4_PRIV_MAX, }; +/* define fixed version 0 length - allow new format reader to read old files. */ +#define CS_ETMV4_NR_TRC_PARAMS_V0 (CS_ETMV4_TRCAUTHSTATUS - CS_ETMV4_TRCCONFIGR + 1) + /* * ETMv3 exception encoding number: * See Embedded Trace Macrocell spcification (ARM IHI 0014Q) @@ -162,7 +182,7 @@ struct cs_etm_packet_queue { #define BMVAL(val, lsb, msb) ((val & GENMASK(msb, lsb)) >> lsb) -#define CS_ETM_HEADER_SIZE (CS_HEADER_VERSION_0_MAX * sizeof(u64)) +#define CS_ETM_HEADER_SIZE (CS_HEADER_VERSION_MAX * sizeof(u64)) #define __perf_cs_etmv3_magic 0x3030303030303030ULL #define __perf_cs_etmv4_magic 0x4040404040404040ULL -- cgit v1.2.3 From 2bb4ccbd95d7fbf58540c8d3d55cbabc8fb95e28 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Wed, 24 Feb 2021 09:48:31 -0700 Subject: tools headers UAPI: Update tools' copy of linux/coresight-pmu.h To get the changes in the commit: "coresight: etm-perf: Clarify comment on perf options". Signed-off-by: Leo Yan Reviewed-by: Suzuki K Poulose Reviewed-by: Mathieu Poirier Cc: Mike Leach Link: https://lore.kernel.org/r/20210213113220.292229-2-leo.yan@linaro.org Link: https://lore.kernel.org/r/20210224164835.3497311-3-mathieu.poirier@linaro.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/linux/coresight-pmu.h | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/include/linux/coresight-pmu.h b/tools/include/linux/coresight-pmu.h index b0e35eec6499..5dc47cfdcf07 100644 --- a/tools/include/linux/coresight-pmu.h +++ b/tools/include/linux/coresight-pmu.h @@ -10,11 +10,18 @@ #define CORESIGHT_ETM_PMU_NAME "cs_etm" #define CORESIGHT_ETM_PMU_SEED 0x10 -/* ETMv3.5/PTM's ETMCR config bit */ -#define ETM_OPT_CYCACC 12 -#define ETM_OPT_CTXTID 14 -#define ETM_OPT_TS 28 -#define ETM_OPT_RETSTK 29 +/* + * Below are the definition of bit offsets for perf option, and works as + * arbitrary values for all ETM versions. + * + * Most of them are orignally from ETMv3.5/PTM's ETMCR config, therefore, + * ETMv3.5/PTM doesn't define ETMCR config bits with prefix "ETM3_" and + * directly use below macros as config bits. + */ +#define ETM_OPT_CYCACC 12 +#define ETM_OPT_CTXTID 14 +#define ETM_OPT_TS 28 +#define ETM_OPT_RETSTK 29 /* ETMv4 CONFIGR programming bits for the ETM OPTs */ #define ETM4_CFG_BIT_CYCACC 4 -- cgit v1.2.3 From 8c559e8d68630d64d932bada633705f6551427df Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Wed, 24 Feb 2021 09:48:32 -0700 Subject: perf cs-etm: Fix bitmap for option When set option with macros ETM_OPT_CTXTID and ETM_OPT_TS, it wrongly takes these two values (14 and 28 prespectively) as bit masks, but actually both are the offset for bits. But this doesn't lead to further failure due to the AND logic operation will be always true for ETM_OPT_CTXTID / ETM_OPT_TS. This patch uses the BIT() macro for option bits, thus it can request the correct bitmaps for "contextid" and "timestamp" when calling cs_etm_set_option(). Signed-off-by: Suzuki K Poulose Reviewed-by: Mathieu Poirier Reviewed-by: Mike Leach Link: https://lore.kernel.org/r/20210213113220.292229-3-leo.yan@linaro.org Link: https://lore.kernel.org/r/20210224164835.3497311-4-mathieu.poirier@linaro.org [Extract the change as a separate patch for easier review] Signed-off-by: Leo Yan Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm/util/cs-etm.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c index b0470f2a955a..5b2bb7fc5ee1 100644 --- a/tools/perf/arch/arm/util/cs-etm.c +++ b/tools/perf/arch/arm/util/cs-etm.c @@ -169,17 +169,17 @@ static int cs_etm_set_option(struct auxtrace_record *itr, !cpu_map__has(online_cpus, i)) continue; - if (option & ETM_OPT_CTXTID) { + if (option & BIT(ETM_OPT_CTXTID)) { err = cs_etm_set_context_id(itr, evsel, i); if (err) goto out; } - if (option & ETM_OPT_TS) { + if (option & BIT(ETM_OPT_TS)) { err = cs_etm_set_timestamp(itr, evsel, i); if (err) goto out; } - if (option & ~(ETM_OPT_CTXTID | ETM_OPT_TS)) + if (option & ~(BIT(ETM_OPT_CTXTID) | BIT(ETM_OPT_TS))) /* Nothing else is currently supported */ goto out; } @@ -406,7 +406,7 @@ static int cs_etm_recording_options(struct auxtrace_record *itr, evsel__set_sample_bit(cs_etm_evsel, CPU); err = cs_etm_set_option(itr, cs_etm_evsel, - ETM_OPT_CTXTID | ETM_OPT_TS); + BIT(ETM_OPT_CTXTID) | BIT(ETM_OPT_TS)); if (err) goto out; } -- cgit v1.2.3 From 30cb76aabfb4deab4ffef54882f86df319b4d862 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Wed, 24 Feb 2021 09:48:33 -0700 Subject: perf cs-etm: Support PID tracing in config If the kernel is running at EL2, the pid of a task is exposed via VMID instead of the CONTEXTID. Add support for this in the perf tool. This patch respects user setting if user has specified any configs from "contextid", "contextid1" or "contextid2"; otherwise, it dynamically sets config based on PMU format "contextid". Signed-off-by: Suzuki K Poulose Co-developed-by: Leo Yan Signed-off-by: Leo Yan Reviewed-by: Mike Leach Reviewed-by: Mathieu Poirier Cc: Al Grant Link: https://lore.kernel.org/r/20210213113220.292229-4-leo.yan@linaro.org Link: https://lore.kernel.org/r/20210224164835.3497311-5-mathieu.poirier@linaro.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/linux/coresight-pmu.h | 3 ++ tools/perf/arch/arm/util/cs-etm.c | 61 +++++++++++++++++++++++++++++-------- 2 files changed, 52 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/include/linux/coresight-pmu.h b/tools/include/linux/coresight-pmu.h index 5dc47cfdcf07..4ac5c081af93 100644 --- a/tools/include/linux/coresight-pmu.h +++ b/tools/include/linux/coresight-pmu.h @@ -20,14 +20,17 @@ */ #define ETM_OPT_CYCACC 12 #define ETM_OPT_CTXTID 14 +#define ETM_OPT_CTXTID2 15 #define ETM_OPT_TS 28 #define ETM_OPT_RETSTK 29 /* ETMv4 CONFIGR programming bits for the ETM OPTs */ #define ETM4_CFG_BIT_CYCACC 4 #define ETM4_CFG_BIT_CTXTID 6 +#define ETM4_CFG_BIT_VMID 7 #define ETM4_CFG_BIT_TS 11 #define ETM4_CFG_BIT_RETSTK 12 +#define ETM4_CFG_BIT_VMID_OPT 15 static inline int coresight_get_trace_id(int cpu) { diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c index 5b2bb7fc5ee1..911c7f2b3581 100644 --- a/tools/perf/arch/arm/util/cs-etm.c +++ b/tools/perf/arch/arm/util/cs-etm.c @@ -67,6 +67,7 @@ static int cs_etm_set_context_id(struct auxtrace_record *itr, char path[PATH_MAX]; int err = -EINVAL; u32 val; + u64 contextid; ptr = container_of(itr, struct cs_etm_recording, itr); cs_etm_pmu = ptr->cs_etm_pmu; @@ -86,25 +87,59 @@ static int cs_etm_set_context_id(struct auxtrace_record *itr, goto out; } + /* User has configured for PID tracing, respects it. */ + contextid = evsel->core.attr.config & + (BIT(ETM_OPT_CTXTID) | BIT(ETM_OPT_CTXTID2)); + /* - * TRCIDR2.CIDSIZE, bit [9-5], indicates whether contextID tracing - * is supported: - * 0b00000 Context ID tracing is not supported. - * 0b00100 Maximum of 32-bit Context ID size. - * All other values are reserved. + * If user doesn't configure the contextid format, parse PMU format and + * enable PID tracing according to the "contextid" format bits: + * + * If bit ETM_OPT_CTXTID is set, trace CONTEXTIDR_EL1; + * If bit ETM_OPT_CTXTID2 is set, trace CONTEXTIDR_EL2. */ - val = BMVAL(val, 5, 9); - if (!val || val != 0x4) { - err = -EINVAL; - goto out; + if (!contextid) + contextid = perf_pmu__format_bits(&cs_etm_pmu->format, + "contextid"); + + if (contextid & BIT(ETM_OPT_CTXTID)) { + /* + * TRCIDR2.CIDSIZE, bit [9-5], indicates whether contextID + * tracing is supported: + * 0b00000 Context ID tracing is not supported. + * 0b00100 Maximum of 32-bit Context ID size. + * All other values are reserved. + */ + val = BMVAL(val, 5, 9); + if (!val || val != 0x4) { + pr_err("%s: CONTEXTIDR_EL1 isn't supported\n", + CORESIGHT_ETM_PMU_NAME); + err = -EINVAL; + goto out; + } + } + + if (contextid & BIT(ETM_OPT_CTXTID2)) { + /* + * TRCIDR2.VMIDOPT[30:29] != 0 and + * TRCIDR2.VMIDSIZE[14:10] == 0b00100 (32bit virtual contextid) + * We can't support CONTEXTIDR in VMID if the size of the + * virtual context id is < 32bit. + * Any value of VMIDSIZE >= 4 (i.e, > 32bit) is fine for us. + */ + if (!BMVAL(val, 29, 30) || BMVAL(val, 10, 14) < 4) { + pr_err("%s: CONTEXTIDR_EL2 isn't supported\n", + CORESIGHT_ETM_PMU_NAME); + err = -EINVAL; + goto out; + } } /* All good, let the kernel know */ - evsel->core.attr.config |= (1 << ETM_OPT_CTXTID); + evsel->core.attr.config |= contextid; err = 0; out: - return err; } @@ -485,7 +520,9 @@ static u64 cs_etmv4_get_config(struct auxtrace_record *itr) config |= BIT(ETM4_CFG_BIT_TS); if (config_opts & BIT(ETM_OPT_RETSTK)) config |= BIT(ETM4_CFG_BIT_RETSTK); - + if (config_opts & BIT(ETM_OPT_CTXTID2)) + config |= BIT(ETM4_CFG_BIT_VMID) | + BIT(ETM4_CFG_BIT_VMID_OPT); return config; } -- cgit v1.2.3 From 47f0d94c203751ddcfdb296fcf15df20fffcef0c Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Wed, 24 Feb 2021 09:48:34 -0700 Subject: perf cs-etm: Add helper cs_etm__get_pid_fmt() This patch adds helper function cs_etm__get_pid_fmt(), by passing parameter "traceID", it returns the PID format. Signed-off-by: Leo Yan Reviewed-by: Mathieu Poirier Reviewed-by: Suzuki K Poulose Cc: Mike Leach Link: https://lore.kernel.org/r/20210213113220.292229-5-leo.yan@linaro.org Link: https://lore.kernel.org/r/20210224164835.3497311-6-mathieu.poirier@linaro.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/cs-etm.c | 42 ++++++++++++++++++++++++++++++++++++++++++ tools/perf/util/cs-etm.h | 1 + 2 files changed, 43 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index ee32d023e9bd..9ac80fc23c58 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -7,6 +7,7 @@ */ #include +#include #include #include #include @@ -156,6 +157,47 @@ int cs_etm__get_cpu(u8 trace_chan_id, int *cpu) return 0; } +/* + * The returned PID format is presented by two bits: + * + * Bit ETM_OPT_CTXTID: CONTEXTIDR or CONTEXTIDR_EL1 is traced; + * Bit ETM_OPT_CTXTID2: CONTEXTIDR_EL2 is traced. + * + * It's possible that the two bits ETM_OPT_CTXTID and ETM_OPT_CTXTID2 + * are enabled at the same time when the session runs on an EL2 kernel. + * This means the CONTEXTIDR_EL1 and CONTEXTIDR_EL2 both will be + * recorded in the trace data, the tool will selectively use + * CONTEXTIDR_EL2 as PID. + */ +int cs_etm__get_pid_fmt(u8 trace_chan_id, u64 *pid_fmt) +{ + struct int_node *inode; + u64 *metadata, val; + + inode = intlist__find(traceid_list, trace_chan_id); + if (!inode) + return -EINVAL; + + metadata = inode->priv; + + if (metadata[CS_ETM_MAGIC] == __perf_cs_etmv3_magic) { + val = metadata[CS_ETM_ETMCR]; + /* CONTEXTIDR is traced */ + if (val & BIT(ETM_OPT_CTXTID)) + *pid_fmt = BIT(ETM_OPT_CTXTID); + } else { + val = metadata[CS_ETMV4_TRCCONFIGR]; + /* CONTEXTIDR_EL2 is traced */ + if (val & (BIT(ETM4_CFG_BIT_VMID) | BIT(ETM4_CFG_BIT_VMID_OPT))) + *pid_fmt = BIT(ETM_OPT_CTXTID2); + /* CONTEXTIDR_EL1 is traced */ + else if (val & BIT(ETM4_CFG_BIT_CTXTID)) + *pid_fmt = BIT(ETM_OPT_CTXTID); + } + + return 0; +} + void cs_etm__etmq_set_traceid_queue_timestamp(struct cs_etm_queue *etmq, u8 trace_chan_id) { diff --git a/tools/perf/util/cs-etm.h b/tools/perf/util/cs-etm.h index e153d02df0de..85ed11e9d2a7 100644 --- a/tools/perf/util/cs-etm.h +++ b/tools/perf/util/cs-etm.h @@ -193,6 +193,7 @@ struct cs_etm_packet_queue { int cs_etm__process_auxtrace_info(union perf_event *event, struct perf_session *session); int cs_etm__get_cpu(u8 trace_chan_id, int *cpu); +int cs_etm__get_pid_fmt(u8 trace_chan_id, u64 *pid_fmt); int cs_etm__etmq_set_tid(struct cs_etm_queue *etmq, pid_t tid, u8 trace_chan_id); bool cs_etm__etmq_is_timeless(struct cs_etm_queue *etmq); -- cgit v1.2.3 From 8e1488a46dcf73b1f1916d95421e303dbf773fb4 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Wed, 24 Feb 2021 09:48:35 -0700 Subject: perf cs-etm: Detect pid in VMID for kernel running at EL2 The PID of the task could be traced as VMID when the kernel is running at EL2. Teach the decoder to look for VMID when the CONTEXTIDR (Arm32) or CONTEXTIDR_EL1 (Arm64) is invalid but we have a valid VMID. Signed-off-by: Suzuki K Poulose Co-developed-by: Leo Yan Reviewed-by: Mathieu Poirier Cc: Al Grant Cc: Mike Leach Link: https://lore.kernel.org/r/20210213113220.292229-6-leo.yan@linaro.org Link: https://lore.kernel.org/r/20210224164835.3497311-7-mathieu.poirier@linaro.org Signed-off-by: Leo Yan Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/cs-etm-decoder/cs-etm-decoder.c | 38 ++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c index 3f4bc4050477..4052c9ce6e2f 100644 --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c @@ -6,6 +6,7 @@ * Author: Mathieu Poirier */ +#include #include #include #include @@ -491,13 +492,42 @@ cs_etm_decoder__set_tid(struct cs_etm_queue *etmq, const ocsd_generic_trace_elem *elem, const uint8_t trace_chan_id) { - pid_t tid; + pid_t tid = -1; + static u64 pid_fmt; + int ret; - /* Ignore PE_CONTEXT packets that don't have a valid contextID */ - if (!elem->context.ctxt_id_valid) + /* + * As all the ETMs run at the same exception level, the system should + * have the same PID format crossing CPUs. So cache the PID format + * and reuse it for sequential decoding. + */ + if (!pid_fmt) { + ret = cs_etm__get_pid_fmt(trace_chan_id, &pid_fmt); + if (ret) + return OCSD_RESP_FATAL_SYS_ERR; + } + + /* + * Process the PE_CONTEXT packets if we have a valid contextID or VMID. + * If the kernel is running at EL2, the PID is traced in CONTEXTIDR_EL2 + * as VMID, Bit ETM_OPT_CTXTID2 is set in this case. + */ + switch (pid_fmt) { + case BIT(ETM_OPT_CTXTID): + if (elem->context.ctxt_id_valid) + tid = elem->context.context_id; + break; + case BIT(ETM_OPT_CTXTID2): + if (elem->context.vmid_valid) + tid = elem->context.vmid; + break; + default: + break; + } + + if (tid == -1) return OCSD_RESP_CONT; - tid = elem->context.context_id; if (cs_etm__etmq_set_tid(etmq, tid, trace_chan_id)) return OCSD_RESP_FATAL_SYS_ERR; -- cgit v1.2.3 From 303dcc25b5c782547eb13b9f29426de843dd6f34 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 2 Mar 2021 16:40:10 -0800 Subject: tools/runqslower: Allow substituting custom vmlinux.h for the build Just like was done for bpftool and selftests in ec23eb705620 ("tools/bpftool: Allow substituting custom vmlinux.h for the build") and ca4db6389d61 ("selftests/bpf: Allow substituting custom vmlinux.h for selftests build"), allow to provide pre-generated vmlinux.h for runqslower build. Signed-off-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210303004010.653954-1-andrii@kernel.org --- tools/bpf/runqslower/Makefile | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'tools') diff --git a/tools/bpf/runqslower/Makefile b/tools/bpf/runqslower/Makefile index c96ba90c6f01..3818ec511fd2 100644 --- a/tools/bpf/runqslower/Makefile +++ b/tools/bpf/runqslower/Makefile @@ -69,12 +69,16 @@ $(OUTPUT) $(BPFOBJ_OUTPUT) $(BPFTOOL_OUTPUT): $(QUIET_MKDIR)mkdir -p $@ $(OUTPUT)/vmlinux.h: $(VMLINUX_BTF_PATH) | $(OUTPUT) $(BPFTOOL) +ifeq ($(VMLINUX_H),) $(Q)if [ ! -e "$(VMLINUX_BTF_PATH)" ] ; then \ echo "Couldn't find kernel BTF; set VMLINUX_BTF to" \ "specify its location." >&2; \ exit 1;\ fi $(QUIET_GEN)$(BPFTOOL) btf dump file $(VMLINUX_BTF_PATH) format c > $@ +else + $(Q)cp "$(VMLINUX_H)" $@ +endif $(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(BPFOBJ_OUTPUT) $(Q)$(MAKE) $(submake_extras) -C $(LIBBPF_SRC) OUTPUT=$(BPFOBJ_OUTPUT) $@ -- cgit v1.2.3 From 81db00a4ea625c88925b00df9673472bd1b8c77f Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Thu, 25 Feb 2021 14:07:00 +0800 Subject: perf metric: Remove unneeded semicolon Fix the following coccicheck warnings: ./tools/perf/tests/parse-metric.c:101:2-3: Unneeded semicolon. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/1614233220-67326-1-git-send-email-jiapeng.chong@linux.alibaba.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/parse-metric.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/tests/parse-metric.c b/tools/perf/tests/parse-metric.c index 6dc1db1626ad..55bf52e588be 100644 --- a/tools/perf/tests/parse-metric.c +++ b/tools/perf/tests/parse-metric.c @@ -98,7 +98,7 @@ static u64 find_value(const char *name, struct value *values) if (!strcmp(name, v->event)) return v->val; v++; - }; + } return 0; } -- cgit v1.2.3 From 2e989f82181cd414599c6afbc5a666356a3d1dd1 Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Fri, 19 Feb 2021 15:00:05 +0800 Subject: perf report: Create option to disable raw event ordering Warning "dso not found" is reported when using "perf report -D". 66702781413407 0x32c0 [0x30]: PERF_RECORD_SAMPLE(IP, 0x2): 28177/28177: 0x55e493e00563 period: 106578 addr: 0 ... thread: perf:28177 ...... dso: 66702727832429 0x9dd8 [0x38]: PERF_RECORD_COMM exec: triad_loop:28177/28177 The PERF_RECORD_SAMPLE event (timestamp: 66702781413407) should be after the PERF_RECORD_COMM event (timestamp: 66702727832429), but it's early processed. So for most of cases, it makes sense to keep the event ordered even for dump mode. But it would be also useful to disable ordered_events for reporting raw dump to see events as they are stored in the perf.data file. So now, set ordered_events by default to true and add a new option 'disable-order' to disable it. For example, perf report -D --disable-order Fixes: 977f739b7126b ("perf report: Disable ordered_events for raw dump") Signed-off-by: Jin Yao Acked-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Jin Yao Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20210219070005.12397-1-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-report.txt | 3 +++ tools/perf/builtin-report.c | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index f546b5e9db05..87112e8d904e 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -224,6 +224,9 @@ OPTIONS --dump-raw-trace:: Dump raw trace in ASCII. +--disable-order:: + Disable raw trace ordering. + -g:: --call-graph=:: Display call chains using type, min percent threshold, print limit, diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 2a845d6cac09..0d65c98794a8 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -84,6 +84,7 @@ struct report { bool nonany_branch_mode; bool group_set; bool stitch_lbr; + bool disable_order; int max_stack; struct perf_read_values show_threads_values; struct annotation_options annotation_opts; @@ -1296,6 +1297,8 @@ int cmd_report(int argc, const char **argv) OPTS_EVSWITCH(&report.evswitch), OPT_BOOLEAN(0, "total-cycles", &report.total_cycles_mode, "Sort all blocks by 'Sampled Cycles%'"), + OPT_BOOLEAN(0, "disable-order", &report.disable_order, + "Disable raw trace ordering"), OPT_END() }; struct perf_data data = { @@ -1329,7 +1332,7 @@ int cmd_report(int argc, const char **argv) if (report.mmaps_mode) report.tasks_mode = true; - if (dump_trace) + if (dump_trace && report.disable_order) report.tool.ordered_events = false; if (quiet) -- cgit v1.2.3 From 86a35af628e539f7ae9796f1984761e8d3232ac0 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 26 Feb 2021 14:38:10 -0800 Subject: selftests/bpf: Add a verifier scale test with unknown bounded loop The original bcc pull request https://github.com/iovisor/bcc/pull/3270 exposed a verifier failure with Clang 12/13 while Clang 4 works fine. Further investigation exposed two issues: Issue 1: LLVM may generate code which uses less refined value. The issue is fixed in LLVM patch: https://reviews.llvm.org/D97479 Issue 2: Spills with initial value 0 are marked as precise which makes later state pruning less effective. This is my rough initial analysis and further investigation is needed to find how to improve verifier pruning in such cases. With the above LLVM patch, for the new loop6.c test, which has smaller loop bound compared to original test, I got: $ test_progs -s -n 10/16 ... stack depth 64 processed 390735 insns (limit 1000000) max_states_per_insn 87 total_states 8658 peak_states 964 mark_read 6 #10/16 loop6.o:OK Use the original loop bound, i.e., commenting out "#define WORKAROUND", I got: $ test_progs -s -n 10/16 ... BPF program is too large. Processed 1000001 insn stack depth 64 processed 1000001 insns (limit 1000000) max_states_per_insn 91 total_states 23176 peak_states 5069 mark_read 6 ... #10/16 loop6.o:FAIL The purpose of this patch is to provide a regression test for the above LLVM fix and also provide a test case for further analyzing the verifier pruning issue. Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann Cc: Zhenwei Pi Link: https://lore.kernel.org/bpf/20210226223810.236472-1-yhs@fb.com --- tools/testing/selftests/bpf/README.rst | 39 +++++++++ .../selftests/bpf/prog_tests/bpf_verif_scale.c | 1 + tools/testing/selftests/bpf/progs/loop6.c | 99 ++++++++++++++++++++++ 3 files changed, 139 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/loop6.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst index fd148b8410fa..dbc8f6cc5c67 100644 --- a/tools/testing/selftests/bpf/README.rst +++ b/tools/testing/selftests/bpf/README.rst @@ -111,6 +111,45 @@ available in 10.0.1. The patch is available in llvm 11.0.0 trunk. __ https://reviews.llvm.org/D78466 +bpf_verif_scale/loop6.o test failure with Clang 12 +================================================== + +With Clang 12, the following bpf_verif_scale test failed: + * ``bpf_verif_scale/loop6.o`` + +The verifier output looks like + +.. code-block:: c + + R1 type=ctx expected=fp + The sequence of 8193 jumps is too complex. + +The reason is compiler generating the following code + +.. code-block:: c + + ; for (i = 0; (i < VIRTIO_MAX_SGS) && (i < num); i++) { + 14: 16 05 40 00 00 00 00 00 if w5 == 0 goto +64 + 15: bc 51 00 00 00 00 00 00 w1 = w5 + 16: 04 01 00 00 ff ff ff ff w1 += -1 + 17: 67 05 00 00 20 00 00 00 r5 <<= 32 + 18: 77 05 00 00 20 00 00 00 r5 >>= 32 + 19: a6 01 01 00 05 00 00 00 if w1 < 5 goto +1 + 20: b7 05 00 00 06 00 00 00 r5 = 6 + 00000000000000a8 : + 21: b7 02 00 00 00 00 00 00 r2 = 0 + 22: b7 01 00 00 00 00 00 00 r1 = 0 + ; for (i = 0; (i < VIRTIO_MAX_SGS) && (i < num); i++) { + 23: 7b 1a e0 ff 00 00 00 00 *(u64 *)(r10 - 32) = r1 + 24: 7b 5a c0 ff 00 00 00 00 *(u64 *)(r10 - 64) = r5 + +Note that insn #15 has w1 = w5 and w1 is refined later but +r5(w5) is eventually saved on stack at insn #24 for later use. +This cause later verifier failure. The bug has been `fixed`__ in +Clang 13. + +__ https://reviews.llvm.org/D97479 + BPF CO-RE-based tests and Clang version ======================================= diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c index e698ee6bb6c2..3d002c245d2b 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c @@ -76,6 +76,7 @@ void test_bpf_verif_scale(void) { "loop2.o", BPF_PROG_TYPE_RAW_TRACEPOINT }, { "loop4.o", BPF_PROG_TYPE_SCHED_CLS }, { "loop5.o", BPF_PROG_TYPE_SCHED_CLS }, + { "loop6.o", BPF_PROG_TYPE_KPROBE }, /* partial unroll. 19k insn in a loop. * Total program size 20.8k insn. diff --git a/tools/testing/selftests/bpf/progs/loop6.c b/tools/testing/selftests/bpf/progs/loop6.c new file mode 100644 index 000000000000..2a7141ac1656 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/loop6.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include + +char _license[] SEC("license") = "GPL"; + +/* typically virtio scsi has max SGs of 6 */ +#define VIRTIO_MAX_SGS 6 + +/* Verifier will fail with SG_MAX = 128. The failure can be + * workarounded with a smaller SG_MAX, e.g. 10. + */ +#define WORKAROUND +#ifdef WORKAROUND +#define SG_MAX 10 +#else +/* typically virtio blk has max SEG of 128 */ +#define SG_MAX 128 +#endif + +#define SG_CHAIN 0x01UL +#define SG_END 0x02UL + +struct scatterlist { + unsigned long page_link; + unsigned int offset; + unsigned int length; +}; + +#define sg_is_chain(sg) ((sg)->page_link & SG_CHAIN) +#define sg_is_last(sg) ((sg)->page_link & SG_END) +#define sg_chain_ptr(sg) \ + ((struct scatterlist *) ((sg)->page_link & ~(SG_CHAIN | SG_END))) + +static inline struct scatterlist *__sg_next(struct scatterlist *sgp) +{ + struct scatterlist sg; + + bpf_probe_read_kernel(&sg, sizeof(sg), sgp); + if (sg_is_last(&sg)) + return NULL; + + sgp++; + + bpf_probe_read_kernel(&sg, sizeof(sg), sgp); + if (sg_is_chain(&sg)) + sgp = sg_chain_ptr(&sg); + + return sgp; +} + +static inline struct scatterlist *get_sgp(struct scatterlist **sgs, int i) +{ + struct scatterlist *sgp; + + bpf_probe_read_kernel(&sgp, sizeof(sgp), sgs + i); + return sgp; +} + +int config = 0; +int result = 0; + +SEC("kprobe/virtqueue_add_sgs") +BPF_KPROBE(trace_virtqueue_add_sgs, void *unused, struct scatterlist **sgs, + unsigned int out_sgs, unsigned int in_sgs) +{ + struct scatterlist *sgp = NULL; + __u64 length1 = 0, length2 = 0; + unsigned int i, n, len; + + if (config != 0) + return 0; + + for (i = 0; (i < VIRTIO_MAX_SGS) && (i < out_sgs); i++) { + for (n = 0, sgp = get_sgp(sgs, i); sgp && (n < SG_MAX); + sgp = __sg_next(sgp)) { + bpf_probe_read_kernel(&len, sizeof(len), &sgp->length); + length1 += len; + n++; + } + } + + for (i = 0; (i < VIRTIO_MAX_SGS) && (i < in_sgs); i++) { + for (n = 0, sgp = get_sgp(sgs, i); sgp && (n < SG_MAX); + sgp = __sg_next(sgp)) { + bpf_probe_read_kernel(&len, sizeof(len), &sgp->length); + length2 += len; + n++; + } + } + + config = 1; + result = length2 - length1; + return 0; +} -- cgit v1.2.3 From 8fd886911a6a99acf4a8facf619a2e7b5225be78 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 26 Feb 2021 21:22:47 +0100 Subject: bpf: Add BTF_KIND_FLOAT to uapi Add a new kind value and expand the kind bitfield. Signed-off-by: Ilya Leoshkevich Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210226202256.116518-2-iii@linux.ibm.com --- include/uapi/linux/btf.h | 5 +++-- tools/include/uapi/linux/btf.h | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index 5a667107ad2c..d27b1708efe9 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -52,7 +52,7 @@ struct btf_type { }; }; -#define BTF_INFO_KIND(info) (((info) >> 24) & 0x0f) +#define BTF_INFO_KIND(info) (((info) >> 24) & 0x1f) #define BTF_INFO_VLEN(info) ((info) & 0xffff) #define BTF_INFO_KFLAG(info) ((info) >> 31) @@ -72,7 +72,8 @@ struct btf_type { #define BTF_KIND_FUNC_PROTO 13 /* Function Proto */ #define BTF_KIND_VAR 14 /* Variable */ #define BTF_KIND_DATASEC 15 /* Section */ -#define BTF_KIND_MAX BTF_KIND_DATASEC +#define BTF_KIND_FLOAT 16 /* Floating point */ +#define BTF_KIND_MAX BTF_KIND_FLOAT #define NR_BTF_KINDS (BTF_KIND_MAX + 1) /* For some specific BTF_KIND, "struct btf_type" is immediately diff --git a/tools/include/uapi/linux/btf.h b/tools/include/uapi/linux/btf.h index 5a667107ad2c..d27b1708efe9 100644 --- a/tools/include/uapi/linux/btf.h +++ b/tools/include/uapi/linux/btf.h @@ -52,7 +52,7 @@ struct btf_type { }; }; -#define BTF_INFO_KIND(info) (((info) >> 24) & 0x0f) +#define BTF_INFO_KIND(info) (((info) >> 24) & 0x1f) #define BTF_INFO_VLEN(info) ((info) & 0xffff) #define BTF_INFO_KFLAG(info) ((info) >> 31) @@ -72,7 +72,8 @@ struct btf_type { #define BTF_KIND_FUNC_PROTO 13 /* Function Proto */ #define BTF_KIND_VAR 14 /* Variable */ #define BTF_KIND_DATASEC 15 /* Section */ -#define BTF_KIND_MAX BTF_KIND_DATASEC +#define BTF_KIND_FLOAT 16 /* Floating point */ +#define BTF_KIND_MAX BTF_KIND_FLOAT #define NR_BTF_KINDS (BTF_KIND_MAX + 1) /* For some specific BTF_KIND, "struct btf_type" is immediately -- cgit v1.2.3 From 1b1ce92b24331b569a444858fc487a1ca19dc778 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 26 Feb 2021 21:22:48 +0100 Subject: libbpf: Fix whitespace in btf_add_composite() comment Remove trailing space. Signed-off-by: Ilya Leoshkevich Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210226202256.116518-3-iii@linux.ibm.com --- tools/lib/bpf/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index d9c10830d749..0797ab714830 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -1883,7 +1883,7 @@ static int btf_add_composite(struct btf *btf, int kind, const char *name, __u32 * - *byte_sz* - size of the struct, in bytes; * * Struct initially has no fields in it. Fields can be added by - * btf__add_field() right after btf__add_struct() succeeds. + * btf__add_field() right after btf__add_struct() succeeds. * * Returns: * - >0, type ID of newly added BTF type; -- cgit v1.2.3 From 22541a9eeb0d968c133aaebd95fa59da3208e705 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 26 Feb 2021 21:22:49 +0100 Subject: libbpf: Add BTF_KIND_FLOAT support The logic follows that of BTF_KIND_INT most of the time. Sanitization replaces BTF_KIND_FLOATs with equally-sized empty BTF_KIND_STRUCTs on older kernels, for example, the following: [4] FLOAT 'float' size=4 becomes the following: [4] STRUCT '(anon)' size=4 vlen=0 With dwarves patch [1] and this patch, the older kernels, which were failing with the floating-point-related errors, will now start working correctly. [1] https://github.com/iii-i/dwarves/commit/btf-kind-float-v2 Signed-off-by: Ilya Leoshkevich Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210226202256.116518-4-iii@linux.ibm.com --- tools/lib/bpf/btf.c | 49 +++++++++++++++++++++++++++++++++++++++++ tools/lib/bpf/btf.h | 6 +++++ tools/lib/bpf/btf_dump.c | 4 ++++ tools/lib/bpf/libbpf.c | 29 +++++++++++++++++++++++- tools/lib/bpf/libbpf.map | 5 +++++ tools/lib/bpf/libbpf_internal.h | 2 ++ 6 files changed, 94 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 0797ab714830..3aa58f2ac183 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -291,6 +291,7 @@ static int btf_type_size(const struct btf_type *t) case BTF_KIND_PTR: case BTF_KIND_TYPEDEF: case BTF_KIND_FUNC: + case BTF_KIND_FLOAT: return base_size; case BTF_KIND_INT: return base_size + sizeof(__u32); @@ -338,6 +339,7 @@ static int btf_bswap_type_rest(struct btf_type *t) case BTF_KIND_PTR: case BTF_KIND_TYPEDEF: case BTF_KIND_FUNC: + case BTF_KIND_FLOAT: return 0; case BTF_KIND_INT: *(__u32 *)(t + 1) = bswap_32(*(__u32 *)(t + 1)); @@ -578,6 +580,7 @@ __s64 btf__resolve_size(const struct btf *btf, __u32 type_id) case BTF_KIND_UNION: case BTF_KIND_ENUM: case BTF_KIND_DATASEC: + case BTF_KIND_FLOAT: size = t->size; goto done; case BTF_KIND_PTR: @@ -621,6 +624,7 @@ int btf__align_of(const struct btf *btf, __u32 id) switch (kind) { case BTF_KIND_INT: case BTF_KIND_ENUM: + case BTF_KIND_FLOAT: return min(btf_ptr_sz(btf), (size_t)t->size); case BTF_KIND_PTR: return btf_ptr_sz(btf); @@ -1756,6 +1760,47 @@ int btf__add_int(struct btf *btf, const char *name, size_t byte_sz, int encoding return btf_commit_type(btf, sz); } +/* + * Append new BTF_KIND_FLOAT type with: + * - *name* - non-empty, non-NULL type name; + * - *sz* - size of the type, in bytes; + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_float(struct btf *btf, const char *name, size_t byte_sz) +{ + struct btf_type *t; + int sz, name_off; + + /* non-empty name */ + if (!name || !name[0]) + return -EINVAL; + + /* byte_sz must be one of the explicitly allowed values */ + if (byte_sz != 2 && byte_sz != 4 && byte_sz != 8 && byte_sz != 12 && + byte_sz != 16) + return -EINVAL; + + if (btf_ensure_modifiable(btf)) + return -ENOMEM; + + sz = sizeof(struct btf_type); + t = btf_add_type_mem(btf, sz); + if (!t) + return -ENOMEM; + + name_off = btf__add_str(btf, name); + if (name_off < 0) + return name_off; + + t->name_off = name_off; + t->info = btf_type_info(BTF_KIND_FLOAT, 0, 0); + t->size = byte_sz; + + return btf_commit_type(btf, sz); +} + /* it's completely legal to append BTF types with type IDs pointing forward to * types that haven't been appended yet, so we only make sure that id looks * sane, we can't guarantee that ID will always be valid @@ -3626,6 +3671,7 @@ static int btf_dedup_prep(struct btf_dedup *d) case BTF_KIND_FWD: case BTF_KIND_TYPEDEF: case BTF_KIND_FUNC: + case BTF_KIND_FLOAT: h = btf_hash_common(t); break; case BTF_KIND_INT: @@ -3722,6 +3768,7 @@ static int btf_dedup_prim_type(struct btf_dedup *d, __u32 type_id) break; case BTF_KIND_FWD: + case BTF_KIND_FLOAT: h = btf_hash_common(t); for_each_dedup_cand(d, hash_entry, h) { cand_id = (__u32)(long)hash_entry->value; @@ -3983,6 +4030,7 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id, return btf_compat_enum(cand_type, canon_type); case BTF_KIND_FWD: + case BTF_KIND_FLOAT: return btf_equal_common(cand_type, canon_type); case BTF_KIND_CONST: @@ -4479,6 +4527,7 @@ static int btf_dedup_remap_type(struct btf_dedup *d, __u32 type_id) switch (btf_kind(t)) { case BTF_KIND_INT: case BTF_KIND_ENUM: + case BTF_KIND_FLOAT: break; case BTF_KIND_FWD: diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index 1237bcd1dd17..029a9cfc8c2d 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -95,6 +95,7 @@ LIBBPF_API int btf__find_str(struct btf *btf, const char *s); LIBBPF_API int btf__add_str(struct btf *btf, const char *s); LIBBPF_API int btf__add_int(struct btf *btf, const char *name, size_t byte_sz, int encoding); +LIBBPF_API int btf__add_float(struct btf *btf, const char *name, size_t byte_sz); LIBBPF_API int btf__add_ptr(struct btf *btf, int ref_type_id); LIBBPF_API int btf__add_array(struct btf *btf, int index_type_id, int elem_type_id, __u32 nr_elems); @@ -294,6 +295,11 @@ static inline bool btf_is_datasec(const struct btf_type *t) return btf_kind(t) == BTF_KIND_DATASEC; } +static inline bool btf_is_float(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_FLOAT; +} + static inline __u8 btf_int_encoding(const struct btf_type *t) { return BTF_INT_ENCODING(*(__u32 *)(t + 1)); diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index 2f9d685bd522..5e957fcceee6 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -279,6 +279,7 @@ static int btf_dump_mark_referenced(struct btf_dump *d) case BTF_KIND_INT: case BTF_KIND_ENUM: case BTF_KIND_FWD: + case BTF_KIND_FLOAT: break; case BTF_KIND_VOLATILE: @@ -453,6 +454,7 @@ static int btf_dump_order_type(struct btf_dump *d, __u32 id, bool through_ptr) switch (btf_kind(t)) { case BTF_KIND_INT: + case BTF_KIND_FLOAT: tstate->order_state = ORDERED; return 0; @@ -1133,6 +1135,7 @@ skip_mod: case BTF_KIND_STRUCT: case BTF_KIND_UNION: case BTF_KIND_TYPEDEF: + case BTF_KIND_FLOAT: goto done; default: pr_warn("unexpected type in decl chain, kind:%u, id:[%u]\n", @@ -1247,6 +1250,7 @@ static void btf_dump_emit_type_chain(struct btf_dump *d, switch (kind) { case BTF_KIND_INT: + case BTF_KIND_FLOAT: btf_dump_emit_mods(d, decls); name = btf_name_of(d, t->name_off); btf_dump_printf(d, "%s", name); diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 62d9ed56b081..2f351d3ad3e7 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -178,6 +178,8 @@ enum kern_feature_id { FEAT_PROG_BIND_MAP, /* Kernel support for module BTFs */ FEAT_MODULE_BTF, + /* BTF_KIND_FLOAT support */ + FEAT_BTF_FLOAT, __FEAT_CNT, }; @@ -1946,6 +1948,7 @@ static const char *btf_kind_str(const struct btf_type *t) case BTF_KIND_FUNC_PROTO: return "func_proto"; case BTF_KIND_VAR: return "var"; case BTF_KIND_DATASEC: return "datasec"; + case BTF_KIND_FLOAT: return "float"; default: return "unknown"; } } @@ -2395,15 +2398,17 @@ static bool btf_needs_sanitization(struct bpf_object *obj) { bool has_func_global = kernel_supports(FEAT_BTF_GLOBAL_FUNC); bool has_datasec = kernel_supports(FEAT_BTF_DATASEC); + bool has_float = kernel_supports(FEAT_BTF_FLOAT); bool has_func = kernel_supports(FEAT_BTF_FUNC); - return !has_func || !has_datasec || !has_func_global; + return !has_func || !has_datasec || !has_func_global || !has_float; } static void bpf_object__sanitize_btf(struct bpf_object *obj, struct btf *btf) { bool has_func_global = kernel_supports(FEAT_BTF_GLOBAL_FUNC); bool has_datasec = kernel_supports(FEAT_BTF_DATASEC); + bool has_float = kernel_supports(FEAT_BTF_FLOAT); bool has_func = kernel_supports(FEAT_BTF_FUNC); struct btf_type *t; int i, j, vlen; @@ -2456,6 +2461,13 @@ static void bpf_object__sanitize_btf(struct bpf_object *obj, struct btf *btf) } else if (!has_func_global && btf_is_func(t)) { /* replace BTF_FUNC_GLOBAL with BTF_FUNC_STATIC */ t->info = BTF_INFO_ENC(BTF_KIND_FUNC, 0, 0); + } else if (!has_float && btf_is_float(t)) { + /* replace FLOAT with an equally-sized empty STRUCT; + * since C compilers do not accept e.g. "float" as a + * valid struct name, make it anonymous + */ + t->name_off = 0; + t->info = BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 0); } } } @@ -3927,6 +3939,18 @@ static int probe_kern_btf_datasec(void) strs, sizeof(strs))); } +static int probe_kern_btf_float(void) +{ + static const char strs[] = "\0float"; + __u32 types[] = { + /* float */ + BTF_TYPE_FLOAT_ENC(1, 4), + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs))); +} + static int probe_kern_array_mmap(void) { struct bpf_create_map_attr attr = { @@ -4106,6 +4130,9 @@ static struct kern_feature_desc { [FEAT_MODULE_BTF] = { "module BTF support", probe_module_btf, }, + [FEAT_BTF_FLOAT] = { + "BTF_KIND_FLOAT support", probe_kern_btf_float, + }, }; static bool kernel_supports(enum kern_feature_id feat_id) diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 1c0fd2dd233a..ec898f464ab9 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -350,3 +350,8 @@ LIBBPF_0.3.0 { xsk_setup_xdp_prog; xsk_socket__update_xskmap; } LIBBPF_0.2.0; + +LIBBPF_0.4.0 { + global: + btf__add_float; +} LIBBPF_0.3.0; diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index 969d0ac592ba..343f6eb05637 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -31,6 +31,8 @@ #define BTF_MEMBER_ENC(name, type, bits_offset) (name), (type), (bits_offset) #define BTF_PARAM_ENC(name, type) (name), (type) #define BTF_VAR_SECINFO_ENC(type, offset, size) (type), (offset), (size) +#define BTF_TYPE_FLOAT_ENC(name, sz) \ + BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_FLOAT, 0, 0), sz) #ifndef likely #define likely(x) __builtin_expect(!!(x), 1) -- cgit v1.2.3 From 737e0f919a8d2a313618d8ac67d50e8223bc5d74 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 26 Feb 2021 21:22:50 +0100 Subject: tools/bpftool: Add BTF_KIND_FLOAT support Only dumping support needs to be adjusted, the code structure follows that of BTF_KIND_INT. Example plain and JSON outputs: [4] FLOAT 'float' size=4 {"id":4,"kind":"FLOAT","name":"float","size":4} Signed-off-by: Ilya Leoshkevich Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210226202256.116518-5-iii@linux.ibm.com --- tools/bpf/bpftool/btf.c | 8 ++++++++ tools/bpf/bpftool/btf_dumper.c | 1 + 2 files changed, 9 insertions(+) (limited to 'tools') diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c index fe9e7b3a4b50..985610c3f193 100644 --- a/tools/bpf/bpftool/btf.c +++ b/tools/bpf/bpftool/btf.c @@ -36,6 +36,7 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = { [BTF_KIND_FUNC_PROTO] = "FUNC_PROTO", [BTF_KIND_VAR] = "VAR", [BTF_KIND_DATASEC] = "DATASEC", + [BTF_KIND_FLOAT] = "FLOAT", }; struct btf_attach_table { @@ -327,6 +328,13 @@ static int dump_btf_type(const struct btf *btf, __u32 id, jsonw_end_array(w); break; } + case BTF_KIND_FLOAT: { + if (json_output) + jsonw_uint_field(w, "size", t->size); + else + printf(" size=%u", t->size); + break; + } default: break; } diff --git a/tools/bpf/bpftool/btf_dumper.c b/tools/bpf/bpftool/btf_dumper.c index 0e9310727281..7ca54d046362 100644 --- a/tools/bpf/bpftool/btf_dumper.c +++ b/tools/bpf/bpftool/btf_dumper.c @@ -596,6 +596,7 @@ static int __btf_dumper_type_only(const struct btf *btf, __u32 type_id, switch (BTF_INFO_KIND(t->info)) { case BTF_KIND_INT: case BTF_KIND_TYPEDEF: + case BTF_KIND_FLOAT: BTF_PRINT_ARG("%s ", btf__name_by_offset(btf, t->name_off)); break; case BTF_KIND_STRUCT: -- cgit v1.2.3 From eea154a852e827c003215f7beed3e10f05471a86 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 26 Feb 2021 21:22:51 +0100 Subject: selftests/bpf: Use the 25th bit in the "invalid BTF_INFO" test The bit being checked by this test is no longer reserved after introducing BTF_KIND_FLOAT, so use the next one instead. Signed-off-by: Ilya Leoshkevich Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210226202256.116518-6-iii@linux.ibm.com --- tools/testing/selftests/bpf/prog_tests/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c index 6a7ee7420701..c29406736138 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf.c +++ b/tools/testing/selftests/bpf/prog_tests/btf.c @@ -1903,7 +1903,7 @@ static struct btf_raw_test raw_tests[] = { .raw_types = { /* int */ /* [1] */ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), - BTF_TYPE_ENC(0, 0x10000000, 4), + BTF_TYPE_ENC(0, 0x20000000, 4), BTF_END_RAW, }, .str_sec = "", -- cgit v1.2.3 From 7e72aad3a15c06e40e3ccd2352e5010e978f1acf Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 26 Feb 2021 21:22:53 +0100 Subject: selftest/bpf: Add BTF_KIND_FLOAT tests Test the good variants as well as the potential malformed ones. Signed-off-by: Ilya Leoshkevich Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210226202256.116518-8-iii@linux.ibm.com --- tools/testing/selftests/bpf/btf_helpers.c | 4 + tools/testing/selftests/bpf/prog_tests/btf.c | 131 +++++++++++++++++++++++++++ tools/testing/selftests/bpf/test_btf.h | 3 + 3 files changed, 138 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/btf_helpers.c b/tools/testing/selftests/bpf/btf_helpers.c index 48f90490f922..b692e6ead9b5 100644 --- a/tools/testing/selftests/bpf/btf_helpers.c +++ b/tools/testing/selftests/bpf/btf_helpers.c @@ -23,6 +23,7 @@ static const char * const btf_kind_str_mapping[] = { [BTF_KIND_FUNC_PROTO] = "FUNC_PROTO", [BTF_KIND_VAR] = "VAR", [BTF_KIND_DATASEC] = "DATASEC", + [BTF_KIND_FLOAT] = "FLOAT", }; static const char *btf_kind_str(__u16 kind) @@ -173,6 +174,9 @@ int fprintf_btf_type_raw(FILE *out, const struct btf *btf, __u32 id) } break; } + case BTF_KIND_FLOAT: + fprintf(out, " size=%u", t->size); + break; default: break; } diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c index c29406736138..11d98d3cf949 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf.c +++ b/tools/testing/selftests/bpf/prog_tests/btf.c @@ -3531,6 +3531,136 @@ static struct btf_raw_test raw_tests[] = { .max_entries = 1, }, +{ + .descr = "float test #1, well-formed", + .raw_types = { + BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), + /* [1] */ + BTF_TYPE_FLOAT_ENC(NAME_TBD, 2), /* [2] */ + BTF_TYPE_FLOAT_ENC(NAME_TBD, 4), /* [3] */ + BTF_TYPE_FLOAT_ENC(NAME_TBD, 8), /* [4] */ + BTF_TYPE_FLOAT_ENC(NAME_TBD, 12), /* [5] */ + BTF_TYPE_FLOAT_ENC(NAME_TBD, 16), /* [6] */ + BTF_STRUCT_ENC(NAME_TBD, 5, 48), /* [7] */ + BTF_MEMBER_ENC(NAME_TBD, 2, 0), + BTF_MEMBER_ENC(NAME_TBD, 3, 32), + BTF_MEMBER_ENC(NAME_TBD, 4, 64), + BTF_MEMBER_ENC(NAME_TBD, 5, 128), + BTF_MEMBER_ENC(NAME_TBD, 6, 256), + BTF_END_RAW, + }, + BTF_STR_SEC("\0int\0_Float16\0float\0double\0_Float80\0long_double" + "\0floats\0a\0b\0c\0d\0e"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "float_type_check_btf", + .key_size = sizeof(int), + .value_size = 48, + .key_type_id = 1, + .value_type_id = 7, + .max_entries = 1, +}, +{ + .descr = "float test #2, invalid vlen", + .raw_types = { + BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), + /* [1] */ + BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_FLOAT, 0, 1), 4), + /* [2] */ + BTF_END_RAW, + }, + BTF_STR_SEC("\0int\0float"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "float_type_check_btf", + .key_size = sizeof(int), + .value_size = 4, + .key_type_id = 1, + .value_type_id = 2, + .max_entries = 1, + .btf_load_err = true, + .err_str = "vlen != 0", +}, +{ + .descr = "float test #3, invalid kind_flag", + .raw_types = { + BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), + /* [1] */ + BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_FLOAT, 1, 0), 4), + /* [2] */ + BTF_END_RAW, + }, + BTF_STR_SEC("\0int\0float"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "float_type_check_btf", + .key_size = sizeof(int), + .value_size = 4, + .key_type_id = 1, + .value_type_id = 2, + .max_entries = 1, + .btf_load_err = true, + .err_str = "Invalid btf_info kind_flag", +}, +{ + .descr = "float test #4, member does not fit", + .raw_types = { + BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), + /* [1] */ + BTF_TYPE_FLOAT_ENC(NAME_TBD, 4), /* [2] */ + BTF_STRUCT_ENC(NAME_TBD, 1, 2), /* [3] */ + BTF_MEMBER_ENC(NAME_TBD, 2, 0), + BTF_END_RAW, + }, + BTF_STR_SEC("\0int\0float\0floats\0x"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "float_type_check_btf", + .key_size = sizeof(int), + .value_size = 4, + .key_type_id = 1, + .value_type_id = 3, + .max_entries = 1, + .btf_load_err = true, + .err_str = "Member exceeds struct_size", +}, +{ + .descr = "float test #5, member is not properly aligned", + .raw_types = { + BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), + /* [1] */ + BTF_TYPE_FLOAT_ENC(NAME_TBD, 4), /* [2] */ + BTF_STRUCT_ENC(NAME_TBD, 1, 8), /* [3] */ + BTF_MEMBER_ENC(NAME_TBD, 2, 8), + BTF_END_RAW, + }, + BTF_STR_SEC("\0int\0float\0floats\0x"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "float_type_check_btf", + .key_size = sizeof(int), + .value_size = 4, + .key_type_id = 1, + .value_type_id = 3, + .max_entries = 1, + .btf_load_err = true, + .err_str = "Member is not properly aligned", +}, +{ + .descr = "float test #6, invalid size", + .raw_types = { + BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), + /* [1] */ + BTF_TYPE_FLOAT_ENC(NAME_TBD, 6), /* [2] */ + BTF_END_RAW, + }, + BTF_STR_SEC("\0int\0float"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "float_type_check_btf", + .key_size = sizeof(int), + .value_size = 6, + .key_type_id = 1, + .value_type_id = 2, + .max_entries = 1, + .btf_load_err = true, + .err_str = "Invalid type_size", +}, + }; /* struct btf_raw_test raw_tests[] */ static const char *get_next_str(const char *start, const char *end) @@ -6630,6 +6760,7 @@ static int btf_type_size(const struct btf_type *t) case BTF_KIND_PTR: case BTF_KIND_TYPEDEF: case BTF_KIND_FUNC: + case BTF_KIND_FLOAT: return base_size; case BTF_KIND_INT: return base_size + sizeof(__u32); diff --git a/tools/testing/selftests/bpf/test_btf.h b/tools/testing/selftests/bpf/test_btf.h index 2023725f1962..e2394eea4b7f 100644 --- a/tools/testing/selftests/bpf/test_btf.h +++ b/tools/testing/selftests/bpf/test_btf.h @@ -66,4 +66,7 @@ #define BTF_FUNC_ENC(name, func_proto) \ BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_FUNC, 0, 0), func_proto) +#define BTF_TYPE_FLOAT_ENC(name, sz) \ + BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_FLOAT, 0, 0), sz) + #endif /* _TEST_BTF_H */ -- cgit v1.2.3 From 7999cf7df899caf244236dcc11cce844347dab4a Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 26 Feb 2021 21:22:54 +0100 Subject: selftests/bpf: Add BTF_KIND_FLOAT to the existing deduplication tests Check that floats don't interfere with struct deduplication, that they are not merged with another kinds and that floats of different sizes are not merged with each other. Suggested-by: Andrii Nakryiko Signed-off-by: Ilya Leoshkevich Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210226202256.116518-9-iii@linux.ibm.com --- tools/testing/selftests/bpf/prog_tests/btf.c | 43 ++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c index 11d98d3cf949..0457ae32b270 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf.c +++ b/tools/testing/selftests/bpf/prog_tests/btf.c @@ -6411,11 +6411,12 @@ const struct btf_dedup_test dedup_tests[] = { /* int[16] */ BTF_TYPE_ARRAY_ENC(1, 1, 16), /* [2] */ /* struct s { */ - BTF_STRUCT_ENC(NAME_NTH(2), 4, 84), /* [3] */ + BTF_STRUCT_ENC(NAME_NTH(2), 5, 88), /* [3] */ BTF_MEMBER_ENC(NAME_NTH(3), 4, 0), /* struct s *next; */ BTF_MEMBER_ENC(NAME_NTH(4), 5, 64), /* const int *a; */ BTF_MEMBER_ENC(NAME_NTH(5), 2, 128), /* int b[16]; */ BTF_MEMBER_ENC(NAME_NTH(6), 1, 640), /* int c; */ + BTF_MEMBER_ENC(NAME_NTH(8), 13, 672), /* float d; */ /* ptr -> [3] struct s */ BTF_PTR_ENC(3), /* [4] */ /* ptr -> [6] const int */ @@ -6426,39 +6427,43 @@ const struct btf_dedup_test dedup_tests[] = { /* full copy of the above */ BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 32, 4), /* [7] */ BTF_TYPE_ARRAY_ENC(7, 7, 16), /* [8] */ - BTF_STRUCT_ENC(NAME_NTH(2), 4, 84), /* [9] */ + BTF_STRUCT_ENC(NAME_NTH(2), 5, 88), /* [9] */ BTF_MEMBER_ENC(NAME_NTH(3), 10, 0), BTF_MEMBER_ENC(NAME_NTH(4), 11, 64), BTF_MEMBER_ENC(NAME_NTH(5), 8, 128), BTF_MEMBER_ENC(NAME_NTH(6), 7, 640), + BTF_MEMBER_ENC(NAME_NTH(8), 13, 672), BTF_PTR_ENC(9), /* [10] */ BTF_PTR_ENC(12), /* [11] */ BTF_CONST_ENC(7), /* [12] */ + BTF_TYPE_FLOAT_ENC(NAME_NTH(7), 4), /* [13] */ BTF_END_RAW, }, - BTF_STR_SEC("\0int\0s\0next\0a\0b\0c\0"), + BTF_STR_SEC("\0int\0s\0next\0a\0b\0c\0float\0d"), }, .expect = { .raw_types = { /* int */ - BTF_TYPE_INT_ENC(NAME_NTH(4), BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + BTF_TYPE_INT_ENC(NAME_NTH(5), BTF_INT_SIGNED, 0, 32, 4), /* [1] */ /* int[16] */ BTF_TYPE_ARRAY_ENC(1, 1, 16), /* [2] */ /* struct s { */ - BTF_STRUCT_ENC(NAME_NTH(6), 4, 84), /* [3] */ - BTF_MEMBER_ENC(NAME_NTH(5), 4, 0), /* struct s *next; */ + BTF_STRUCT_ENC(NAME_NTH(8), 5, 88), /* [3] */ + BTF_MEMBER_ENC(NAME_NTH(7), 4, 0), /* struct s *next; */ BTF_MEMBER_ENC(NAME_NTH(1), 5, 64), /* const int *a; */ BTF_MEMBER_ENC(NAME_NTH(2), 2, 128), /* int b[16]; */ BTF_MEMBER_ENC(NAME_NTH(3), 1, 640), /* int c; */ + BTF_MEMBER_ENC(NAME_NTH(4), 7, 672), /* float d; */ /* ptr -> [3] struct s */ BTF_PTR_ENC(3), /* [4] */ /* ptr -> [6] const int */ BTF_PTR_ENC(6), /* [5] */ /* const -> [1] int */ BTF_CONST_ENC(1), /* [6] */ + BTF_TYPE_FLOAT_ENC(NAME_NTH(7), 4), /* [7] */ BTF_END_RAW, }, - BTF_STR_SEC("\0a\0b\0c\0int\0next\0s"), + BTF_STR_SEC("\0a\0b\0c\0d\0int\0float\0next\0s"), }, .opts = { .dont_resolve_fwds = false, @@ -6579,9 +6584,10 @@ const struct btf_dedup_test dedup_tests[] = { BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1), BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 8), BTF_FUNC_ENC(NAME_TBD, 12), /* [13] func */ + BTF_TYPE_FLOAT_ENC(NAME_TBD, 2), /* [14] float */ BTF_END_RAW, }, - BTF_STR_SEC("\0A\0B\0C\0D\0E\0F\0G\0H\0I\0J\0K\0L\0M"), + BTF_STR_SEC("\0A\0B\0C\0D\0E\0F\0G\0H\0I\0J\0K\0L\0M\0N"), }, .expect = { .raw_types = { @@ -6604,16 +6610,17 @@ const struct btf_dedup_test dedup_tests[] = { BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1), BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 8), BTF_FUNC_ENC(NAME_TBD, 12), /* [13] func */ + BTF_TYPE_FLOAT_ENC(NAME_TBD, 2), /* [14] float */ BTF_END_RAW, }, - BTF_STR_SEC("\0A\0B\0C\0D\0E\0F\0G\0H\0I\0J\0K\0L\0M"), + BTF_STR_SEC("\0A\0B\0C\0D\0E\0F\0G\0H\0I\0J\0K\0L\0M\0N"), }, .opts = { .dont_resolve_fwds = false, }, }, { - .descr = "dedup: no int duplicates", + .descr = "dedup: no int/float duplicates", .input = { .raw_types = { BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 32, 8), @@ -6628,9 +6635,15 @@ const struct btf_dedup_test dedup_tests[] = { BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 27, 8), /* different byte size */ BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 32, 4), + /* all allowed sizes */ + BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 2), + BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 4), + BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 8), + BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 12), + BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 16), BTF_END_RAW, }, - BTF_STR_SEC("\0int\0some other int"), + BTF_STR_SEC("\0int\0some other int\0float"), }, .expect = { .raw_types = { @@ -6646,9 +6659,15 @@ const struct btf_dedup_test dedup_tests[] = { BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 27, 8), /* different byte size */ BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 32, 4), + /* all allowed sizes */ + BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 2), + BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 4), + BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 8), + BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 12), + BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 16), BTF_END_RAW, }, - BTF_STR_SEC("\0int\0some other int"), + BTF_STR_SEC("\0int\0some other int\0float"), }, .opts = { .dont_resolve_fwds = false, -- cgit v1.2.3 From 923a932c982fd71856f80dbeaaa3ca41a75e89e0 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Mar 2021 09:19:41 -0800 Subject: scripts/bpf: Abstract eBPF API target parameter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Abstract out the target parameter so that upcoming commits, more than just the existing "helpers" target can be called to generate specific portions of docs from the eBPF UAPI headers. Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210302171947.2268128-10-joe@cilium.io --- MAINTAINERS | 1 + include/uapi/linux/bpf.h | 2 +- scripts/bpf_doc.py | 650 +++++++++++++++++++++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 615 -------------------------------------- tools/bpf/Makefile.helpers | 2 +- tools/include/uapi/linux/bpf.h | 2 +- tools/lib/bpf/Makefile | 2 +- tools/perf/MANIFEST | 2 +- 8 files changed, 656 insertions(+), 620 deletions(-) create mode 100755 scripts/bpf_doc.py delete mode 100755 scripts/bpf_helpers_doc.py (limited to 'tools') diff --git a/MAINTAINERS b/MAINTAINERS index a50a543e3c81..8d56c7044067 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3223,6 +3223,7 @@ F: net/core/filter.c F: net/sched/act_bpf.c F: net/sched/cls_bpf.c F: samples/bpf/ +F: scripts/bpf_doc.py F: tools/bpf/ F: tools/lib/bpf/ F: tools/testing/selftests/bpf/ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c8b9d19fce22..63a56ed6a785 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1439,7 +1439,7 @@ union bpf_attr { * parsed and used to produce a manual page. The workflow is the following, * and requires the rst2man utility: * - * $ ./scripts/bpf_helpers_doc.py \ + * $ ./scripts/bpf_doc.py \ * --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst * $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7 * $ man /tmp/bpf-helpers.7 diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py new file mode 100755 index 000000000000..5a4f68aab335 --- /dev/null +++ b/scripts/bpf_doc.py @@ -0,0 +1,650 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0-only +# +# Copyright (C) 2018-2019 Netronome Systems, Inc. +# Copyright (C) 2021 Isovalent, Inc. + +# In case user attempts to run with Python 2. +from __future__ import print_function + +import argparse +import re +import sys, os + +class NoHelperFound(BaseException): + pass + +class ParsingError(BaseException): + def __init__(self, line='', reader=None): + if reader: + BaseException.__init__(self, + 'Error at file offset %d, parsing line: %s' % + (reader.tell(), line)) + else: + BaseException.__init__(self, 'Error parsing line: %s' % line) + +class Helper(object): + """ + An object representing the description of an eBPF helper function. + @proto: function prototype of the helper function + @desc: textual description of the helper function + @ret: description of the return value of the helper function + """ + def __init__(self, proto='', desc='', ret=''): + self.proto = proto + self.desc = desc + self.ret = ret + + def proto_break_down(self): + """ + Break down helper function protocol into smaller chunks: return type, + name, distincts arguments. + """ + arg_re = re.compile('((\w+ )*?(\w+|...))( (\**)(\w+))?$') + res = {} + proto_re = re.compile('(.+) (\**)(\w+)\(((([^,]+)(, )?){1,5})\)$') + + capture = proto_re.match(self.proto) + res['ret_type'] = capture.group(1) + res['ret_star'] = capture.group(2) + res['name'] = capture.group(3) + res['args'] = [] + + args = capture.group(4).split(', ') + for a in args: + capture = arg_re.match(a) + res['args'].append({ + 'type' : capture.group(1), + 'star' : capture.group(5), + 'name' : capture.group(6) + }) + + return res + +class HeaderParser(object): + """ + An object used to parse a file in order to extract the documentation of a + list of eBPF helper functions. All the helpers that can be retrieved are + stored as Helper object, in the self.helpers() array. + @filename: name of file to parse, usually include/uapi/linux/bpf.h in the + kernel tree + """ + def __init__(self, filename): + self.reader = open(filename, 'r') + self.line = '' + self.helpers = [] + + def parse_helper(self): + proto = self.parse_proto() + desc = self.parse_desc() + ret = self.parse_ret() + return Helper(proto=proto, desc=desc, ret=ret) + + def parse_proto(self): + # Argument can be of shape: + # - "void" + # - "type name" + # - "type *name" + # - Same as above, with "const" and/or "struct" in front of type + # - "..." (undefined number of arguments, for bpf_trace_printk()) + # There is at least one term ("void"), and at most five arguments. + p = re.compile(' \* ?((.+) \**\w+\((((const )?(struct )?(\w+|\.\.\.)( \**\w+)?)(, )?){1,5}\))$') + capture = p.match(self.line) + if not capture: + raise NoHelperFound + self.line = self.reader.readline() + return capture.group(1) + + def parse_desc(self): + p = re.compile(' \* ?(?:\t| {5,8})Description$') + capture = p.match(self.line) + if not capture: + # Helper can have empty description and we might be parsing another + # attribute: return but do not consume. + return '' + # Description can be several lines, some of them possibly empty, and it + # stops when another subsection title is met. + desc = '' + while True: + self.line = self.reader.readline() + if self.line == ' *\n': + desc += '\n' + else: + p = re.compile(' \* ?(?:\t| {5,8})(?:\t| {8})(.*)') + capture = p.match(self.line) + if capture: + desc += capture.group(1) + '\n' + else: + break + return desc + + def parse_ret(self): + p = re.compile(' \* ?(?:\t| {5,8})Return$') + capture = p.match(self.line) + if not capture: + # Helper can have empty retval and we might be parsing another + # attribute: return but do not consume. + return '' + # Return value description can be several lines, some of them possibly + # empty, and it stops when another subsection title is met. + ret = '' + while True: + self.line = self.reader.readline() + if self.line == ' *\n': + ret += '\n' + else: + p = re.compile(' \* ?(?:\t| {5,8})(?:\t| {8})(.*)') + capture = p.match(self.line) + if capture: + ret += capture.group(1) + '\n' + else: + break + return ret + + def run(self): + # Advance to start of helper function descriptions. + offset = self.reader.read().find('* Start of BPF helper function descriptions:') + if offset == -1: + raise Exception('Could not find start of eBPF helper descriptions list') + self.reader.seek(offset) + self.reader.readline() + self.reader.readline() + self.line = self.reader.readline() + + while True: + try: + helper = self.parse_helper() + self.helpers.append(helper) + except NoHelperFound: + break + + self.reader.close() + +############################################################################### + +class Printer(object): + """ + A generic class for printers. Printers should be created with an array of + Helper objects, and implement a way to print them in the desired fashion. + @parser: A HeaderParser with objects to print to standard output + """ + def __init__(self, parser): + self.parser = parser + self.elements = [] + + def print_header(self): + pass + + def print_footer(self): + pass + + def print_one(self, helper): + pass + + def print_all(self): + self.print_header() + for elem in self.elements: + self.print_one(elem) + self.print_footer() + + +class PrinterRST(Printer): + """ + A generic class for printers that print ReStructured Text. Printers should + be created with a HeaderParser object, and implement a way to print API + elements in the desired fashion. + @parser: A HeaderParser with objects to print to standard output + """ + def __init__(self, parser): + self.parser = parser + + def print_license(self): + license = '''\ +.. Copyright (C) All BPF authors and contributors from 2014 to present. +.. See git log include/uapi/linux/bpf.h in kernel tree for details. +.. +.. %%%LICENSE_START(VERBATIM) +.. Permission is granted to make and distribute verbatim copies of this +.. manual provided the copyright notice and this permission notice are +.. preserved on all copies. +.. +.. Permission is granted to copy and distribute modified versions of this +.. manual under the conditions for verbatim copying, provided that the +.. entire resulting derived work is distributed under the terms of a +.. permission notice identical to this one. +.. +.. Since the Linux kernel and libraries are constantly changing, this +.. manual page may be incorrect or out-of-date. The author(s) assume no +.. responsibility for errors or omissions, or for damages resulting from +.. the use of the information contained herein. The author(s) may not +.. have taken the same level of care in the production of this manual, +.. which is licensed free of charge, as they might when working +.. professionally. +.. +.. Formatted or processed versions of this manual, if unaccompanied by +.. the source, must acknowledge the copyright and authors of this work. +.. %%%LICENSE_END +.. +.. Please do not edit this file. It was generated from the documentation +.. located in file include/uapi/linux/bpf.h of the Linux kernel sources +.. (helpers description), and from scripts/bpf_doc.py in the same +.. repository (header and footer). +''' + print(license) + + def print_elem(self, elem): + if (elem.desc): + print('\tDescription') + # Do not strip all newline characters: formatted code at the end of + # a section must be followed by a blank line. + for line in re.sub('\n$', '', elem.desc, count=1).split('\n'): + print('{}{}'.format('\t\t' if line else '', line)) + + if (elem.ret): + print('\tReturn') + for line in elem.ret.rstrip().split('\n'): + print('{}{}'.format('\t\t' if line else '', line)) + + print('') + + +class PrinterHelpersRST(PrinterRST): + """ + A printer for dumping collected information about helpers as a ReStructured + Text page compatible with the rst2man program, which can be used to + generate a manual page for the helpers. + @parser: A HeaderParser with Helper objects to print to standard output + """ + def __init__(self, parser): + self.elements = parser.helpers + + def print_header(self): + header = '''\ +=========== +BPF-HELPERS +=========== +------------------------------------------------------------------------------- +list of eBPF helper functions +------------------------------------------------------------------------------- + +:Manual section: 7 + +DESCRIPTION +=========== + +The extended Berkeley Packet Filter (eBPF) subsystem consists in programs +written in a pseudo-assembly language, then attached to one of the several +kernel hooks and run in reaction of specific events. This framework differs +from the older, "classic" BPF (or "cBPF") in several aspects, one of them being +the ability to call special functions (or "helpers") from within a program. +These functions are restricted to a white-list of helpers defined in the +kernel. + +These helpers are used by eBPF programs to interact with the system, or with +the context in which they work. For instance, they can be used to print +debugging messages, to get the time since the system was booted, to interact +with eBPF maps, or to manipulate network packets. Since there are several eBPF +program types, and that they do not run in the same context, each program type +can only call a subset of those helpers. + +Due to eBPF conventions, a helper can not have more than five arguments. + +Internally, eBPF programs call directly into the compiled helper functions +without requiring any foreign-function interface. As a result, calling helpers +introduces no overhead, thus offering excellent performance. + +This document is an attempt to list and document the helpers available to eBPF +developers. They are sorted by chronological order (the oldest helpers in the +kernel at the top). + +HELPERS +======= +''' + PrinterRST.print_license(self) + print(header) + + def print_footer(self): + footer = ''' +EXAMPLES +======== + +Example usage for most of the eBPF helpers listed in this manual page are +available within the Linux kernel sources, at the following locations: + +* *samples/bpf/* +* *tools/testing/selftests/bpf/* + +LICENSE +======= + +eBPF programs can have an associated license, passed along with the bytecode +instructions to the kernel when the programs are loaded. The format for that +string is identical to the one in use for kernel modules (Dual licenses, such +as "Dual BSD/GPL", may be used). Some helper functions are only accessible to +programs that are compatible with the GNU Privacy License (GPL). + +In order to use such helpers, the eBPF program must be loaded with the correct +license string passed (via **attr**) to the **bpf**\ () system call, and this +generally translates into the C source code of the program containing a line +similar to the following: + +:: + + char ____license[] __attribute__((section("license"), used)) = "GPL"; + +IMPLEMENTATION +============== + +This manual page is an effort to document the existing eBPF helper functions. +But as of this writing, the BPF sub-system is under heavy development. New eBPF +program or map types are added, along with new helper functions. Some helpers +are occasionally made available for additional program types. So in spite of +the efforts of the community, this page might not be up-to-date. If you want to +check by yourself what helper functions exist in your kernel, or what types of +programs they can support, here are some files among the kernel tree that you +may be interested in: + +* *include/uapi/linux/bpf.h* is the main BPF header. It contains the full list + of all helper functions, as well as many other BPF definitions including most + of the flags, structs or constants used by the helpers. +* *net/core/filter.c* contains the definition of most network-related helper + functions, and the list of program types from which they can be used. +* *kernel/trace/bpf_trace.c* is the equivalent for most tracing program-related + helpers. +* *kernel/bpf/verifier.c* contains the functions used to check that valid types + of eBPF maps are used with a given helper function. +* *kernel/bpf/* directory contains other files in which additional helpers are + defined (for cgroups, sockmaps, etc.). +* The bpftool utility can be used to probe the availability of helper functions + on the system (as well as supported program and map types, and a number of + other parameters). To do so, run **bpftool feature probe** (see + **bpftool-feature**\ (8) for details). Add the **unprivileged** keyword to + list features available to unprivileged users. + +Compatibility between helper functions and program types can generally be found +in the files where helper functions are defined. Look for the **struct +bpf_func_proto** objects and for functions returning them: these functions +contain a list of helpers that a given program type can call. Note that the +**default:** label of the **switch ... case** used to filter helpers can call +other functions, themselves allowing access to additional helpers. The +requirement for GPL license is also in those **struct bpf_func_proto**. + +Compatibility between helper functions and map types can be found in the +**check_map_func_compatibility**\ () function in file *kernel/bpf/verifier.c*. + +Helper functions that invalidate the checks on **data** and **data_end** +pointers for network processing are listed in function +**bpf_helper_changes_pkt_data**\ () in file *net/core/filter.c*. + +SEE ALSO +======== + +**bpf**\ (2), +**bpftool**\ (8), +**cgroups**\ (7), +**ip**\ (8), +**perf_event_open**\ (2), +**sendmsg**\ (2), +**socket**\ (7), +**tc-bpf**\ (8)''' + print(footer) + + def print_proto(self, helper): + """ + Format function protocol with bold and italics markers. This makes RST + file less readable, but gives nice results in the manual page. + """ + proto = helper.proto_break_down() + + print('**%s %s%s(' % (proto['ret_type'], + proto['ret_star'].replace('*', '\\*'), + proto['name']), + end='') + + comma = '' + for a in proto['args']: + one_arg = '{}{}'.format(comma, a['type']) + if a['name']: + if a['star']: + one_arg += ' {}**\ '.format(a['star'].replace('*', '\\*')) + else: + one_arg += '** ' + one_arg += '*{}*\\ **'.format(a['name']) + comma = ', ' + print(one_arg, end='') + + print(')**') + + def print_one(self, helper): + self.print_proto(helper) + self.print_elem(helper) + + + + +class PrinterHelpers(Printer): + """ + A printer for dumping collected information about helpers as C header to + be included from BPF program. + @parser: A HeaderParser with Helper objects to print to standard output + """ + def __init__(self, parser): + self.elements = parser.helpers + + type_fwds = [ + 'struct bpf_fib_lookup', + 'struct bpf_sk_lookup', + 'struct bpf_perf_event_data', + 'struct bpf_perf_event_value', + 'struct bpf_pidns_info', + 'struct bpf_redir_neigh', + 'struct bpf_sock', + 'struct bpf_sock_addr', + 'struct bpf_sock_ops', + 'struct bpf_sock_tuple', + 'struct bpf_spin_lock', + 'struct bpf_sysctl', + 'struct bpf_tcp_sock', + 'struct bpf_tunnel_key', + 'struct bpf_xfrm_state', + 'struct linux_binprm', + 'struct pt_regs', + 'struct sk_reuseport_md', + 'struct sockaddr', + 'struct tcphdr', + 'struct seq_file', + 'struct tcp6_sock', + 'struct tcp_sock', + 'struct tcp_timewait_sock', + 'struct tcp_request_sock', + 'struct udp6_sock', + 'struct task_struct', + + 'struct __sk_buff', + 'struct sk_msg_md', + 'struct xdp_md', + 'struct path', + 'struct btf_ptr', + 'struct inode', + 'struct socket', + 'struct file', + ] + known_types = { + '...', + 'void', + 'const void', + 'char', + 'const char', + 'int', + 'long', + 'unsigned long', + + '__be16', + '__be32', + '__wsum', + + 'struct bpf_fib_lookup', + 'struct bpf_perf_event_data', + 'struct bpf_perf_event_value', + 'struct bpf_pidns_info', + 'struct bpf_redir_neigh', + 'struct bpf_sk_lookup', + 'struct bpf_sock', + 'struct bpf_sock_addr', + 'struct bpf_sock_ops', + 'struct bpf_sock_tuple', + 'struct bpf_spin_lock', + 'struct bpf_sysctl', + 'struct bpf_tcp_sock', + 'struct bpf_tunnel_key', + 'struct bpf_xfrm_state', + 'struct linux_binprm', + 'struct pt_regs', + 'struct sk_reuseport_md', + 'struct sockaddr', + 'struct tcphdr', + 'struct seq_file', + 'struct tcp6_sock', + 'struct tcp_sock', + 'struct tcp_timewait_sock', + 'struct tcp_request_sock', + 'struct udp6_sock', + 'struct task_struct', + 'struct path', + 'struct btf_ptr', + 'struct inode', + 'struct socket', + 'struct file', + } + mapped_types = { + 'u8': '__u8', + 'u16': '__u16', + 'u32': '__u32', + 'u64': '__u64', + 's8': '__s8', + 's16': '__s16', + 's32': '__s32', + 's64': '__s64', + 'size_t': 'unsigned long', + 'struct bpf_map': 'void', + 'struct sk_buff': 'struct __sk_buff', + 'const struct sk_buff': 'const struct __sk_buff', + 'struct sk_msg_buff': 'struct sk_msg_md', + 'struct xdp_buff': 'struct xdp_md', + } + # Helpers overloaded for different context types. + overloaded_helpers = [ + 'bpf_get_socket_cookie', + 'bpf_sk_assign', + ] + + def print_header(self): + header = '''\ +/* This is auto-generated file. See bpf_doc.py for details. */ + +/* Forward declarations of BPF structs */''' + + print(header) + for fwd in self.type_fwds: + print('%s;' % fwd) + print('') + + def print_footer(self): + footer = '' + print(footer) + + def map_type(self, t): + if t in self.known_types: + return t + if t in self.mapped_types: + return self.mapped_types[t] + print("Unrecognized type '%s', please add it to known types!" % t, + file=sys.stderr) + sys.exit(1) + + seen_helpers = set() + + def print_one(self, helper): + proto = helper.proto_break_down() + + if proto['name'] in self.seen_helpers: + return + self.seen_helpers.add(proto['name']) + + print('/*') + print(" * %s" % proto['name']) + print(" *") + if (helper.desc): + # Do not strip all newline characters: formatted code at the end of + # a section must be followed by a blank line. + for line in re.sub('\n$', '', helper.desc, count=1).split('\n'): + print(' *{}{}'.format(' \t' if line else '', line)) + + if (helper.ret): + print(' *') + print(' * Returns') + for line in helper.ret.rstrip().split('\n'): + print(' *{}{}'.format(' \t' if line else '', line)) + + print(' */') + print('static %s %s(*%s)(' % (self.map_type(proto['ret_type']), + proto['ret_star'], proto['name']), end='') + comma = '' + for i, a in enumerate(proto['args']): + t = a['type'] + n = a['name'] + if proto['name'] in self.overloaded_helpers and i == 0: + t = 'void' + n = 'ctx' + one_arg = '{}{}'.format(comma, self.map_type(t)) + if n: + if a['star']: + one_arg += ' {}'.format(a['star']) + else: + one_arg += ' ' + one_arg += '{}'.format(n) + comma = ', ' + print(one_arg, end='') + + print(') = (void *) %d;' % len(self.seen_helpers)) + print('') + +############################################################################### + +# If script is launched from scripts/ from kernel tree and can access +# ../include/uapi/linux/bpf.h, use it as a default name for the file to parse, +# otherwise the --filename argument will be required from the command line. +script = os.path.abspath(sys.argv[0]) +linuxRoot = os.path.dirname(os.path.dirname(script)) +bpfh = os.path.join(linuxRoot, 'include/uapi/linux/bpf.h') + +printers = { + 'helpers': PrinterHelpersRST, +} + +argParser = argparse.ArgumentParser(description=""" +Parse eBPF header file and generate documentation for the eBPF API. +The RST-formatted output produced can be turned into a manual page with the +rst2man utility. +""") +argParser.add_argument('--header', action='store_true', + help='generate C header file') +if (os.path.isfile(bpfh)): + argParser.add_argument('--filename', help='path to include/uapi/linux/bpf.h', + default=bpfh) +else: + argParser.add_argument('--filename', help='path to include/uapi/linux/bpf.h') +argParser.add_argument('target', nargs='?', default='helpers', + choices=printers.keys(), help='eBPF API target') +args = argParser.parse_args() + +# Parse file. +headerParser = HeaderParser(args.filename) +headerParser.run() + +# Print formatted output to standard output. +if args.header: + printer = PrinterHelpers(headerParser) +else: + printer = printers[args.target](headerParser) +printer.print_all() diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py deleted file mode 100755 index 867ada23281c..000000000000 --- a/scripts/bpf_helpers_doc.py +++ /dev/null @@ -1,615 +0,0 @@ -#!/usr/bin/env python3 -# SPDX-License-Identifier: GPL-2.0-only -# -# Copyright (C) 2018-2019 Netronome Systems, Inc. - -# In case user attempts to run with Python 2. -from __future__ import print_function - -import argparse -import re -import sys, os - -class NoHelperFound(BaseException): - pass - -class ParsingError(BaseException): - def __init__(self, line='', reader=None): - if reader: - BaseException.__init__(self, - 'Error at file offset %d, parsing line: %s' % - (reader.tell(), line)) - else: - BaseException.__init__(self, 'Error parsing line: %s' % line) - -class Helper(object): - """ - An object representing the description of an eBPF helper function. - @proto: function prototype of the helper function - @desc: textual description of the helper function - @ret: description of the return value of the helper function - """ - def __init__(self, proto='', desc='', ret=''): - self.proto = proto - self.desc = desc - self.ret = ret - - def proto_break_down(self): - """ - Break down helper function protocol into smaller chunks: return type, - name, distincts arguments. - """ - arg_re = re.compile('((\w+ )*?(\w+|...))( (\**)(\w+))?$') - res = {} - proto_re = re.compile('(.+) (\**)(\w+)\(((([^,]+)(, )?){1,5})\)$') - - capture = proto_re.match(self.proto) - res['ret_type'] = capture.group(1) - res['ret_star'] = capture.group(2) - res['name'] = capture.group(3) - res['args'] = [] - - args = capture.group(4).split(', ') - for a in args: - capture = arg_re.match(a) - res['args'].append({ - 'type' : capture.group(1), - 'star' : capture.group(5), - 'name' : capture.group(6) - }) - - return res - -class HeaderParser(object): - """ - An object used to parse a file in order to extract the documentation of a - list of eBPF helper functions. All the helpers that can be retrieved are - stored as Helper object, in the self.helpers() array. - @filename: name of file to parse, usually include/uapi/linux/bpf.h in the - kernel tree - """ - def __init__(self, filename): - self.reader = open(filename, 'r') - self.line = '' - self.helpers = [] - - def parse_helper(self): - proto = self.parse_proto() - desc = self.parse_desc() - ret = self.parse_ret() - return Helper(proto=proto, desc=desc, ret=ret) - - def parse_proto(self): - # Argument can be of shape: - # - "void" - # - "type name" - # - "type *name" - # - Same as above, with "const" and/or "struct" in front of type - # - "..." (undefined number of arguments, for bpf_trace_printk()) - # There is at least one term ("void"), and at most five arguments. - p = re.compile(' \* ?((.+) \**\w+\((((const )?(struct )?(\w+|\.\.\.)( \**\w+)?)(, )?){1,5}\))$') - capture = p.match(self.line) - if not capture: - raise NoHelperFound - self.line = self.reader.readline() - return capture.group(1) - - def parse_desc(self): - p = re.compile(' \* ?(?:\t| {5,8})Description$') - capture = p.match(self.line) - if not capture: - # Helper can have empty description and we might be parsing another - # attribute: return but do not consume. - return '' - # Description can be several lines, some of them possibly empty, and it - # stops when another subsection title is met. - desc = '' - while True: - self.line = self.reader.readline() - if self.line == ' *\n': - desc += '\n' - else: - p = re.compile(' \* ?(?:\t| {5,8})(?:\t| {8})(.*)') - capture = p.match(self.line) - if capture: - desc += capture.group(1) + '\n' - else: - break - return desc - - def parse_ret(self): - p = re.compile(' \* ?(?:\t| {5,8})Return$') - capture = p.match(self.line) - if not capture: - # Helper can have empty retval and we might be parsing another - # attribute: return but do not consume. - return '' - # Return value description can be several lines, some of them possibly - # empty, and it stops when another subsection title is met. - ret = '' - while True: - self.line = self.reader.readline() - if self.line == ' *\n': - ret += '\n' - else: - p = re.compile(' \* ?(?:\t| {5,8})(?:\t| {8})(.*)') - capture = p.match(self.line) - if capture: - ret += capture.group(1) + '\n' - else: - break - return ret - - def run(self): - # Advance to start of helper function descriptions. - offset = self.reader.read().find('* Start of BPF helper function descriptions:') - if offset == -1: - raise Exception('Could not find start of eBPF helper descriptions list') - self.reader.seek(offset) - self.reader.readline() - self.reader.readline() - self.line = self.reader.readline() - - while True: - try: - helper = self.parse_helper() - self.helpers.append(helper) - except NoHelperFound: - break - - self.reader.close() - -############################################################################### - -class Printer(object): - """ - A generic class for printers. Printers should be created with an array of - Helper objects, and implement a way to print them in the desired fashion. - @helpers: array of Helper objects to print to standard output - """ - def __init__(self, helpers): - self.helpers = helpers - - def print_header(self): - pass - - def print_footer(self): - pass - - def print_one(self, helper): - pass - - def print_all(self): - self.print_header() - for helper in self.helpers: - self.print_one(helper) - self.print_footer() - -class PrinterRST(Printer): - """ - A printer for dumping collected information about helpers as a ReStructured - Text page compatible with the rst2man program, which can be used to - generate a manual page for the helpers. - @helpers: array of Helper objects to print to standard output - """ - def print_header(self): - header = '''\ -.. Copyright (C) All BPF authors and contributors from 2014 to present. -.. See git log include/uapi/linux/bpf.h in kernel tree for details. -.. -.. %%%LICENSE_START(VERBATIM) -.. Permission is granted to make and distribute verbatim copies of this -.. manual provided the copyright notice and this permission notice are -.. preserved on all copies. -.. -.. Permission is granted to copy and distribute modified versions of this -.. manual under the conditions for verbatim copying, provided that the -.. entire resulting derived work is distributed under the terms of a -.. permission notice identical to this one. -.. -.. Since the Linux kernel and libraries are constantly changing, this -.. manual page may be incorrect or out-of-date. The author(s) assume no -.. responsibility for errors or omissions, or for damages resulting from -.. the use of the information contained herein. The author(s) may not -.. have taken the same level of care in the production of this manual, -.. which is licensed free of charge, as they might when working -.. professionally. -.. -.. Formatted or processed versions of this manual, if unaccompanied by -.. the source, must acknowledge the copyright and authors of this work. -.. %%%LICENSE_END -.. -.. Please do not edit this file. It was generated from the documentation -.. located in file include/uapi/linux/bpf.h of the Linux kernel sources -.. (helpers description), and from scripts/bpf_helpers_doc.py in the same -.. repository (header and footer). - -=========== -BPF-HELPERS -=========== -------------------------------------------------------------------------------- -list of eBPF helper functions -------------------------------------------------------------------------------- - -:Manual section: 7 - -DESCRIPTION -=========== - -The extended Berkeley Packet Filter (eBPF) subsystem consists in programs -written in a pseudo-assembly language, then attached to one of the several -kernel hooks and run in reaction of specific events. This framework differs -from the older, "classic" BPF (or "cBPF") in several aspects, one of them being -the ability to call special functions (or "helpers") from within a program. -These functions are restricted to a white-list of helpers defined in the -kernel. - -These helpers are used by eBPF programs to interact with the system, or with -the context in which they work. For instance, they can be used to print -debugging messages, to get the time since the system was booted, to interact -with eBPF maps, or to manipulate network packets. Since there are several eBPF -program types, and that they do not run in the same context, each program type -can only call a subset of those helpers. - -Due to eBPF conventions, a helper can not have more than five arguments. - -Internally, eBPF programs call directly into the compiled helper functions -without requiring any foreign-function interface. As a result, calling helpers -introduces no overhead, thus offering excellent performance. - -This document is an attempt to list and document the helpers available to eBPF -developers. They are sorted by chronological order (the oldest helpers in the -kernel at the top). - -HELPERS -======= -''' - print(header) - - def print_footer(self): - footer = ''' -EXAMPLES -======== - -Example usage for most of the eBPF helpers listed in this manual page are -available within the Linux kernel sources, at the following locations: - -* *samples/bpf/* -* *tools/testing/selftests/bpf/* - -LICENSE -======= - -eBPF programs can have an associated license, passed along with the bytecode -instructions to the kernel when the programs are loaded. The format for that -string is identical to the one in use for kernel modules (Dual licenses, such -as "Dual BSD/GPL", may be used). Some helper functions are only accessible to -programs that are compatible with the GNU Privacy License (GPL). - -In order to use such helpers, the eBPF program must be loaded with the correct -license string passed (via **attr**) to the **bpf**\ () system call, and this -generally translates into the C source code of the program containing a line -similar to the following: - -:: - - char ____license[] __attribute__((section("license"), used)) = "GPL"; - -IMPLEMENTATION -============== - -This manual page is an effort to document the existing eBPF helper functions. -But as of this writing, the BPF sub-system is under heavy development. New eBPF -program or map types are added, along with new helper functions. Some helpers -are occasionally made available for additional program types. So in spite of -the efforts of the community, this page might not be up-to-date. If you want to -check by yourself what helper functions exist in your kernel, or what types of -programs they can support, here are some files among the kernel tree that you -may be interested in: - -* *include/uapi/linux/bpf.h* is the main BPF header. It contains the full list - of all helper functions, as well as many other BPF definitions including most - of the flags, structs or constants used by the helpers. -* *net/core/filter.c* contains the definition of most network-related helper - functions, and the list of program types from which they can be used. -* *kernel/trace/bpf_trace.c* is the equivalent for most tracing program-related - helpers. -* *kernel/bpf/verifier.c* contains the functions used to check that valid types - of eBPF maps are used with a given helper function. -* *kernel/bpf/* directory contains other files in which additional helpers are - defined (for cgroups, sockmaps, etc.). -* The bpftool utility can be used to probe the availability of helper functions - on the system (as well as supported program and map types, and a number of - other parameters). To do so, run **bpftool feature probe** (see - **bpftool-feature**\ (8) for details). Add the **unprivileged** keyword to - list features available to unprivileged users. - -Compatibility between helper functions and program types can generally be found -in the files where helper functions are defined. Look for the **struct -bpf_func_proto** objects and for functions returning them: these functions -contain a list of helpers that a given program type can call. Note that the -**default:** label of the **switch ... case** used to filter helpers can call -other functions, themselves allowing access to additional helpers. The -requirement for GPL license is also in those **struct bpf_func_proto**. - -Compatibility between helper functions and map types can be found in the -**check_map_func_compatibility**\ () function in file *kernel/bpf/verifier.c*. - -Helper functions that invalidate the checks on **data** and **data_end** -pointers for network processing are listed in function -**bpf_helper_changes_pkt_data**\ () in file *net/core/filter.c*. - -SEE ALSO -======== - -**bpf**\ (2), -**bpftool**\ (8), -**cgroups**\ (7), -**ip**\ (8), -**perf_event_open**\ (2), -**sendmsg**\ (2), -**socket**\ (7), -**tc-bpf**\ (8)''' - print(footer) - - def print_proto(self, helper): - """ - Format function protocol with bold and italics markers. This makes RST - file less readable, but gives nice results in the manual page. - """ - proto = helper.proto_break_down() - - print('**%s %s%s(' % (proto['ret_type'], - proto['ret_star'].replace('*', '\\*'), - proto['name']), - end='') - - comma = '' - for a in proto['args']: - one_arg = '{}{}'.format(comma, a['type']) - if a['name']: - if a['star']: - one_arg += ' {}**\ '.format(a['star'].replace('*', '\\*')) - else: - one_arg += '** ' - one_arg += '*{}*\\ **'.format(a['name']) - comma = ', ' - print(one_arg, end='') - - print(')**') - - def print_one(self, helper): - self.print_proto(helper) - - if (helper.desc): - print('\tDescription') - # Do not strip all newline characters: formatted code at the end of - # a section must be followed by a blank line. - for line in re.sub('\n$', '', helper.desc, count=1).split('\n'): - print('{}{}'.format('\t\t' if line else '', line)) - - if (helper.ret): - print('\tReturn') - for line in helper.ret.rstrip().split('\n'): - print('{}{}'.format('\t\t' if line else '', line)) - - print('') - -class PrinterHelpers(Printer): - """ - A printer for dumping collected information about helpers as C header to - be included from BPF program. - @helpers: array of Helper objects to print to standard output - """ - - type_fwds = [ - 'struct bpf_fib_lookup', - 'struct bpf_sk_lookup', - 'struct bpf_perf_event_data', - 'struct bpf_perf_event_value', - 'struct bpf_pidns_info', - 'struct bpf_redir_neigh', - 'struct bpf_sock', - 'struct bpf_sock_addr', - 'struct bpf_sock_ops', - 'struct bpf_sock_tuple', - 'struct bpf_spin_lock', - 'struct bpf_sysctl', - 'struct bpf_tcp_sock', - 'struct bpf_tunnel_key', - 'struct bpf_xfrm_state', - 'struct linux_binprm', - 'struct pt_regs', - 'struct sk_reuseport_md', - 'struct sockaddr', - 'struct tcphdr', - 'struct seq_file', - 'struct tcp6_sock', - 'struct tcp_sock', - 'struct tcp_timewait_sock', - 'struct tcp_request_sock', - 'struct udp6_sock', - 'struct task_struct', - - 'struct __sk_buff', - 'struct sk_msg_md', - 'struct xdp_md', - 'struct path', - 'struct btf_ptr', - 'struct inode', - 'struct socket', - 'struct file', - ] - known_types = { - '...', - 'void', - 'const void', - 'char', - 'const char', - 'int', - 'long', - 'unsigned long', - - '__be16', - '__be32', - '__wsum', - - 'struct bpf_fib_lookup', - 'struct bpf_perf_event_data', - 'struct bpf_perf_event_value', - 'struct bpf_pidns_info', - 'struct bpf_redir_neigh', - 'struct bpf_sk_lookup', - 'struct bpf_sock', - 'struct bpf_sock_addr', - 'struct bpf_sock_ops', - 'struct bpf_sock_tuple', - 'struct bpf_spin_lock', - 'struct bpf_sysctl', - 'struct bpf_tcp_sock', - 'struct bpf_tunnel_key', - 'struct bpf_xfrm_state', - 'struct linux_binprm', - 'struct pt_regs', - 'struct sk_reuseport_md', - 'struct sockaddr', - 'struct tcphdr', - 'struct seq_file', - 'struct tcp6_sock', - 'struct tcp_sock', - 'struct tcp_timewait_sock', - 'struct tcp_request_sock', - 'struct udp6_sock', - 'struct task_struct', - 'struct path', - 'struct btf_ptr', - 'struct inode', - 'struct socket', - 'struct file', - } - mapped_types = { - 'u8': '__u8', - 'u16': '__u16', - 'u32': '__u32', - 'u64': '__u64', - 's8': '__s8', - 's16': '__s16', - 's32': '__s32', - 's64': '__s64', - 'size_t': 'unsigned long', - 'struct bpf_map': 'void', - 'struct sk_buff': 'struct __sk_buff', - 'const struct sk_buff': 'const struct __sk_buff', - 'struct sk_msg_buff': 'struct sk_msg_md', - 'struct xdp_buff': 'struct xdp_md', - } - # Helpers overloaded for different context types. - overloaded_helpers = [ - 'bpf_get_socket_cookie', - 'bpf_sk_assign', - ] - - def print_header(self): - header = '''\ -/* This is auto-generated file. See bpf_helpers_doc.py for details. */ - -/* Forward declarations of BPF structs */''' - - print(header) - for fwd in self.type_fwds: - print('%s;' % fwd) - print('') - - def print_footer(self): - footer = '' - print(footer) - - def map_type(self, t): - if t in self.known_types: - return t - if t in self.mapped_types: - return self.mapped_types[t] - print("Unrecognized type '%s', please add it to known types!" % t, - file=sys.stderr) - sys.exit(1) - - seen_helpers = set() - - def print_one(self, helper): - proto = helper.proto_break_down() - - if proto['name'] in self.seen_helpers: - return - self.seen_helpers.add(proto['name']) - - print('/*') - print(" * %s" % proto['name']) - print(" *") - if (helper.desc): - # Do not strip all newline characters: formatted code at the end of - # a section must be followed by a blank line. - for line in re.sub('\n$', '', helper.desc, count=1).split('\n'): - print(' *{}{}'.format(' \t' if line else '', line)) - - if (helper.ret): - print(' *') - print(' * Returns') - for line in helper.ret.rstrip().split('\n'): - print(' *{}{}'.format(' \t' if line else '', line)) - - print(' */') - print('static %s %s(*%s)(' % (self.map_type(proto['ret_type']), - proto['ret_star'], proto['name']), end='') - comma = '' - for i, a in enumerate(proto['args']): - t = a['type'] - n = a['name'] - if proto['name'] in self.overloaded_helpers and i == 0: - t = 'void' - n = 'ctx' - one_arg = '{}{}'.format(comma, self.map_type(t)) - if n: - if a['star']: - one_arg += ' {}'.format(a['star']) - else: - one_arg += ' ' - one_arg += '{}'.format(n) - comma = ', ' - print(one_arg, end='') - - print(') = (void *) %d;' % len(self.seen_helpers)) - print('') - -############################################################################### - -# If script is launched from scripts/ from kernel tree and can access -# ../include/uapi/linux/bpf.h, use it as a default name for the file to parse, -# otherwise the --filename argument will be required from the command line. -script = os.path.abspath(sys.argv[0]) -linuxRoot = os.path.dirname(os.path.dirname(script)) -bpfh = os.path.join(linuxRoot, 'include/uapi/linux/bpf.h') - -argParser = argparse.ArgumentParser(description=""" -Parse eBPF header file and generate documentation for eBPF helper functions. -The RST-formatted output produced can be turned into a manual page with the -rst2man utility. -""") -argParser.add_argument('--header', action='store_true', - help='generate C header file') -if (os.path.isfile(bpfh)): - argParser.add_argument('--filename', help='path to include/uapi/linux/bpf.h', - default=bpfh) -else: - argParser.add_argument('--filename', help='path to include/uapi/linux/bpf.h') -args = argParser.parse_args() - -# Parse file. -headerParser = HeaderParser(args.filename) -headerParser.run() - -# Print formatted output to standard output. -if args.header: - printer = PrinterHelpers(headerParser.helpers) -else: - printer = PrinterRST(headerParser.helpers) -printer.print_all() diff --git a/tools/bpf/Makefile.helpers b/tools/bpf/Makefile.helpers index 854d084026dd..a26599022fd6 100644 --- a/tools/bpf/Makefile.helpers +++ b/tools/bpf/Makefile.helpers @@ -35,7 +35,7 @@ man7: $(DOC_MAN7) RST2MAN_DEP := $(shell command -v rst2man 2>/dev/null) $(OUTPUT)$(HELPERS_RST): $(UP2DIR)../../include/uapi/linux/bpf.h - $(QUIET_GEN)$(UP2DIR)../../scripts/bpf_helpers_doc.py --filename $< > $@ + $(QUIET_GEN)$(UP2DIR)../../scripts/bpf_doc.py --filename $< > $@ $(OUTPUT)%.7: $(OUTPUT)%.rst ifndef RST2MAN_DEP diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b89af20cfa19..b4c5c529ad17 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -729,7 +729,7 @@ union bpf_attr { * parsed and used to produce a manual page. The workflow is the following, * and requires the rst2man utility: * - * $ ./scripts/bpf_helpers_doc.py \ + * $ ./scripts/bpf_doc.py \ * --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst * $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7 * $ man /tmp/bpf-helpers.7 diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index 887a494ad5fc..8170f88e8ea6 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -158,7 +158,7 @@ $(BPF_IN_STATIC): force $(BPF_HELPER_DEFS) $(Q)$(MAKE) $(build)=libbpf OUTPUT=$(STATIC_OBJDIR) $(BPF_HELPER_DEFS): $(srctree)/tools/include/uapi/linux/bpf.h - $(QUIET_GEN)$(srctree)/scripts/bpf_helpers_doc.py --header \ + $(QUIET_GEN)$(srctree)/scripts/bpf_doc.py --header \ --file $(srctree)/tools/include/uapi/linux/bpf.h > $(BPF_HELPER_DEFS) $(OUTPUT)libbpf.so: $(OUTPUT)libbpf.so.$(LIBBPF_VERSION) diff --git a/tools/perf/MANIFEST b/tools/perf/MANIFEST index 5d7b947320fb..f05c4d48fd7e 100644 --- a/tools/perf/MANIFEST +++ b/tools/perf/MANIFEST @@ -20,4 +20,4 @@ tools/lib/bitmap.c tools/lib/str_error_r.c tools/lib/vsprintf.c tools/lib/zalloc.c -scripts/bpf_helpers_doc.py +scripts/bpf_doc.py -- cgit v1.2.3 From a01d935b2e0916d32cb567f90c32a0c4eb46993c Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Mar 2021 09:19:43 -0800 Subject: tools/bpf: Remove bpf-helpers from bpftool docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This logic is used for validating the manual pages from selftests, so move the infra under tools/testing/selftests/bpf/ and rely on selftests for validation rather than tying it into the bpftool build. Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210302171947.2268128-12-joe@cilium.io --- tools/bpf/Makefile.helpers | 60 ---------------------- tools/bpf/bpftool/.gitignore | 1 - tools/bpf/bpftool/Documentation/Makefile | 11 ++-- tools/testing/selftests/bpf/.gitignore | 1 + tools/testing/selftests/bpf/Makefile | 20 ++++++-- tools/testing/selftests/bpf/Makefile.docs | 62 +++++++++++++++++++++++ tools/testing/selftests/bpf/test_bpftool_build.sh | 21 -------- tools/testing/selftests/bpf/test_doc_build.sh | 13 +++++ 8 files changed, 95 insertions(+), 94 deletions(-) delete mode 100644 tools/bpf/Makefile.helpers create mode 100644 tools/testing/selftests/bpf/Makefile.docs create mode 100755 tools/testing/selftests/bpf/test_doc_build.sh (limited to 'tools') diff --git a/tools/bpf/Makefile.helpers b/tools/bpf/Makefile.helpers deleted file mode 100644 index a26599022fd6..000000000000 --- a/tools/bpf/Makefile.helpers +++ /dev/null @@ -1,60 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -ifndef allow-override - include ../scripts/Makefile.include - include ../scripts/utilities.mak -else - # Assume Makefile.helpers is being run from bpftool/Documentation - # subdirectory. Go up two more directories to fetch bpf.h header and - # associated script. - UP2DIR := ../../ -endif - -INSTALL ?= install -RM ?= rm -f -RMDIR ?= rmdir --ignore-fail-on-non-empty - -ifeq ($(V),1) - Q = -else - Q = @ -endif - -prefix ?= /usr/local -mandir ?= $(prefix)/man -man7dir = $(mandir)/man7 - -HELPERS_RST = bpf-helpers.rst -MAN7_RST = $(HELPERS_RST) - -_DOC_MAN7 = $(patsubst %.rst,%.7,$(MAN7_RST)) -DOC_MAN7 = $(addprefix $(OUTPUT),$(_DOC_MAN7)) - -helpers: man7 -man7: $(DOC_MAN7) - -RST2MAN_DEP := $(shell command -v rst2man 2>/dev/null) - -$(OUTPUT)$(HELPERS_RST): $(UP2DIR)../../include/uapi/linux/bpf.h - $(QUIET_GEN)$(UP2DIR)../../scripts/bpf_doc.py --filename $< > $@ - -$(OUTPUT)%.7: $(OUTPUT)%.rst -ifndef RST2MAN_DEP - $(error "rst2man not found, but required to generate man pages") -endif - $(QUIET_GEN)rst2man $< > $@ - -helpers-clean: - $(call QUIET_CLEAN, eBPF_helpers-manpage) - $(Q)$(RM) $(DOC_MAN7) $(OUTPUT)$(HELPERS_RST) - -helpers-install: helpers - $(call QUIET_INSTALL, eBPF_helpers-manpage) - $(Q)$(INSTALL) -d -m 755 $(DESTDIR)$(man7dir) - $(Q)$(INSTALL) -m 644 $(DOC_MAN7) $(DESTDIR)$(man7dir) - -helpers-uninstall: - $(call QUIET_UNINST, eBPF_helpers-manpage) - $(Q)$(RM) $(addprefix $(DESTDIR)$(man7dir)/,$(_DOC_MAN7)) - $(Q)$(RMDIR) $(DESTDIR)$(man7dir) - -.PHONY: helpers helpers-clean helpers-install helpers-uninstall diff --git a/tools/bpf/bpftool/.gitignore b/tools/bpf/bpftool/.gitignore index 944cb4b7c95d..05ce4446b780 100644 --- a/tools/bpf/bpftool/.gitignore +++ b/tools/bpf/bpftool/.gitignore @@ -3,7 +3,6 @@ /bootstrap/ /bpftool bpftool*.8 -bpf-helpers.* FEATURE-DUMP.bpftool feature libbpf diff --git a/tools/bpf/bpftool/Documentation/Makefile b/tools/bpf/bpftool/Documentation/Makefile index f33cb02de95c..c49487905ceb 100644 --- a/tools/bpf/bpftool/Documentation/Makefile +++ b/tools/bpf/bpftool/Documentation/Makefile @@ -16,15 +16,12 @@ prefix ?= /usr/local mandir ?= $(prefix)/man man8dir = $(mandir)/man8 -# Load targets for building eBPF helpers man page. -include ../../Makefile.helpers - MAN8_RST = $(wildcard bpftool*.rst) _DOC_MAN8 = $(patsubst %.rst,%.8,$(MAN8_RST)) DOC_MAN8 = $(addprefix $(OUTPUT),$(_DOC_MAN8)) -man: man8 helpers +man: man8 man8: $(DOC_MAN8) RST2MAN_DEP := $(shell command -v rst2man 2>/dev/null) @@ -46,16 +43,16 @@ ifndef RST2MAN_DEP endif $(QUIET_GEN)( cat $< ; printf "%b" $(call see_also,$<) ) | rst2man $(RST2MAN_OPTS) > $@ -clean: helpers-clean +clean: $(call QUIET_CLEAN, Documentation) $(Q)$(RM) $(DOC_MAN8) -install: man helpers-install +install: man $(call QUIET_INSTALL, Documentation-man) $(Q)$(INSTALL) -d -m 755 $(DESTDIR)$(man8dir) $(Q)$(INSTALL) -m 644 $(DOC_MAN8) $(DESTDIR)$(man8dir) -uninstall: helpers-uninstall +uninstall: $(call QUIET_UNINST, Documentation-man) $(Q)$(RM) $(addprefix $(DESTDIR)$(man8dir)/,$(_DOC_MAN8)) $(Q)$(RMDIR) $(DESTDIR)$(man8dir) diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index c0c48fdb9ac1..a0d5ec3cfc24 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only +bpf-helpers* test_verifier test_maps test_lru_map diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index a81af15e4ded..b5827464c6b5 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -68,6 +68,7 @@ TEST_PROGS := test_kmod.sh \ test_bpftool_build.sh \ test_bpftool.sh \ test_bpftool_metadata.sh \ + test_docs_build.sh \ test_xsk.sh TEST_PROGS_EXTENDED := with_addr.sh \ @@ -103,6 +104,7 @@ override define CLEAN $(call msg,CLEAN) $(Q)$(RM) -r $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) $(EXTRA_CLEAN) $(Q)$(MAKE) -C bpf_testmod clean + $(Q)$(MAKE) docs-clean endef include ../lib.mk @@ -180,6 +182,7 @@ $(OUTPUT)/runqslower: $(BPFOBJ) | $(DEFAULT_BPFTOOL) cp $(SCRATCH_DIR)/runqslower $@ $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED): $(OUTPUT)/test_stub.o $(BPFOBJ) +$(TEST_GEN_FILES): docs $(OUTPUT)/test_dev_cgroup: cgroup_helpers.c $(OUTPUT)/test_skb_cgroup_id_user: cgroup_helpers.c @@ -200,11 +203,16 @@ $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) \ CC=$(HOSTCC) LD=$(HOSTLD) \ OUTPUT=$(HOST_BUILD_DIR)/bpftool/ \ prefix= DESTDIR=$(HOST_SCRATCH_DIR)/ install - $(Q)mkdir -p $(BUILD_DIR)/bpftool/Documentation - $(Q)RST2MAN_OPTS="--exit-status=1" $(MAKE) $(submake_extras) \ - -C $(BPFTOOLDIR)/Documentation \ - OUTPUT=$(BUILD_DIR)/bpftool/Documentation/ \ - prefix= DESTDIR=$(SCRATCH_DIR)/ install + +docs: + $(Q)RST2MAN_OPTS="--exit-status=1" $(MAKE) $(submake_extras) \ + -f Makefile.docs \ + prefix= OUTPUT=$(OUTPUT)/ DESTDIR=$(OUTPUT)/ $@ + +docs-clean: + $(Q)$(MAKE) $(submake_extras) \ + -f Makefile.docs \ + prefix= OUTPUT=$(OUTPUT)/ DESTDIR=$(OUTPUT)/ $@ $(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ ../../../include/uapi/linux/bpf.h \ @@ -477,3 +485,5 @@ EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) $(SCRATCH_DIR) $(HOST_SCRATCH_DIR) \ prog_tests/tests.h map_tests/tests.h verifier/tests.h \ feature \ $(addprefix $(OUTPUT)/,*.o *.skel.h no_alu32 bpf_gcc bpf_testmod.ko) + +.PHONY: docs docs-clean diff --git a/tools/testing/selftests/bpf/Makefile.docs b/tools/testing/selftests/bpf/Makefile.docs new file mode 100644 index 000000000000..546c4a763b46 --- /dev/null +++ b/tools/testing/selftests/bpf/Makefile.docs @@ -0,0 +1,62 @@ +# SPDX-License-Identifier: GPL-2.0-only + +include ../../../scripts/Makefile.include +include ../../../scripts/utilities.mak + +INSTALL ?= install +RM ?= rm -f +RMDIR ?= rmdir --ignore-fail-on-non-empty + +ifeq ($(V),1) + Q = +else + Q = @ +endif + +prefix ?= /usr/local +mandir ?= $(prefix)/man +man7dir = $(mandir)/man7 + +HELPERS_RST = bpf-helpers.rst +MAN7_RST = $(HELPERS_RST) + +_DOC_MAN7 = $(patsubst %.rst,%.7,$(MAN7_RST)) +DOC_MAN7 = $(addprefix $(OUTPUT),$(_DOC_MAN7)) + +DOCTARGETS := helpers + +docs: $(DOCTARGETS) +helpers: man7 +man7: $(DOC_MAN7) + +RST2MAN_DEP := $(shell command -v rst2man 2>/dev/null) + +# Configure make rules for the man page bpf-$1.$2. +# $1 - target for scripts/bpf_doc.py +# $2 - man page section to generate the troff file +define DOCS_RULES = +$(OUTPUT)bpf-$1.rst: ../../../../include/uapi/linux/bpf.h + $$(QUIET_GEN)../../../../scripts/bpf_doc.py $1 \ + --filename $$< > $$@ + +$(OUTPUT)%.7: $(OUTPUT)%.rst +ifndef RST2MAN_DEP + $(error "rst2man not found, but required to generate man pages") +endif + $(QUIET_GEN)rst2man $< > $@ + +docs-clean: + $(call QUIET_CLEAN, eBPF_helpers-manpage) + $(Q)$(RM) $(DOC_MAN7) $(OUTPUT)$(HELPERS_RST) + +docs-install: helpers + $(call QUIET_INSTALL, eBPF_helpers-manpage) + $(Q)$(INSTALL) -d -m 755 $(DESTDIR)$(man7dir) + $(Q)$(INSTALL) -m 644 $(DOC_MAN7) $(DESTDIR)$(man7dir) + +docs-uninstall: + $(call QUIET_UNINST, eBPF_helpers-manpage) + $(Q)$(RM) $(addprefix $(DESTDIR)$(man7dir)/,$(_DOC_MAN7)) + $(Q)$(RMDIR) $(DESTDIR)$(man7dir) + +.PHONY: docs docs-clean docs-install docs-uninstall diff --git a/tools/testing/selftests/bpf/test_bpftool_build.sh b/tools/testing/selftests/bpf/test_bpftool_build.sh index 2db3c60e1e61..ac349a5cea7e 100755 --- a/tools/testing/selftests/bpf/test_bpftool_build.sh +++ b/tools/testing/selftests/bpf/test_bpftool_build.sh @@ -85,23 +85,6 @@ make_with_tmpdir() { echo } -make_doc_and_clean() { - echo -e "\$PWD: $PWD" - echo -e "command: make -s $* doc >/dev/null" - RST2MAN_OPTS="--exit-status=1" make $J -s $* doc - if [ $? -ne 0 ] ; then - ERROR=1 - printf "FAILURE: Errors or warnings when building documentation\n" - fi - ( - if [ $# -ge 1 ] ; then - cd ${@: -1} - fi - make -s doc-clean - ) - echo -} - echo "Trying to build bpftool" echo -e "... through kbuild\n" @@ -162,7 +145,3 @@ make_and_clean make_with_tmpdir OUTPUT make_with_tmpdir O - -echo -e "Checking documentation build\n" -# From tools/bpf/bpftool -make_doc_and_clean diff --git a/tools/testing/selftests/bpf/test_doc_build.sh b/tools/testing/selftests/bpf/test_doc_build.sh new file mode 100755 index 000000000000..7eb940a7b2eb --- /dev/null +++ b/tools/testing/selftests/bpf/test_doc_build.sh @@ -0,0 +1,13 @@ +#!/bin/bash +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + +# Assume script is located under tools/testing/selftests/bpf/. We want to start +# build attempts from the top of kernel repository. +SCRIPT_REL_PATH=$(realpath --relative-to=$PWD $0) +SCRIPT_REL_DIR=$(dirname $SCRIPT_REL_PATH) +KDIR_ROOT_DIR=$(realpath $PWD/$SCRIPT_REL_DIR/../../../../) +cd $KDIR_ROOT_DIR + +for tgt in docs docs-clean; do + make -s -C $PWD/$SCRIPT_REL_DIR $tgt; +done -- cgit v1.2.3 From 62b379a233a79e6f4d2e8b14750ae8fa13b8caf8 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Mar 2021 09:19:44 -0800 Subject: selftests/bpf: Templatize man page generation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, the Makefile here was only targeting a single manual page so it just hardcoded a bunch of individual rules to specifically handle build, clean, install, uninstall for that particular page. Upcoming commits will generate manual pages for an additional section, so this commit prepares the makefile first by converting the existing targets into an evaluated set of targets based on the manual page name and section. Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210302171947.2268128-13-joe@cilium.io --- tools/testing/selftests/bpf/Makefile.docs | 40 +++++++++++++++++++------------ 1 file changed, 25 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile.docs b/tools/testing/selftests/bpf/Makefile.docs index 546c4a763b46..f39ad19317c8 100644 --- a/tools/testing/selftests/bpf/Makefile.docs +++ b/tools/testing/selftests/bpf/Makefile.docs @@ -39,24 +39,34 @@ $(OUTPUT)bpf-$1.rst: ../../../../include/uapi/linux/bpf.h $$(QUIET_GEN)../../../../scripts/bpf_doc.py $1 \ --filename $$< > $$@ -$(OUTPUT)%.7: $(OUTPUT)%.rst +$(OUTPUT)%.$2: $(OUTPUT)%.rst ifndef RST2MAN_DEP - $(error "rst2man not found, but required to generate man pages") + $$(error "rst2man not found, but required to generate man pages") endif - $(QUIET_GEN)rst2man $< > $@ + $$(QUIET_GEN)rst2man $$< > $$@ -docs-clean: - $(call QUIET_CLEAN, eBPF_helpers-manpage) - $(Q)$(RM) $(DOC_MAN7) $(OUTPUT)$(HELPERS_RST) +docs-clean-$1: + $$(call QUIET_CLEAN, eBPF_$1-manpage) + $(Q)$(RM) $$(DOC_MAN$2) $(OUTPUT)bpf-$1.rst -docs-install: helpers - $(call QUIET_INSTALL, eBPF_helpers-manpage) - $(Q)$(INSTALL) -d -m 755 $(DESTDIR)$(man7dir) - $(Q)$(INSTALL) -m 644 $(DOC_MAN7) $(DESTDIR)$(man7dir) +docs-install-$1: docs + $$(call QUIET_INSTALL, eBPF_$1-manpage) + $(Q)$(INSTALL) -d -m 755 $(DESTDIR)$$(man$2dir) + $(Q)$(INSTALL) -m 644 $$(DOC_MAN$2) $(DESTDIR)$$(man$2dir) -docs-uninstall: - $(call QUIET_UNINST, eBPF_helpers-manpage) - $(Q)$(RM) $(addprefix $(DESTDIR)$(man7dir)/,$(_DOC_MAN7)) - $(Q)$(RMDIR) $(DESTDIR)$(man7dir) +docs-uninstall-$1: + $$(call QUIET_UNINST, eBPF_$1-manpage) + $(Q)$(RM) $$(addprefix $(DESTDIR)$$(man$2dir)/,$$(_DOC_MAN$2)) + $(Q)$(RMDIR) $(DESTDIR)$$(man$2dir) -.PHONY: docs docs-clean docs-install docs-uninstall +.PHONY: $1 docs-clean-$1 docs-install-$1 docs-uninstall-$1 +endef + +# Create the make targets to generate manual pages by name and section +$(eval $(call DOCS_RULES,helpers,7)) + +docs-clean: $(foreach doctarget,$(DOCTARGETS), docs-clean-$(doctarget)) +docs-install: $(foreach doctarget,$(DOCTARGETS), docs-install-$(doctarget)) +docs-uninstall: $(foreach doctarget,$(DOCTARGETS), docs-uninstall-$(doctarget)) + +.PHONY: docs docs-clean docs-install docs-uninstall man7 -- cgit v1.2.3 From accbd33a9b0328777899a85d148040e4d8921d87 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Mar 2021 09:19:45 -0800 Subject: selftests/bpf: Test syscall command parsing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add building of the bpf(2) syscall commands documentation as part of the docs building step in the build. This allows us to pick up on potential parse errors from the docs generator script as part of selftests. The generated manual pages here are not intended for distribution, they are just a fragment that can be integrated into the other static text of bpf(2) to form the full manual page. Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Acked-by: Toke Høiland-Jørgensen Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210302171947.2268128-14-joe@cilium.io --- tools/testing/selftests/bpf/.gitignore | 1 + tools/testing/selftests/bpf/Makefile.docs | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index a0d5ec3cfc24..4866f6a21901 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only bpf-helpers* +bpf-syscall* test_verifier test_maps test_lru_map diff --git a/tools/testing/selftests/bpf/Makefile.docs b/tools/testing/selftests/bpf/Makefile.docs index f39ad19317c8..ccf260021e83 100644 --- a/tools/testing/selftests/bpf/Makefile.docs +++ b/tools/testing/selftests/bpf/Makefile.docs @@ -15,18 +15,27 @@ endif prefix ?= /usr/local mandir ?= $(prefix)/man +man2dir = $(mandir)/man2 man7dir = $(mandir)/man7 +SYSCALL_RST = bpf-syscall.rst +MAN2_RST = $(SYSCALL_RST) + HELPERS_RST = bpf-helpers.rst MAN7_RST = $(HELPERS_RST) +_DOC_MAN2 = $(patsubst %.rst,%.2,$(MAN2_RST)) +DOC_MAN2 = $(addprefix $(OUTPUT),$(_DOC_MAN2)) + _DOC_MAN7 = $(patsubst %.rst,%.7,$(MAN7_RST)) DOC_MAN7 = $(addprefix $(OUTPUT),$(_DOC_MAN7)) -DOCTARGETS := helpers +DOCTARGETS := helpers syscall docs: $(DOCTARGETS) +syscall: man2 helpers: man7 +man2: $(DOC_MAN2) man7: $(DOC_MAN7) RST2MAN_DEP := $(shell command -v rst2man 2>/dev/null) @@ -64,9 +73,10 @@ endef # Create the make targets to generate manual pages by name and section $(eval $(call DOCS_RULES,helpers,7)) +$(eval $(call DOCS_RULES,syscall,2)) docs-clean: $(foreach doctarget,$(DOCTARGETS), docs-clean-$(doctarget)) docs-install: $(foreach doctarget,$(DOCTARGETS), docs-install-$(doctarget)) docs-uninstall: $(foreach doctarget,$(DOCTARGETS), docs-uninstall-$(doctarget)) -.PHONY: docs docs-clean docs-install docs-uninstall man7 +.PHONY: docs docs-clean docs-install docs-uninstall man2 man7 -- cgit v1.2.3 From 242029f42691e05ac09b31b98221421bd564375e Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Mar 2021 09:19:47 -0800 Subject: tools: Sync uapi bpf.h header with latest changes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Synchronize the header after all of the recent changes. Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210302171947.2268128-16-joe@cilium.io --- tools/include/uapi/linux/bpf.h | 712 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 711 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b4c5c529ad17..63a56ed6a785 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -93,7 +93,717 @@ union bpf_iter_link_info { } map; }; -/* BPF syscall commands, see bpf(2) man-page for details. */ +/* BPF syscall commands, see bpf(2) man-page for more details. */ +/** + * DOC: eBPF Syscall Preamble + * + * The operation to be performed by the **bpf**\ () system call is determined + * by the *cmd* argument. Each operation takes an accompanying argument, + * provided via *attr*, which is a pointer to a union of type *bpf_attr* (see + * below). The size argument is the size of the union pointed to by *attr*. + */ +/** + * DOC: eBPF Syscall Commands + * + * BPF_MAP_CREATE + * Description + * Create a map and return a file descriptor that refers to the + * map. The close-on-exec file descriptor flag (see **fcntl**\ (2)) + * is automatically enabled for the new file descriptor. + * + * Applying **close**\ (2) to the file descriptor returned by + * **BPF_MAP_CREATE** will delete the map (but see NOTES). + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_MAP_LOOKUP_ELEM + * Description + * Look up an element with a given *key* in the map referred to + * by the file descriptor *map_fd*. + * + * The *flags* argument may be specified as one of the + * following: + * + * **BPF_F_LOCK** + * Look up the value of a spin-locked map without + * returning the lock. This must be specified if the + * elements contain a spinlock. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_UPDATE_ELEM + * Description + * Create or update an element (key/value pair) in a specified map. + * + * The *flags* argument should be specified as one of the + * following: + * + * **BPF_ANY** + * Create a new element or update an existing element. + * **BPF_NOEXIST** + * Create a new element only if it did not exist. + * **BPF_EXIST** + * Update an existing element. + * **BPF_F_LOCK** + * Update a spin_lock-ed map element. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * May set *errno* to **EINVAL**, **EPERM**, **ENOMEM**, + * **E2BIG**, **EEXIST**, or **ENOENT**. + * + * **E2BIG** + * The number of elements in the map reached the + * *max_entries* limit specified at map creation time. + * **EEXIST** + * If *flags* specifies **BPF_NOEXIST** and the element + * with *key* already exists in the map. + * **ENOENT** + * If *flags* specifies **BPF_EXIST** and the element with + * *key* does not exist in the map. + * + * BPF_MAP_DELETE_ELEM + * Description + * Look up and delete an element by key in a specified map. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_GET_NEXT_KEY + * Description + * Look up an element by key in a specified map and return the key + * of the next element. Can be used to iterate over all elements + * in the map. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * The following cases can be used to iterate over all elements of + * the map: + * + * * If *key* is not found, the operation returns zero and sets + * the *next_key* pointer to the key of the first element. + * * If *key* is found, the operation returns zero and sets the + * *next_key* pointer to the key of the next element. + * * If *key* is the last element, returns -1 and *errno* is set + * to **ENOENT**. + * + * May set *errno* to **ENOMEM**, **EFAULT**, **EPERM**, or + * **EINVAL** on error. + * + * BPF_PROG_LOAD + * Description + * Verify and load an eBPF program, returning a new file + * descriptor associated with the program. + * + * Applying **close**\ (2) to the file descriptor returned by + * **BPF_PROG_LOAD** will unload the eBPF program (but see NOTES). + * + * The close-on-exec file descriptor flag (see **fcntl**\ (2)) is + * automatically enabled for the new file descriptor. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_OBJ_PIN + * Description + * Pin an eBPF program or map referred by the specified *bpf_fd* + * to the provided *pathname* on the filesystem. + * + * The *pathname* argument must not contain a dot ("."). + * + * On success, *pathname* retains a reference to the eBPF object, + * preventing deallocation of the object when the original + * *bpf_fd* is closed. This allow the eBPF object to live beyond + * **close**\ (\ *bpf_fd*\ ), and hence the lifetime of the parent + * process. + * + * Applying **unlink**\ (2) or similar calls to the *pathname* + * unpins the object from the filesystem, removing the reference. + * If no other file descriptors or filesystem nodes refer to the + * same object, it will be deallocated (see NOTES). + * + * The filesystem type for the parent directory of *pathname* must + * be **BPF_FS_MAGIC**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_OBJ_GET + * Description + * Open a file descriptor for the eBPF object pinned to the + * specified *pathname*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_PROG_ATTACH + * Description + * Attach an eBPF program to a *target_fd* at the specified + * *attach_type* hook. + * + * The *attach_type* specifies the eBPF attachment point to + * attach the program to, and must be one of *bpf_attach_type* + * (see below). + * + * The *attach_bpf_fd* must be a valid file descriptor for a + * loaded eBPF program of a cgroup, flow dissector, LIRC, sockmap + * or sock_ops type corresponding to the specified *attach_type*. + * + * The *target_fd* must be a valid file descriptor for a kernel + * object which depends on the attach type of *attach_bpf_fd*: + * + * **BPF_PROG_TYPE_CGROUP_DEVICE**, + * **BPF_PROG_TYPE_CGROUP_SKB**, + * **BPF_PROG_TYPE_CGROUP_SOCK**, + * **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**, + * **BPF_PROG_TYPE_CGROUP_SOCKOPT**, + * **BPF_PROG_TYPE_CGROUP_SYSCTL**, + * **BPF_PROG_TYPE_SOCK_OPS** + * + * Control Group v2 hierarchy with the eBPF controller + * enabled. Requires the kernel to be compiled with + * **CONFIG_CGROUP_BPF**. + * + * **BPF_PROG_TYPE_FLOW_DISSECTOR** + * + * Network namespace (eg /proc/self/ns/net). + * + * **BPF_PROG_TYPE_LIRC_MODE2** + * + * LIRC device path (eg /dev/lircN). Requires the kernel + * to be compiled with **CONFIG_BPF_LIRC_MODE2**. + * + * **BPF_PROG_TYPE_SK_SKB**, + * **BPF_PROG_TYPE_SK_MSG** + * + * eBPF map of socket type (eg **BPF_MAP_TYPE_SOCKHASH**). + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_DETACH + * Description + * Detach the eBPF program associated with the *target_fd* at the + * hook specified by *attach_type*. The program must have been + * previously attached using **BPF_PROG_ATTACH**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_TEST_RUN + * Description + * Run the eBPF program associated with the *prog_fd* a *repeat* + * number of times against a provided program context *ctx_in* and + * data *data_in*, and return the modified program context + * *ctx_out*, *data_out* (for example, packet data), result of the + * execution *retval*, and *duration* of the test run. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * **ENOSPC** + * Either *data_size_out* or *ctx_size_out* is too small. + * **ENOTSUPP** + * This command is not supported by the program type of + * the program referred to by *prog_fd*. + * + * BPF_PROG_GET_NEXT_ID + * Description + * Fetch the next eBPF program currently loaded into the kernel. + * + * Looks for the eBPF program with an id greater than *start_id* + * and updates *next_id* on success. If no other eBPF programs + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_MAP_GET_NEXT_ID + * Description + * Fetch the next eBPF map currently loaded into the kernel. + * + * Looks for the eBPF map with an id greater than *start_id* + * and updates *next_id* on success. If no other eBPF maps + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_PROG_GET_FD_BY_ID + * Description + * Open a file descriptor for the eBPF program corresponding to + * *prog_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_MAP_GET_FD_BY_ID + * Description + * Open a file descriptor for the eBPF map corresponding to + * *map_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_OBJ_GET_INFO_BY_FD + * Description + * Obtain information about the eBPF object corresponding to + * *bpf_fd*. + * + * Populates up to *info_len* bytes of *info*, which will be in + * one of the following formats depending on the eBPF object type + * of *bpf_fd*: + * + * * **struct bpf_prog_info** + * * **struct bpf_map_info** + * * **struct bpf_btf_info** + * * **struct bpf_link_info** + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_QUERY + * Description + * Obtain information about eBPF programs associated with the + * specified *attach_type* hook. + * + * The *target_fd* must be a valid file descriptor for a kernel + * object which depends on the attach type of *attach_bpf_fd*: + * + * **BPF_PROG_TYPE_CGROUP_DEVICE**, + * **BPF_PROG_TYPE_CGROUP_SKB**, + * **BPF_PROG_TYPE_CGROUP_SOCK**, + * **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**, + * **BPF_PROG_TYPE_CGROUP_SOCKOPT**, + * **BPF_PROG_TYPE_CGROUP_SYSCTL**, + * **BPF_PROG_TYPE_SOCK_OPS** + * + * Control Group v2 hierarchy with the eBPF controller + * enabled. Requires the kernel to be compiled with + * **CONFIG_CGROUP_BPF**. + * + * **BPF_PROG_TYPE_FLOW_DISSECTOR** + * + * Network namespace (eg /proc/self/ns/net). + * + * **BPF_PROG_TYPE_LIRC_MODE2** + * + * LIRC device path (eg /dev/lircN). Requires the kernel + * to be compiled with **CONFIG_BPF_LIRC_MODE2**. + * + * **BPF_PROG_QUERY** always fetches the number of programs + * attached and the *attach_flags* which were used to attach those + * programs. Additionally, if *prog_ids* is nonzero and the number + * of attached programs is less than *prog_cnt*, populates + * *prog_ids* with the eBPF program ids of the programs attached + * at *target_fd*. + * + * The following flags may alter the result: + * + * **BPF_F_QUERY_EFFECTIVE** + * Only return information regarding programs which are + * currently effective at the specified *target_fd*. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_RAW_TRACEPOINT_OPEN + * Description + * Attach an eBPF program to a tracepoint *name* to access kernel + * internal arguments of the tracepoint in their raw form. + * + * The *prog_fd* must be a valid file descriptor associated with + * a loaded eBPF program of type **BPF_PROG_TYPE_RAW_TRACEPOINT**. + * + * No ABI guarantees are made about the content of tracepoint + * arguments exposed to the corresponding eBPF program. + * + * Applying **close**\ (2) to the file descriptor returned by + * **BPF_RAW_TRACEPOINT_OPEN** will delete the map (but see NOTES). + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_BTF_LOAD + * Description + * Verify and load BPF Type Format (BTF) metadata into the kernel, + * returning a new file descriptor associated with the metadata. + * BTF is described in more detail at + * https://www.kernel.org/doc/html/latest/bpf/btf.html. + * + * The *btf* parameter must point to valid memory providing + * *btf_size* bytes of BTF binary metadata. + * + * The returned file descriptor can be passed to other **bpf**\ () + * subcommands such as **BPF_PROG_LOAD** or **BPF_MAP_CREATE** to + * associate the BTF with those objects. + * + * Similar to **BPF_PROG_LOAD**, **BPF_BTF_LOAD** has optional + * parameters to specify a *btf_log_buf*, *btf_log_size* and + * *btf_log_level* which allow the kernel to return freeform log + * output regarding the BTF verification process. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_BTF_GET_FD_BY_ID + * Description + * Open a file descriptor for the BPF Type Format (BTF) + * corresponding to *btf_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_TASK_FD_QUERY + * Description + * Obtain information about eBPF programs associated with the + * target process identified by *pid* and *fd*. + * + * If the *pid* and *fd* are associated with a tracepoint, kprobe + * or uprobe perf event, then the *prog_id* and *fd_type* will + * be populated with the eBPF program id and file descriptor type + * of type **bpf_task_fd_type**. If associated with a kprobe or + * uprobe, the *probe_offset* and *probe_addr* will also be + * populated. Optionally, if *buf* is provided, then up to + * *buf_len* bytes of *buf* will be populated with the name of + * the tracepoint, kprobe or uprobe. + * + * The resulting *prog_id* may be introspected in deeper detail + * using **BPF_PROG_GET_FD_BY_ID** and **BPF_OBJ_GET_INFO_BY_FD**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_LOOKUP_AND_DELETE_ELEM + * Description + * Look up an element with the given *key* in the map referred to + * by the file descriptor *fd*, and if found, delete the element. + * + * The **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map types + * implement this command as a "pop" operation, deleting the top + * element rather than one corresponding to *key*. + * The *key* and *key_len* parameters should be zeroed when + * issuing this operation for these map types. + * + * This command is only valid for the following map types: + * * **BPF_MAP_TYPE_QUEUE** + * * **BPF_MAP_TYPE_STACK** + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_FREEZE + * Description + * Freeze the permissions of the specified map. + * + * Write permissions may be frozen by passing zero *flags*. + * Upon success, no future syscall invocations may alter the + * map state of *map_fd*. Write operations from eBPF programs + * are still possible for a frozen map. + * + * Not supported for maps of type **BPF_MAP_TYPE_STRUCT_OPS**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_BTF_GET_NEXT_ID + * Description + * Fetch the next BPF Type Format (BTF) object currently loaded + * into the kernel. + * + * Looks for the BTF object with an id greater than *start_id* + * and updates *next_id* on success. If no other BTF objects + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_MAP_LOOKUP_BATCH + * Description + * Iterate and fetch multiple elements in a map. + * + * Two opaque values are used to manage batch operations, + * *in_batch* and *out_batch*. Initially, *in_batch* must be set + * to NULL to begin the batched operation. After each subsequent + * **BPF_MAP_LOOKUP_BATCH**, the caller should pass the resultant + * *out_batch* as the *in_batch* for the next operation to + * continue iteration from the current point. + * + * The *keys* and *values* are output parameters which must point + * to memory large enough to hold *count* items based on the key + * and value size of the map *map_fd*. The *keys* buffer must be + * of *key_size* * *count*. The *values* buffer must be of + * *value_size* * *count*. + * + * The *elem_flags* argument may be specified as one of the + * following: + * + * **BPF_F_LOCK** + * Look up the value of a spin-locked map without + * returning the lock. This must be specified if the + * elements contain a spinlock. + * + * On success, *count* elements from the map are copied into the + * user buffer, with the keys copied into *keys* and the values + * copied into the corresponding indices in *values*. + * + * If an error is returned and *errno* is not **EFAULT**, *count* + * is set to the number of successfully processed elements. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * May set *errno* to **ENOSPC** to indicate that *keys* or + * *values* is too small to dump an entire bucket during + * iteration of a hash-based map type. + * + * BPF_MAP_LOOKUP_AND_DELETE_BATCH + * Description + * Iterate and delete all elements in a map. + * + * This operation has the same behavior as + * **BPF_MAP_LOOKUP_BATCH** with two exceptions: + * + * * Every element that is successfully returned is also deleted + * from the map. This is at least *count* elements. Note that + * *count* is both an input and an output parameter. + * * Upon returning with *errno* set to **EFAULT**, up to + * *count* elements may be deleted without returning the keys + * and values of the deleted elements. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_UPDATE_BATCH + * Description + * Update multiple elements in a map by *key*. + * + * The *keys* and *values* are input parameters which must point + * to memory large enough to hold *count* items based on the key + * and value size of the map *map_fd*. The *keys* buffer must be + * of *key_size* * *count*. The *values* buffer must be of + * *value_size* * *count*. + * + * Each element specified in *keys* is sequentially updated to the + * value in the corresponding index in *values*. The *in_batch* + * and *out_batch* parameters are ignored and should be zeroed. + * + * The *elem_flags* argument should be specified as one of the + * following: + * + * **BPF_ANY** + * Create new elements or update a existing elements. + * **BPF_NOEXIST** + * Create new elements only if they do not exist. + * **BPF_EXIST** + * Update existing elements. + * **BPF_F_LOCK** + * Update spin_lock-ed map elements. This must be + * specified if the map value contains a spinlock. + * + * On success, *count* elements from the map are updated. + * + * If an error is returned and *errno* is not **EFAULT**, *count* + * is set to the number of successfully processed elements. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * May set *errno* to **EINVAL**, **EPERM**, **ENOMEM**, or + * **E2BIG**. **E2BIG** indicates that the number of elements in + * the map reached the *max_entries* limit specified at map + * creation time. + * + * May set *errno* to one of the following error codes under + * specific circumstances: + * + * **EEXIST** + * If *flags* specifies **BPF_NOEXIST** and the element + * with *key* already exists in the map. + * **ENOENT** + * If *flags* specifies **BPF_EXIST** and the element with + * *key* does not exist in the map. + * + * BPF_MAP_DELETE_BATCH + * Description + * Delete multiple elements in a map by *key*. + * + * The *keys* parameter is an input parameter which must point + * to memory large enough to hold *count* items based on the key + * size of the map *map_fd*, that is, *key_size* * *count*. + * + * Each element specified in *keys* is sequentially deleted. The + * *in_batch*, *out_batch*, and *values* parameters are ignored + * and should be zeroed. + * + * The *elem_flags* argument may be specified as one of the + * following: + * + * **BPF_F_LOCK** + * Look up the value of a spin-locked map without + * returning the lock. This must be specified if the + * elements contain a spinlock. + * + * On success, *count* elements from the map are updated. + * + * If an error is returned and *errno* is not **EFAULT**, *count* + * is set to the number of successfully processed elements. If + * *errno* is **EFAULT**, up to *count* elements may be been + * deleted. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_LINK_CREATE + * Description + * Attach an eBPF program to a *target_fd* at the specified + * *attach_type* hook and return a file descriptor handle for + * managing the link. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_LINK_UPDATE + * Description + * Update the eBPF program in the specified *link_fd* to + * *new_prog_fd*. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_LINK_GET_FD_BY_ID + * Description + * Open a file descriptor for the eBPF Link corresponding to + * *link_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_LINK_GET_NEXT_ID + * Description + * Fetch the next eBPF link currently loaded into the kernel. + * + * Looks for the eBPF link with an id greater than *start_id* + * and updates *next_id* on success. If no other eBPF links + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_ENABLE_STATS + * Description + * Enable eBPF runtime statistics gathering. + * + * Runtime statistics gathering for the eBPF runtime is disabled + * by default to minimize the corresponding performance overhead. + * This command enables statistics globally. + * + * Multiple programs may independently enable statistics. + * After gathering the desired statistics, eBPF runtime statistics + * may be disabled again by calling **close**\ (2) for the file + * descriptor returned by this function. Statistics will only be + * disabled system-wide when all outstanding file descriptors + * returned by prior calls for this subcommand are closed. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_ITER_CREATE + * Description + * Create an iterator on top of the specified *link_fd* (as + * previously created using **BPF_LINK_CREATE**) and return a + * file descriptor that can be used to trigger the iteration. + * + * If the resulting file descriptor is pinned to the filesystem + * using **BPF_OBJ_PIN**, then subsequent **read**\ (2) syscalls + * for that path will trigger the iterator to read kernel state + * using the eBPF program attached to *link_fd*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_LINK_DETACH + * Description + * Forcefully detach the specified *link_fd* from its + * corresponding attachment point. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_BIND_MAP + * Description + * Bind a map to the lifetime of an eBPF program. + * + * The map identified by *map_fd* is bound to the program + * identified by *prog_fd* and only released when *prog_fd* is + * released. This may be used in cases where metadata should be + * associated with a program which otherwise does not contain any + * references to the map (for example, embedded in the eBPF + * program instructions). + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * NOTES + * eBPF objects (maps and programs) can be shared between processes. + * + * * After **fork**\ (2), the child inherits file descriptors + * referring to the same eBPF objects. + * * File descriptors referring to eBPF objects can be transferred over + * **unix**\ (7) domain sockets. + * * File descriptors referring to eBPF objects can be duplicated in the + * usual way, using **dup**\ (2) and similar calls. + * * File descriptors referring to eBPF objects can be pinned to the + * filesystem using the **BPF_OBJ_PIN** command of **bpf**\ (2). + * + * An eBPF object is deallocated only after all file descriptors referring + * to the object have been closed and no references remain pinned to the + * filesystem or attached (for example, bound to a program or device). + */ enum bpf_cmd { BPF_MAP_CREATE, BPF_MAP_LOOKUP_ELEM, -- cgit v1.2.3 From 7c32e8f8bc33a5f4b113a630857e46634e3e143b Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Wed, 3 Mar 2021 10:18:13 +0000 Subject: bpf: Add PROG_TEST_RUN support for sk_lookup programs Allow to pass sk_lookup programs to PROG_TEST_RUN. User space provides the full bpf_sk_lookup struct as context. Since the context includes a socket pointer that can't be exposed to user space we define that PROG_TEST_RUN returns the cookie of the selected socket or zero in place of the socket pointer. We don't support testing programs that select a reuseport socket, since this would mean running another (unrelated) BPF program from the sk_lookup test handler. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210303101816.36774-3-lmb@cloudflare.com --- include/linux/bpf.h | 10 ++++ include/uapi/linux/bpf.h | 5 +- net/bpf/test_run.c | 105 +++++++++++++++++++++++++++++++++++++++++ net/core/filter.c | 1 + tools/include/uapi/linux/bpf.h | 5 +- 5 files changed, 124 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4c730863fa77..c931bc97019d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1491,6 +1491,9 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); +int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr); bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info); @@ -1692,6 +1695,13 @@ static inline int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, return -ENOTSUPP; } +static inline int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + return -ENOTSUPP; +} + static inline void bpf_map_put(struct bpf_map *map) { } diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 63a56ed6a785..7f530e349aff 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5953,7 +5953,10 @@ struct bpf_pidns_info { /* User accessible data for SK_LOOKUP programs. Add new fields at the end. */ struct bpf_sk_lookup { - __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */ + union { + __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */ + __u64 cookie; /* Non-zero if socket was selected in PROG_TEST_RUN */ + }; __u32 family; /* Protocol family (AF_INET, AF_INET6) */ __u32 protocol; /* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */ diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index eb3c78cd4d7c..0abdd67f44b1 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -10,8 +10,10 @@ #include #include #include +#include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -781,3 +783,106 @@ out: kfree(data); return ret; } + +int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + struct bpf_test_timer t = { NO_PREEMPT }; + struct bpf_prog_array *progs = NULL; + struct bpf_sk_lookup_kern ctx = {}; + u32 repeat = kattr->test.repeat; + struct bpf_sk_lookup *user_ctx; + u32 retval, duration; + int ret = -EINVAL; + + if (prog->type != BPF_PROG_TYPE_SK_LOOKUP) + return -EINVAL; + + if (kattr->test.flags || kattr->test.cpu) + return -EINVAL; + + if (kattr->test.data_in || kattr->test.data_size_in || kattr->test.data_out || + kattr->test.data_size_out) + return -EINVAL; + + if (!repeat) + repeat = 1; + + user_ctx = bpf_ctx_init(kattr, sizeof(*user_ctx)); + if (IS_ERR(user_ctx)) + return PTR_ERR(user_ctx); + + if (!user_ctx) + return -EINVAL; + + if (user_ctx->sk) + goto out; + + if (!range_is_zero(user_ctx, offsetofend(typeof(*user_ctx), local_port), sizeof(*user_ctx))) + goto out; + + if (user_ctx->local_port > U16_MAX || user_ctx->remote_port > U16_MAX) { + ret = -ERANGE; + goto out; + } + + ctx.family = (u16)user_ctx->family; + ctx.protocol = (u16)user_ctx->protocol; + ctx.dport = (u16)user_ctx->local_port; + ctx.sport = (__force __be16)user_ctx->remote_port; + + switch (ctx.family) { + case AF_INET: + ctx.v4.daddr = (__force __be32)user_ctx->local_ip4; + ctx.v4.saddr = (__force __be32)user_ctx->remote_ip4; + break; + +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + ctx.v6.daddr = (struct in6_addr *)user_ctx->local_ip6; + ctx.v6.saddr = (struct in6_addr *)user_ctx->remote_ip6; + break; +#endif + + default: + ret = -EAFNOSUPPORT; + goto out; + } + + progs = bpf_prog_array_alloc(1, GFP_KERNEL); + if (!progs) { + ret = -ENOMEM; + goto out; + } + + progs->items[0].prog = prog; + + bpf_test_timer_enter(&t); + do { + ctx.selected_sk = NULL; + retval = BPF_PROG_SK_LOOKUP_RUN_ARRAY(progs, ctx, BPF_PROG_RUN); + } while (bpf_test_timer_continue(&t, repeat, &ret, &duration)); + bpf_test_timer_leave(&t); + + if (ret < 0) + goto out; + + user_ctx->cookie = 0; + if (ctx.selected_sk) { + if (ctx.selected_sk->sk_reuseport && !ctx.no_reuseport) { + ret = -EOPNOTSUPP; + goto out; + } + + user_ctx->cookie = sock_gen_cookie(ctx.selected_sk); + } + + ret = bpf_test_finish(kattr, uattr, NULL, 0, retval, duration); + if (!ret) + ret = bpf_ctx_finish(kattr, uattr, user_ctx, sizeof(*user_ctx)); + +out: + bpf_prog_array_free(progs); + kfree(user_ctx); + return ret; +} diff --git a/net/core/filter.c b/net/core/filter.c index 13bcf248ee7b..a526db494c62 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -10457,6 +10457,7 @@ static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type, } const struct bpf_prog_ops sk_lookup_prog_ops = { + .test_run = bpf_prog_test_run_sk_lookup, }; const struct bpf_verifier_ops sk_lookup_verifier_ops = { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 63a56ed6a785..7f530e349aff 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5953,7 +5953,10 @@ struct bpf_pidns_info { /* User accessible data for SK_LOOKUP programs. Add new fields at the end. */ struct bpf_sk_lookup { - __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */ + union { + __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */ + __u64 cookie; /* Non-zero if socket was selected in PROG_TEST_RUN */ + }; __u32 family; /* Protocol family (AF_INET, AF_INET6) */ __u32 protocol; /* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */ -- cgit v1.2.3 From 509b2937bce90089fd2785db9f27951a3d850c34 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Wed, 3 Mar 2021 10:18:14 +0000 Subject: selftests: bpf: Convert sk_lookup ctx access tests to PROG_TEST_RUN Convert the selftests for sk_lookup narrow context access to use PROG_TEST_RUN instead of creating actual sockets. This ensures that ctx is populated correctly when using PROG_TEST_RUN. Assert concrete values since we now control remote_ip and remote_port. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210303101816.36774-4-lmb@cloudflare.com --- tools/testing/selftests/bpf/prog_tests/sk_lookup.c | 83 ++++++++++++++++++---- tools/testing/selftests/bpf/progs/test_sk_lookup.c | 62 ++++++++++------ 2 files changed, 109 insertions(+), 36 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c index 9ff0412e1fd3..45c82db3c58c 100644 --- a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c +++ b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c @@ -241,6 +241,48 @@ fail: return -1; } +static __u64 socket_cookie(int fd) +{ + __u64 cookie; + socklen_t cookie_len = sizeof(cookie); + + if (CHECK(getsockopt(fd, SOL_SOCKET, SO_COOKIE, &cookie, &cookie_len) < 0, + "getsockopt(SO_COOKIE)", "%s\n", strerror(errno))) + return 0; + return cookie; +} + +static int fill_sk_lookup_ctx(struct bpf_sk_lookup *ctx, const char *local_ip, __u16 local_port, + const char *remote_ip, __u16 remote_port) +{ + void *local, *remote; + int err; + + memset(ctx, 0, sizeof(*ctx)); + ctx->local_port = local_port; + ctx->remote_port = htons(remote_port); + + if (is_ipv6(local_ip)) { + ctx->family = AF_INET6; + local = &ctx->local_ip6[0]; + remote = &ctx->remote_ip6[0]; + } else { + ctx->family = AF_INET; + local = &ctx->local_ip4; + remote = &ctx->remote_ip4; + } + + err = inet_pton(ctx->family, local_ip, local); + if (CHECK(err != 1, "inet_pton", "local_ip failed\n")) + return 1; + + err = inet_pton(ctx->family, remote_ip, remote); + if (CHECK(err != 1, "inet_pton", "remote_ip failed\n")) + return 1; + + return 0; +} + static int send_byte(int fd) { ssize_t n; @@ -1009,18 +1051,27 @@ static void test_drop_on_reuseport(struct test_sk_lookup *skel) static void run_sk_assign(struct test_sk_lookup *skel, struct bpf_program *lookup_prog, - const char *listen_ip, const char *connect_ip) + const char *remote_ip, const char *local_ip) { - int client_fd, peer_fd, server_fds[MAX_SERVERS] = { -1 }; - struct bpf_link *lookup_link; + int server_fds[MAX_SERVERS] = { -1 }; + struct bpf_sk_lookup ctx; + __u64 server_cookie; int i, err; - lookup_link = attach_lookup_prog(lookup_prog); - if (!lookup_link) + DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts, + .ctx_in = &ctx, + .ctx_size_in = sizeof(ctx), + .ctx_out = &ctx, + .ctx_size_out = sizeof(ctx), + ); + + if (fill_sk_lookup_ctx(&ctx, local_ip, EXT_PORT, remote_ip, INT_PORT)) return; + ctx.protocol = IPPROTO_TCP; + for (i = 0; i < ARRAY_SIZE(server_fds); i++) { - server_fds[i] = make_server(SOCK_STREAM, listen_ip, 0, NULL); + server_fds[i] = make_server(SOCK_STREAM, local_ip, 0, NULL); if (server_fds[i] < 0) goto close_servers; @@ -1030,23 +1081,25 @@ static void run_sk_assign(struct test_sk_lookup *skel, goto close_servers; } - client_fd = make_client(SOCK_STREAM, connect_ip, EXT_PORT); - if (client_fd < 0) + server_cookie = socket_cookie(server_fds[SERVER_B]); + if (!server_cookie) + return; + + err = bpf_prog_test_run_opts(bpf_program__fd(lookup_prog), &opts); + if (CHECK(err, "test_run", "failed with error %d\n", errno)) + goto close_servers; + + if (CHECK(ctx.cookie == 0, "ctx.cookie", "no socket selected\n")) goto close_servers; - peer_fd = accept(server_fds[SERVER_B], NULL, NULL); - if (CHECK(peer_fd < 0, "accept", "failed\n")) - goto close_client; + CHECK(ctx.cookie != server_cookie, "ctx.cookie", + "selected sk %llu instead of %llu\n", ctx.cookie, server_cookie); - close(peer_fd); -close_client: - close(client_fd); close_servers: for (i = 0; i < ARRAY_SIZE(server_fds); i++) { if (server_fds[i] != -1) close(server_fds[i]); } - bpf_link__destroy(lookup_link); } static void run_sk_assign_v4(struct test_sk_lookup *skel, diff --git a/tools/testing/selftests/bpf/progs/test_sk_lookup.c b/tools/testing/selftests/bpf/progs/test_sk_lookup.c index 1032b292af5b..ac6f7f205e25 100644 --- a/tools/testing/selftests/bpf/progs/test_sk_lookup.c +++ b/tools/testing/selftests/bpf/progs/test_sk_lookup.c @@ -64,6 +64,10 @@ static const int PROG_DONE = 1; static const __u32 KEY_SERVER_A = SERVER_A; static const __u32 KEY_SERVER_B = SERVER_B; +static const __u16 SRC_PORT = bpf_htons(8008); +static const __u32 SRC_IP4 = IP4(127, 0, 0, 2); +static const __u32 SRC_IP6[] = IP6(0xfd000000, 0x0, 0x0, 0x00000002); + static const __u16 DST_PORT = 7007; /* Host byte order */ static const __u32 DST_IP4 = IP4(127, 0, 0, 1); static const __u32 DST_IP6[] = IP6(0xfd000000, 0x0, 0x0, 0x00000001); @@ -398,11 +402,12 @@ int ctx_narrow_access(struct bpf_sk_lookup *ctx) if (LSW(ctx->protocol, 0) != IPPROTO_TCP) return SK_DROP; - /* Narrow loads from remote_port field. Expect non-0 value. */ - if (LSB(ctx->remote_port, 0) == 0 && LSB(ctx->remote_port, 1) == 0 && - LSB(ctx->remote_port, 2) == 0 && LSB(ctx->remote_port, 3) == 0) + /* Narrow loads from remote_port field. Expect SRC_PORT. */ + if (LSB(ctx->remote_port, 0) != ((SRC_PORT >> 0) & 0xff) || + LSB(ctx->remote_port, 1) != ((SRC_PORT >> 8) & 0xff) || + LSB(ctx->remote_port, 2) != 0 || LSB(ctx->remote_port, 3) != 0) return SK_DROP; - if (LSW(ctx->remote_port, 0) == 0) + if (LSW(ctx->remote_port, 0) != SRC_PORT) return SK_DROP; /* Narrow loads from local_port field. Expect DST_PORT. */ @@ -415,11 +420,14 @@ int ctx_narrow_access(struct bpf_sk_lookup *ctx) /* Narrow loads from IPv4 fields */ if (v4) { - /* Expect non-0.0.0.0 in remote_ip4 */ - if (LSB(ctx->remote_ip4, 0) == 0 && LSB(ctx->remote_ip4, 1) == 0 && - LSB(ctx->remote_ip4, 2) == 0 && LSB(ctx->remote_ip4, 3) == 0) + /* Expect SRC_IP4 in remote_ip4 */ + if (LSB(ctx->remote_ip4, 0) != ((SRC_IP4 >> 0) & 0xff) || + LSB(ctx->remote_ip4, 1) != ((SRC_IP4 >> 8) & 0xff) || + LSB(ctx->remote_ip4, 2) != ((SRC_IP4 >> 16) & 0xff) || + LSB(ctx->remote_ip4, 3) != ((SRC_IP4 >> 24) & 0xff)) return SK_DROP; - if (LSW(ctx->remote_ip4, 0) == 0 && LSW(ctx->remote_ip4, 1) == 0) + if (LSW(ctx->remote_ip4, 0) != ((SRC_IP4 >> 0) & 0xffff) || + LSW(ctx->remote_ip4, 1) != ((SRC_IP4 >> 16) & 0xffff)) return SK_DROP; /* Expect DST_IP4 in local_ip4 */ @@ -448,20 +456,32 @@ int ctx_narrow_access(struct bpf_sk_lookup *ctx) /* Narrow loads from IPv6 fields */ if (!v4) { - /* Expect non-:: IP in remote_ip6 */ - if (LSB(ctx->remote_ip6[0], 0) == 0 && LSB(ctx->remote_ip6[0], 1) == 0 && - LSB(ctx->remote_ip6[0], 2) == 0 && LSB(ctx->remote_ip6[0], 3) == 0 && - LSB(ctx->remote_ip6[1], 0) == 0 && LSB(ctx->remote_ip6[1], 1) == 0 && - LSB(ctx->remote_ip6[1], 2) == 0 && LSB(ctx->remote_ip6[1], 3) == 0 && - LSB(ctx->remote_ip6[2], 0) == 0 && LSB(ctx->remote_ip6[2], 1) == 0 && - LSB(ctx->remote_ip6[2], 2) == 0 && LSB(ctx->remote_ip6[2], 3) == 0 && - LSB(ctx->remote_ip6[3], 0) == 0 && LSB(ctx->remote_ip6[3], 1) == 0 && - LSB(ctx->remote_ip6[3], 2) == 0 && LSB(ctx->remote_ip6[3], 3) == 0) + /* Expect SRC_IP6 in remote_ip6 */ + if (LSB(ctx->remote_ip6[0], 0) != ((SRC_IP6[0] >> 0) & 0xff) || + LSB(ctx->remote_ip6[0], 1) != ((SRC_IP6[0] >> 8) & 0xff) || + LSB(ctx->remote_ip6[0], 2) != ((SRC_IP6[0] >> 16) & 0xff) || + LSB(ctx->remote_ip6[0], 3) != ((SRC_IP6[0] >> 24) & 0xff) || + LSB(ctx->remote_ip6[1], 0) != ((SRC_IP6[1] >> 0) & 0xff) || + LSB(ctx->remote_ip6[1], 1) != ((SRC_IP6[1] >> 8) & 0xff) || + LSB(ctx->remote_ip6[1], 2) != ((SRC_IP6[1] >> 16) & 0xff) || + LSB(ctx->remote_ip6[1], 3) != ((SRC_IP6[1] >> 24) & 0xff) || + LSB(ctx->remote_ip6[2], 0) != ((SRC_IP6[2] >> 0) & 0xff) || + LSB(ctx->remote_ip6[2], 1) != ((SRC_IP6[2] >> 8) & 0xff) || + LSB(ctx->remote_ip6[2], 2) != ((SRC_IP6[2] >> 16) & 0xff) || + LSB(ctx->remote_ip6[2], 3) != ((SRC_IP6[2] >> 24) & 0xff) || + LSB(ctx->remote_ip6[3], 0) != ((SRC_IP6[3] >> 0) & 0xff) || + LSB(ctx->remote_ip6[3], 1) != ((SRC_IP6[3] >> 8) & 0xff) || + LSB(ctx->remote_ip6[3], 2) != ((SRC_IP6[3] >> 16) & 0xff) || + LSB(ctx->remote_ip6[3], 3) != ((SRC_IP6[3] >> 24) & 0xff)) return SK_DROP; - if (LSW(ctx->remote_ip6[0], 0) == 0 && LSW(ctx->remote_ip6[0], 1) == 0 && - LSW(ctx->remote_ip6[1], 0) == 0 && LSW(ctx->remote_ip6[1], 1) == 0 && - LSW(ctx->remote_ip6[2], 0) == 0 && LSW(ctx->remote_ip6[2], 1) == 0 && - LSW(ctx->remote_ip6[3], 0) == 0 && LSW(ctx->remote_ip6[3], 1) == 0) + if (LSW(ctx->remote_ip6[0], 0) != ((SRC_IP6[0] >> 0) & 0xffff) || + LSW(ctx->remote_ip6[0], 1) != ((SRC_IP6[0] >> 16) & 0xffff) || + LSW(ctx->remote_ip6[1], 0) != ((SRC_IP6[1] >> 0) & 0xffff) || + LSW(ctx->remote_ip6[1], 1) != ((SRC_IP6[1] >> 16) & 0xffff) || + LSW(ctx->remote_ip6[2], 0) != ((SRC_IP6[2] >> 0) & 0xffff) || + LSW(ctx->remote_ip6[2], 1) != ((SRC_IP6[2] >> 16) & 0xffff) || + LSW(ctx->remote_ip6[3], 0) != ((SRC_IP6[3] >> 0) & 0xffff) || + LSW(ctx->remote_ip6[3], 1) != ((SRC_IP6[3] >> 16) & 0xffff)) return SK_DROP; /* Expect DST_IP6 in local_ip6 */ if (LSB(ctx->local_ip6[0], 0) != ((DST_IP6[0] >> 0) & 0xff) || -- cgit v1.2.3 From abab306ff04b570e817be3f026068fe9a5d99fbb Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Wed, 3 Mar 2021 10:18:15 +0000 Subject: selftests: bpf: Check that PROG_TEST_RUN repeats as requested Extend a simple prog_run test to check that PROG_TEST_RUN adheres to the requested repetitions. Convert it to use BPF skeleton. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210303101816.36774-5-lmb@cloudflare.com --- .../selftests/bpf/prog_tests/prog_run_xattr.c | 51 ++++++++++++++++++---- 1 file changed, 42 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c b/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c index 935a294f049a..131d7f7eeb42 100644 --- a/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c +++ b/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c @@ -2,12 +2,31 @@ #include #include -void test_prog_run_xattr(void) +#include "test_pkt_access.skel.h" + +static const __u32 duration; + +static void check_run_cnt(int prog_fd, __u64 run_cnt) { - const char *file = "./test_pkt_access.o"; - struct bpf_object *obj; - char buf[10]; + struct bpf_prog_info info = {}; + __u32 info_len = sizeof(info); int err; + + err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); + if (CHECK(err, "get_prog_info", "failed to get bpf_prog_info for fd %d\n", prog_fd)) + return; + + CHECK(run_cnt != info.run_cnt, "run_cnt", + "incorrect number of repetitions, want %llu have %llu\n", run_cnt, info.run_cnt); +} + +void test_prog_run_xattr(void) +{ + struct test_pkt_access *skel; + int err, stats_fd = -1; + char buf[10] = {}; + __u64 run_cnt = 0; + struct bpf_prog_test_run_attr tattr = { .repeat = 1, .data_in = &pkt_v4, @@ -16,12 +35,15 @@ void test_prog_run_xattr(void) .data_size_out = 5, }; - err = bpf_prog_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, - &tattr.prog_fd); - if (CHECK_ATTR(err, "load", "err %d errno %d\n", err, errno)) + stats_fd = bpf_enable_stats(BPF_STATS_RUN_TIME); + if (CHECK_ATTR(stats_fd < 0, "enable_stats", "failed %d\n", errno)) return; - memset(buf, 0, sizeof(buf)); + skel = test_pkt_access__open_and_load(); + if (CHECK_ATTR(!skel, "open_and_load", "failed\n")) + goto cleanup; + + tattr.prog_fd = bpf_program__fd(skel->progs.test_pkt_access); err = bpf_prog_test_run_xattr(&tattr); CHECK_ATTR(err != -1 || errno != ENOSPC || tattr.retval, "run", @@ -34,8 +56,12 @@ void test_prog_run_xattr(void) CHECK_ATTR(buf[5] != 0, "overflow", "BPF_PROG_TEST_RUN ignored size hint\n"); + run_cnt += tattr.repeat; + check_run_cnt(tattr.prog_fd, run_cnt); + tattr.data_out = NULL; tattr.data_size_out = 0; + tattr.repeat = 2; errno = 0; err = bpf_prog_test_run_xattr(&tattr); @@ -46,5 +72,12 @@ void test_prog_run_xattr(void) err = bpf_prog_test_run_xattr(&tattr); CHECK_ATTR(err != -EINVAL, "run_wrong_size_out", "err %d\n", err); - bpf_object__close(obj); + run_cnt += tattr.repeat; + check_run_cnt(tattr.prog_fd, run_cnt); + +cleanup: + if (skel) + test_pkt_access__destroy(skel); + if (stats_fd != -1) + close(stats_fd); } -- cgit v1.2.3 From b4f894633fa14d7d46ba7676f950b90a401504bb Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Wed, 3 Mar 2021 10:18:16 +0000 Subject: selftests: bpf: Don't run sk_lookup in verifier tests sk_lookup doesn't allow setting data_in for bpf_prog_run. This doesn't play well with the verifier tests, since they always set a 64 byte input buffer. Allow not running verifier tests by setting bpf_test.runs to a negative value and don't run the ctx access case for sk_lookup. We have dedicated ctx access tests so skipping here doesn't reduce coverage. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210303101816.36774-6-lmb@cloudflare.com --- tools/testing/selftests/bpf/test_verifier.c | 4 ++-- tools/testing/selftests/bpf/verifier/ctx_sk_lookup.c | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 58b5a349d3ba..1512092e1e68 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -105,7 +105,7 @@ struct bpf_test { enum bpf_prog_type prog_type; uint8_t flags; void (*fill_helper)(struct bpf_test *self); - uint8_t runs; + int runs; #define bpf_testdata_struct_t \ struct { \ uint32_t retval, retval_unpriv; \ @@ -1165,7 +1165,7 @@ static void do_test_single(struct bpf_test *test, bool unpriv, run_errs = 0; run_successes = 0; - if (!alignment_prevented_execution && fd_prog >= 0) { + if (!alignment_prevented_execution && fd_prog >= 0 && test->runs >= 0) { uint32_t expected_val; int i; diff --git a/tools/testing/selftests/bpf/verifier/ctx_sk_lookup.c b/tools/testing/selftests/bpf/verifier/ctx_sk_lookup.c index fb13ca2d5606..d78627be060f 100644 --- a/tools/testing/selftests/bpf/verifier/ctx_sk_lookup.c +++ b/tools/testing/selftests/bpf/verifier/ctx_sk_lookup.c @@ -239,6 +239,7 @@ .result = ACCEPT, .prog_type = BPF_PROG_TYPE_SK_LOOKUP, .expected_attach_type = BPF_SK_LOOKUP, + .runs = -1, }, /* invalid 8-byte reads from a 4-byte fields in bpf_sk_lookup */ { -- cgit v1.2.3 From 46ac034f769fcd50d3d554041a3879a0cdf2ee57 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Wed, 3 Mar 2021 15:20:35 +0800 Subject: bpf: Simplify the calculation of variables Fix the following coccicheck warnings: ./tools/bpf/bpf_dbg.c:1201:55-57: WARNING !A || A && B is equivalent to !A || B. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/1614756035-111280-1-git-send-email-jiapeng.chong@linux.alibaba.com --- tools/bpf/bpf_dbg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/bpf/bpf_dbg.c b/tools/bpf/bpf_dbg.c index a07dfc479270..00e560a17baf 100644 --- a/tools/bpf/bpf_dbg.c +++ b/tools/bpf/bpf_dbg.c @@ -1198,7 +1198,7 @@ static int cmd_run(char *num) else return CMD_OK; bpf_reset(); - } while (pcap_next_pkt() && (!has_limit || (has_limit && ++i < pkts))); + } while (pcap_next_pkt() && (!has_limit || (++i < pkts))); rl_printf("bpf passes:%u fails:%u\n", pass, fail); -- cgit v1.2.3 From bce8623135fbe54bd86797df72cb85bfe4118b6e Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Wed, 3 Mar 2021 15:52:10 +0800 Subject: selftests/bpf: Simplify the calculation of variables Fix the following coccicheck warnings: ./tools/testing/selftests/bpf/test_sockmap.c:735:35-37: WARNING !A || A && B is equivalent to !A || B. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/1614757930-17197-1-git-send-email-jiapeng.chong@linux.alibaba.com --- tools/testing/selftests/bpf/test_sockmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index 427ca00a3217..eefd445b96fc 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -732,7 +732,7 @@ static int sendmsg_test(struct sockmap_options *opt) * socket is not a valid test. So in this case lets not * enable kTLS but still run the test. */ - if (!txmsg_redir || (txmsg_redir && txmsg_ingress)) { + if (!txmsg_redir || txmsg_ingress) { err = sockmap_init_ktls(opt->verbose, rx_fd); if (err) return err; -- cgit v1.2.3 From d01b59c9ae94560fbcceaafeef39784d72765033 Mon Sep 17 00:00:00 2001 From: Xuesen Huang Date: Thu, 4 Mar 2021 14:40:46 +0800 Subject: bpf: Add bpf_skb_adjust_room flag BPF_F_ADJ_ROOM_ENCAP_L2_ETH bpf_skb_adjust_room sets the inner_protocol as skb->protocol for packets encapsulation. But that is not appropriate when pushing Ethernet header. Add an option to further specify encap L2 type and set the inner_protocol as ETH_P_TEB. Suggested-by: Willem de Bruijn Signed-off-by: Xuesen Huang Signed-off-by: Zhiyong Cheng Signed-off-by: Li Wang Signed-off-by: Daniel Borkmann Acked-by: Willem de Bruijn Link: https://lore.kernel.org/bpf/20210304064046.6232-1-hxseverything@gmail.com --- include/uapi/linux/bpf.h | 5 +++++ net/core/filter.c | 11 ++++++++++- tools/include/uapi/linux/bpf.h | 5 +++++ 3 files changed, 20 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7f530e349aff..2d3036e292a9 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2484,6 +2484,10 @@ union bpf_attr { * Use with ENCAP_L3/L4 flags to further specify the tunnel * type; *len* is the length of the inner MAC header. * + * * **BPF_F_ADJ_ROOM_ENCAP_L2_ETH**: + * Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the + * L2 type as Ethernet. + * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be @@ -4916,6 +4920,7 @@ enum { BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3), BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5), + BPF_F_ADJ_ROOM_ENCAP_L2_ETH = (1ULL << 6), }; enum { diff --git a/net/core/filter.c b/net/core/filter.c index a526db494c62..588b19ba0da8 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3409,6 +3409,7 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb) BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \ BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \ BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \ + BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \ BPF_F_ADJ_ROOM_ENCAP_L2( \ BPF_ADJ_ROOM_ENCAP_L2_MASK)) @@ -3445,6 +3446,10 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) return -EINVAL; + if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH && + inner_mac_len < ETH_HLEN) + return -EINVAL; + if (skb->encapsulation) return -EALREADY; @@ -3463,7 +3468,11 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, skb->inner_mac_header = inner_net - inner_mac_len; skb->inner_network_header = inner_net; skb->inner_transport_header = inner_trans; - skb_set_inner_protocol(skb, skb->protocol); + + if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH) + skb_set_inner_protocol(skb, htons(ETH_P_TEB)); + else + skb_set_inner_protocol(skb, skb->protocol); skb->encapsulation = 1; skb_set_network_header(skb, mac_len); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 7f530e349aff..2d3036e292a9 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2484,6 +2484,10 @@ union bpf_attr { * Use with ENCAP_L3/L4 flags to further specify the tunnel * type; *len* is the length of the inner MAC header. * + * * **BPF_F_ADJ_ROOM_ENCAP_L2_ETH**: + * Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the + * L2 type as Ethernet. + * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be @@ -4916,6 +4920,7 @@ enum { BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3), BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5), + BPF_F_ADJ_ROOM_ENCAP_L2_ETH = (1ULL << 6), }; enum { -- cgit v1.2.3 From 256becd450172eec74566f1aa7819ce80181d7e1 Mon Sep 17 00:00:00 2001 From: Xuesen Huang Date: Fri, 5 Mar 2021 20:33:47 +0800 Subject: selftests, bpf: Extend test_tc_tunnel test with vxlan Add BPF_F_ADJ_ROOM_ENCAP_L2_ETH flag to the existing tests which encapsulates the ethernet as the inner l2 header. Update a vxlan encapsulation test case. Signed-off-by: Xuesen Huang Signed-off-by: Li Wang Signed-off-by: Daniel Borkmann Acked-by: Willem de Bruijn Link: https://lore.kernel.org/bpf/20210305123347.15311-1-hxseverything@gmail.com --- tools/testing/selftests/bpf/progs/test_tc_tunnel.c | 113 ++++++++++++++++++--- tools/testing/selftests/bpf/test_tc_tunnel.sh | 15 ++- 2 files changed, 111 insertions(+), 17 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c index 37bce7a7c394..84cd63259554 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c @@ -24,14 +24,29 @@ static const int cfg_port = 8000; static const int cfg_udp_src = 20000; +#define L2_PAD_SZ (sizeof(struct vxlanhdr) + ETH_HLEN) + #define UDP_PORT 5555 #define MPLS_OVER_UDP_PORT 6635 #define ETH_OVER_UDP_PORT 7777 +#define VXLAN_UDP_PORT 8472 + +#define EXTPROTO_VXLAN 0x1 + +#define VXLAN_N_VID (1u << 24) +#define VXLAN_VNI_MASK bpf_htonl((VXLAN_N_VID - 1) << 8) +#define VXLAN_FLAGS 0x8 +#define VXLAN_VNI 1 /* MPLS label 1000 with S bit (last label) set and ttl of 255. */ static const __u32 mpls_label = __bpf_constant_htonl(1000 << 12 | MPLS_LS_S_MASK | 0xff); +struct vxlanhdr { + __be32 vx_flags; + __be32 vx_vni; +} __attribute__((packed)); + struct gre_hdr { __be16 flags; __be16 protocol; @@ -45,13 +60,13 @@ union l4hdr { struct v4hdr { struct iphdr ip; union l4hdr l4hdr; - __u8 pad[16]; /* enough space for L2 header */ + __u8 pad[L2_PAD_SZ]; /* space for L2 header / vxlan header ... */ } __attribute__((packed)); struct v6hdr { struct ipv6hdr ip; union l4hdr l4hdr; - __u8 pad[16]; /* enough space for L2 header */ + __u8 pad[L2_PAD_SZ]; /* space for L2 header / vxlan header ... */ } __attribute__((packed)); static __always_inline void set_ipv4_csum(struct iphdr *iph) @@ -69,14 +84,15 @@ static __always_inline void set_ipv4_csum(struct iphdr *iph) iph->check = ~((csum & 0xffff) + (csum >> 16)); } -static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, - __u16 l2_proto) +static __always_inline int __encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, + __u16 l2_proto, __u16 ext_proto) { __u16 udp_dst = UDP_PORT; struct iphdr iph_inner; struct v4hdr h_outer; struct tcphdr tcph; int olen, l2_len; + __u8 *l2_hdr = NULL; int tcp_off; __u64 flags; @@ -141,7 +157,11 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, break; case ETH_P_TEB: l2_len = ETH_HLEN; - udp_dst = ETH_OVER_UDP_PORT; + if (ext_proto & EXTPROTO_VXLAN) { + udp_dst = VXLAN_UDP_PORT; + l2_len += sizeof(struct vxlanhdr); + } else + udp_dst = ETH_OVER_UDP_PORT; break; } flags |= BPF_F_ADJ_ROOM_ENCAP_L2(l2_len); @@ -171,14 +191,26 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, } /* add L2 encap (if specified) */ + l2_hdr = (__u8 *)&h_outer + olen; switch (l2_proto) { case ETH_P_MPLS_UC: - *((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label; + *(__u32 *)l2_hdr = mpls_label; break; case ETH_P_TEB: - if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen, - ETH_HLEN)) + flags |= BPF_F_ADJ_ROOM_ENCAP_L2_ETH; + + if (ext_proto & EXTPROTO_VXLAN) { + struct vxlanhdr *vxlan_hdr = (struct vxlanhdr *)l2_hdr; + + vxlan_hdr->vx_flags = VXLAN_FLAGS; + vxlan_hdr->vx_vni = bpf_htonl((VXLAN_VNI & VXLAN_VNI_MASK) << 8); + + l2_hdr += sizeof(struct vxlanhdr); + } + + if (bpf_skb_load_bytes(skb, 0, l2_hdr, ETH_HLEN)) return TC_ACT_SHOT; + break; } olen += l2_len; @@ -214,14 +246,21 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, return TC_ACT_OK; } -static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, +static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, __u16 l2_proto) +{ + return __encap_ipv4(skb, encap_proto, l2_proto, 0); +} + +static __always_inline int __encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, + __u16 l2_proto, __u16 ext_proto) { __u16 udp_dst = UDP_PORT; struct ipv6hdr iph_inner; struct v6hdr h_outer; struct tcphdr tcph; int olen, l2_len; + __u8 *l2_hdr = NULL; __u16 tot_len; __u64 flags; @@ -249,7 +288,11 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, break; case ETH_P_TEB: l2_len = ETH_HLEN; - udp_dst = ETH_OVER_UDP_PORT; + if (ext_proto & EXTPROTO_VXLAN) { + udp_dst = VXLAN_UDP_PORT; + l2_len += sizeof(struct vxlanhdr); + } else + udp_dst = ETH_OVER_UDP_PORT; break; } flags |= BPF_F_ADJ_ROOM_ENCAP_L2(l2_len); @@ -267,7 +310,7 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, h_outer.l4hdr.udp.source = __bpf_constant_htons(cfg_udp_src); h_outer.l4hdr.udp.dest = bpf_htons(udp_dst); tot_len = bpf_ntohs(iph_inner.payload_len) + sizeof(iph_inner) + - sizeof(h_outer.l4hdr.udp); + sizeof(h_outer.l4hdr.udp) + l2_len; h_outer.l4hdr.udp.check = 0; h_outer.l4hdr.udp.len = bpf_htons(tot_len); break; @@ -278,13 +321,24 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, } /* add L2 encap (if specified) */ + l2_hdr = (__u8 *)&h_outer + olen; switch (l2_proto) { case ETH_P_MPLS_UC: - *((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label; + *(__u32 *)l2_hdr = mpls_label; break; case ETH_P_TEB: - if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen, - ETH_HLEN)) + flags |= BPF_F_ADJ_ROOM_ENCAP_L2_ETH; + + if (ext_proto & EXTPROTO_VXLAN) { + struct vxlanhdr *vxlan_hdr = (struct vxlanhdr *)l2_hdr; + + vxlan_hdr->vx_flags = VXLAN_FLAGS; + vxlan_hdr->vx_vni = bpf_htonl((VXLAN_VNI & VXLAN_VNI_MASK) << 8); + + l2_hdr += sizeof(struct vxlanhdr); + } + + if (bpf_skb_load_bytes(skb, 0, l2_hdr, ETH_HLEN)) return TC_ACT_SHOT; break; } @@ -309,6 +363,12 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, return TC_ACT_OK; } +static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, + __u16 l2_proto) +{ + return __encap_ipv6(skb, encap_proto, l2_proto, 0); +} + SEC("encap_ipip_none") int __encap_ipip_none(struct __sk_buff *skb) { @@ -372,6 +432,17 @@ int __encap_udp_eth(struct __sk_buff *skb) return TC_ACT_OK; } +SEC("encap_vxlan_eth") +int __encap_vxlan_eth(struct __sk_buff *skb) +{ + if (skb->protocol == __bpf_constant_htons(ETH_P_IP)) + return __encap_ipv4(skb, IPPROTO_UDP, + ETH_P_TEB, + EXTPROTO_VXLAN); + else + return TC_ACT_OK; +} + SEC("encap_sit_none") int __encap_sit_none(struct __sk_buff *skb) { @@ -444,6 +515,17 @@ int __encap_ip6udp_eth(struct __sk_buff *skb) return TC_ACT_OK; } +SEC("encap_ip6vxlan_eth") +int __encap_ip6vxlan_eth(struct __sk_buff *skb) +{ + if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6)) + return __encap_ipv6(skb, IPPROTO_UDP, + ETH_P_TEB, + EXTPROTO_VXLAN); + else + return TC_ACT_OK; +} + static int decap_internal(struct __sk_buff *skb, int off, int len, char proto) { char buf[sizeof(struct v6hdr)]; @@ -479,6 +561,9 @@ static int decap_internal(struct __sk_buff *skb, int off, int len, char proto) case ETH_OVER_UDP_PORT: olen += ETH_HLEN; break; + case VXLAN_UDP_PORT: + olen += ETH_HLEN + sizeof(struct vxlanhdr); + break; } break; default: diff --git a/tools/testing/selftests/bpf/test_tc_tunnel.sh b/tools/testing/selftests/bpf/test_tc_tunnel.sh index 7c76b841b17b..c9dde9b9d987 100755 --- a/tools/testing/selftests/bpf/test_tc_tunnel.sh +++ b/tools/testing/selftests/bpf/test_tc_tunnel.sh @@ -44,8 +44,8 @@ setup() { # clamp route to reserve room for tunnel headers ip -netns "${ns1}" -4 route flush table main ip -netns "${ns1}" -6 route flush table main - ip -netns "${ns1}" -4 route add "${ns2_v4}" mtu 1458 dev veth1 - ip -netns "${ns1}" -6 route add "${ns2_v6}" mtu 1438 dev veth1 + ip -netns "${ns1}" -4 route add "${ns2_v4}" mtu 1450 dev veth1 + ip -netns "${ns1}" -6 route add "${ns2_v6}" mtu 1430 dev veth1 sleep 1 @@ -105,6 +105,12 @@ if [[ "$#" -eq "0" ]]; then echo "sit" $0 ipv6 sit none 100 + echo "ip4 vxlan" + $0 ipv4 vxlan eth 2000 + + echo "ip6 vxlan" + $0 ipv6 ip6vxlan eth 2000 + for mac in none mpls eth ; do echo "ip gre $mac" $0 ipv4 gre $mac 100 @@ -214,6 +220,9 @@ if [[ "$tuntype" =~ "udp" ]]; then targs="encap fou encap-sport auto encap-dport $dport" elif [[ "$tuntype" =~ "gre" && "$mac" == "eth" ]]; then ttype=$gretaptype +elif [[ "$tuntype" =~ "vxlan" && "$mac" == "eth" ]]; then + ttype="vxlan" + targs="id 1 dstport 8472 udp6zerocsumrx" else ttype=$tuntype targs="" @@ -242,7 +251,7 @@ if [[ "$tuntype" == "ip6udp" && "$mac" == "mpls" ]]; then elif [[ "$tuntype" =~ "udp" && "$mac" == "eth" ]]; then # No support for TEB fou tunnel; expect failure. expect_tun_fail=1 -elif [[ "$tuntype" =~ "gre" && "$mac" == "eth" ]]; then +elif [[ "$tuntype" =~ (gre|vxlan) && "$mac" == "eth" ]]; then # Share ethernet address between tunnel/veth2 so L2 decap works. ethaddr=$(ip netns exec "${ns2}" ip link show veth2 | \ awk '/ether/ { print $2 }') -- cgit v1.2.3 From d54dba41999498b38a40940e1123019d50b26496 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Feb 2021 13:03:28 +0100 Subject: objtool: Allow UNWIND_HINT to suppress dodgy stack modifications rewind_stack_do_exit() UNWIND_HINT_FUNC /* Prevent any naive code from trying to unwind to our caller. */ xorl %ebp, %ebp movq PER_CPU_VAR(cpu_current_top_of_stack), %rax leaq -PTREGS_SIZE(%rax), %rsp UNWIND_HINT_REGS call do_exit Does unspeakable things to the stack, which objtool currently fails to detect due to a limitation in instruction decoding. This will be rectified after which the above will result in: arch/x86/entry/entry_64.o: warning: objtool: .text+0xab: unsupported stack register modification Allow the UNWIND_HINT on the next instruction to suppress this, it will overwrite the state anyway. Suggested-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Josh Poimboeuf Tested-by: Nick Desaulniers Link: https://lkml.kernel.org/r/20210211173626.918498579@infradead.org --- tools/objtool/check.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 068cdb41f76f..12b8f0f01176 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -1959,8 +1959,9 @@ static void restore_reg(struct cfi_state *cfi, unsigned char reg) * 41 5d pop %r13 * c3 retq */ -static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi, - struct stack_op *op) +static int update_cfi_state(struct instruction *insn, + struct instruction *next_insn, + struct cfi_state *cfi, struct stack_op *op) { struct cfi_reg *cfa = &cfi->cfa; struct cfi_reg *regs = cfi->regs; @@ -2161,7 +2162,7 @@ static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi, break; } - if (op->dest.reg == cfi->cfa.base) { + if (op->dest.reg == cfi->cfa.base && !(next_insn && next_insn->hint)) { WARN_FUNC("unsupported stack register modification", insn->sec, insn->offset); return -1; @@ -2433,13 +2434,15 @@ static int propagate_alt_cfi(struct objtool_file *file, struct instruction *insn return 0; } -static int handle_insn_ops(struct instruction *insn, struct insn_state *state) +static int handle_insn_ops(struct instruction *insn, + struct instruction *next_insn, + struct insn_state *state) { struct stack_op *op; list_for_each_entry(op, &insn->stack_ops, list) { - if (update_cfi_state(insn, &state->cfi, op)) + if (update_cfi_state(insn, next_insn, &state->cfi, op)) return 1; if (op->dest.type == OP_DEST_PUSHF) { @@ -2719,7 +2722,7 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, return 0; } - if (handle_insn_ops(insn, &state)) + if (handle_insn_ops(insn, next_insn, &state)) return 1; switch (insn->type) { -- cgit v1.2.3 From d473b18b2ef62563fb874f9cae6e123f99129e3f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 9 Feb 2021 20:18:21 +0100 Subject: objtool,x86: Renumber CFI_reg Make them match the instruction encoding numbering. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Josh Poimboeuf Tested-by: Nick Desaulniers Link: https://lkml.kernel.org/r/20210211173627.033720313@infradead.org --- tools/objtool/arch/x86/include/arch/cfi_regs.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/objtool/arch/x86/include/arch/cfi_regs.h b/tools/objtool/arch/x86/include/arch/cfi_regs.h index 79bc517efba8..0579d22c433c 100644 --- a/tools/objtool/arch/x86/include/arch/cfi_regs.h +++ b/tools/objtool/arch/x86/include/arch/cfi_regs.h @@ -4,13 +4,13 @@ #define _OBJTOOL_CFI_REGS_H #define CFI_AX 0 -#define CFI_DX 1 -#define CFI_CX 2 +#define CFI_CX 1 +#define CFI_DX 2 #define CFI_BX 3 -#define CFI_SI 4 -#define CFI_DI 5 -#define CFI_BP 6 -#define CFI_SP 7 +#define CFI_SP 4 +#define CFI_BP 5 +#define CFI_SI 6 +#define CFI_DI 7 #define CFI_R8 8 #define CFI_R9 9 #define CFI_R10 10 -- cgit v1.2.3 From 2ee0c363492f1acc1082125218e6a80c0d7d502b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 9 Feb 2021 21:29:16 +0100 Subject: objtool,x86: Rewrite LEA decode Current LEA decoding is a bunch of special cases, properly decode the instruction, with exception of full SIB and RIP-relative modes. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Josh Poimboeuf Tested-by: Nick Desaulniers Link: https://lkml.kernel.org/r/20210211173627.143250641@infradead.org --- tools/objtool/arch/x86/decode.c | 86 ++++++++++++++--------------------------- 1 file changed, 28 insertions(+), 58 deletions(-) (limited to 'tools') diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index 549813cff8ab..d8f01387d671 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -91,9 +91,10 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, { struct insn insn; int x86_64, sign; - unsigned char op1, op2, rex = 0, rex_b = 0, rex_r = 0, rex_w = 0, - rex_x = 0, modrm = 0, modrm_mod = 0, modrm_rm = 0, - modrm_reg = 0, sib = 0; + unsigned char op1, op2, + rex = 0, rex_b = 0, rex_r = 0, rex_w = 0, rex_x = 0, + modrm = 0, modrm_mod = 0, modrm_rm = 0, modrm_reg = 0, + sib = 0; struct stack_op *op = NULL; struct symbol *sym; @@ -328,68 +329,37 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, break; case 0x8d: - if (sib == 0x24 && rex_w && !rex_b && !rex_x) { - - ADD_OP(op) { - if (!insn.displacement.value) { - /* lea (%rsp), reg */ - op->src.type = OP_SRC_REG; - } else { - /* lea disp(%rsp), reg */ - op->src.type = OP_SRC_ADD; - op->src.offset = insn.displacement.value; - } - op->src.reg = CFI_SP; - op->dest.type = OP_DEST_REG; - op->dest.reg = op_to_cfi_reg[modrm_reg][rex_r]; - } - - } else if (rex == 0x48 && modrm == 0x65) { - - /* lea disp(%rbp), %rsp */ - ADD_OP(op) { - op->src.type = OP_SRC_ADD; - op->src.reg = CFI_BP; - op->src.offset = insn.displacement.value; - op->dest.type = OP_DEST_REG; - op->dest.reg = CFI_SP; - } + if (modrm_mod == 3) { + WARN("invalid LEA encoding at %s:0x%lx", sec->name, offset); + break; + } - } else if (rex == 0x49 && modrm == 0x62 && - insn.displacement.value == -8) { + /* skip non 64bit ops */ + if (!rex_w) + break; - /* - * lea -0x8(%r10), %rsp - * - * Restoring rsp back to its original value after a - * stack realignment. - */ - ADD_OP(op) { - op->src.type = OP_SRC_ADD; - op->src.reg = CFI_R10; - op->src.offset = -8; - op->dest.type = OP_DEST_REG; - op->dest.reg = CFI_SP; - } + /* skip nontrivial SIB */ + if (modrm_rm == 4 && !(sib == 0x24 && rex_b == rex_x)) + break; - } else if (rex == 0x49 && modrm == 0x65 && - insn.displacement.value == -16) { + /* skip RIP relative displacement */ + if (modrm_rm == 5 && modrm_mod == 0) + break; - /* - * lea -0x10(%r13), %rsp - * - * Restoring rsp back to its original value after a - * stack realignment. - */ - ADD_OP(op) { + /* lea disp(%src), %dst */ + ADD_OP(op) { + op->src.offset = insn.displacement.value; + if (!op->src.offset) { + /* lea (%src), %dst */ + op->src.type = OP_SRC_REG; + } else { + /* lea disp(%src), %dst */ op->src.type = OP_SRC_ADD; - op->src.reg = CFI_R13; - op->src.offset = -16; - op->dest.type = OP_DEST_REG; - op->dest.reg = CFI_SP; } + op->src.reg = op_to_cfi_reg[modrm_rm][rex_b]; + op->dest.type = OP_DEST_REG; + op->dest.reg = op_to_cfi_reg[modrm_reg][rex_r]; } - break; case 0x8f: -- cgit v1.2.3 From ffc7e74f36a2c7424da262a32a0bbe59669677ef Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 9 Feb 2021 21:41:13 +0100 Subject: objtool,x86: Rewrite LEAVE Since we can now have multiple stack-ops per instruction, we don't need to special case LEAVE and can simply emit the composite operations. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Josh Poimboeuf Tested-by: Nick Desaulniers Link: https://lkml.kernel.org/r/20210211173627.253273977@infradead.org --- tools/objtool/arch/x86/decode.c | 14 +++++++++++--- tools/objtool/check.c | 24 ++---------------------- tools/objtool/include/objtool/arch.h | 1 - 3 files changed, 13 insertions(+), 26 deletions(-) (limited to 'tools') diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index d8f01387d671..47b9acfc6a4c 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -446,9 +446,17 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, * mov bp, sp * pop bp */ - ADD_OP(op) - op->dest.type = OP_DEST_LEAVE; - + ADD_OP(op) { + op->src.type = OP_SRC_REG; + op->src.reg = CFI_BP; + op->dest.type = OP_DEST_REG; + op->dest.reg = CFI_SP; + } + ADD_OP(op) { + op->src.type = OP_SRC_POP; + op->dest.type = OP_DEST_REG; + op->dest.reg = CFI_BP; + } break; case 0xe3: diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 12b8f0f01176..a0f762a15ad5 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -2020,7 +2020,7 @@ static int update_cfi_state(struct instruction *insn, } else if (op->src.reg == CFI_BP && op->dest.reg == CFI_SP && - cfa->base == CFI_BP) { + (cfa->base == CFI_BP || cfa->base == cfi->drap_reg)) { /* * mov %rbp, %rsp @@ -2217,7 +2217,7 @@ static int update_cfi_state(struct instruction *insn, cfa->offset = 0; cfi->drap_offset = -1; - } else if (regs[op->dest.reg].offset == -cfi->stack_size) { + } else if (cfi->stack_size == -regs[op->dest.reg].offset) { /* pop %reg */ restore_reg(cfi, op->dest.reg); @@ -2358,26 +2358,6 @@ static int update_cfi_state(struct instruction *insn, break; - case OP_DEST_LEAVE: - if ((!cfi->drap && cfa->base != CFI_BP) || - (cfi->drap && cfa->base != cfi->drap_reg)) { - WARN_FUNC("leave instruction with modified stack frame", - insn->sec, insn->offset); - return -1; - } - - /* leave (mov %rbp, %rsp; pop %rbp) */ - - cfi->stack_size = -cfi->regs[CFI_BP].offset - 8; - restore_reg(cfi, CFI_BP); - - if (!cfi->drap) { - cfa->base = CFI_SP; - cfa->offset -= 8; - } - - break; - case OP_DEST_MEM: if (op->src.type != OP_SRC_POP && op->src.type != OP_SRC_POPF) { WARN_FUNC("unknown stack-related memory operation", diff --git a/tools/objtool/include/objtool/arch.h b/tools/objtool/include/objtool/arch.h index 6ff0685f5cc5..ff21f387712d 100644 --- a/tools/objtool/include/objtool/arch.h +++ b/tools/objtool/include/objtool/arch.h @@ -35,7 +35,6 @@ enum op_dest_type { OP_DEST_MEM, OP_DEST_PUSH, OP_DEST_PUSHF, - OP_DEST_LEAVE, }; struct op_dest { -- cgit v1.2.3 From 16ef7f159c503c7befec7018ee0e82fdc311721e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 9 Feb 2021 19:59:43 +0100 Subject: objtool,x86: Simplify register decode Since the CFI_reg number now matches the instruction encoding order do away with the op_to_cfi_reg[] and use direct assignment. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Josh Poimboeuf Tested-by: Nick Desaulniers Link: https://lkml.kernel.org/r/20210211173627.362004522@infradead.org --- tools/objtool/arch/x86/decode.c | 79 ++++++++++++++++++++--------------------- 1 file changed, 39 insertions(+), 40 deletions(-) (limited to 'tools') diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index 47b9acfc6a4c..5ce7dc4d8a0a 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -17,17 +17,6 @@ #include #include -static unsigned char op_to_cfi_reg[][2] = { - {CFI_AX, CFI_R8}, - {CFI_CX, CFI_R9}, - {CFI_DX, CFI_R10}, - {CFI_BX, CFI_R11}, - {CFI_SP, CFI_R12}, - {CFI_BP, CFI_R13}, - {CFI_SI, CFI_R14}, - {CFI_DI, CFI_R15}, -}; - static int is_x86_64(const struct elf *elf) { switch (elf->ehdr.e_machine) { @@ -94,7 +83,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, unsigned char op1, op2, rex = 0, rex_b = 0, rex_r = 0, rex_w = 0, rex_x = 0, modrm = 0, modrm_mod = 0, modrm_rm = 0, modrm_reg = 0, - sib = 0; + sib = 0 /* , sib_scale = 0, sib_index = 0, sib_base = 0 */; struct stack_op *op = NULL; struct symbol *sym; @@ -130,23 +119,29 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, if (insn.modrm.nbytes) { modrm = insn.modrm.bytes[0]; modrm_mod = X86_MODRM_MOD(modrm); - modrm_reg = X86_MODRM_REG(modrm); - modrm_rm = X86_MODRM_RM(modrm); + modrm_reg = X86_MODRM_REG(modrm) + 8*rex_r; + modrm_rm = X86_MODRM_RM(modrm) + 8*rex_b; } - if (insn.sib.nbytes) + if (insn.sib.nbytes) { sib = insn.sib.bytes[0]; + /* + sib_scale = X86_SIB_SCALE(sib); + sib_index = X86_SIB_INDEX(sib) + 8*rex_x; + sib_base = X86_SIB_BASE(sib) + 8*rex_b; + */ + } switch (op1) { case 0x1: case 0x29: - if (rex_w && !rex_b && modrm_mod == 3 && modrm_rm == 4) { + if (rex_w && modrm_mod == 3 && modrm_rm == CFI_SP) { /* add/sub reg, %rsp */ ADD_OP(op) { op->src.type = OP_SRC_ADD; - op->src.reg = op_to_cfi_reg[modrm_reg][rex_r]; + op->src.reg = modrm_reg; op->dest.type = OP_DEST_REG; op->dest.reg = CFI_SP; } @@ -158,7 +153,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, /* push reg */ ADD_OP(op) { op->src.type = OP_SRC_REG; - op->src.reg = op_to_cfi_reg[op1 & 0x7][rex_b]; + op->src.reg = (op1 & 0x7) + 8*rex_b; op->dest.type = OP_DEST_PUSH; } @@ -170,7 +165,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, ADD_OP(op) { op->src.type = OP_SRC_POP; op->dest.type = OP_DEST_REG; - op->dest.reg = op_to_cfi_reg[op1 & 0x7][rex_b]; + op->dest.reg = (op1 & 0x7) + 8*rex_b; } break; @@ -223,7 +218,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, break; case 0x89: - if (rex_w && !rex_r && modrm_reg == 4) { + if (rex_w && modrm_reg == CFI_SP) { if (modrm_mod == 3) { /* mov %rsp, reg */ @@ -231,17 +226,17 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, op->src.type = OP_SRC_REG; op->src.reg = CFI_SP; op->dest.type = OP_DEST_REG; - op->dest.reg = op_to_cfi_reg[modrm_rm][rex_b]; + op->dest.reg = modrm_rm; } break; } else { /* skip nontrivial SIB */ - if (modrm_rm == 4 && !(sib == 0x24 && rex_b == rex_x)) + if ((modrm_rm & 7) == 4 && !(sib == 0x24 && rex_b == rex_x)) break; /* skip RIP relative displacement */ - if (modrm_rm == 5 && modrm_mod == 0) + if ((modrm_rm & 7) == 5 && modrm_mod == 0) break; /* mov %rsp, disp(%reg) */ @@ -249,7 +244,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, op->src.type = OP_SRC_REG; op->src.reg = CFI_SP; op->dest.type = OP_DEST_REG_INDIRECT; - op->dest.reg = op_to_cfi_reg[modrm_rm][rex_b]; + op->dest.reg = modrm_rm; op->dest.offset = insn.displacement.value; } break; @@ -258,12 +253,12 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, break; } - if (rex_w && !rex_b && modrm_mod == 3 && modrm_rm == 4) { + if (rex_w && modrm_mod == 3 && modrm_rm == CFI_SP) { /* mov reg, %rsp */ ADD_OP(op) { op->src.type = OP_SRC_REG; - op->src.reg = op_to_cfi_reg[modrm_reg][rex_r]; + op->src.reg = modrm_reg; op->dest.type = OP_DEST_REG; op->dest.reg = CFI_SP; } @@ -272,13 +267,12 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, /* fallthrough */ case 0x88: - if (!rex_b && - (modrm_mod == 1 || modrm_mod == 2) && modrm_rm == 5) { + if ((modrm_mod == 1 || modrm_mod == 2) && modrm_rm == CFI_BP) { /* mov reg, disp(%rbp) */ ADD_OP(op) { op->src.type = OP_SRC_REG; - op->src.reg = op_to_cfi_reg[modrm_reg][rex_r]; + op->src.reg = modrm_reg; op->dest.type = OP_DEST_REG_INDIRECT; op->dest.reg = CFI_BP; op->dest.offset = insn.displacement.value; @@ -286,12 +280,12 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, break; } - if (rex_w && !rex_b && modrm_rm == 4 && sib == 0x24) { + if (rex_w && modrm_rm == CFI_SP && sib == 0x24) { /* mov reg, disp(%rsp) */ ADD_OP(op) { op->src.type = OP_SRC_REG; - op->src.reg = op_to_cfi_reg[modrm_reg][rex_r]; + op->src.reg = modrm_reg; op->dest.type = OP_DEST_REG_INDIRECT; op->dest.reg = CFI_SP; op->dest.offset = insn.displacement.value; @@ -302,7 +296,10 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, break; case 0x8b: - if (rex_w && !rex_b && modrm_mod == 1 && modrm_rm == 5) { + if (!rex_w) + break; + + if (modrm_mod == 1 && modrm_rm == CFI_BP) { /* mov disp(%rbp), reg */ ADD_OP(op) { @@ -310,11 +307,12 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, op->src.reg = CFI_BP; op->src.offset = insn.displacement.value; op->dest.type = OP_DEST_REG; - op->dest.reg = op_to_cfi_reg[modrm_reg][rex_r]; + op->dest.reg = modrm_reg; } + break; + } - } else if (rex_w && !rex_b && sib == 0x24 && - modrm_mod != 3 && modrm_rm == 4) { + if (modrm_mod != 3 && modrm_rm == CFI_SP && sib == 0x24) { /* mov disp(%rsp), reg */ ADD_OP(op) { @@ -322,8 +320,9 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, op->src.reg = CFI_SP; op->src.offset = insn.displacement.value; op->dest.type = OP_DEST_REG; - op->dest.reg = op_to_cfi_reg[modrm_reg][rex_r]; + op->dest.reg = modrm_reg; } + break; } break; @@ -339,11 +338,11 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, break; /* skip nontrivial SIB */ - if (modrm_rm == 4 && !(sib == 0x24 && rex_b == rex_x)) + if ((modrm_rm & 7) == 4 && !(sib == 0x24 && rex_b == rex_x)) break; /* skip RIP relative displacement */ - if (modrm_rm == 5 && modrm_mod == 0) + if ((modrm_rm & 7) == 5 && modrm_mod == 0) break; /* lea disp(%src), %dst */ @@ -356,9 +355,9 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, /* lea disp(%src), %dst */ op->src.type = OP_SRC_ADD; } - op->src.reg = op_to_cfi_reg[modrm_rm][rex_b]; + op->src.reg = modrm_rm; op->dest.type = OP_DEST_REG; - op->dest.reg = op_to_cfi_reg[modrm_reg][rex_r]; + op->dest.reg = modrm_reg; } break; -- cgit v1.2.3 From 78df6245c3c82484200b9f8e306dc86fb19e9c02 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 10 Feb 2021 11:47:35 +0100 Subject: objtool,x86: Support %riz encodings When there's a SIB byte, the register otherwise denoted by r/m will then be denoted by SIB.base REX.b will now extend this. SIB.index == SP is magic and notes an index value zero. This means that there's a bunch of alternative (longer) encodings for the same thing. Eg. 'ModRM.mod != 3, ModRM.r/m = AX' can be encoded as 'ModRM.mod != 3, ModRM.r/m = SP, SIB.base = AX, SIB.index = SP' which is actually 4 different encodings because the value of SIB.scale is irrelevant, giving rise to 5 different but equal encodings. Support these encodings and clean up the SIB handling in general. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Josh Poimboeuf Tested-by: Nick Desaulniers Link: https://lkml.kernel.org/r/20210211173627.472967498@infradead.org --- tools/objtool/arch/x86/decode.c | 67 +++++++++++++++++++++++++++++------------ 1 file changed, 48 insertions(+), 19 deletions(-) (limited to 'tools') diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index 5ce7dc4d8a0a..78ae5be2fb70 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -72,6 +72,25 @@ unsigned long arch_jump_destination(struct instruction *insn) return -1; \ else for (list_add_tail(&op->list, ops_list); op; op = NULL) +/* + * Helpers to decode ModRM/SIB: + * + * r/m| AX CX DX BX | SP | BP | SI DI | + * | R8 R9 R10 R11 | R12 | R13 | R14 R15 | + * Mod+----------------+-----+-----+---------+ + * 00 | [r/m] |[SIB]|[IP+]| [r/m] | + * 01 | [r/m + d8] |[S+d]| [r/m + d8] | + * 10 | [r/m + d32] |[S+D]| [r/m + d32] | + * 11 | r/ m | + * + */ +#define is_RIP() ((modrm_rm & 7) == CFI_BP && modrm_mod == 0) +#define have_SIB() ((modrm_rm & 7) == CFI_SP && modrm_mod != 3) + +#define rm_is(reg) (have_SIB() ? \ + sib_base == (reg) && sib_index == CFI_SP : \ + modrm_rm == (reg)) + int arch_decode_instruction(const struct elf *elf, const struct section *sec, unsigned long offset, unsigned int maxlen, unsigned int *len, enum insn_type *type, @@ -83,7 +102,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, unsigned char op1, op2, rex = 0, rex_b = 0, rex_r = 0, rex_w = 0, rex_x = 0, modrm = 0, modrm_mod = 0, modrm_rm = 0, modrm_reg = 0, - sib = 0 /* , sib_scale = 0, sib_index = 0, sib_base = 0 */; + sib = 0, /* sib_scale = 0, */ sib_index = 0, sib_base = 0; struct stack_op *op = NULL; struct symbol *sym; @@ -125,11 +144,9 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, if (insn.sib.nbytes) { sib = insn.sib.bytes[0]; - /* - sib_scale = X86_SIB_SCALE(sib); + /* sib_scale = X86_SIB_SCALE(sib); */ sib_index = X86_SIB_INDEX(sib) + 8*rex_x; sib_base = X86_SIB_BASE(sib) + 8*rex_b; - */ } switch (op1) { @@ -218,7 +235,10 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, break; case 0x89: - if (rex_w && modrm_reg == CFI_SP) { + if (!rex_w) + break; + + if (modrm_reg == CFI_SP) { if (modrm_mod == 3) { /* mov %rsp, reg */ @@ -231,14 +251,17 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, break; } else { - /* skip nontrivial SIB */ - if ((modrm_rm & 7) == 4 && !(sib == 0x24 && rex_b == rex_x)) - break; - /* skip RIP relative displacement */ - if ((modrm_rm & 7) == 5 && modrm_mod == 0) + if (is_RIP()) break; + /* skip nontrivial SIB */ + if (have_SIB()) { + modrm_rm = sib_base; + if (sib_index != CFI_SP) + break; + } + /* mov %rsp, disp(%reg) */ ADD_OP(op) { op->src.type = OP_SRC_REG; @@ -253,7 +276,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, break; } - if (rex_w && modrm_mod == 3 && modrm_rm == CFI_SP) { + if (modrm_mod == 3 && modrm_rm == CFI_SP) { /* mov reg, %rsp */ ADD_OP(op) { @@ -267,6 +290,9 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, /* fallthrough */ case 0x88: + if (!rex_w) + break; + if ((modrm_mod == 1 || modrm_mod == 2) && modrm_rm == CFI_BP) { /* mov reg, disp(%rbp) */ @@ -280,7 +306,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, break; } - if (rex_w && modrm_rm == CFI_SP && sib == 0x24) { + if (modrm_mod != 3 && rm_is(CFI_SP)) { /* mov reg, disp(%rsp) */ ADD_OP(op) { @@ -299,7 +325,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, if (!rex_w) break; - if (modrm_mod == 1 && modrm_rm == CFI_BP) { + if ((modrm_mod == 1 || modrm_mod == 2) && modrm_rm == CFI_BP) { /* mov disp(%rbp), reg */ ADD_OP(op) { @@ -312,7 +338,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, break; } - if (modrm_mod != 3 && modrm_rm == CFI_SP && sib == 0x24) { + if (modrm_mod != 3 && rm_is(CFI_SP)) { /* mov disp(%rsp), reg */ ADD_OP(op) { @@ -337,14 +363,17 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, if (!rex_w) break; - /* skip nontrivial SIB */ - if ((modrm_rm & 7) == 4 && !(sib == 0x24 && rex_b == rex_x)) - break; - /* skip RIP relative displacement */ - if ((modrm_rm & 7) == 5 && modrm_mod == 0) + if (is_RIP()) break; + /* skip nontrivial SIB */ + if (have_SIB()) { + modrm_rm = sib_base; + if (sib_index != CFI_SP) + break; + } + /* lea disp(%src), %dst */ ADD_OP(op) { op->src.offset = insn.displacement.value; -- cgit v1.2.3 From 961d83b9073b1ce5834af50d3c69e5e2461c6fd3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 10 Feb 2021 14:11:30 +0100 Subject: objtool,x86: Rewrite ADD/SUB/AND Support sign extending and imm8 forms. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Josh Poimboeuf Tested-by: Nick Desaulniers Link: https://lkml.kernel.org/r/20210211173627.588366777@infradead.org --- tools/objtool/arch/x86/decode.c | 70 ++++++++++++++++++++++++++++++----------- 1 file changed, 51 insertions(+), 19 deletions(-) (limited to 'tools') diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index 78ae5be2fb70..b42e5ec083a0 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -98,13 +98,14 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, struct list_head *ops_list) { struct insn insn; - int x86_64, sign; + int x86_64; unsigned char op1, op2, rex = 0, rex_b = 0, rex_r = 0, rex_w = 0, rex_x = 0, modrm = 0, modrm_mod = 0, modrm_rm = 0, modrm_reg = 0, sib = 0, /* sib_scale = 0, */ sib_index = 0, sib_base = 0; struct stack_op *op = NULL; struct symbol *sym; + u64 imm; x86_64 = is_x86_64(elf); if (x86_64 == -1) @@ -200,12 +201,54 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, *type = INSN_JUMP_CONDITIONAL; break; - case 0x81: - case 0x83: - if (rex != 0x48) + case 0x80 ... 0x83: + /* + * 1000 00sw : mod OP r/m : immediate + * + * s - sign extend immediate + * w - imm8 / imm32 + * + * OP: 000 ADD 100 AND + * 001 OR 101 SUB + * 010 ADC 110 XOR + * 011 SBB 111 CMP + */ + + /* 64bit only */ + if (!rex_w) break; - if (modrm == 0xe4) { + /* %rsp target only */ + if (!(modrm_mod == 3 && modrm_rm == CFI_SP)) + break; + + imm = insn.immediate.value; + if (op1 & 2) { /* sign extend */ + if (op1 & 1) { /* imm32 */ + imm <<= 32; + imm = (s64)imm >> 32; + } else { /* imm8 */ + imm <<= 56; + imm = (s64)imm >> 56; + } + } + + switch (modrm_reg & 7) { + case 5: + imm = -imm; + /* fallthrough */ + case 0: + /* add/sub imm, %rsp */ + ADD_OP(op) { + op->src.type = OP_SRC_ADD; + op->src.reg = CFI_SP; + op->src.offset = imm; + op->dest.type = OP_DEST_REG; + op->dest.reg = CFI_SP; + } + break; + + case 4: /* and imm, %rsp */ ADD_OP(op) { op->src.type = OP_SRC_AND; @@ -215,23 +258,12 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, op->dest.reg = CFI_SP; } break; - } - if (modrm == 0xc4) - sign = 1; - else if (modrm == 0xec) - sign = -1; - else + default: + /* WARN ? */ break; - - /* add/sub imm, %rsp */ - ADD_OP(op) { - op->src.type = OP_SRC_ADD; - op->src.reg = CFI_SP; - op->src.offset = insn.immediate.value * sign; - op->dest.type = OP_DEST_REG; - op->dest.reg = CFI_SP; } + break; case 0x89: -- cgit v1.2.3 From 36d92e43d01cbeeec99abdf405362243051d6b3f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 12 Feb 2021 09:13:00 +0100 Subject: objtool,x86: More ModRM sugar Better helpers to decode ModRM. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Josh Poimboeuf Link: https://lkml.kernel.org/r/YCZB/ljatFXqQbm8@hirez.programming.kicks-ass.net --- tools/objtool/arch/x86/decode.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index b42e5ec083a0..431bafb881d4 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -82,15 +82,21 @@ unsigned long arch_jump_destination(struct instruction *insn) * 01 | [r/m + d8] |[S+d]| [r/m + d8] | * 10 | [r/m + d32] |[S+D]| [r/m + d32] | * 11 | r/ m | - * */ + +#define mod_is_mem() (modrm_mod != 3) +#define mod_is_reg() (modrm_mod == 3) + #define is_RIP() ((modrm_rm & 7) == CFI_BP && modrm_mod == 0) -#define have_SIB() ((modrm_rm & 7) == CFI_SP && modrm_mod != 3) +#define have_SIB() ((modrm_rm & 7) == CFI_SP && mod_is_mem()) #define rm_is(reg) (have_SIB() ? \ sib_base == (reg) && sib_index == CFI_SP : \ modrm_rm == (reg)) +#define rm_is_mem(reg) (mod_is_mem() && !is_RIP() && rm_is(reg)) +#define rm_is_reg(reg) (mod_is_reg() && modrm_rm == (reg)) + int arch_decode_instruction(const struct elf *elf, const struct section *sec, unsigned long offset, unsigned int maxlen, unsigned int *len, enum insn_type *type, @@ -154,7 +160,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, case 0x1: case 0x29: - if (rex_w && modrm_mod == 3 && modrm_rm == CFI_SP) { + if (rex_w && rm_is_reg(CFI_SP)) { /* add/sub reg, %rsp */ ADD_OP(op) { @@ -219,7 +225,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, break; /* %rsp target only */ - if (!(modrm_mod == 3 && modrm_rm == CFI_SP)) + if (!rm_is_reg(CFI_SP)) break; imm = insn.immediate.value; @@ -272,7 +278,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, if (modrm_reg == CFI_SP) { - if (modrm_mod == 3) { + if (mod_is_reg()) { /* mov %rsp, reg */ ADD_OP(op) { op->src.type = OP_SRC_REG; @@ -308,7 +314,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, break; } - if (modrm_mod == 3 && modrm_rm == CFI_SP) { + if (rm_is_reg(CFI_SP)) { /* mov reg, %rsp */ ADD_OP(op) { @@ -325,7 +331,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, if (!rex_w) break; - if ((modrm_mod == 1 || modrm_mod == 2) && modrm_rm == CFI_BP) { + if (rm_is_mem(CFI_BP)) { /* mov reg, disp(%rbp) */ ADD_OP(op) { @@ -338,7 +344,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, break; } - if (modrm_mod != 3 && rm_is(CFI_SP)) { + if (rm_is_mem(CFI_SP)) { /* mov reg, disp(%rsp) */ ADD_OP(op) { @@ -357,7 +363,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, if (!rex_w) break; - if ((modrm_mod == 1 || modrm_mod == 2) && modrm_rm == CFI_BP) { + if (rm_is_mem(CFI_BP)) { /* mov disp(%rbp), reg */ ADD_OP(op) { @@ -370,7 +376,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, break; } - if (modrm_mod != 3 && rm_is(CFI_SP)) { + if (rm_is_mem(CFI_SP)) { /* mov disp(%rsp), reg */ ADD_OP(op) { @@ -386,7 +392,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, break; case 0x8d: - if (modrm_mod == 3) { + if (mod_is_reg()) { WARN("invalid LEA encoding at %s:0x%lx", sec->name, offset); break; } -- cgit v1.2.3 From 8ad15c6900840e8a2163012f4581c52127622e02 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Feb 2021 10:59:59 +0100 Subject: objtool: Add --backup Teach objtool to write backups files, such that it becomes easier to see what objtool did to the object file. Backup files will be ${name}.orig. Suggested-by: Borislav Petkov Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Borislav Petkov Acked-by: Josh Poimboeuf Link: https://lkml.kernel.org/r/YD4obT3aoXPWl7Ax@hirez.programming.kicks-ass.net --- tools/objtool/builtin-check.c | 4 ++- tools/objtool/include/objtool/builtin.h | 3 +- tools/objtool/objtool.c | 64 +++++++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c index c3a85d8f6c5c..97f063d55b9f 100644 --- a/tools/objtool/builtin-check.c +++ b/tools/objtool/builtin-check.c @@ -18,7 +18,8 @@ #include #include -bool no_fp, no_unreachable, retpoline, module, backtrace, uaccess, stats, validate_dup, vmlinux, mcount, noinstr; +bool no_fp, no_unreachable, retpoline, module, backtrace, uaccess, stats, + validate_dup, vmlinux, mcount, noinstr, backup; static const char * const check_usage[] = { "objtool check [] file.o", @@ -37,6 +38,7 @@ const struct option check_options[] = { OPT_BOOLEAN('n', "noinstr", &noinstr, "noinstr validation for vmlinux.o"), OPT_BOOLEAN('l', "vmlinux", &vmlinux, "vmlinux.o validation"), OPT_BOOLEAN('M', "mcount", &mcount, "generate __mcount_loc"), + OPT_BOOLEAN('B', "backup", &backup, "create .orig files before modification"), OPT_END(), }; diff --git a/tools/objtool/include/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h index 2502bb27de17..d019210d74b1 100644 --- a/tools/objtool/include/objtool/builtin.h +++ b/tools/objtool/include/objtool/builtin.h @@ -8,7 +8,8 @@ #include extern const struct option check_options[]; -extern bool no_fp, no_unreachable, retpoline, module, backtrace, uaccess, stats, validate_dup, vmlinux, mcount, noinstr; +extern bool no_fp, no_unreachable, retpoline, module, backtrace, uaccess, stats, + validate_dup, vmlinux, mcount, noinstr, backup; extern int cmd_check(int argc, const char **argv); extern int cmd_orc(int argc, const char **argv); diff --git a/tools/objtool/objtool.c b/tools/objtool/objtool.c index 7b97ce499405..43c1836a06b4 100644 --- a/tools/objtool/objtool.c +++ b/tools/objtool/objtool.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -44,6 +45,64 @@ bool help; const char *objname; static struct objtool_file file; +static bool objtool_create_backup(const char *_objname) +{ + int len = strlen(_objname); + char *buf, *base, *name = malloc(len+6); + int s, d, l, t; + + if (!name) { + perror("failed backup name malloc"); + return false; + } + + strcpy(name, _objname); + strcpy(name + len, ".orig"); + + d = open(name, O_CREAT|O_WRONLY|O_TRUNC, 0644); + if (d < 0) { + perror("failed to create backup file"); + return false; + } + + s = open(_objname, O_RDONLY); + if (s < 0) { + perror("failed to open orig file"); + return false; + } + + buf = malloc(4096); + if (!buf) { + perror("failed backup data malloc"); + return false; + } + + while ((l = read(s, buf, 4096)) > 0) { + base = buf; + do { + t = write(d, base, l); + if (t < 0) { + perror("failed backup write"); + return false; + } + base += t; + l -= t; + } while (l); + } + + if (l < 0) { + perror("failed backup read"); + return false; + } + + free(name); + free(buf); + close(d); + close(s); + + return true; +} + struct objtool_file *objtool_open_read(const char *_objname) { if (objname) { @@ -59,6 +118,11 @@ struct objtool_file *objtool_open_read(const char *_objname) if (!file.elf) return NULL; + if (backup && !objtool_create_backup(objname)) { + WARN("can't create backup file"); + return NULL; + } + INIT_LIST_HEAD(&file.insn_list); hash_init(file.insn_hash); INIT_LIST_HEAD(&file.static_call_list); -- cgit v1.2.3 From a2f605f9ff57397d05a8e2f282b78a69f574d305 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Feb 2021 11:18:24 +0100 Subject: objtool: Collate parse_options() users Ensure there's a single place that parses check_options, in preparation for extending where to get options from. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Josh Poimboeuf Link: https://lkml.kernel.org/r/20210226110004.193108106@infradead.org --- tools/objtool/builtin-check.c | 14 +++++++++----- tools/objtool/builtin-orc.c | 5 +---- tools/objtool/include/objtool/builtin.h | 2 ++ 3 files changed, 12 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c index 97f063d55b9f..03997524b93f 100644 --- a/tools/objtool/builtin-check.c +++ b/tools/objtool/builtin-check.c @@ -42,17 +42,21 @@ const struct option check_options[] = { OPT_END(), }; +int cmd_parse_options(int argc, const char **argv, const char * const usage[]) +{ + argc = parse_options(argc, argv, check_options, usage, 0); + if (argc != 1) + usage_with_options(usage, check_options); + return argc; +} + int cmd_check(int argc, const char **argv) { const char *objname; struct objtool_file *file; int ret; - argc = parse_options(argc, argv, check_options, check_usage, 0); - - if (argc != 1) - usage_with_options(check_usage, check_options); - + argc = cmd_parse_options(argc, argv, check_usage); objname = argv[0]; file = objtool_open_read(objname); diff --git a/tools/objtool/builtin-orc.c b/tools/objtool/builtin-orc.c index 8273bbf7cebb..17f8b9307738 100644 --- a/tools/objtool/builtin-orc.c +++ b/tools/objtool/builtin-orc.c @@ -34,10 +34,7 @@ int cmd_orc(int argc, const char **argv) struct objtool_file *file; int ret; - argc = parse_options(argc, argv, check_options, orc_usage, 0); - if (argc != 1) - usage_with_options(orc_usage, check_options); - + argc = cmd_parse_options(argc, argv, orc_usage); objname = argv[0]; file = objtool_open_read(objname); diff --git a/tools/objtool/include/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h index d019210d74b1..15ac0b7d3d6a 100644 --- a/tools/objtool/include/objtool/builtin.h +++ b/tools/objtool/include/objtool/builtin.h @@ -11,6 +11,8 @@ extern const struct option check_options[]; extern bool no_fp, no_unreachable, retpoline, module, backtrace, uaccess, stats, validate_dup, vmlinux, mcount, noinstr, backup; +extern int cmd_parse_options(int argc, const char **argv, const char * const usage[]); + extern int cmd_check(int argc, const char **argv); extern int cmd_orc(int argc, const char **argv); -- cgit v1.2.3 From 900b4df347bbac4874149a226143a556909faba8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Feb 2021 11:32:30 +0100 Subject: objtool: Parse options from OBJTOOL_ARGS Teach objtool to parse options from the OBJTOOL_ARGS environment variable. This enables things like: $ OBJTOOL_ARGS="--backup" make O=defconfig-build/ kernel/ponies.o to obtain both defconfig-build/kernel/ponies.o{,.orig} and easily inspect what objtool actually did. Suggested-by: Borislav Petkov Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Josh Poimboeuf Link: https://lkml.kernel.org/r/20210226110004.252553847@infradead.org --- tools/objtool/builtin-check.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'tools') diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c index 03997524b93f..8b38b5d6fec7 100644 --- a/tools/objtool/builtin-check.c +++ b/tools/objtool/builtin-check.c @@ -15,6 +15,7 @@ #include #include +#include #include #include @@ -26,6 +27,11 @@ static const char * const check_usage[] = { NULL, }; +static const char * const env_usage[] = { + "OBJTOOL_ARGS=\"\"", + NULL, +}; + const struct option check_options[] = { OPT_BOOLEAN('f', "no-fp", &no_fp, "Skip frame pointer validation"), OPT_BOOLEAN('u', "no-unreachable", &no_unreachable, "Skip 'unreachable instruction' warnings"), @@ -44,6 +50,25 @@ const struct option check_options[] = { int cmd_parse_options(int argc, const char **argv, const char * const usage[]) { + const char *envv[16] = { }; + char *env; + int envc; + + env = getenv("OBJTOOL_ARGS"); + if (env) { + envv[0] = "OBJTOOL_ARGS"; + for (envc = 1; envc < ARRAY_SIZE(envv); ) { + envv[envc++] = env; + env = strchr(env, ' '); + if (!env) + break; + *env = '\0'; + env++; + } + + parse_options(envc, envv, check_options, env_usage, 0); + } + argc = parse_options(argc, argv, check_options, usage, 0); if (argc != 1) usage_with_options(usage, check_options); -- cgit v1.2.3 From 35276a4f058df23507987ef6d2426ea9673f5e35 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 6 Mar 2021 00:08:38 -0800 Subject: perf skel: Remove some unused variables. Fixes -Wall warnings. Signed-off-by: Ian Rogers Acked-by: Song Liu Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lore.kernel.org/lkml/20210306080840.3785816-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/bpf_skel/bpf_prog_profiler.bpf.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/bpf_skel/bpf_prog_profiler.bpf.c b/tools/perf/util/bpf_skel/bpf_prog_profiler.bpf.c index c7cec92d0236..ab12b4c4ece2 100644 --- a/tools/perf/util/bpf_skel/bpf_prog_profiler.bpf.c +++ b/tools/perf/util/bpf_skel/bpf_prog_profiler.bpf.c @@ -52,7 +52,7 @@ int BPF_PROG(fentry_XXX) static inline void fexit_update_maps(struct bpf_perf_event_value *after) { - struct bpf_perf_event_value *before, diff, *accum; + struct bpf_perf_event_value *before, diff; __u32 zero = 0; before = bpf_map_lookup_elem(&fentry_readings, &zero); @@ -78,7 +78,6 @@ int BPF_PROG(fexit_XXX) { struct bpf_perf_event_value reading; __u32 cpu = bpf_get_smp_processor_id(); - __u32 one = 1, zero = 0; int err; /* read all events before updating the maps, to reduce error */ -- cgit v1.2.3 From 7e1df64edeb294767f0976a7784fe1dbbe9f4394 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 6 Mar 2021 00:08:39 -0800 Subject: perf tools: Enable warnings when compiling BPF programs Add -Wall -Werror when compiling BPF skeletons. Signed-off-by: Ian Rogers Acked-by: Song Liu Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lore.kernel.org/lkml/20210306080840.3785816-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.perf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 5345ac70cd83..f43d2551f3de 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -1029,7 +1029,7 @@ $(BPFTOOL): | $(SKEL_TMP_OUT) OUTPUT=$(SKEL_TMP_OUT)/ bootstrap $(SKEL_TMP_OUT)/%.bpf.o: util/bpf_skel/%.bpf.c $(LIBBPF) | $(SKEL_TMP_OUT) - $(QUIET_CLANG)$(CLANG) -g -O2 -target bpf $(BPF_INCLUDE) \ + $(QUIET_CLANG)$(CLANG) -g -O2 -target bpf -Wall -Werror $(BPF_INCLUDE) \ -c $(filter util/bpf_skel/%.bpf.c,$^) -o $@ && $(LLVM_STRIP) -g $@ $(SKEL_OUT)/%.skel.h: $(SKEL_TMP_OUT)/%.bpf.o | $(BPFTOOL) -- cgit v1.2.3 From 509bbd75f7ff878b5e3f78b9e14ba4b6e16dc3b0 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 6 Mar 2021 00:08:40 -0800 Subject: perf bpf: Minor whitespace cleanup. Missed space after #include. Signed-off-by: Ian Rogers Acked-by: Song Liu Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lore.kernel.org/lkml/20210306080840.3785816-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/bpf_counter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/bpf_counter.h b/tools/perf/util/bpf_counter.h index 2eca210e5dc1..cb9c532e0a07 100644 --- a/tools/perf/util/bpf_counter.h +++ b/tools/perf/util/bpf_counter.h @@ -38,7 +38,7 @@ int bpf_counter__install_pe(struct evsel *evsel, int cpu, int fd); #else /* HAVE_BPF_SKEL */ -#include +#include static inline int bpf_counter__load(struct evsel *evsel __maybe_unused, struct target *target __maybe_unused) -- cgit v1.2.3 From 44e176501c557460de954572435baa4ae34d2a35 Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Fri, 26 Feb 2021 11:01:24 +0100 Subject: perf config: Add annotate.demangle{,_kernel} MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Committer notes: This allows setting this in from the command line: $ perf config annotate.demangle $ perf config annotate.demangle=yes $ perf config annotate.demangle annotate.demangle=yes $ cat ~/.perfconfig # this file is auto-generated. [report] sort-order = srcline [annotate] demangle = yes $ $ $ perf config annotate.demangle_kernel $ perf config annotate.demangle_kernel=yes $ perf config annotate.demangle_kernel annotate.demangle_kernel=yes $ cat ~/.perfconfig # this file is auto-generated. [report] sort-order = srcline [annotate] demangle = yes demangle_kernel = yes $ Signed-off-by: Martin Liška Tested-by: Arnaldo Carvalho de Melo Link: https://lore.kernel.org/r/c96aabe7-791f-9503-295f-3147a9d19b60@suse.cz Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-config.txt | 6 ++++++ tools/perf/util/annotate.c | 4 ++++ 2 files changed, 10 insertions(+) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-config.txt b/tools/perf/Documentation/perf-config.txt index 153bde14bbe0..154a1ced72b2 100644 --- a/tools/perf/Documentation/perf-config.txt +++ b/tools/perf/Documentation/perf-config.txt @@ -393,6 +393,12 @@ annotate.*:: This option works with tui, stdio2 browsers. + annotate.demangle:: + Demangle symbol names to human readable form. Default is 'true'. + + annotate.demangle_kernel:: + Demangle kernel symbol names to human readable form. Default is 'true'. + hist.*:: hist.percentage:: This option control the way to calculate overhead of filtered entries - diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index e60841b86d27..f54224586a3d 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -3144,6 +3144,10 @@ static int annotation__config(const char *var, const char *value, void *data) opt->use_offset = perf_config_bool("use_offset", value); } else if (!strcmp(var, "annotate.disassembler_style")) { opt->disassembler_style = value; + } else if (!strcmp(var, "annotate.demangle")) { + symbol_conf.demangle = perf_config_bool("demangle", value); + } else if (!strcmp(var, "annotate.demangle_kernel")) { + symbol_conf.demangle_kernel = perf_config_bool("demangle_kernel", value); } else { pr_debug("%s variable unknown, ignoring...", var); } -- cgit v1.2.3 From a78e724f4eb487517f03a6044d7a554c8823fe49 Mon Sep 17 00:00:00 2001 From: Xiong Zhenwu Date: Fri, 5 Mar 2021 01:22:12 -0800 Subject: perf bench: Fix misspellings using codespell $ codespell ./tool/perf/bench tools/perf/bench/inject-buildid.c:375: tihs ==> this Fix a typo found by codespell. Signed-off-by: Xiong Zhenwu Cc: Alexander Shishkin Cc: Ian Rogers Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20210305092212.204923-1-xiong.zhenwu@zte.com.cn Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/bench/inject-buildid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/bench/inject-buildid.c b/tools/perf/bench/inject-buildid.c index 280227e3ffd7..55d373b75791 100644 --- a/tools/perf/bench/inject-buildid.c +++ b/tools/perf/bench/inject-buildid.c @@ -372,7 +372,7 @@ static int inject_build_id(struct bench_data *data, u64 *max_rss) len += synthesize_flush(data); } - /* tihs makes the child to finish */ + /* this makes the child to finish */ close(data->input_pipe[1]); wait4(data->pid, &status, 0, &rusage); -- cgit v1.2.3 From 2777b81b379df772defd654bc4d3fa82dca17a4b Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Mon, 15 Feb 2021 12:34:46 +0100 Subject: perf annotate: Show full source location with 'l' hotkey MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Right now, when Line numbers are displayed, one can't easily find a source file that the line corresponds to. When a source line is selected and 'l' is pressed, full source file location is displayed in perf UI footer line. The hotkey works only for source code lines. Signed-off-by: Martin Liška Tested-by: Arnaldo Carvalho de Melo Link: http://lore.kernel.org/lkml/25a6384f-d862-5dda-4fec-8f0555599c75@suse.cz Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/browsers/annotate.c | 25 +++++++++++++++++++++++-- tools/perf/util/annotate.c | 12 ++++++++++-- tools/perf/util/annotate.h | 2 ++ 3 files changed, 35 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index 35b82caf8090..9f75d767b6da 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -381,6 +381,25 @@ static bool annotate_browser__toggle_source(struct annotate_browser *browser) return true; } +#define SYM_TITLE_MAX_SIZE (PATH_MAX + 64) + +static void annotate_browser__show_full_location(struct ui_browser *browser) +{ + struct annotate_browser *ab = container_of(browser, struct annotate_browser, b); + struct disasm_line *cursor = disasm_line(ab->selection); + struct annotation_line *al = &cursor->al; + + if (al->offset != -1) + ui_helpline__puts("Only available for source code lines."); + else if (al->fileloc == NULL) + ui_helpline__puts("No source file location."); + else { + char help_line[SYM_TITLE_MAX_SIZE]; + sprintf (help_line, "Source file location: %s", al->fileloc); + ui_helpline__puts(help_line); + } +} + static void ui_browser__init_asm_mode(struct ui_browser *browser) { struct annotation *notes = browser__annotation(browser); @@ -388,8 +407,6 @@ static void ui_browser__init_asm_mode(struct ui_browser *browser) browser->nr_entries = notes->nr_asm_entries; } -#define SYM_TITLE_MAX_SIZE (PATH_MAX + 64) - static int sym_title(struct symbol *sym, struct map *map, char *title, size_t sz, int percent_type) { @@ -747,6 +764,7 @@ static int annotate_browser__run(struct annotate_browser *browser, "c Show min/max cycle\n" "/ Search string\n" "k Toggle line numbers\n" + "l Show full source file location\n" "P Print to [symbol_name].annotation file.\n" "r Run available scripts\n" "p Toggle percent type [local/global]\n" @@ -760,6 +778,9 @@ static int annotate_browser__run(struct annotate_browser *browser, case 'k': notes->options->show_linenr = !notes->options->show_linenr; continue; + case 'l': + annotate_browser__show_full_location (&browser->b); + continue; case 'H': nd = browser->curr_hot; break; diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index f54224586a3d..18eee25b4976 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -1161,6 +1161,7 @@ struct annotate_args { s64 offset; char *line; int line_nr; + char *fileloc; }; static void annotation_line__init(struct annotation_line *al, @@ -1170,6 +1171,7 @@ static void annotation_line__init(struct annotation_line *al, al->offset = args->offset; al->line = strdup(args->line); al->line_nr = args->line_nr; + al->fileloc = args->fileloc; al->data_nr = nr; } @@ -1482,7 +1484,7 @@ annotation_line__print(struct annotation_line *al, struct symbol *sym, u64 start */ static int symbol__parse_objdump_line(struct symbol *sym, struct annotate_args *args, - char *parsed_line, int *line_nr) + char *parsed_line, int *line_nr, char **fileloc) { struct map *map = args->ms.map; struct annotation *notes = symbol__annotation(sym); @@ -1494,6 +1496,7 @@ static int symbol__parse_objdump_line(struct symbol *sym, /* /filename:linenr ? Save line number and ignore. */ if (regexec(&file_lineno, parsed_line, 2, match, 0) == 0) { *line_nr = atoi(parsed_line + match[1].rm_so); + *fileloc = strdup(parsed_line); return 0; } @@ -1513,6 +1516,7 @@ static int symbol__parse_objdump_line(struct symbol *sym, args->offset = offset; args->line = parsed_line; args->line_nr = *line_nr; + args->fileloc = *fileloc; args->ms.sym = sym; dl = disasm_line__new(args); @@ -1807,6 +1811,7 @@ static int symbol__disassemble_bpf(struct symbol *sym, args->offset = -1; args->line = strdup(srcline); args->line_nr = 0; + args->fileloc = NULL; args->ms.sym = sym; dl = disasm_line__new(args); if (dl) { @@ -1818,6 +1823,7 @@ static int symbol__disassemble_bpf(struct symbol *sym, args->offset = pc; args->line = buf + prev_buf_size; args->line_nr = 0; + args->fileloc = NULL; args->ms.sym = sym; dl = disasm_line__new(args); if (dl) @@ -1852,6 +1858,7 @@ symbol__disassemble_bpf_image(struct symbol *sym, args->offset = -1; args->line = strdup("to be implemented"); args->line_nr = 0; + args->fileloc = NULL; dl = disasm_line__new(args); if (dl) annotation_line__add(&dl->al, ¬es->src->source); @@ -1933,6 +1940,7 @@ static int symbol__disassemble(struct symbol *sym, struct annotate_args *args) bool delete_extract = false; bool decomp = false; int lineno = 0; + char *fileloc = NULL; int nline; char *line; size_t line_len; @@ -2060,7 +2068,7 @@ static int symbol__disassemble(struct symbol *sym, struct annotate_args *args) * See disasm_line__new() and struct disasm_line::line_nr. */ if (symbol__parse_objdump_line(sym, args, expanded_line, - &lineno) < 0) + &lineno, &fileloc) < 0) break; nline++; } diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index 096cdaf21b01..3757416bcf46 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -84,6 +84,7 @@ struct annotation_options { print_lines, full_path, show_linenr, + show_fileloc, show_nr_jumps, show_minmax_cycle, show_asm_raw, @@ -136,6 +137,7 @@ struct annotation_line { s64 offset; char *line; int line_nr; + char *fileloc; int jump_sources; float ipc; u64 cycles; -- cgit v1.2.3 From c9491aad97e67e45216d5181a2dc56ebf1a0bc04 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 10 Feb 2021 15:40:05 -0800 Subject: Documentation: Replace more lkml.org links with lore As started by commit 05a5f51ca566 ("Documentation: Replace lkml.org links with lore"), replace a few more scattered lkml.org links with lore to better use a single source that's more likely to stay available long-term. Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20210210234005.2236201-1-keescook@chromium.org Signed-off-by: Jonathan Corbet --- CREDITS | 2 +- tools/scripts/Makefile.include | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/CREDITS b/CREDITS index cef83b958cbe..cf8e23498a34 100644 --- a/CREDITS +++ b/CREDITS @@ -550,7 +550,7 @@ D: gadget layers, SPI subsystem, GPIO subsystem, and more than a few D: device drivers. His encouragement also helped many engineers get D: started working on the Linux kernel. David passed away in early D: 2011, and will be greatly missed. -W: https://lkml.org/lkml/2011/4/5/36 +W: https://lore.kernel.org/lkml/20110405034819.GA7872@kroah.com N: Gary Brubaker E: xavyer@ix.netcom.com diff --git a/tools/scripts/Makefile.include b/tools/scripts/Makefile.include index a402f32a145c..84dbf61a7eca 100644 --- a/tools/scripts/Makefile.include +++ b/tools/scripts/Makefile.include @@ -86,7 +86,8 @@ endif # in newer systems. # Needed for the __raw_cmpxchg in tools/arch/x86/include/asm/cmpxchg.h # -# See https://lkml.org/lkml/2006/11/28/253 and https://gcc.gnu.org/gcc-4.8/changes.html, +# See https://lore.kernel.org/lkml/9a8748490611281710g78402fbeh8ff7fcc162dbcbca@mail.gmail.com/ +# and https://gcc.gnu.org/gcc-4.8/changes.html, # that takes into account Linus's comments (search for Wshadow) for the reasoning about # -Wshadow not being interesting before gcc 4.8. -- cgit v1.2.3 From 09141ec0e4efede4fb5e2aa68cb819fba974325c Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 5 Mar 2020 09:47:06 -0800 Subject: x86: Remove duplicate TSC DEADLINE MSR definitions There are two definitions for the TSC deadline MSR in msr-index.h, one with an underscore and one without. Axe one of them and move all the references over to the other one. [ bp: Fixup the MSR define in handle_fastpath_set_msr_irqoff() too. ] Signed-off-by: Dave Hansen Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200305174706.0D6B8EE4@viggo.jf.intel.com --- arch/x86/include/asm/msr-index.h | 2 -- arch/x86/kvm/x86.c | 8 ++++---- tools/arch/x86/include/asm/msr-index.h | 2 -- tools/perf/trace/beauty/tracepoints/x86_msr.sh | 2 +- 4 files changed, 5 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 546d6ecf0a35..45029354e0a8 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -628,8 +628,6 @@ #define MSR_IA32_APICBASE_ENABLE (1<<11) #define MSR_IA32_APICBASE_BASE (0xfffff<<12) -#define MSR_IA32_TSCDEADLINE 0x000006e0 - #define MSR_IA32_UCODE_WRITE 0x00000079 #define MSR_IA32_UCODE_REV 0x0000008b diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2a20ce60152e..c020499efe47 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1288,7 +1288,7 @@ static const u32 emulated_msrs_all[] = { MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK, MSR_IA32_TSC_ADJUST, - MSR_IA32_TSCDEADLINE, + MSR_IA32_TSC_DEADLINE, MSR_IA32_ARCH_CAPABILITIES, MSR_IA32_PERF_CAPABILITIES, MSR_IA32_MISC_ENABLE, @@ -1841,7 +1841,7 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) ret = EXIT_FASTPATH_EXIT_HANDLED; } break; - case MSR_IA32_TSCDEADLINE: + case MSR_IA32_TSC_DEADLINE: data = kvm_read_edx_eax(vcpu); if (!handle_fastpath_set_tscdeadline(vcpu, data)) { kvm_skip_emulated_instruction(vcpu); @@ -3075,7 +3075,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return kvm_set_apic_base(vcpu, msr_info); case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: return kvm_x2apic_msr_write(vcpu, msr, data); - case MSR_IA32_TSCDEADLINE: + case MSR_IA32_TSC_DEADLINE: kvm_set_lapic_tscdeadline_msr(vcpu, data); break; case MSR_IA32_TSC_ADJUST: @@ -3437,7 +3437,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data); - case MSR_IA32_TSCDEADLINE: + case MSR_IA32_TSC_DEADLINE: msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu); break; case MSR_IA32_TSC_ADJUST: diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 546d6ecf0a35..45029354e0a8 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -628,8 +628,6 @@ #define MSR_IA32_APICBASE_ENABLE (1<<11) #define MSR_IA32_APICBASE_BASE (0xfffff<<12) -#define MSR_IA32_TSCDEADLINE 0x000006e0 - #define MSR_IA32_UCODE_WRITE 0x00000079 #define MSR_IA32_UCODE_REV 0x0000008b diff --git a/tools/perf/trace/beauty/tracepoints/x86_msr.sh b/tools/perf/trace/beauty/tracepoints/x86_msr.sh index 27ee1ea1fe94..9b0614a87831 100755 --- a/tools/perf/trace/beauty/tracepoints/x86_msr.sh +++ b/tools/perf/trace/beauty/tracepoints/x86_msr.sh @@ -15,7 +15,7 @@ x86_msr_index=${arch_x86_header_dir}/msr-index.h printf "static const char *x86_MSRs[] = {\n" regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+MSR_([[:alnum:]][[:alnum:]_]+)[[:space:]]+(0x00000[[:xdigit:]]+)[[:space:]]*.*' -egrep $regex ${x86_msr_index} | egrep -v 'MSR_(ATOM|P[46]|IA32_(TSCDEADLINE|UCODE_REV)|IDT_FCR4)' | \ +egrep $regex ${x86_msr_index} | egrep -v 'MSR_(ATOM|P[46]|IA32_(TSC_DEADLINE|UCODE_REV)|IDT_FCR4)' | \ sed -r "s/$regex/\2 \1/g" | sort -n | \ xargs printf "\t[%s] = \"%s\",\n" printf "};\n\n" -- cgit v1.2.3 From c6b2f240bf8d5604e6507aff15d5c441944c2f89 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Fri, 5 Mar 2021 15:21:18 +0800 Subject: tools/x86: Add a kcpuid tool to show raw CPU features MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit End users frequently want to know what features their processor supports, independent of what the kernel supports. /proc/cpuinfo is great. It is omnipresent and since it is provided by the kernel it is always as up to date as the kernel. But, it could be ambiguous about processor features which can be disabled by the kernel at boot-time or compile-time. There are some user space tools showing more raw features, but they are not bound with kernel, and go with distros. Many end users are still using old distros with new kernels (upgraded by themselves), and may not upgrade the distros only to get a newer tool. So here arise the need for a new tool, which * shows raw CPU features read from the CPUID instruction * will be easier to update compared to existing userspace tooling (perhaps distributed like perf) * inherits "modern" kernel development process, in contrast to some of the existing userspace CPUID tools which are still being developed without git and distributed in tarballs from non-https sites. * Can produce output consistent with /proc/cpuinfo to make comparison easier. The CPUID leaf definitions are kept in an .csv file which allows for updating only that file to add support for new feature leafs. This is based on prototype code from Borislav Petkov (http://sr71.net/~dave/intel/stupid-cpuid.c). [ bp: - Massage, add #define _GNU_SOURCE to fix implicit declaration of function ‘strcasestr' warning - remove superfluous newlines - fallback to cpuid.csv in the current dir if none found - fix typos - move comments over the lines instead of sideways. ] Originally-from: Borislav Petkov Suggested-by: Dave Hansen Suggested-by: Borislav Petkov Signed-off-by: Feng Tang Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/1614928878-86075-1-git-send-email-feng.tang@intel.com --- tools/arch/x86/kcpuid/Makefile | 24 ++ tools/arch/x86/kcpuid/cpuid.csv | 380 +++++++++++++++++++++++ tools/arch/x86/kcpuid/kcpuid.c | 655 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 1059 insertions(+) create mode 100644 tools/arch/x86/kcpuid/Makefile create mode 100644 tools/arch/x86/kcpuid/cpuid.csv create mode 100644 tools/arch/x86/kcpuid/kcpuid.c (limited to 'tools') diff --git a/tools/arch/x86/kcpuid/Makefile b/tools/arch/x86/kcpuid/Makefile new file mode 100644 index 000000000000..87b554fab14b --- /dev/null +++ b/tools/arch/x86/kcpuid/Makefile @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: GPL-2.0 +# Makefile for x86/kcpuid tool + +kcpuid : kcpuid.c + +CFLAGS = -Wextra + +BINDIR ?= /usr/sbin + +HWDATADIR ?= /usr/share/misc/ + +override CFLAGS += -O2 -Wall -I../../../include + +%: %.c + $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) + +.PHONY : clean +clean : + @rm -f kcpuid + +install : kcpuid + install -d $(DESTDIR)$(BINDIR) + install -m 755 -p kcpuid $(DESTDIR)$(BINDIR)/kcpuid + install -m 444 -p cpuid.csv $(HWDATADIR)/cpuid.csv diff --git a/tools/arch/x86/kcpuid/cpuid.csv b/tools/arch/x86/kcpuid/cpuid.csv new file mode 100644 index 000000000000..f4a5b85073f4 --- /dev/null +++ b/tools/arch/x86/kcpuid/cpuid.csv @@ -0,0 +1,380 @@ +# The basic row format is: +# LEAF, SUBLEAF, register_name, bits, short_name, long_description + +# Leaf 00H + 0, 0, EAX, 31:0, max_basic_leafs, Max input value for supported subleafs + +# Leaf 01H + 1, 0, EAX, 3:0, stepping, Stepping ID + 1, 0, EAX, 7:4, model, Model + 1, 0, EAX, 11:8, family, Family ID + 1, 0, EAX, 13:12, processor, Processor Type + 1, 0, EAX, 19:16, model_ext, Extended Model ID + 1, 0, EAX, 27:20, family_ext, Extended Family ID + + 1, 0, EBX, 7:0, brand, Brand Index + 1, 0, EBX, 15:8, clflush_size, CLFLUSH line size (value * 8) in bytes + 1, 0, EBX, 23:16, max_cpu_id, Maxim number of addressable logic cpu in this package + 1, 0, EBX, 31:24, apic_id, Initial APIC ID + + 1, 0, ECX, 0, sse3, Streaming SIMD Extensions 3(SSE3) + 1, 0, ECX, 1, pclmulqdq, PCLMULQDQ instruction supported + 1, 0, ECX, 2, dtes64, DS area uses 64-bit layout + 1, 0, ECX, 3, mwait, MONITOR/MWAIT supported + 1, 0, ECX, 4, ds_cpl, CPL Qualified Debug Store which allows for branch message storage qualified by CPL + 1, 0, ECX, 5, vmx, Virtual Machine Extensions supported + 1, 0, ECX, 6, smx, Safer Mode Extension supported + 1, 0, ECX, 7, eist, Enhanced Intel SpeedStep Technology + 1, 0, ECX, 8, tm2, Thermal Monitor 2 + 1, 0, ECX, 9, ssse3, Supplemental Streaming SIMD Extensions 3 (SSSE3) + 1, 0, ECX, 10, l1_ctx_id, L1 data cache could be set to either adaptive mode or shared mode (check IA32_MISC_ENABLE bit 24 definition) + 1, 0, ECX, 11, sdbg, IA32_DEBUG_INTERFACE MSR for silicon debug supported + 1, 0, ECX, 12, fma, FMA extensions using YMM state supported + 1, 0, ECX, 13, cmpxchg16b, 'CMPXCHG16B - Compare and Exchange Bytes' supported + 1, 0, ECX, 14, xtpr_update, xTPR Update Control supported + 1, 0, ECX, 15, pdcm, Perfmon and Debug Capability present + 1, 0, ECX, 17, pcid, Process-Context Identifiers feature present + 1, 0, ECX, 18, dca, Prefetching data from a memory mapped device supported + 1, 0, ECX, 19, sse4_1, SSE4.1 feature present + 1, 0, ECX, 20, sse4_2, SSE4.2 feature present + 1, 0, ECX, 21, x2apic, x2APIC supported + 1, 0, ECX, 22, movbe, MOVBE instruction supported + 1, 0, ECX, 23, popcnt, POPCNT instruction supported + 1, 0, ECX, 24, tsc_deadline_timer, LAPIC supports one-shot operation using a TSC deadline value + 1, 0, ECX, 25, aesni, AESNI instruction supported + 1, 0, ECX, 26, xsave, XSAVE/XRSTOR processor extended states (XSETBV/XGETBV/XCR0) + 1, 0, ECX, 27, osxsave, OS has set CR4.OSXSAVE bit to enable XSETBV/XGETBV/XCR0 + 1, 0, ECX, 28, avx, AVX instruction supported + 1, 0, ECX, 29, f16c, 16-bit floating-point conversion instruction supported + 1, 0, ECX, 30, rdrand, RDRAND instruction supported + + 1, 0, EDX, 0, fpu, x87 FPU on chip + 1, 0, EDX, 1, vme, Virtual-8086 Mode Enhancement + 1, 0, EDX, 2, de, Debugging Extensions + 1, 0, EDX, 3, pse, Page Size Extensions + 1, 0, EDX, 4, tsc, Time Stamp Counter + 1, 0, EDX, 5, msr, RDMSR and WRMSR Support + 1, 0, EDX, 6, pae, Physical Address Extensions + 1, 0, EDX, 7, mce, Machine Check Exception + 1, 0, EDX, 8, cx8, CMPXCHG8B instr + 1, 0, EDX, 9, apic, APIC on Chip + 1, 0, EDX, 11, sep, SYSENTER and SYSEXIT instrs + 1, 0, EDX, 12, mtrr, Memory Type Range Registers + 1, 0, EDX, 13, pge, Page Global Bit + 1, 0, EDX, 14, mca, Machine Check Architecture + 1, 0, EDX, 15, cmov, Conditional Move Instrs + 1, 0, EDX, 16, pat, Page Attribute Table + 1, 0, EDX, 17, pse36, 36-Bit Page Size Extension + 1, 0, EDX, 18, psn, Processor Serial Number + 1, 0, EDX, 19, clflush, CLFLUSH instr +# 1, 0, EDX, 20, + 1, 0, EDX, 21, ds, Debug Store + 1, 0, EDX, 22, acpi, Thermal Monitor and Software Controlled Clock Facilities + 1, 0, EDX, 23, mmx, Intel MMX Technology + 1, 0, EDX, 24, fxsr, XSAVE and FXRSTOR Instrs + 1, 0, EDX, 25, sse, SSE + 1, 0, EDX, 26, sse2, SSE2 + 1, 0, EDX, 27, ss, Self Snoop + 1, 0, EDX, 28, hit, Max APIC IDs + 1, 0, EDX, 29, tm, Thermal Monitor +# 1, 0, EDX, 30, + 1, 0, EDX, 31, pbe, Pending Break Enable + +# Leaf 02H +# cache and TLB descriptor info + +# Leaf 03H +# Precessor Serial Number, introduced on Pentium III, not valid for +# latest models + +# Leaf 04H +# thread/core and cache topology + 4, 0, EAX, 4:0, cache_type, Cache type like instr/data or unified + 4, 0, EAX, 7:5, cache_level, Cache Level (starts at 1) + 4, 0, EAX, 8, cache_self_init, Cache Self Initialization + 4, 0, EAX, 9, fully_associate, Fully Associative cache +# 4, 0, EAX, 13:10, resvd, resvd + 4, 0, EAX, 25:14, max_logical_id, Max number of addressable IDs for logical processors sharing the cache + 4, 0, EAX, 31:26, max_phy_id, Max number of addressable IDs for processors in phy package + + 4, 0, EBX, 11:0, cache_linesize, Size of a cache line in bytes + 4, 0, EBX, 21:12, cache_partition, Physical Line partitions + 4, 0, EBX, 31:22, cache_ways, Ways of associativity + 4, 0, ECX, 31:0, cache_sets, Number of Sets - 1 + 4, 0, EDX, 0, c_wbinvd, 1 means WBINVD/INVD is not ganranteed to act upon lower level caches of non-originating threads sharing this cache + 4, 0, EDX, 1, c_incl, Whether cache is inclusive of lower cache level + 4, 0, EDX, 2, c_comp_index, Complex Cache Indexing + +# Leaf 05H +# MONITOR/MWAIT + 5, 0, EAX, 15:0, min_mon_size, Smallest monitor line size in bytes + 5, 0, EBX, 15:0, max_mon_size, Largest monitor line size in bytes + 5, 0, ECX, 0, mwait_ext, Enum of Monitor-Mwait extensions supported + 5, 0, ECX, 1, mwait_irq_break, Largest monitor line size in bytes + 5, 0, EDX, 3:0, c0_sub_stats, Number of C0* sub C-states supported using MWAIT + 5, 0, EDX, 7:4, c1_sub_stats, Number of C1* sub C-states supported using MWAIT + 5, 0, EDX, 11:8, c2_sub_stats, Number of C2* sub C-states supported using MWAIT + 5, 0, EDX, 15:12, c3_sub_stats, Number of C3* sub C-states supported using MWAIT + 5, 0, EDX, 19:16, c4_sub_stats, Number of C4* sub C-states supported using MWAIT + 5, 0, EDX, 23:20, c5_sub_stats, Number of C5* sub C-states supported using MWAIT + 5, 0, EDX, 27:24, c6_sub_stats, Number of C6* sub C-states supported using MWAIT + 5, 0, EDX, 31:28, c7_sub_stats, Number of C7* sub C-states supported using MWAIT + +# Leaf 06H +# Thermal & Power Management + + 6, 0, EAX, 0, dig_temp, Digital temperature sensor supported + 6, 0, EAX, 1, turbo, Intel Turbo Boost + 6, 0, EAX, 2, arat, Always running APIC timer +# 6, 0, EAX, 3, resv, Reserved + 6, 0, EAX, 4, pln, Power limit notifications supported + 6, 0, EAX, 5, ecmd, Clock modulation duty cycle extension supported + 6, 0, EAX, 6, ptm, Package thermal management supported + 6, 0, EAX, 7, hwp, HWP base register + 6, 0, EAX, 8, hwp_notify, HWP notification + 6, 0, EAX, 9, hwp_act_window, HWP activity window + 6, 0, EAX, 10, hwp_energy, HWP energy performance preference + 6, 0, EAX, 11, hwp_pkg_req, HWP package level request +# 6, 0, EAX, 12, resv, Reserved + 6, 0, EAX, 13, hdc, HDC base registers supported + 6, 0, EAX, 14, turbo3, Turbo Boost Max 3.0 + 6, 0, EAX, 15, hwp_cap, Highest Performance change supported + 6, 0, EAX, 16, hwp_peci, HWP PECI override is supported + 6, 0, EAX, 17, hwp_flex, Flexible HWP is supported + 6, 0, EAX, 18, hwp_fast, Fast access mode for the IA32_HWP_REQUEST MSR is supported +# 6, 0, EAX, 19, resv, Reserved + 6, 0, EAX, 20, hwp_ignr, Ignoring Idle Logical Processor HWP request is supported + + 6, 0, EBX, 3:0, therm_irq_thresh, Number of Interrupt Thresholds in Digital Thermal Sensor + 6, 0, ECX, 0, aperfmperf, Presence of IA32_MPERF and IA32_APERF + 6, 0, ECX, 3, energ_bias, Performance-energy bias preference supported + +# Leaf 07H +# ECX == 0 +# AVX512 refers to https://en.wikipedia.org/wiki/AVX-512 +# XXX: Do we really need to enumerate each and every AVX512 sub features + + 7, 0, EBX, 0, fsgsbase, RDFSBASE/RDGSBASE/WRFSBASE/WRGSBASE supported + 7, 0, EBX, 1, tsc_adjust, TSC_ADJUST MSR supported + 7, 0, EBX, 2, sgx, Software Guard Extensions + 7, 0, EBX, 3, bmi1, BMI1 + 7, 0, EBX, 4, hle, Hardware Lock Elision + 7, 0, EBX, 5, avx2, AVX2 +# 7, 0, EBX, 6, fdp_excp_only, x87 FPU Data Pointer updated only on x87 exceptions + 7, 0, EBX, 7, smep, Supervisor-Mode Execution Prevention + 7, 0, EBX, 8, bmi2, BMI2 + 7, 0, EBX, 9, rep_movsb, Enhanced REP MOVSB/STOSB + 7, 0, EBX, 10, invpcid, INVPCID instruction + 7, 0, EBX, 11, rtm, Restricted Transactional Memory + 7, 0, EBX, 12, rdt_m, Intel RDT Monitoring capability + 7, 0, EBX, 13, depc_fpu_cs_ds, Deprecates FPU CS and FPU DS + 7, 0, EBX, 14, mpx, Memory Protection Extensions + 7, 0, EBX, 15, rdt_a, Intel RDT Allocation capability + 7, 0, EBX, 16, avx512f, AVX512 Foundation instr + 7, 0, EBX, 17, avx512dq, AVX512 Double and Quadword AVX512 instr + 7, 0, EBX, 18, rdseed, RDSEED instr + 7, 0, EBX, 19, adx, ADX instr + 7, 0, EBX, 20, smap, Supervisor Mode Access Prevention + 7, 0, EBX, 21, avx512ifma, AVX512 Integer Fused Multiply Add +# 7, 0, EBX, 22, resvd, resvd + 7, 0, EBX, 23, clflushopt, CLFLUSHOPT instr + 7, 0, EBX, 24, clwb, CLWB instr + 7, 0, EBX, 25, intel_pt, Intel Processor Trace instr + 7, 0, EBX, 26, avx512pf, Prefetch + 7, 0, EBX, 27, avx512er, AVX512 Exponent Reciproca instr + 7, 0, EBX, 28, avx512cd, AVX512 Conflict Detection instr + 7, 0, EBX, 29, sha, Intel Secure Hash Algorithm Extensions instr + 7, 0, EBX, 26, avx512bw, AVX512 Byte & Word instr + 7, 0, EBX, 28, avx512vl, AVX512 Vector Length Extentions (VL) + 7, 0, ECX, 0, prefetchwt1, X + 7, 0, ECX, 1, avx512vbmi, AVX512 Vector Byte Manipulation Instructions + 7, 0, ECX, 2, umip, User-mode Instruction Prevention + + 7, 0, ECX, 3, pku, Protection Keys for User-mode pages + 7, 0, ECX, 4, ospke, CR4 PKE set to enable protection keys +# 7, 0, ECX, 16:5, resvd, resvd + 7, 0, ECX, 21:17, mawau, The value of MAWAU used by the BNDLDX and BNDSTX instructions in 64-bit mode + 7, 0, ECX, 22, rdpid, RDPID and IA32_TSC_AUX +# 7, 0, ECX, 29:23, resvd, resvd + 7, 0, ECX, 30, sgx_lc, SGX Launch Configuration +# 7, 0, ECX, 31, resvd, resvd + +# Leaf 08H +# + + +# Leaf 09H +# Direct Cache Access (DCA) information + 9, 0, ECX, 31:0, dca_cap, The value of IA32_PLATFORM_DCA_CAP + +# Leaf 0AH +# Architectural Performance Monitoring +# +# Do we really need to print out the PMU related stuff? +# Does normal user really care about it? +# + 0xA, 0, EAX, 7:0, pmu_ver, Performance Monitoring Unit version + 0xA, 0, EAX, 15:8, pmu_gp_cnt_num, Numer of general-purose PMU counters per logical CPU + 0xA, 0, EAX, 23:16, pmu_cnt_bits, Bit wideth of PMU counter + 0xA, 0, EAX, 31:24, pmu_ebx_bits, Length of EBX bit vector to enumerate PMU events + + 0xA, 0, EBX, 0, pmu_no_core_cycle_evt, Core cycle event not available + 0xA, 0, EBX, 1, pmu_no_instr_ret_evt, Instruction retired event not available + 0xA, 0, EBX, 2, pmu_no_ref_cycle_evt, Reference cycles event not available + 0xA, 0, EBX, 3, pmu_no_llc_ref_evt, Last-level cache reference event not available + 0xA, 0, EBX, 4, pmu_no_llc_mis_evt, Last-level cache misses event not available + 0xA, 0, EBX, 5, pmu_no_br_instr_ret_evt, Branch instruction retired event not available + 0xA, 0, EBX, 6, pmu_no_br_mispredict_evt, Branch mispredict retired event not available + + 0xA, 0, ECX, 4:0, pmu_fixed_cnt_num, Performance Monitoring Unit version + 0xA, 0, ECX, 12:5, pmu_fixed_cnt_bits, Numer of PMU counters per logical CPU + +# Leaf 0BH +# Extended Topology Enumeration Leaf +# + + 0xB, 0, EAX, 4:0, id_shift, Number of bits to shift right on x2APIC ID to get a unique topology ID of the next level type + 0xB, 0, EBX, 15:0, cpu_nr, Number of logical processors at this level type + 0xB, 0, ECX, 15:8, lvl_type, 0-Invalid 1-SMT 2-Core + 0xB, 0, EDX, 31:0, x2apic_id, x2APIC ID the current logical processor + + +# Leaf 0DH +# Processor Extended State + + 0xD, 0, EAX, 0, x87, X87 state + 0xD, 0, EAX, 1, sse, SSE state + 0xD, 0, EAX, 2, avx, AVX state + 0xD, 0, EAX, 4:3, mpx, MPX state + 0xD, 0, EAX, 7:5, avx512, AVX-512 state + 0xD, 0, EAX, 9, pkru, PKRU state + + 0xD, 0, EBX, 31:0, max_sz_xcr0, Maximum size (bytes) required by enabled features in XCR0 + 0xD, 0, ECX, 31:0, max_sz_xsave, Maximum size (bytes) of the XSAVE/XRSTOR save area + + 0xD, 1, EAX, 0, xsaveopt, XSAVEOPT available + 0xD, 1, EAX, 1, xsavec, XSAVEC and compacted form supported + 0xD, 1, EAX, 2, xgetbv, XGETBV supported + 0xD, 1, EAX, 3, xsaves, XSAVES/XRSTORS and IA32_XSS supported + + 0xD, 1, EBX, 31:0, max_sz_xcr0, Maximum size (bytes) required by enabled features in XCR0 + 0xD, 1, ECX, 8, pt, PT state + 0xD, 1, ECX, 11, cet_usr, CET user state + 0xD, 1, ECX, 12, cet_supv, CET supervisor state + 0xD, 1, ECX, 13, hdc, HDC state + 0xD, 1, ECX, 16, hwp, HWP state + +# Leaf 0FH +# Intel RDT Monitoring + + 0xF, 0, EBX, 31:0, rmid_range, Maximum range (zero-based) of RMID within this physical processor of all types + 0xF, 0, EDX, 1, l3c_rdt_mon, L3 Cache RDT Monitoring supported + + 0xF, 1, ECX, 31:0, rmid_range, Maximum range (zero-based) of RMID of this types + 0xF, 1, EDX, 0, l3c_ocp_mon, L3 Cache occupancy Monitoring supported + 0xF, 1, EDX, 1, l3c_tbw_mon, L3 Cache Total Bandwidth Monitoring supported + 0xF, 1, EDX, 2, l3c_lbw_mon, L3 Cache Local Bandwidth Monitoring supported + +# Leaf 10H +# Intel RDT Allocation + + 0x10, 0, EBX, 1, l3c_rdt_alloc, L3 Cache Allocation supported + 0x10, 0, EBX, 2, l2c_rdt_alloc, L2 Cache Allocation supported + 0x10, 0, EBX, 3, mem_bw_alloc, Memory Bandwidth Allocation supported + + +# Leaf 12H +# SGX Capability +# +# Some detailed SGX features not added yet + + 0x12, 0, EAX, 0, sgx1, L3 Cache Allocation supported + 0x12, 1, EAX, 0, sgx2, L3 Cache Allocation supported + + +# Leaf 14H +# Intel Processor Tracer +# + +# Leaf 15H +# Time Stamp Counter and Nominal Core Crystal Clock Information + + 0x15, 0, EAX, 31:0, tsc_denominator, The denominator of the TSC/”core crystal clock” ratio + 0x15, 0, EBX, 31:0, tsc_numerator, The numerator of the TSC/”core crystal clock” ratio + 0x15, 0, ECX, 31:0, nom_freq, Nominal frequency of the core crystal clock in Hz + +# Leaf 16H +# Processor Frequency Information + + 0x16, 0, EAX, 15:0, cpu_base_freq, Processor Base Frequency in MHz + 0x16, 0, EBX, 15:0, cpu_max_freq, Maximum Frequency in MHz + 0x16, 0, ECX, 15:0, bus_freq, Bus (Reference) Frequency in MHz + +# Leaf 17H +# System-On-Chip Vendor Attribute + + 0x17, 0, EAX, 31:0, max_socid, Maximum input value of supported sub-leaf + 0x17, 0, EBX, 15:0, soc_vid, SOC Vendor ID + 0x17, 0, EBX, 16, std_vid, SOC Vendor ID is assigned via an industry standard scheme + 0x17, 0, ECX, 31:0, soc_pid, SOC Project ID assigned by vendor + 0x17, 0, EDX, 31:0, soc_sid, SOC Stepping ID + +# Leaf 18H +# Deterministic Address Translation Parameters + + +# Leaf 19H +# Key Locker Leaf + + +# Leaf 1AH +# Hybrid Information + + 0x1A, 0, EAX, 31:24, core_type, 20H-Intel_Atom 40H-Intel_Core + + +# Leaf 1FH +# V2 Extended Topology - A preferred superset to leaf 0BH + + +# According to SDM +# 40000000H - 4FFFFFFFH is invalid range + + +# Leaf 80000001H +# Extended Processor Signature and Feature Bits + +0x80000001, 0, ECX, 0, lahf_lm, LAHF/SAHF available in 64-bit mode +0x80000001, 0, ECX, 5, lzcnt, LZCNT +0x80000001, 0, ECX, 8, prefetchw, PREFETCHW + +0x80000001, 0, EDX, 11, sysret, SYSCALL/SYSRET supported +0x80000001, 0, EDX, 20, exec_dis, Execute Disable Bit available +0x80000001, 0, EDX, 26, 1gb_page, 1GB page supported +0x80000001, 0, EDX, 27, rdtscp, RDTSCP and IA32_TSC_AUX are available +#0x80000001, 0, EDX, 29, 64b, 64b Architecture supported + +# Leaf 80000002H/80000003H/80000004H +# Processor Brand String + +# Leaf 80000005H +# Reserved + +# Leaf 80000006H +# Extended L2 Cache Features + +0x80000006, 0, ECX, 7:0, clsize, Cache Line size in bytes +0x80000006, 0, ECX, 15:12, l2c_assoc, L2 Associativity +0x80000006, 0, ECX, 31:16, csize, Cache size in 1K units + + +# Leaf 80000007H + +0x80000007, 0, EDX, 8, nonstop_tsc, Invariant TSC available + + +# Leaf 80000008H + +0x80000008, 0, EAX, 7:0, phy_adr_bits, Physical Address Bits +0x80000008, 0, EAX, 15:8, lnr_adr_bits, Linear Address Bits +0x80000007, 0, EBX, 9, wbnoinvd, WBNOINVD diff --git a/tools/arch/x86/kcpuid/kcpuid.c b/tools/arch/x86/kcpuid/kcpuid.c new file mode 100644 index 000000000000..6048da34fcc6 --- /dev/null +++ b/tools/arch/x86/kcpuid/kcpuid.c @@ -0,0 +1,655 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE + +#include +#include +#include +#include +#include + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) + +typedef unsigned int u32; +typedef unsigned long long u64; + +char *def_csv = "/usr/share/misc/cpuid.csv"; +char *user_csv; + + +/* Cover both single-bit flag and multiple-bits fields */ +struct bits_desc { + /* start and end bits */ + int start, end; + /* 0 or 1 for 1-bit flag */ + int value; + char simp[32]; + char detail[256]; +}; + +/* descriptor info for eax/ebx/ecx/edx */ +struct reg_desc { + /* number of valid entries */ + int nr; + struct bits_desc descs[32]; +}; + +enum { + R_EAX = 0, + R_EBX, + R_ECX, + R_EDX, + NR_REGS +}; + +struct subleaf { + u32 index; + u32 sub; + u32 eax, ebx, ecx, edx; + struct reg_desc info[NR_REGS]; +}; + +/* Represent one leaf (basic or extended) */ +struct cpuid_func { + /* + * Array of subleafs for this func, if there is no subleafs + * then the leafs[0] is the main leaf + */ + struct subleaf *leafs; + int nr; +}; + +struct cpuid_range { + /* array of main leafs */ + struct cpuid_func *funcs; + /* number of valid leafs */ + int nr; + bool is_ext; +}; + +/* + * basic: basic functions range: [0... ] + * ext: extended functions range: [0x80000000... ] + */ +struct cpuid_range *leafs_basic, *leafs_ext; + +static int num_leafs; +static bool is_amd; +static bool show_details; +static bool show_raw; +static bool show_flags_only = true; +static u32 user_index = 0xFFFFFFFF; +static u32 user_sub = 0xFFFFFFFF; +static int flines; + +static inline void cpuid(u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) +{ + /* ecx is often an input as well as an output. */ + asm volatile("cpuid" + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (*eax), "2" (*ecx)); +} + +static inline bool has_subleafs(u32 f) +{ + if (f == 0x7 || f == 0xd) + return true; + + if (is_amd) { + if (f == 0x8000001d) + return true; + return false; + } + + switch (f) { + case 0x4: + case 0xb: + case 0xf: + case 0x10: + case 0x14: + case 0x18: + case 0x1f: + return true; + default: + return false; + } +} + +static void leaf_print_raw(struct subleaf *leaf) +{ + if (has_subleafs(leaf->index)) { + if (leaf->sub == 0) + printf("0x%08x: subleafs:\n", leaf->index); + + printf(" %2d: EAX=0x%08x, EBX=0x%08x, ECX=0x%08x, EDX=0x%08x\n", + leaf->sub, leaf->eax, leaf->ebx, leaf->ecx, leaf->edx); + } else { + printf("0x%08x: EAX=0x%08x, EBX=0x%08x, ECX=0x%08x, EDX=0x%08x\n", + leaf->index, leaf->eax, leaf->ebx, leaf->ecx, leaf->edx); + } +} + +/* Return true is the input eax/ebx/ecx/edx are all zero */ +static bool cpuid_store(struct cpuid_range *range, u32 f, int subleaf, + u32 a, u32 b, u32 c, u32 d) +{ + struct cpuid_func *func; + struct subleaf *leaf; + int s = 0; + + if (a == 0 && b == 0 && c == 0 && d == 0) + return true; + + /* + * Cut off vendor-prefix from CPUID function as we're using it as an + * index into ->funcs. + */ + func = &range->funcs[f & 0xffff]; + + if (!func->leafs) { + func->leafs = malloc(sizeof(struct subleaf)); + if (!func->leafs) + perror("malloc func leaf"); + + func->nr = 1; + } else { + s = func->nr; + func->leafs = realloc(func->leafs, (s + 1) * sizeof(*leaf)); + if (!func->leafs) + perror("realloc f->leafs"); + + func->nr++; + } + + leaf = &func->leafs[s]; + + leaf->index = f; + leaf->sub = subleaf; + leaf->eax = a; + leaf->ebx = b; + leaf->ecx = c; + leaf->edx = d; + + return false; +} + +static void raw_dump_range(struct cpuid_range *range) +{ + u32 f; + int i; + + printf("%s Leafs :\n", range->is_ext ? "Extended" : "Basic"); + printf("================\n"); + + for (f = 0; (int)f < range->nr; f++) { + struct cpuid_func *func = &range->funcs[f]; + u32 index = f; + + if (range->is_ext) + index += 0x80000000; + + /* Skip leaf without valid items */ + if (!func->nr) + continue; + + /* First item is the main leaf, followed by all subleafs */ + for (i = 0; i < func->nr; i++) + leaf_print_raw(&func->leafs[i]); + } +} + +#define MAX_SUBLEAF_NUM 32 +struct cpuid_range *setup_cpuid_range(u32 input_eax) +{ + u32 max_func, idx_func; + int subleaf; + struct cpuid_range *range; + u32 eax, ebx, ecx, edx; + u32 f = input_eax; + int max_subleaf; + bool allzero; + + eax = input_eax; + ebx = ecx = edx = 0; + + cpuid(&eax, &ebx, &ecx, &edx); + max_func = eax; + idx_func = (max_func & 0xffff) + 1; + + range = malloc(sizeof(struct cpuid_range)); + if (!range) + perror("malloc range"); + + if (input_eax & 0x80000000) + range->is_ext = true; + else + range->is_ext = false; + + range->funcs = malloc(sizeof(struct cpuid_func) * idx_func); + if (!range->funcs) + perror("malloc range->funcs"); + + range->nr = idx_func; + memset(range->funcs, 0, sizeof(struct cpuid_func) * idx_func); + + for (; f <= max_func; f++) { + eax = f; + subleaf = ecx = 0; + + cpuid(&eax, &ebx, &ecx, &edx); + allzero = cpuid_store(range, f, subleaf, eax, ebx, ecx, edx); + if (allzero) + continue; + num_leafs++; + + if (!has_subleafs(f)) + continue; + + max_subleaf = MAX_SUBLEAF_NUM; + + /* + * Some can provide the exact number of subleafs, + * others have to be tried (0xf) + */ + if (f == 0x7 || f == 0x14 || f == 0x17 || f == 0x18) + max_subleaf = (eax & 0xff) + 1; + + if (f == 0xb) + max_subleaf = 2; + + for (subleaf = 1; subleaf < max_subleaf; subleaf++) { + eax = f; + ecx = subleaf; + + cpuid(&eax, &ebx, &ecx, &edx); + allzero = cpuid_store(range, f, subleaf, + eax, ebx, ecx, edx); + if (allzero) + continue; + num_leafs++; + } + + } + + return range; +} + +/* + * The basic row format for cpuid.csv is + * LEAF,SUBLEAF,register_name,bits,short name,long description + * + * like: + * 0, 0, EAX, 31:0, max_basic_leafs, Max input value for supported subleafs + * 1, 0, ECX, 0, sse3, Streaming SIMD Extensions 3(SSE3) + */ +static int parse_line(char *line) +{ + char *str; + int i; + struct cpuid_range *range; + struct cpuid_func *func; + struct subleaf *leaf; + u32 index; + u32 sub; + char buffer[512]; + char *buf; + /* + * Tokens: + * 1. leaf + * 2. subleaf + * 3. register + * 4. bits + * 5. short name + * 6. long detail + */ + char *tokens[6]; + struct reg_desc *reg; + struct bits_desc *bdesc; + int reg_index; + char *start, *end; + + /* Skip comments and NULL line */ + if (line[0] == '#' || line[0] == '\n') + return 0; + + strncpy(buffer, line, 511); + buffer[511] = 0; + str = buffer; + for (i = 0; i < 5; i++) { + tokens[i] = strtok(str, ","); + if (!tokens[i]) + goto err_exit; + str = NULL; + } + tokens[5] = strtok(str, "\n"); + + /* index/main-leaf */ + index = strtoull(tokens[0], NULL, 0); + + if (index & 0x80000000) + range = leafs_ext; + else + range = leafs_basic; + + index &= 0x7FFFFFFF; + /* Skip line parsing for non-existing indexes */ + if ((int)index >= range->nr) + return -1; + + func = &range->funcs[index]; + + /* Return if the index has no valid item on this platform */ + if (!func->nr) + return 0; + + /* subleaf */ + sub = strtoul(tokens[1], NULL, 0); + if ((int)sub > func->nr) + return -1; + + leaf = &func->leafs[sub]; + buf = tokens[2]; + + if (strcasestr(buf, "EAX")) + reg_index = R_EAX; + else if (strcasestr(buf, "EBX")) + reg_index = R_EBX; + else if (strcasestr(buf, "ECX")) + reg_index = R_ECX; + else if (strcasestr(buf, "EDX")) + reg_index = R_EDX; + else + goto err_exit; + + reg = &leaf->info[reg_index]; + bdesc = ®->descs[reg->nr++]; + + /* bit flag or bits field */ + buf = tokens[3]; + + end = strtok(buf, ":"); + bdesc->end = strtoul(end, NULL, 0); + bdesc->start = bdesc->end; + + /* start != NULL means it is bit fields */ + start = strtok(NULL, ":"); + if (start) + bdesc->start = strtoul(start, NULL, 0); + + strcpy(bdesc->simp, tokens[4]); + strcpy(bdesc->detail, tokens[5]); + return 0; + +err_exit: + printf("Warning: wrong line format:\n"); + printf("\tline[%d]: %s\n", flines, line); + return -1; +} + +/* Parse csv file, and construct the array of all leafs and subleafs */ +static void parse_text(void) +{ + FILE *file; + char *filename, *line = NULL; + size_t len = 0; + int ret; + + if (show_raw) + return; + + filename = user_csv ? user_csv : def_csv; + file = fopen(filename, "r"); + if (!file) { + /* Fallback to a csv in the same dir */ + file = fopen("./cpuid.csv", "r"); + } + + if (!file) { + printf("Fail to open '%s'\n", filename); + return; + } + + while (1) { + ret = getline(&line, &len, file); + flines++; + if (ret > 0) + parse_line(line); + + if (feof(file)) + break; + } + + fclose(file); +} + + +/* Decode every eax/ebx/ecx/edx */ +static void decode_bits(u32 value, struct reg_desc *rdesc) +{ + struct bits_desc *bdesc; + int start, end, i; + u32 mask; + + for (i = 0; i < rdesc->nr; i++) { + bdesc = &rdesc->descs[i]; + + start = bdesc->start; + end = bdesc->end; + if (start == end) { + /* single bit flag */ + if (value & (1 << start)) + printf("\t%-20s %s%s\n", + bdesc->simp, + show_details ? "-" : "", + show_details ? bdesc->detail : "" + ); + } else { + /* bit fields */ + if (show_flags_only) + continue; + + mask = ((u64)1 << (end - start + 1)) - 1; + printf("\t%-20s\t: 0x%-8x\t%s%s\n", + bdesc->simp, + (value >> start) & mask, + show_details ? "-" : "", + show_details ? bdesc->detail : "" + ); + } + } +} + +static void show_leaf(struct subleaf *leaf) +{ + if (!leaf) + return; + + if (show_raw) + leaf_print_raw(leaf); + + decode_bits(leaf->eax, &leaf->info[R_EAX]); + decode_bits(leaf->ebx, &leaf->info[R_EBX]); + decode_bits(leaf->ecx, &leaf->info[R_ECX]); + decode_bits(leaf->edx, &leaf->info[R_EDX]); +} + +static void show_func(struct cpuid_func *func) +{ + int i; + + if (!func) + return; + + for (i = 0; i < func->nr; i++) + show_leaf(&func->leafs[i]); +} + +static void show_range(struct cpuid_range *range) +{ + int i; + + for (i = 0; i < range->nr; i++) + show_func(&range->funcs[i]); +} + +static inline struct cpuid_func *index_to_func(u32 index) +{ + struct cpuid_range *range; + + range = (index & 0x80000000) ? leafs_ext : leafs_basic; + index &= 0x7FFFFFFF; + + if (((index & 0xFFFF) + 1) > (u32)range->nr) { + printf("ERR: invalid input index (0x%x)\n", index); + return NULL; + } + return &range->funcs[index]; +} + +static void show_info(void) +{ + struct cpuid_func *func; + + if (show_raw) { + /* Show all of the raw output of 'cpuid' instr */ + raw_dump_range(leafs_basic); + raw_dump_range(leafs_ext); + return; + } + + if (user_index != 0xFFFFFFFF) { + /* Only show specific leaf/subleaf info */ + func = index_to_func(user_index); + if (!func) + return; + + /* Dump the raw data also */ + show_raw = true; + + if (user_sub != 0xFFFFFFFF) { + if (user_sub + 1 <= (u32)func->nr) { + show_leaf(&func->leafs[user_sub]); + return; + } + + printf("ERR: invalid input subleaf (0x%x)\n", user_sub); + } + + show_func(func); + return; + } + + printf("CPU features:\n=============\n\n"); + show_range(leafs_basic); + show_range(leafs_ext); +} + +static void setup_platform_cpuid(void) +{ + u32 eax, ebx, ecx, edx; + + /* Check vendor */ + eax = ebx = ecx = edx = 0; + cpuid(&eax, &ebx, &ecx, &edx); + + /* "htuA" */ + if (ebx == 0x68747541) + is_amd = true; + + /* Setup leafs for the basic and extended range */ + leafs_basic = setup_cpuid_range(0x0); + leafs_ext = setup_cpuid_range(0x80000000); +} + +static void usage(void) +{ + printf("kcpuid [-abdfhr] [-l leaf] [-s subleaf]\n" + "\t-a|--all Show both bit flags and complex bit fields info\n" + "\t-b|--bitflags Show boolean flags only\n" + "\t-d|--detail Show details of the flag/fields (default)\n" + "\t-f|--flags Specify the cpuid csv file\n" + "\t-h|--help Show usage info\n" + "\t-l|--leaf=index Specify the leaf you want to check\n" + "\t-r|--raw Show raw cpuid data\n" + "\t-s|--subleaf=sub Specify the subleaf you want to check\n" + ); +} + +static struct option opts[] = { + { "all", no_argument, NULL, 'a' }, /* show both bit flags and fields */ + { "bitflags", no_argument, NULL, 'b' }, /* only show bit flags, default on */ + { "detail", no_argument, NULL, 'd' }, /* show detail descriptions */ + { "file", required_argument, NULL, 'f' }, /* use user's cpuid file */ + { "help", no_argument, NULL, 'h'}, /* show usage */ + { "leaf", required_argument, NULL, 'l'}, /* only check a specific leaf */ + { "raw", no_argument, NULL, 'r'}, /* show raw CPUID leaf data */ + { "subleaf", required_argument, NULL, 's'}, /* check a specific subleaf */ + { NULL, 0, NULL, 0 } +}; + +static int parse_options(int argc, char *argv[]) +{ + int c; + + while ((c = getopt_long(argc, argv, "abdf:hl:rs:", + opts, NULL)) != -1) + switch (c) { + case 'a': + show_flags_only = false; + break; + case 'b': + show_flags_only = true; + break; + case 'd': + show_details = true; + break; + case 'f': + user_csv = optarg; + break; + case 'h': + usage(); + exit(1); + break; + case 'l': + /* main leaf */ + user_index = strtoul(optarg, NULL, 0); + break; + case 'r': + show_raw = true; + break; + case 's': + /* subleaf */ + user_sub = strtoul(optarg, NULL, 0); + break; + default: + printf("%s: Invalid option '%c'\n", argv[0], optopt); + return -1; + } + + return 0; +} + +/* + * Do 4 things in turn: + * 1. Parse user options + * 2. Parse and store all the CPUID leaf data supported on this platform + * 2. Parse the csv file, while skipping leafs which are not available + * on this platform + * 3. Print leafs info based on user options + */ +int main(int argc, char *argv[]) +{ + if (parse_options(argc, argv)) + return -1; + + /* Setup the cpuid leafs of current platform */ + setup_platform_cpuid(); + + /* Read and parse the 'cpuid.csv' */ + parse_text(); + + show_info(); + return 0; +} -- cgit v1.2.3 From 2942a671a37b61186e1073c76a17b1a631111f83 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sun, 7 Mar 2021 14:30:24 -0800 Subject: tools include: Add __sum16 and __wsum definitions. This adds definitions available in the uapi version. Explanation: In the kernel include of types.h the uapi version is included. In tools the uapi/linux/types.h and linux/types.h are distinct. For BPF programs a definition of __wsum is needed by the generated bpf_helpers.h. The definition comes either from a generated vmlinux.h or from that may be transitively included from bpf.h. The perf build prefers linux/types.h over uapi/linux/types.h for *. To allow tools/perf/util/bpf_skel/bpf_prog_profiler.bpf.c to compile with the same include path used for perf then these definitions are necessary. There is likely a wider conversation about exactly how types.h should be specified and the include order used by the perf build - it is somewhat confusing that tools/include/uapi/linux/bpf.h is using the non-uapi types.h. *see tools/perf/Makefile.config: ... INC_FLAGS += -I$(srctree)/tools/include/ INC_FLAGS += -I$(srctree)/tools/arch/$(SRCARCH)/include/uapi INC_FLAGS += -I$(srctree)/tools/include/uapi ... The include directories are scanned from left-to-right: https://gcc.gnu.org/onlinedocs/gcc/Directory-Options.html As tools/include/linux/types.h appears before tools/include/uapi/linux/types.h then I say it is preferred. Signed-off-by: Ian Rogers Cc: Alexander Shishkin Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Daniel Borkmann Cc: Jiri Olsa Cc: John Fastabend Cc: KP Singh Cc: Mark Rutland Cc: Martin KaFai Lau Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Song Liu Cc: Stephane Eranian Cc: Tiezhu Yang Cc: Yonghong Song Cc: bpf@vger.kernel.org Cc: netdev@vger.kernel.org Link: http://lore.kernel.org/lkml/20210307223024.4081067-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/linux/types.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'tools') diff --git a/tools/include/linux/types.h b/tools/include/linux/types.h index e9c5a215837d..6e14a533ab4e 100644 --- a/tools/include/linux/types.h +++ b/tools/include/linux/types.h @@ -61,6 +61,9 @@ typedef __u32 __bitwise __be32; typedef __u64 __bitwise __le64; typedef __u64 __bitwise __be64; +typedef __u16 __bitwise __sum16; +typedef __u32 __bitwise __wsum; + typedef struct { int counter; } atomic_t; -- cgit v1.2.3 From 210e4c89ef61432040c6cd828fefa441f4887186 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 8 Mar 2021 11:17:51 -0300 Subject: perf symbols: Fix dso__fprintf_symbols_by_name() to return the number of printed chars The 'ret' variable was initialized to zero but then it was not updated from the fprintf() return, fix it. Reported-by: Yang Li cc: Alexander Shishkin cc: Ingo Molnar cc: Jiri Olsa cc: Mark Rutland cc: Namhyung Kim Cc: Peter Zijlstra Cc: Srikar Dronamraju Fixes: 90f18e63fbd00513 ("perf symbols: List symbols in a dso in ascending name order") Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/symbol_fprintf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/symbol_fprintf.c b/tools/perf/util/symbol_fprintf.c index 35c936ce33ef..2664fb65e47a 100644 --- a/tools/perf/util/symbol_fprintf.c +++ b/tools/perf/util/symbol_fprintf.c @@ -68,7 +68,7 @@ size_t dso__fprintf_symbols_by_name(struct dso *dso, for (nd = rb_first_cached(&dso->symbol_names); nd; nd = rb_next(nd)) { pos = rb_entry(nd, struct symbol_name_rb_node, rb_node); - fprintf(fp, "%s\n", pos->sym.name); + ret += fprintf(fp, "%s\n", pos->sym.name); } return ret; -- cgit v1.2.3 From 299194a91451263020c73dd2a3b7e0218c88dbd0 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 5 Mar 2021 14:40:50 +0100 Subject: selftests/bpf: Fix test_attach_probe for powerpc uprobes When testing uprobes we the test gets GEP (Global Entry Point) address from kallsyms, but then the function is called locally so the uprobe is not triggered. Fixing this by adjusting the address to LEP (Local Entry Point) for powerpc arch plus instruction check stolen from ppc_function_entry function pointed out and explained by Michael and Naveen. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Naveen N. Rao Link: https://lore.kernel.org/bpf/20210305134050.139840-1-jolsa@kernel.org --- .../selftests/bpf/prog_tests/attach_probe.c | 40 +++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/attach_probe.c b/tools/testing/selftests/bpf/prog_tests/attach_probe.c index a0ee87c8e1ea..9dc4e3dfbcf3 100644 --- a/tools/testing/selftests/bpf/prog_tests/attach_probe.c +++ b/tools/testing/selftests/bpf/prog_tests/attach_probe.c @@ -2,6 +2,44 @@ #include #include "test_attach_probe.skel.h" +#if defined(__powerpc64__) && defined(_CALL_ELF) && _CALL_ELF == 2 + +#define OP_RT_RA_MASK 0xffff0000UL +#define LIS_R2 0x3c400000UL +#define ADDIS_R2_R12 0x3c4c0000UL +#define ADDI_R2_R2 0x38420000UL + +static ssize_t get_offset(ssize_t addr, ssize_t base) +{ + u32 *insn = (u32 *) addr; + + /* + * A PPC64 ABIv2 function may have a local and a global entry + * point. We need to use the local entry point when patching + * functions, so identify and step over the global entry point + * sequence. + * + * The global entry point sequence is always of the form: + * + * addis r2,r12,XXXX + * addi r2,r2,XXXX + * + * A linker optimisation may convert the addis to lis: + * + * lis r2,XXXX + * addi r2,r2,XXXX + */ + if ((((*insn & OP_RT_RA_MASK) == ADDIS_R2_R12) || + ((*insn & OP_RT_RA_MASK) == LIS_R2)) && + ((*(insn + 1) & OP_RT_RA_MASK) == ADDI_R2_R2)) + return (ssize_t)(insn + 2) - base; + else + return addr - base; +} +#else +#define get_offset(addr, base) (addr - base) +#endif + ssize_t get_base_addr() { size_t start, offset; char buf[256]; @@ -36,7 +74,7 @@ void test_attach_probe(void) if (CHECK(base_addr < 0, "get_base_addr", "failed to find base addr: %zd", base_addr)) return; - uprobe_offset = (size_t)&get_base_addr - base_addr; + uprobe_offset = get_offset((size_t)&get_base_addr, base_addr); skel = test_attach_probe__open_and_load(); if (CHECK(!skel, "skel_open", "failed to open skeleton\n")) -- cgit v1.2.3 From 291471dd1559528a4c2ad5026eff94ed1030562b Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Fri, 5 Mar 2021 10:41:13 +0100 Subject: libbpf, xsk: Add libbpf_smp_store_release libbpf_smp_load_acquire MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that the AF_XDP rings have load-acquire/store-release semantics, move libbpf to that as well. The library-internal libbpf_smp_{load_acquire,store_release} are only valid for 32-bit words on ARM64. Also, remove the barriers that are no longer in use. Signed-off-by: Björn Töpel Signed-off-by: Andrii Nakryiko Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210305094113.413544-3-bjorn.topel@gmail.com --- tools/lib/bpf/libbpf_util.h | 72 +++++++++++++++++++++++++++++++-------------- tools/lib/bpf/xsk.h | 17 ++++------- 2 files changed, 55 insertions(+), 34 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf_util.h b/tools/lib/bpf/libbpf_util.h index 59c779c5790c..94a0d7bb6f3c 100644 --- a/tools/lib/bpf/libbpf_util.h +++ b/tools/lib/bpf/libbpf_util.h @@ -5,6 +5,7 @@ #define __LIBBPF_LIBBPF_UTIL_H #include +#include #ifdef __cplusplus extern "C" { @@ -15,29 +16,56 @@ extern "C" { * application that uses libbpf. */ #if defined(__i386__) || defined(__x86_64__) -# define libbpf_smp_rmb() asm volatile("" : : : "memory") -# define libbpf_smp_wmb() asm volatile("" : : : "memory") -# define libbpf_smp_mb() \ - asm volatile("lock; addl $0,-4(%%rsp)" : : : "memory", "cc") -/* Hinders stores to be observed before older loads. */ -# define libbpf_smp_rwmb() asm volatile("" : : : "memory") +# define libbpf_smp_store_release(p, v) \ + do { \ + asm volatile("" : : : "memory"); \ + WRITE_ONCE(*p, v); \ + } while (0) +# define libbpf_smp_load_acquire(p) \ + ({ \ + typeof(*p) ___p1 = READ_ONCE(*p); \ + asm volatile("" : : : "memory"); \ + ___p1; \ + }) #elif defined(__aarch64__) -# define libbpf_smp_rmb() asm volatile("dmb ishld" : : : "memory") -# define libbpf_smp_wmb() asm volatile("dmb ishst" : : : "memory") -# define libbpf_smp_mb() asm volatile("dmb ish" : : : "memory") -# define libbpf_smp_rwmb() libbpf_smp_mb() -#elif defined(__arm__) -/* These are only valid for armv7 and above */ -# define libbpf_smp_rmb() asm volatile("dmb ish" : : : "memory") -# define libbpf_smp_wmb() asm volatile("dmb ishst" : : : "memory") -# define libbpf_smp_mb() asm volatile("dmb ish" : : : "memory") -# define libbpf_smp_rwmb() libbpf_smp_mb() -#else -/* Architecture missing native barrier functions. */ -# define libbpf_smp_rmb() __sync_synchronize() -# define libbpf_smp_wmb() __sync_synchronize() -# define libbpf_smp_mb() __sync_synchronize() -# define libbpf_smp_rwmb() __sync_synchronize() +# define libbpf_smp_store_release(p, v) \ + asm volatile ("stlr %w1, %0" : "=Q" (*p) : "r" (v) : "memory") +# define libbpf_smp_load_acquire(p) \ + ({ \ + typeof(*p) ___p1; \ + asm volatile ("ldar %w0, %1" \ + : "=r" (___p1) : "Q" (*p) : "memory"); \ + __p1; \ + }) +#elif defined(__riscv) +# define libbpf_smp_store_release(p, v) \ + do { \ + asm volatile ("fence rw,w" : : : "memory"); \ + WRITE_ONCE(*p, v); \ + } while (0) +# define libbpf_smp_load_acquire(p) \ + ({ \ + typeof(*p) ___p1 = READ_ONCE(*p); \ + asm volatile ("fence r,rw" : : : "memory"); \ + ___p1; \ + }) +#endif + +#ifndef libbpf_smp_store_release +#define libbpf_smp_store_release(p, v) \ + do { \ + __sync_synchronize(); \ + WRITE_ONCE(*p, v); \ + } while (0) +#endif + +#ifndef libbpf_smp_load_acquire +#define libbpf_smp_load_acquire(p) \ + ({ \ + typeof(*p) ___p1 = READ_ONCE(*p); \ + __sync_synchronize(); \ + ___p1; \ + }) #endif #ifdef __cplusplus diff --git a/tools/lib/bpf/xsk.h b/tools/lib/bpf/xsk.h index e9f121f5d129..a9fdea87b5cd 100644 --- a/tools/lib/bpf/xsk.h +++ b/tools/lib/bpf/xsk.h @@ -96,7 +96,8 @@ static inline __u32 xsk_prod_nb_free(struct xsk_ring_prod *r, __u32 nb) * this function. Without this optimization it whould have been * free_entries = r->cached_prod - r->cached_cons + r->size. */ - r->cached_cons = *r->consumer + r->size; + r->cached_cons = libbpf_smp_load_acquire(r->consumer); + r->cached_cons += r->size; return r->cached_cons - r->cached_prod; } @@ -106,7 +107,7 @@ static inline __u32 xsk_cons_nb_avail(struct xsk_ring_cons *r, __u32 nb) __u32 entries = r->cached_prod - r->cached_cons; if (entries == 0) { - r->cached_prod = *r->producer; + r->cached_prod = libbpf_smp_load_acquire(r->producer); entries = r->cached_prod - r->cached_cons; } @@ -129,9 +130,7 @@ static inline void xsk_ring_prod__submit(struct xsk_ring_prod *prod, __u32 nb) /* Make sure everything has been written to the ring before indicating * this to the kernel by writing the producer pointer. */ - libbpf_smp_wmb(); - - *prod->producer += nb; + libbpf_smp_store_release(prod->producer, *prod->producer + nb); } static inline __u32 xsk_ring_cons__peek(struct xsk_ring_cons *cons, __u32 nb, __u32 *idx) @@ -139,11 +138,6 @@ static inline __u32 xsk_ring_cons__peek(struct xsk_ring_cons *cons, __u32 nb, __ __u32 entries = xsk_cons_nb_avail(cons, nb); if (entries > 0) { - /* Make sure we do not speculatively read the data before - * we have received the packet buffers from the ring. - */ - libbpf_smp_rmb(); - *idx = cons->cached_cons; cons->cached_cons += entries; } @@ -161,9 +155,8 @@ static inline void xsk_ring_cons__release(struct xsk_ring_cons *cons, __u32 nb) /* Make sure data has been read before indicating we are done * with the entries by updating the consumer pointer. */ - libbpf_smp_rwmb(); + libbpf_smp_store_release(cons->consumer, *cons->consumer + nb); - *cons->consumer += nb; } static inline void *xsk_umem__get_data(void *umem_area, __u64 addr) -- cgit v1.2.3 From a6aac408c56112f73d28ea8567c29b2a7fe8fccc Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Mon, 8 Mar 2021 19:25:22 +0100 Subject: libbpf: Fix arm64 build The macro for libbpf_smp_store_release() doesn't build on arm64, fix it. Fixes: 291471dd1559 ("libbpf, xsk: Add libbpf_smp_store_release libbpf_smp_load_acquire") Signed-off-by: Jean-Philippe Brucker Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210308182521.155536-1-jean-philippe@linaro.org --- tools/lib/bpf/libbpf_util.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf_util.h b/tools/lib/bpf/libbpf_util.h index 94a0d7bb6f3c..cfbcfc063c81 100644 --- a/tools/lib/bpf/libbpf_util.h +++ b/tools/lib/bpf/libbpf_util.h @@ -35,7 +35,7 @@ extern "C" { typeof(*p) ___p1; \ asm volatile ("ldar %w0, %1" \ : "=r" (___p1) : "Q" (*p) : "memory"); \ - __p1; \ + ___p1; \ }) #elif defined(__riscv) # define libbpf_smp_store_release(p, v) \ -- cgit v1.2.3 From a0d73acc1e4bc1c542701e37b2e0e233fe6a271d Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Mon, 8 Mar 2021 19:28:31 +0100 Subject: selftests/bpf: Fix typo in Makefile The selftest build fails when trying to install the scripts: rsync: [sender] link_stat "tools/testing/selftests/bpf/test_docs_build.sh" failed: No such file or directory (2) Fix the filename. Fixes: a01d935b2e09 ("tools/bpf: Remove bpf-helpers from bpftool docs") Signed-off-by: Jean-Philippe Brucker Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210308182830.155784-1-jean-philippe@linaro.org --- tools/testing/selftests/bpf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index b5827464c6b5..c3999587bc23 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -68,7 +68,7 @@ TEST_PROGS := test_kmod.sh \ test_bpftool_build.sh \ test_bpftool.sh \ test_bpftool_metadata.sh \ - test_docs_build.sh \ + test_doc_build.sh \ test_xsk.sh TEST_PROGS_EXTENDED := with_addr.sh \ -- cgit v1.2.3 From c71c39b344f7eec9d4492913f22126b03bb7b746 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 21 Jan 2021 15:56:53 -0800 Subject: rcutorture: Use "all" and "N" in "nohz_full" and "rcu_nocbs" This commit uses the shiny new "all" and "N" cpumask options to decouple the "nohz_full" and "rcu_nocbs" kernel boot parameters in the TREE04.boot and TREE08.boot files from the CONFIG_NR_CPUS options in the TREE04 and TREE08 files. Reported-by: Paul Gortmaker Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot | 2 +- tools/testing/selftests/rcutorture/configs/rcu/TREE08.boot | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot index 5adc6756792a..a8d94caf7d2f 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot @@ -1 +1 @@ -rcutree.rcu_fanout_leaf=4 nohz_full=1-7 +rcutree.rcu_fanout_leaf=4 nohz_full=1-N diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE08.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE08.boot index 22478fd3a865..94d38445d393 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE08.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE08.boot @@ -1,3 +1,3 @@ rcupdate.rcu_self_test=1 rcutree.rcu_fanout_exact=1 -rcu_nocbs=0-7 +rcu_nocbs=all -- cgit v1.2.3 From e2b949d54392ad890bb10fb8954d967e2fcd7503 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 14 Jan 2021 16:11:04 -0800 Subject: rcutorture: Make TREE03 use real-time tree.use_softirq setting TREE03 tests RCU priority boosting, which is a real-time feature. It would also be good if it tested something closer to what is actually used by the real-time folks. This commit therefore adds tree.use_softirq=0 to the TREE03 kernel boot parameters in TREE03.boot. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot index 1c218944b1e9..64f864f1f361 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot @@ -4,3 +4,4 @@ rcutree.gp_init_delay=3 rcutree.gp_cleanup_delay=3 rcutree.kthread_prio=2 threadirqs +tree.use_softirq=0 -- cgit v1.2.3 From 8126c57f00cea3502a017b7c76df1fac58f89e88 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 10 Feb 2021 13:25:58 -0800 Subject: torture: Make jitter.sh handle large systems The current jitter.sh script expects cpumask bits to fit into whatever the awk interpreter uses for an integer, which clearly does not hold for even medium-sized systems these days. This means that on a large system, only the first 32 or 64 CPUs (depending) are subjected to jitter.sh CPU-time perturbations. This commit therefore computes a given CPU's cpumask using text manipulation rather than arithmetic shifts. Reported-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/jitter.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/bin/jitter.sh b/tools/testing/selftests/rcutorture/bin/jitter.sh index 188b864bc4bf..3a856ec2e92a 100755 --- a/tools/testing/selftests/rcutorture/bin/jitter.sh +++ b/tools/testing/selftests/rcutorture/bin/jitter.sh @@ -67,10 +67,10 @@ do srand(n + me + systime()); ncpus = split(cpus, ca); curcpu = ca[int(rand() * ncpus + 1)]; - mask = lshift(1, curcpu); - if (mask + 0 <= 0) - mask = 1; - printf("%#x\n", mask); + z = ""; + for (i = 1; 4 * i <= curcpu; i++) + z = z "0"; + print "0x" 2 ^ (curcpu % 4) z; }' < /dev/null` n=$(($n+1)) if ! taskset -p $cpumask $$ > /dev/null 2>&1 -- cgit v1.2.3 From a519d21480d330918bd522499a323432c31b6ec2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 5 Jan 2021 10:50:32 -0800 Subject: torturescript: Don't rerun failed rcutorture builds If the build fails when running multiple instances of a given rcutorture scenario, for example, using the kvm.sh --configs "8*RUDE01" argument, the build will be rerun an additional seven times. This is in some sense correct, but it can waste significant time. This commit therefore checks for a prior failed build and simply copies over that build's output. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 536d103ef166..9d8a82cac808 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -73,7 +73,7 @@ config_override_param "--kconfig argument" KcList "$TORTURE_KCONFIG_ARG" cp $T/KcList $resdir/ConfigFragment base_resdir=`echo $resdir | sed -e 's/\.[0-9]\+$//'` -if test "$base_resdir" != "$resdir" -a -f $base_resdir/bzImage -a -f $base_resdir/vmlinux +if test "$base_resdir" != "$resdir" && test -f $base_resdir/bzImage && test -f $base_resdir/vmlinux then # Rerunning previous test, so use that test's kernel. QEMU="`identify_qemu $base_resdir/vmlinux`" @@ -83,6 +83,17 @@ then ln -s $base_resdir/.config $resdir # for kvm-recheck.sh # Arch-independent indicator touch $resdir/builtkernel +elif test "$base_resdir" != "$resdir" +then + # Rerunning previous test for which build failed + ln -s $base_resdir/Make*.out $resdir # for kvm-recheck.sh + ln -s $base_resdir/.config $resdir # for kvm-recheck.sh + echo Initial build failed, not running KVM, see $resdir. + if test -f $builddir.wait + then + mv $builddir.wait $builddir.ready + fi + exit 1 elif kvm-build.sh $T/KcList $resdir then # Had to build a kernel for this test. -- cgit v1.2.3 From 3d4977b68101b38c3f9d3be3d89e17ef1fdfc1d3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 Jan 2021 16:38:19 -0800 Subject: torture: Allow 1G of memory for torture.sh kvfree testing Yes, I do recall a time when 512MB of memory was a lot of mass storage, much less main memory, but the rcuscale kvfree_rcu() testing invoked by torture.sh can sometimes exceed it on large systems, resulting in OOM. This commit therefore causes torture.sh to pase the "--memory 1G" argument to kvm.sh to reserve a full gigabyte for this purpose. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/torture.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh index ad7525b7ac29..56e2e1a42569 100755 --- a/tools/testing/selftests/rcutorture/bin/torture.sh +++ b/tools/testing/selftests/rcutorture/bin/torture.sh @@ -374,7 +374,7 @@ done if test "$do_kvfree" = "yes" then torture_bootargs="rcuscale.kfree_rcu_test=1 rcuscale.kfree_nthreads=16 rcuscale.holdoff=20 rcuscale.kfree_loops=10000 torture.disable_onoff_at_boot" - torture_set "rcuscale-kvfree" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration 10 --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --trust-make + torture_set "rcuscale-kvfree" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration 10 --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory 1G --trust-make fi echo " --- " $scriptname $args -- cgit v1.2.3 From a8dafbf3a5465bea6d9b45a4f011ba9b56d8b267 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 Feb 2021 15:44:29 -0800 Subject: torture: Provide bare-metal modprobe-based advice In some environments, the torture-testing use of virtualization is inconvenient. In such cases, the modprobe and rmmod commands may be used to do torture testing, but significant setup is required to build, boot, and modprobe a kernel so as to match a given torture-test scenario. This commit therefore creates a "bare-metal" file in each results directory containing steps to run the corresponding scenario using the modprobe command on bare metal. For example, the contents of this file after using kvm.sh to build an rcutorture TREE01 kernel, perhaps with the --buildonly argument, is as follows: To run this scenario on bare metal: 1. Set your bare-metal build tree to the state shown in this file: /home/git/linux-rcu/tools/testing/selftests/rcutorture/res/2021.02.04-17.10.19/testid.txt 2. Update your bare-metal build tree's .config based on this file: /home/git/linux-rcu/tools/testing/selftests/rcutorture/res/2021.02.04-17.10.19/TREE01/ConfigFragment 3. Make the bare-metal kernel's build system aware of your .config updates: $ yes "" | make oldconfig 4. Build your bare-metal kernel. 5. Boot your bare-metal kernel with the following parameters: maxcpus=8 nr_cpus=43 rcutree.gp_preinit_delay=3 rcutree.gp_init_delay=3 rcutree.gp_cleanup_delay=3 rcu_nocbs=0-1,3-7 6. Start the test with the following command: $ modprobe rcutorture nocbs_nthreads=8 nocbs_toggle=1000 fwd_progress=0 onoff_interval=1000 onoff_holdoff=30 n_barrier_cbs=4 stat_interval=15 shutdown_secs=120 test_no_idle_hz=1 verbose=1 7. After some time, end the test with the following command: $ rmmod rcutorture 8. Copy your bare-metal kernel's .config file, overwriting this file: /home/git/linux-rcu/tools/testing/selftests/rcutorture/res/2021.02.04-17.10.19/TREE01/.config 9. Copy the console output from just before the modprobe to just after the rmmod into this file: /home/git/linux-rcu/tools/testing/selftests/rcutorture/res/2021.02.04-17.10.19/TREE01/console.log 10. Check for runtime errors using the following command: $ tools/testing/selftests/rcutorture/bin/kvm-recheck.sh /home/git/linux-rcu/tools/testing/selftests/rcutorture/res/2021.02.04-17.10.19 Signed-off-by: Paul E. McKenney --- .../selftests/rcutorture/bin/kvm-test-1-run.sh | 44 +++++++++++++++++++--- tools/testing/selftests/rcutorture/bin/kvm.sh | 4 ++ 2 files changed, 42 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 9d8a82cac808..03c041081754 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -7,15 +7,15 @@ # Execute this in the source tree. Do not run it as a background task # because qemu does not seem to like that much. # -# Usage: kvm-test-1-run.sh config builddir resdir seconds qemu-args boot_args +# Usage: kvm-test-1-run.sh config builddir resdir seconds qemu-args boot_args_in # # qemu-args defaults to "-enable-kvm -nographic", along with arguments # specifying the number of CPUs and other options # generated from the underlying CPU architecture. -# boot_args defaults to value returned by the per_version_boot_params +# boot_args_in defaults to value returned by the per_version_boot_params # shell function. # -# Anything you specify for either qemu-args or boot_args is appended to +# Anything you specify for either qemu-args or boot_args_in is appended to # the default values. The "-smp" value is deduced from the contents of # the config fragment. # @@ -134,7 +134,7 @@ do done seconds=$4 qemu_args=$5 -boot_args=$6 +boot_args_in=$6 if test -z "$TORTURE_BUILDONLY" then @@ -144,7 +144,7 @@ fi # Generate -smp qemu argument. qemu_args="-enable-kvm -nographic $qemu_args" cpu_count=`configNR_CPUS.sh $resdir/ConfigFragment` -cpu_count=`configfrag_boot_cpus "$boot_args" "$config_template" "$cpu_count"` +cpu_count=`configfrag_boot_cpus "$boot_args_in" "$config_template" "$cpu_count"` if test "$cpu_count" -gt "$TORTURE_ALLOTED_CPUS" then echo CPU count limited from $cpu_count to $TORTURE_ALLOTED_CPUS | tee -a $resdir/Warnings @@ -160,13 +160,45 @@ qemu_args="$qemu_args `identify_qemu_args "$QEMU" "$resdir/console.log"`" qemu_append="`identify_qemu_append "$QEMU"`" # Pull in Kconfig-fragment boot parameters -boot_args="`configfrag_boot_params "$boot_args" "$config_template"`" +boot_args="`configfrag_boot_params "$boot_args_in" "$config_template"`" # Generate kernel-version-specific boot parameters boot_args="`per_version_boot_params "$boot_args" $resdir/.config $seconds`" if test -n "$TORTURE_BOOT_GDB_ARG" then boot_args="$boot_args $TORTURE_BOOT_GDB_ARG" fi + +# Give bare-metal advice +modprobe_args="`echo $boot_args | tr -s ' ' '\012' | grep "^$TORTURE_MOD\." | sed -e "s/$TORTURE_MOD\.//g"`" +kboot_args="`echo $boot_args | tr -s ' ' '\012' | grep -v "^$TORTURE_MOD\."`" +testid_txt="`dirname $resdir`/testid.txt" +touch $resdir/bare-metal +echo To run this scenario on bare metal: >> $resdir/bare-metal +echo >> $resdir/bare-metal +echo " 1." Set your bare-metal build tree to the state shown in this file: >> $resdir/bare-metal +echo " " $testid_txt >> $resdir/bare-metal +echo " 2." Update your bare-metal build tree"'"s .config based on this file: >> $resdir/bare-metal +echo " " $resdir/ConfigFragment >> $resdir/bare-metal +echo " 3." Make the bare-metal kernel"'"s build system aware of your .config updates: >> $resdir/bare-metal +echo " " $ 'yes "" | make oldconfig' >> $resdir/bare-metal +echo " 4." Build your bare-metal kernel. >> $resdir/bare-metal +echo " 5." Boot your bare-metal kernel with the following parameters: >> $resdir/bare-metal +echo " " $kboot_args >> $resdir/bare-metal +echo " 6." Start the test with the following command: >> $resdir/bare-metal +echo " " $ modprobe $TORTURE_MOD $modprobe_args >> $resdir/bare-metal +echo " 7." After some time, end the test with the following command: >> $resdir/bare-metal +echo " " $ rmmod $TORTURE_MOD >> $resdir/bare-metal +echo " 8." Copy your bare-metal kernel"'"s .config file, overwriting this file: >> $resdir/bare-metal +echo " " $resdir/.config >> $resdir/bare-metal +echo " 9." Copy the console output from just before the modprobe to just after >> $resdir/bare-metal +echo " " the rmmod into this file: >> $resdir/bare-metal +echo " " $resdir/console.log >> $resdir/bare-metal +echo "10." Check for runtime errors using the following command: >> $resdir/bare-metal +echo " " $ tools/testing/selftests/rcutorture/bin/kvm-recheck.sh `dirname $resdir` >> $resdir/bare-metal +echo >> $resdir/bare-metal +echo Some of the above steps may be skipped if you build your bare-metal >> $resdir/bare-metal +echo kernel here: `head -n 1 $testid_txt | sed -e 's/^Build directory: //'` >> $resdir/bare-metal + echo $QEMU $qemu_args -m $TORTURE_QEMU_MEM -kernel $KERNEL -append \"$qemu_append $boot_args\" $TORTURE_QEMU_GDB_ARG > $resdir/qemu-cmd echo "# TORTURE_SHUTDOWN_GRACE=$TORTURE_SHUTDOWN_GRACE" >> $resdir/qemu-cmd echo "# seconds=$seconds" >> $resdir/qemu-cmd diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 8d3c99b35e06..35a2132a8461 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -29,6 +29,7 @@ PATH=${KVM}/bin:$PATH; export PATH TORTURE_ALLOTED_CPUS="`identify_qemu_vcpus`" TORTURE_DEFCONFIG=defconfig TORTURE_BOOT_IMAGE="" +TORTURE_BUILDONLY= TORTURE_INITRD="$KVM/initrd"; export TORTURE_INITRD TORTURE_KCONFIG_ARG="" TORTURE_KCONFIG_GDB_ARG="" @@ -40,6 +41,7 @@ TORTURE_KMAKE_ARG="" TORTURE_QEMU_MEM=512 TORTURE_SHUTDOWN_GRACE=180 TORTURE_SUITE=rcu +TORTURE_MOD=rcutorture TORTURE_TRUST_MAKE="" resdir="" configs="" @@ -215,6 +217,7 @@ do --torture) checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\|rcuscale\|refscale\|scf\)$' '^--' TORTURE_SUITE=$2 + TORTURE_MOD="`echo $TORTURE_SUITE | sed -e 's/^\(lock\|rcu\|scf\)$/\1torture/'`" shift if test "$TORTURE_SUITE" = rcuscale || test "$TORTURE_SUITE" = refscale then @@ -381,6 +384,7 @@ TORTURE_QEMU_GDB_ARG="$TORTURE_QEMU_GDB_ARG"; export TORTURE_QEMU_GDB_ARG TORTURE_KCONFIG_KASAN_ARG="$TORTURE_KCONFIG_KASAN_ARG"; export TORTURE_KCONFIG_KASAN_ARG TORTURE_KCONFIG_KCSAN_ARG="$TORTURE_KCONFIG_KCSAN_ARG"; export TORTURE_KCONFIG_KCSAN_ARG TORTURE_KMAKE_ARG="$TORTURE_KMAKE_ARG"; export TORTURE_KMAKE_ARG +TORTURE_MOD="$TORTURE_MOD"; export TORTURE_MOD TORTURE_QEMU_CMD="$TORTURE_QEMU_CMD"; export TORTURE_QEMU_CMD TORTURE_QEMU_INTERACTIVE="$TORTURE_QEMU_INTERACTIVE"; export TORTURE_QEMU_INTERACTIVE TORTURE_QEMU_MAC="$TORTURE_QEMU_MAC"; export TORTURE_QEMU_MAC -- cgit v1.2.3 From f9d2f1e2c426ad6c4d7661cc7d90be4de2c4f7a4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 4 Feb 2021 17:20:45 -0800 Subject: torture: Improve readability of the testid.txt file The testid.txt file was intended for occasional in extremis use, but now that the new "bare-metal" file references it, it might see more use. This commit therefore labels sections of output and adds spacing to make it easier to see what needs to be done to make a bare-metal build tree match an rcutorture build tree. Of course, you can avoid this whole issue by building your bare-metal kernel in the same directory in which you ran rcutorture, but that might not always be an option. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm.sh | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 35a2132a8461..1de198d6f999 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -404,11 +404,16 @@ echo $scriptname $args touch $resdir/$ds/log echo $scriptname $args >> $resdir/$ds/log echo ${TORTURE_SUITE} > $resdir/$ds/TORTURE_SUITE -pwd > $resdir/$ds/testid.txt +echo Build directory: `pwd` > $resdir/$ds/testid.txt if test -d .git then + echo Current commit: `git rev-parse HEAD` >> $resdir/$ds/testid.txt + echo >> $resdir/$ds/testid.txt + echo ' ---' Output of "'"git status"'": >> $resdir/$ds/testid.txt git status >> $resdir/$ds/testid.txt - git rev-parse HEAD >> $resdir/$ds/testid.txt + echo >> $resdir/$ds/testid.txt + echo >> $resdir/$ds/testid.txt + echo ' ---' Output of "'"git diff HEAD"'": >> $resdir/$ds/testid.txt git diff HEAD >> $resdir/$ds/testid.txt fi ___EOF___ -- cgit v1.2.3 From 0e7457b550233314394574c6bdc890de9131daf5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 Jan 2021 10:15:02 -0800 Subject: rcuscale: Disable verbose torture-test output Given large numbers of threads, the quantity of torture-test output is sufficient to sometimes result in RCU CPU stall warnings. The probability of these stall warnings was greatly reduced by batching the output, but the warnings were not eliminated. However, the actual test only depends on console output that is printed even when rcuscale.verbose=0. This commit therefore causes this test to run with rcuscale.verbose=0. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/configs/rcuscale/ver_functions.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/rcuscale/ver_functions.sh index 0333e9b18522..ffbe15109f0d 100644 --- a/tools/testing/selftests/rcutorture/configs/rcuscale/ver_functions.sh +++ b/tools/testing/selftests/rcutorture/configs/rcuscale/ver_functions.sh @@ -12,5 +12,5 @@ # Adds per-version torture-module parameters to kernels supporting them. per_version_boot_params () { echo $1 rcuscale.shutdown=1 \ - rcuscale.verbose=1 + rcuscale.verbose=0 } -- cgit v1.2.3 From aebf8c7bf6d508dfb4255db8f7355ca819d9e6c9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 Jan 2021 10:17:26 -0800 Subject: refscale: Disable verbose torture-test output Given large numbers of threads, the quantity of torture-test output is sufficient to sometimes result in RCU CPU stall warnings. The probability of these stall warnings was greatly reduced by batching the output, but the warnings were not eliminated. However, the actual test only depends on console output that is printed even when refscale.verbose=0. This commit therefore causes this test to run with refscale.verbose=0. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/configs/refscale/ver_functions.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/configs/refscale/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/refscale/ver_functions.sh index 321e82641287..f81fa2c541a6 100644 --- a/tools/testing/selftests/rcutorture/configs/refscale/ver_functions.sh +++ b/tools/testing/selftests/rcutorture/configs/refscale/ver_functions.sh @@ -12,5 +12,5 @@ # Adds per-version torture-module parameters to kernels supporting them. per_version_boot_params () { echo $1 refscale.shutdown=1 \ - refscale.verbose=1 + refscale.verbose=0 } -- cgit v1.2.3 From 3c43ce53fdb39921f4ee71c65dc100296e15640f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 10 Feb 2021 15:15:13 -0800 Subject: torture: Move build/run synchronization files into scenario directories Currently the bN.ready and bN.wait files are placed in the rcutorture directory, which really is not at all a good place for run-specific files. This commit therefore renames these files to build.ready and build.wait and then moves them into the scenario directories within the "res" directory, for example, into tools/testing/selftests/rcutorture/res/2021.02.10-15.08.23/TINY01. Signed-off-by: Paul E. McKenney --- .../selftests/rcutorture/bin/kvm-test-1-run.sh | 25 +++++++++++----------- tools/testing/selftests/rcutorture/bin/kvm.sh | 10 ++++----- 2 files changed, 16 insertions(+), 19 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 03c041081754..91578d3af219 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -7,7 +7,7 @@ # Execute this in the source tree. Do not run it as a background task # because qemu does not seem to like that much. # -# Usage: kvm-test-1-run.sh config builddir resdir seconds qemu-args boot_args_in +# Usage: kvm-test-1-run.sh config resdir seconds qemu-args boot_args_in # # qemu-args defaults to "-enable-kvm -nographic", along with arguments # specifying the number of CPUs and other options @@ -35,8 +35,7 @@ mkdir $T config_template=${1} config_dir=`echo $config_template | sed -e 's,/[^/]*$,,'` title=`echo $config_template | sed -e 's/^.*\///'` -builddir=${2} -resdir=${3} +resdir=${2} if test -z "$resdir" -o ! -d "$resdir" -o ! -w "$resdir" then echo "kvm-test-1-run.sh :$resdir: Not a writable directory, cannot store results into it" @@ -89,9 +88,9 @@ then ln -s $base_resdir/Make*.out $resdir # for kvm-recheck.sh ln -s $base_resdir/.config $resdir # for kvm-recheck.sh echo Initial build failed, not running KVM, see $resdir. - if test -f $builddir.wait + if test -f $resdir/build.wait then - mv $builddir.wait $builddir.ready + mv $resdir/build.wait $resdir/build.ready fi exit 1 elif kvm-build.sh $T/KcList $resdir @@ -118,23 +117,23 @@ else # Build failed. cp .config $resdir || : echo Build failed, not running KVM, see $resdir. - if test -f $builddir.wait + if test -f $resdir/build.wait then - mv $builddir.wait $builddir.ready + mv $resdir/build.wait $resdir/build.ready fi exit 1 fi -if test -f $builddir.wait +if test -f $resdir/build.wait then - mv $builddir.wait $builddir.ready + mv $resdir/build.wait $resdir/build.ready fi -while test -f $builddir.ready +while test -f $resdir/build.ready do sleep 1 done -seconds=$4 -qemu_args=$5 -boot_args_in=$6 +seconds=$3 +qemu_args=$4 +boot_args_in=$5 if test -z "$TORTURE_BUILDONLY" then diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 1de198d6f999..7944510f8c24 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -444,7 +444,6 @@ function dump(first, pastlast, batchnum) print "needqemurun=" jn=1 for (j = first; j < pastlast; j++) { - builddir=KVM "/b" j - first + 1 cpusr[jn] = cpus[j]; if (cfrep[cf[j]] == "") { cfr[jn] = cf[j]; @@ -453,15 +452,15 @@ function dump(first, pastlast, batchnum) cfrep[cf[j]]++; cfr[jn] = cf[j] "." cfrep[cf[j]]; } + builddir=rd cfr[jn] "/build"; if (cpusr[jn] > ncpus && ncpus != 0) ovf = "-ovf"; else ovf = ""; print "echo ", cfr[jn], cpusr[jn] ovf ": Starting build. `date` | tee -a " rd "log"; - print "rm -f " builddir ".*"; - print "touch " builddir ".wait"; print "mkdir " rd cfr[jn] " || :"; - print "kvm-test-1-run.sh " CONFIGDIR cf[j], builddir, rd cfr[jn], dur " \"" TORTURE_QEMU_ARG "\" \"" TORTURE_BOOTARGS "\" > " rd cfr[jn] "/kvm-test-1-run.sh.out 2>&1 &" + print "touch " builddir ".wait"; + print "kvm-test-1-run.sh " CONFIGDIR cf[j], rd cfr[jn], dur " \"" TORTURE_QEMU_ARG "\" \"" TORTURE_BOOTARGS "\" > " rd cfr[jn] "/kvm-test-1-run.sh.out 2>&1 &" print "echo ", cfr[jn], cpusr[jn] ovf ": Waiting for build to complete. `date` | tee -a " rd "log"; print "while test -f " builddir ".wait" print "do" @@ -471,7 +470,7 @@ function dump(first, pastlast, batchnum) jn++; } for (j = 1; j < jn; j++) { - builddir=KVM "/b" j + builddir=rd cfr[j] "/build"; print "rm -f " builddir ".ready" print "if test -f \"" rd cfr[j] "/builtkernel\"" print "then" @@ -509,7 +508,6 @@ function dump(first, pastlast, batchnum) print "\techo ---- No kernel runs. `date` | tee -a " rd "log"; print "fi" for (j = 1; j < jn; j++) { - builddir=KVM "/b" j print "echo ----", cfr[j], cpusr[j] ovf ": Build/run results: | tee -a " rd "log"; print "cat " rd cfr[j] "/kvm-test-1-run.sh.out | tee -a " rd "log"; } -- cgit v1.2.3 From b674100e630bf9211d7edce06b5d734b125a74ee Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 10 Feb 2021 16:28:44 -0800 Subject: torture: Use file-based protocol to mark batch's runs complete Currently, the script generated by kvm.sh does a "wait" to wait on both the current batch's guest OSes and any jitter.sh scripts. This works, but makes it hard to abstract the jittering so that common code can be used for both local and distributed runs. This commit therefore uses "build.run" files in scenario directories, and these files are removed after the corresponding scenario's guest OS has completed. Note that --build-only runs do not create build.run files because they also do not create guest OSes and do not run any jitter.sh scripts. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh | 3 +++ tools/testing/selftests/rcutorture/bin/kvm.sh | 13 +++++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 91578d3af219..fed6f10a7b60 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -345,4 +345,7 @@ then echo Unknown PID, cannot kill qemu command fi +# Tell the script that this run is done. +rm -f $resdir/build.run + parse-console.sh $resdir/console.log $title diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 7944510f8c24..1f5f8720cacc 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -469,9 +469,15 @@ function dump(first, pastlast, batchnum) print "echo ", cfr[jn], cpusr[jn] ovf ": Build complete. `date` | tee -a " rd "log"; jn++; } + print "runfiles=" for (j = 1; j < jn; j++) { builddir=rd cfr[j] "/build"; - print "rm -f " builddir ".ready" + if (TORTURE_BUILDONLY) + print "rm -f " builddir ".ready" + else + print "mv " builddir ".ready " builddir ".run" + print "runfiles=\"$runfiles " builddir ".run\"" + fi print "if test -f \"" rd cfr[j] "/builtkernel\"" print "then" print "\techo ----", cfr[j], cpusr[j] ovf ": Kernel present. `date` | tee -a " rd "log"; @@ -501,7 +507,10 @@ function dump(first, pastlast, batchnum) print "\tjitter.sh " j " " dur " " ja[2] " " ja[3] "&" print "\techo $! >> " rd "jitter_pids" } - print "\twait" + print "\twhile ls $runfiles > /dev/null 2>&1" + print "\tdo" + print "\t\t:" + print "\tdone" print "\techo ---- All kernel runs complete. `date` | tee -a " rd "log"; print "else" print "\twait" -- cgit v1.2.3 From 37812c9429722824859788cf754dd3e33f546908 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 Feb 2021 10:39:28 -0800 Subject: torture: Use "jittering" file to control jitter.sh execution Currently, jitter.sh execution is controlled by a time limit and by the "kill" command. The former allowed jitter.sh to run uselessly past the end of a set of runs that panicked during boot, and the latter is vulnerable to PID reuse. This commit therefore introduces a "jittering" file in the date-stamp directory within "res" that must be present for the jitter.sh scripts to continue executing. The time limit is still in place in order to avoid disturbing runs featuring large trace dumps, but the removal of the "jittering" file handles the panic-during-boot scenario without relying on PIDs. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/jitter.sh | 10 ++++++---- tools/testing/selftests/rcutorture/bin/kvm.sh | 5 ++++- 2 files changed, 10 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/bin/jitter.sh b/tools/testing/selftests/rcutorture/bin/jitter.sh index 188b864bc4bf..ed0ea86ddf5d 100755 --- a/tools/testing/selftests/rcutorture/bin/jitter.sh +++ b/tools/testing/selftests/rcutorture/bin/jitter.sh @@ -5,10 +5,11 @@ # of this script is to inflict random OS jitter on a concurrently running # test. # -# Usage: jitter.sh me duration [ sleepmax [ spinmax ] ] +# Usage: jitter.sh me duration jittering-path [ sleepmax [ spinmax ] ] # # me: Random-number-generator seed salt. # duration: Time to run in seconds. +# jittering-path: Path to file whose removal will stop this script. # sleepmax: Maximum microseconds to sleep, defaults to one second. # spinmax: Maximum microseconds to spin, defaults to one millisecond. # @@ -18,8 +19,9 @@ me=$(($1 * 1000)) duration=$2 -sleepmax=${3-1000000} -spinmax=${4-1000} +jittering=$3 +sleepmax=${4-1000000} +spinmax=${5-1000} n=1 @@ -47,7 +49,7 @@ do fi # Check for stop request. - if test -f "$TORTURE_STOPFILE" + if ! test -f "$jittering" then exit 1; fi diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 1f5f8720cacc..48da4cdb29d8 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -503,14 +503,17 @@ function dump(first, pastlast, batchnum) print "then" print "\techo ---- Starting kernels. `date` | tee -a " rd "log"; print "\techo > " rd "jitter_pids" + print "\ttouch " rd "jittering" for (j = 0; j < njitter; j++) { - print "\tjitter.sh " j " " dur " " ja[2] " " ja[3] "&" + print "\tjitter.sh " j " " dur " " rd "jittering " ja[2] " " ja[3] "&" print "\techo $! >> " rd "jitter_pids" } print "\twhile ls $runfiles > /dev/null 2>&1" print "\tdo" print "\t\t:" print "\tdone" + print "\trm -f " rd "jittering" + print "\twait" print "\techo ---- All kernel runs complete. `date` | tee -a " rd "log"; print "else" print "\twait" -- cgit v1.2.3 From 1f922db8eef015f261480347aaf79fa9a25728f2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 Feb 2021 10:56:42 -0800 Subject: torture: Eliminate jitter_pids file Now that there is a reliable way to convince the jitter.sh scripts to stop, the jitter_pids file is not needed, nor is the code that kills all the PIDs contained in this file. This commit therefore eliminates this file and the code using it. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh | 14 -------------- tools/testing/selftests/rcutorture/bin/kvm.sh | 5 +---- 2 files changed, 1 insertion(+), 18 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index fed6f10a7b60..eb5346b457d4 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -270,20 +270,6 @@ do echo "ps -fp $killpid" >> $resdir/Warnings 2>&1 ps -fp $killpid >> $resdir/Warnings 2>&1 fi - # Reduce probability of PID reuse by allowing a one-minute buffer - if test $((kruntime + 60)) -lt $seconds && test -s "$resdir/../jitter_pids" - then - awk < "$resdir/../jitter_pids" ' - NF > 0 { - pidlist = pidlist " " $1; - n++; - } - END { - if (n > 0) { - print "kill " pidlist; - } - }' | sh - fi else echo ' ---' `date`: "Kernel done" fi diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 48da4cdb29d8..de93802c3d00 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -502,12 +502,9 @@ function dump(first, pastlast, batchnum) print "if test -n \"$needqemurun\"" print "then" print "\techo ---- Starting kernels. `date` | tee -a " rd "log"; - print "\techo > " rd "jitter_pids" print "\ttouch " rd "jittering" - for (j = 0; j < njitter; j++) { + for (j = 0; j < njitter; j++) print "\tjitter.sh " j " " dur " " rd "jittering " ja[2] " " ja[3] "&" - print "\techo $! >> " rd "jitter_pids" - } print "\twhile ls $runfiles > /dev/null 2>&1" print "\tdo" print "\t\t:" -- cgit v1.2.3 From 4cd54518c3d8afadd11ebd6ad4f03b00859f5e85 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 Feb 2021 11:54:43 -0800 Subject: torture: Reverse jittering and duration parameters for jitter.sh Remote rcutorture testing requires that jitter.sh continue to be invoked from the generated script for local runs, but that it instead be invoked on the remote system for distributed runs. This argues for common jitterstart and jitterstop scripts. But it would be good for jitterstart and jitterstop to control the name and location of the "jittering" file, while continuing to have the duration controlled by the caller of these new scripts. This commit therefore reverses the order of the jittering and duration parameters for jitter.sh, so that the jittering parameter precedes the duration parameter. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/jitter.sh | 6 +++--- tools/testing/selftests/rcutorture/bin/kvm.sh | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/bin/jitter.sh b/tools/testing/selftests/rcutorture/bin/jitter.sh index ed0ea86ddf5d..ff1d3e468e53 100755 --- a/tools/testing/selftests/rcutorture/bin/jitter.sh +++ b/tools/testing/selftests/rcutorture/bin/jitter.sh @@ -5,7 +5,7 @@ # of this script is to inflict random OS jitter on a concurrently running # test. # -# Usage: jitter.sh me duration jittering-path [ sleepmax [ spinmax ] ] +# Usage: jitter.sh me jittering-path duration [ sleepmax [ spinmax ] ] # # me: Random-number-generator seed salt. # duration: Time to run in seconds. @@ -18,8 +18,8 @@ # Authors: Paul E. McKenney me=$(($1 * 1000)) -duration=$2 -jittering=$3 +jittering=$2 +duration=$3 sleepmax=${4-1000000} spinmax=${5-1000} diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index de93802c3d00..a2ee3f2fff3c 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -504,7 +504,7 @@ function dump(first, pastlast, batchnum) print "\techo ---- Starting kernels. `date` | tee -a " rd "log"; print "\ttouch " rd "jittering" for (j = 0; j < njitter; j++) - print "\tjitter.sh " j " " dur " " rd "jittering " ja[2] " " ja[3] "&" + print "\tjitter.sh " j " " rd "jittering " dur " " ja[2] " " ja[3] "&" print "\twhile ls $runfiles > /dev/null 2>&1" print "\tdo" print "\t\t:" -- cgit v1.2.3 From ba46b21bbdf8c1e8f054f44e7db7d6684720ef78 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 13 Jan 2021 11:59:20 +0100 Subject: doc: Update rcu_dereference.rst reference Changeset b00aedf978aa ("doc: Convert to rcu_dereference.txt to rcu_dereference.rst") renamed: Documentation/RCU/rcu_dereference.txt to: Documentation/RCU/rcu_dereference.rst. Update its cross-reference accordingly. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paul E. McKenney --- tools/memory-model/Documentation/glossary.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/memory-model/Documentation/glossary.txt b/tools/memory-model/Documentation/glossary.txt index b2da6365be63..6f3d16dbf467 100644 --- a/tools/memory-model/Documentation/glossary.txt +++ b/tools/memory-model/Documentation/glossary.txt @@ -19,7 +19,7 @@ Address Dependency: When the address of a later memory access is computed from the value returned by the rcu_dereference() on line 2, the address dependency extends from that rcu_dereference() to that "p->a". In rare cases, optimizing compilers can destroy address - dependencies. Please see Documentation/RCU/rcu_dereference.txt + dependencies. Please see Documentation/RCU/rcu_dereference.rst for more information. See also "Control Dependency" and "Data Dependency". -- cgit v1.2.3 From 9146658cc49a1dbed5ece140f658be884e189ade Mon Sep 17 00:00:00 2001 From: Akira Yokosawa Date: Thu, 14 Jan 2021 23:09:07 +0900 Subject: tools/memory-model: Remove reference to atomic_ops.rst atomic_ops.rst was removed by commit f0400a77ebdc ("atomic: Delete obsolete documentation"). Remove the broken link in tools/memory-model/Documentation/simple.txt. Cc: Peter Zijlstra Signed-off-by: Akira Yokosawa Signed-off-by: Paul E. McKenney --- tools/memory-model/Documentation/simple.txt | 1 - 1 file changed, 1 deletion(-) (limited to 'tools') diff --git a/tools/memory-model/Documentation/simple.txt b/tools/memory-model/Documentation/simple.txt index 81e1a0ec5342..4c789ec8334f 100644 --- a/tools/memory-model/Documentation/simple.txt +++ b/tools/memory-model/Documentation/simple.txt @@ -189,7 +189,6 @@ Additional information may be found in these files: Documentation/atomic_t.txt Documentation/atomic_bitops.txt -Documentation/core-api/atomic_ops.rst Documentation/core-api/refcount-vs-atomic.rst Reading code using these primitives is often also quite helpful. -- cgit v1.2.3 From 297e69bfa4c7aa27259dd456af1377e868337043 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Mar 2021 08:59:21 -0300 Subject: perf script: Fixup 'struct evsel_script' method prefix They all operate on 'struct evsel_script' instances, so should be prefixed with evsel_script__, not with perf_evsel_script__. Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 5915f19cee55..119a5f7e2772 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -314,8 +314,7 @@ static inline struct evsel_script *evsel_script(struct evsel *evsel) return (struct evsel_script *)evsel->priv; } -static struct evsel_script *perf_evsel_script__new(struct evsel *evsel, - struct perf_data *data) +static struct evsel_script *evsel_script__new(struct evsel *evsel, struct perf_data *data) { struct evsel_script *es = zalloc(sizeof(*es)); @@ -335,7 +334,7 @@ out_free: return NULL; } -static void perf_evsel_script__delete(struct evsel_script *es) +static void evsel_script__delete(struct evsel_script *es) { zfree(&es->filename); fclose(es->fp); @@ -343,7 +342,7 @@ static void perf_evsel_script__delete(struct evsel_script *es) free(es); } -static int perf_evsel_script__fprintf(struct evsel_script *es, FILE *fp) +static int evsel_script__fprintf(struct evsel_script *es, FILE *fp) { struct stat st; @@ -2219,8 +2218,7 @@ static int process_attr(struct perf_tool *tool, union perf_event *event, if (!evsel->priv) { if (scr->per_event_dump) { - evsel->priv = perf_evsel_script__new(evsel, - scr->session->data); + evsel->priv = evsel_script__new(evsel, scr->session->data); } else { es = zalloc(sizeof(*es)); if (!es) @@ -2475,7 +2473,7 @@ static void perf_script__fclose_per_event_dump(struct perf_script *script) evlist__for_each_entry(evlist, evsel) { if (!evsel->priv) break; - perf_evsel_script__delete(evsel->priv); + evsel_script__delete(evsel->priv); evsel->priv = NULL; } } @@ -2495,7 +2493,7 @@ static int perf_script__fopen_per_event_dump(struct perf_script *script) if (evsel->priv != NULL) continue; - evsel->priv = perf_evsel_script__new(evsel, script->session->data); + evsel->priv = evsel_script__new(evsel, script->session->data); if (evsel->priv == NULL) goto out_err_fclose; } @@ -2530,8 +2528,8 @@ static void perf_script__exit_per_event_dump_stats(struct perf_script *script) evlist__for_each_entry(script->session->evlist, evsel) { struct evsel_script *es = evsel->priv; - perf_evsel_script__fprintf(es, stdout); - perf_evsel_script__delete(es); + evsel_script__fprintf(es, stdout); + evsel_script__delete(es); evsel->priv = NULL; } } -- cgit v1.2.3 From 905203411d8b96cf931dacc359982108a5893c9b Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Mar 2021 09:03:40 -0300 Subject: perf stat: Fixup __perf_stat_evsel__is() prefix This is a perf_stat_evsel method, so should have that as its prefix, previously it was swapped as __perf_evsel_stat__is(). Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/stat.c | 3 +-- tools/perf/util/stat.h | 5 ++--- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index c400f8dde017..2db46b9bebd0 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -76,8 +76,7 @@ double rel_stddev_stats(double stddev, double avg) return pct; } -bool __perf_evsel_stat__is(struct evsel *evsel, - enum perf_stat_evsel_id id) +bool __perf_stat_evsel__is(struct evsel *evsel, enum perf_stat_evsel_id id) { struct perf_stat_evsel *ps = evsel->stats; diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h index d85c292148bb..41107b8deac5 100644 --- a/tools/perf/util/stat.h +++ b/tools/perf/util/stat.h @@ -187,11 +187,10 @@ struct perf_aggr_thread_value { u64 ena; }; -bool __perf_evsel_stat__is(struct evsel *evsel, - enum perf_stat_evsel_id id); +bool __perf_stat_evsel__is(struct evsel *evsel, enum perf_stat_evsel_id id); #define perf_stat_evsel__is(evsel, id) \ - __perf_evsel_stat__is(evsel, PERF_STAT_EVSEL_ID__ ## id) + __perf_stat_evsel__is(evsel, PERF_STAT_EVSEL_ID__ ## id) extern struct runtime_stat rt_stat; extern struct stats walltime_nsecs_stats; -- cgit v1.2.3 From 1f042de2d5c7a69d5ac403a022185164bbe0f51c Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Tue, 9 Mar 2021 17:12:25 +0800 Subject: perf tools: use ARRAY_SIZE Fix the following cppcheck warnings: ./tools/perf/tests/demangle-ocaml-test.c:29:34-35: WARNING: Use ARRAY_SIZE. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/1615281145-2122-1-git-send-email-jiapeng.chong@linux.alibaba.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/demangle-ocaml-test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/tests/demangle-ocaml-test.c b/tools/perf/tests/demangle-ocaml-test.c index a273ed5163d7..2fac7d762c0d 100644 --- a/tools/perf/tests/demangle-ocaml-test.c +++ b/tools/perf/tests/demangle-ocaml-test.c @@ -26,7 +26,7 @@ int test__demangle_ocaml(struct test *test __maybe_unused, int subtest __maybe_u "Stdlib.bytes.++" }, }; - for (i = 0; i < sizeof(test_cases) / sizeof(test_cases[0]); i++) { + for (i = 0; i < ARRAY_SIZE(test_cases); i++) { buf = ocaml_demangle_sym(test_cases[i].mangled); if ((buf == NULL && test_cases[i].demangled != NULL) || (buf != NULL && test_cases[i].demangled == NULL) -- cgit v1.2.3 From 83ff0f93b0803700b018f6cce06f50439dc1348c Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Tue, 9 Mar 2021 18:11:09 +0800 Subject: perf machine: Assign boolean values to a bool variable Fix the following coccicheck warnings: ./tools/perf/util/machine.c:2041:9-10: WARNING: return of 0/1 in function 'symbol__match_regex' with return type bool. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Cc: Alexander Shishkin Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Daniel Borkmann Cc: Jiri Olsa Cc: John Fastabend Cc: KP Singh Cc: Mark Rutland Cc: Martin KaFai Lau Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Song Liu Cc: Yonghong Song Cc: bpf@vger.kernel.org Cc: netdev@vger.kernel.org Link: http://lore.kernel.org/lkml/1615284669-82139-1-git-send-email-jiapeng.chong@linux.alibaba.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/machine.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index b5c2d8be4144..435771eed62b 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -2038,8 +2038,8 @@ int machine__process_event(struct machine *machine, union perf_event *event, static bool symbol__match_regex(struct symbol *sym, regex_t *regex) { if (!regexec(regex, sym->name, 0, NULL, 0)) - return 1; - return 0; + return true; + return false; } static void ip__resolve_ams(struct thread *thread, -- cgit v1.2.3 From 3fcd50d6f9a963a21e142d71be135eff6a374d2d Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Tue, 9 Mar 2021 01:56:48 +0100 Subject: selftests/bpf: Add BTF_KIND_FLOAT to test_core_reloc_size Verify that bpf_core_field_size() is working correctly with floats. Also document the required clang version. Suggested-by: John Fastabend Signed-off-by: Ilya Leoshkevich Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210309005649.162480-2-iii@linux.ibm.com --- tools/testing/selftests/bpf/README.rst | 9 +++++++++ tools/testing/selftests/bpf/prog_tests/core_reloc.c | 1 + tools/testing/selftests/bpf/progs/core_reloc_types.h | 5 +++++ tools/testing/selftests/bpf/progs/test_core_reloc_size.c | 3 +++ 4 files changed, 18 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst index dbc8f6cc5c67..3464161c8eea 100644 --- a/tools/testing/selftests/bpf/README.rst +++ b/tools/testing/selftests/bpf/README.rst @@ -170,3 +170,12 @@ failures: .. _2: https://reviews.llvm.org/D85174 .. _3: https://reviews.llvm.org/D83878 .. _4: https://reviews.llvm.org/D83242 + +Floating-point tests and Clang version +====================================== + +Certain selftests, e.g. core_reloc, require support for the floating-point +types, which was introduced in `Clang 13`__. The older Clang versions will +either crash when compiling these tests, or generate an incorrect BTF. + +__ https://reviews.llvm.org/D83289 diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c index 06eb956ff7bb..d94dcead72e6 100644 --- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c +++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c @@ -266,6 +266,7 @@ static int duration = 0; .arr_elem_sz = sizeof(((type *)0)->arr_field[0]), \ .ptr_sz = 8, /* always 8-byte pointer for BPF */ \ .enum_sz = sizeof(((type *)0)->enum_field), \ + .float_sz = sizeof(((type *)0)->float_field), \ } #define SIZE_CASE(name) { \ diff --git a/tools/testing/selftests/bpf/progs/core_reloc_types.h b/tools/testing/selftests/bpf/progs/core_reloc_types.h index 9a2850850121..9982eb969048 100644 --- a/tools/testing/selftests/bpf/progs/core_reloc_types.h +++ b/tools/testing/selftests/bpf/progs/core_reloc_types.h @@ -807,6 +807,7 @@ struct core_reloc_size_output { int arr_elem_sz; int ptr_sz; int enum_sz; + int float_sz; }; struct core_reloc_size { @@ -816,6 +817,7 @@ struct core_reloc_size { int arr_field[4]; void *ptr_field; enum { VALUE = 123 } enum_field; + float float_field; }; struct core_reloc_size___diff_sz { @@ -825,6 +827,7 @@ struct core_reloc_size___diff_sz { char arr_field[10]; void *ptr_field; enum { OTHER_VALUE = 0xFFFFFFFFFFFFFFFF } enum_field; + double float_field; }; /* Error case of two candidates with the fields (int_field) at the same @@ -839,6 +842,7 @@ struct core_reloc_size___err_ambiguous1 { int arr_field[4]; void *ptr_field; enum { VALUE___1 = 123 } enum_field; + float float_field; }; struct core_reloc_size___err_ambiguous2 { @@ -850,6 +854,7 @@ struct core_reloc_size___err_ambiguous2 { int arr_field[4]; void *ptr_field; enum { VALUE___2 = 123 } enum_field; + float float_field; }; /* diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_size.c b/tools/testing/selftests/bpf/progs/test_core_reloc_size.c index d7fb6cfc7891..7b2d576aeea1 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_size.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_size.c @@ -21,6 +21,7 @@ struct core_reloc_size_output { int arr_elem_sz; int ptr_sz; int enum_sz; + int float_sz; }; struct core_reloc_size { @@ -30,6 +31,7 @@ struct core_reloc_size { int arr_field[4]; void *ptr_field; enum { VALUE = 123 } enum_field; + float float_field; }; SEC("raw_tracepoint/sys_enter") @@ -45,6 +47,7 @@ int test_core_size(void *ctx) out->arr_elem_sz = bpf_core_field_size(in->arr_field[0]); out->ptr_sz = bpf_core_field_size(in->ptr_field); out->enum_sz = bpf_core_field_size(in->enum_field); + out->float_sz = bpf_core_field_size(in->float_field); return 0; } -- cgit v1.2.3 From ccb0e23ca27445e2408f52944660f9e5e5d1c4b1 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Tue, 9 Mar 2021 01:56:49 +0100 Subject: selftests/bpf: Add BTF_KIND_FLOAT to btf_dump_test_case_syntax Check that dumping various floating-point types produces a valid C code. Suggested-by: Andrii Nakryiko Signed-off-by: Ilya Leoshkevich Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210309005649.162480-3-iii@linux.ibm.com --- tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c b/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c index 31975c96e2c9..12b40dc81e14 100644 --- a/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c +++ b/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c @@ -205,6 +205,12 @@ struct struct_with_embedded_stuff { int t[11]; }; +struct float_struct { + float f; + const double *d; + volatile long double *ld; +}; + struct root_struct { enum e1 _1; enum e2 _2; @@ -219,6 +225,7 @@ struct root_struct { union_fwd_t *_12; union_fwd_ptr_t _13; struct struct_with_embedded_stuff _14; + struct float_struct _15; }; /* ------ END-EXPECTED-OUTPUT ------ */ -- cgit v1.2.3 From 11d39cfeecfc9d92a5faa2a55c228e796478e0cb Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 8 Mar 2021 20:43:22 -0800 Subject: selftests/bpf: Fix compiler warning in BPF_KPROBE definition in loop6.c Add missing return type to BPF_KPROBE definition. Without it, compiler generates the following warning: progs/loop6.c:68:12: warning: type specifier missing, defaults to 'int' [-Wimplicit-int] BPF_KPROBE(trace_virtqueue_add_sgs, void *unused, struct scatterlist **sgs, ^ 1 warning generated. Fixes: 86a35af628e5 ("selftests/bpf: Add a verifier scale test with unknown bounded loop") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210309044322.3487636-1-andrii@kernel.org --- tools/testing/selftests/bpf/progs/loop6.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/loop6.c b/tools/testing/selftests/bpf/progs/loop6.c index 2a7141ac1656..38de0331e6b4 100644 --- a/tools/testing/selftests/bpf/progs/loop6.c +++ b/tools/testing/selftests/bpf/progs/loop6.c @@ -65,8 +65,8 @@ int config = 0; int result = 0; SEC("kprobe/virtqueue_add_sgs") -BPF_KPROBE(trace_virtqueue_add_sgs, void *unused, struct scatterlist **sgs, - unsigned int out_sgs, unsigned int in_sgs) +int BPF_KPROBE(trace_virtqueue_add_sgs, void *unused, struct scatterlist **sgs, + unsigned int out_sgs, unsigned int in_sgs) { struct scatterlist *sgp = NULL; __u64 length1 = 0, length2 = 0; -- cgit v1.2.3 From 04ea63e34a2ee85cfd38578b3fc97b2d4c9dd573 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Wed, 10 Mar 2021 14:22:46 +0800 Subject: selftests/bpf: Fix warning comparing pointer to 0 Fix the following coccicheck warning: ./tools/testing/selftests/bpf/progs/test_global_func10.c:17:12-13: WARNING comparing pointer to 0. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1615357366-97612-1-git-send-email-jiapeng.chong@linux.alibaba.com --- tools/testing/selftests/bpf/progs/test_global_func10.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/test_global_func10.c b/tools/testing/selftests/bpf/progs/test_global_func10.c index 61c2ae92ce41..97b7031d0e22 100644 --- a/tools/testing/selftests/bpf/progs/test_global_func10.c +++ b/tools/testing/selftests/bpf/progs/test_global_func10.c @@ -14,7 +14,7 @@ struct Big { __noinline int foo(const struct Big *big) { - if (big == 0) + if (!big) return 0; return bpf_get_prandom_u32() < big->y; -- cgit v1.2.3 From a9c80b03e586fd3819089fbd33c38fb65ad5e00c Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Wed, 10 Mar 2021 15:18:34 +0800 Subject: bpf: Fix warning comparing pointer to 0 Fix the following coccicheck warning: ./tools/testing/selftests/bpf/progs/fentry_test.c:67:12-13: WARNING comparing pointer to 0. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1615360714-30381-1-git-send-email-jiapeng.chong@linux.alibaba.com --- tools/testing/selftests/bpf/progs/fentry_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/fentry_test.c b/tools/testing/selftests/bpf/progs/fentry_test.c index 5f645fdaba6f..52a550d281d9 100644 --- a/tools/testing/selftests/bpf/progs/fentry_test.c +++ b/tools/testing/selftests/bpf/progs/fentry_test.c @@ -64,7 +64,7 @@ __u64 test7_result = 0; SEC("fentry/bpf_fentry_test7") int BPF_PROG(test7, struct bpf_fentry_test_t *arg) { - if (arg == 0) + if (!arg) test7_result = 1; return 0; } -- cgit v1.2.3 From 2882c48bf8f2fb9ec31cbb68be6b979da48f880d Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Wed, 10 Mar 2021 09:09:28 +0100 Subject: libbpf: xsk: Remove linux/compiler.h header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In commit 291471dd1559 ("libbpf, xsk: Add libbpf_smp_store_release libbpf_smp_load_acquire") linux/compiler.h was added as a dependency to xsk.h, which is the user-facing API. This makes it harder for userspace application to consume the library. Here the header inclusion is removed, and instead {READ,WRITE}_ONCE() is added explicitly. Signed-off-by: Björn Töpel Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210310080929.641212-2-bjorn.topel@gmail.com --- tools/lib/bpf/libbpf_util.h | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf_util.h b/tools/lib/bpf/libbpf_util.h index cfbcfc063c81..954da9b34a34 100644 --- a/tools/lib/bpf/libbpf_util.h +++ b/tools/lib/bpf/libbpf_util.h @@ -5,25 +5,30 @@ #define __LIBBPF_LIBBPF_UTIL_H #include -#include #ifdef __cplusplus extern "C" { #endif -/* Use these barrier functions instead of smp_[rw]mb() when they are - * used in a libbpf header file. That way they can be built into the - * application that uses libbpf. +/* Load-Acquire Store-Release barriers used by the XDP socket + * library. The following macros should *NOT* be considered part of + * the xsk.h API, and is subject to change anytime. + * + * LIBRARY INTERNAL */ + +#define __XSK_READ_ONCE(x) (*(volatile typeof(x) *)&x) +#define __XSK_WRITE_ONCE(x, v) (*(volatile typeof(x) *)&x) = (v) + #if defined(__i386__) || defined(__x86_64__) # define libbpf_smp_store_release(p, v) \ do { \ asm volatile("" : : : "memory"); \ - WRITE_ONCE(*p, v); \ + __XSK_WRITE_ONCE(*p, v); \ } while (0) # define libbpf_smp_load_acquire(p) \ ({ \ - typeof(*p) ___p1 = READ_ONCE(*p); \ + typeof(*p) ___p1 = __XSK_READ_ONCE(*p); \ asm volatile("" : : : "memory"); \ ___p1; \ }) @@ -41,11 +46,11 @@ extern "C" { # define libbpf_smp_store_release(p, v) \ do { \ asm volatile ("fence rw,w" : : : "memory"); \ - WRITE_ONCE(*p, v); \ + __XSK_WRITE_ONCE(*p, v); \ } while (0) # define libbpf_smp_load_acquire(p) \ ({ \ - typeof(*p) ___p1 = READ_ONCE(*p); \ + typeof(*p) ___p1 = __XSK_READ_ONCE(*p); \ asm volatile ("fence r,rw" : : : "memory"); \ ___p1; \ }) @@ -55,19 +60,21 @@ extern "C" { #define libbpf_smp_store_release(p, v) \ do { \ __sync_synchronize(); \ - WRITE_ONCE(*p, v); \ + __XSK_WRITE_ONCE(*p, v); \ } while (0) #endif #ifndef libbpf_smp_load_acquire #define libbpf_smp_load_acquire(p) \ ({ \ - typeof(*p) ___p1 = READ_ONCE(*p); \ + typeof(*p) ___p1 = __XSK_READ_ONCE(*p); \ __sync_synchronize(); \ ___p1; \ }) #endif +/* LIBRARY INTERNAL -- END */ + #ifdef __cplusplus } /* extern "C" */ #endif -- cgit v1.2.3 From 7e8bbe24cb8b20e2719ec58bf6937ab2e484add2 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Wed, 10 Mar 2021 09:09:29 +0100 Subject: libbpf: xsk: Move barriers from libbpf_util.h to xsk.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The only user of libbpf_util.h is xsk.h. Move the barriers to xsk.h, and remove libbpf_util.h. The barriers are used as an implementation detail, and should not be considered part of the stable API. Signed-off-by: Björn Töpel Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210310080929.641212-3-bjorn.topel@gmail.com --- tools/lib/bpf/Makefile | 1 - tools/lib/bpf/libbpf_util.h | 82 --------------------------------------------- tools/lib/bpf/xsk.h | 70 ++++++++++++++++++++++++++++++++++++-- 3 files changed, 68 insertions(+), 85 deletions(-) delete mode 100644 tools/lib/bpf/libbpf_util.h (limited to 'tools') diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index 8170f88e8ea6..f45bacbaa3d5 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -228,7 +228,6 @@ install_headers: $(BPF_HELPER_DEFS) $(call do_install,bpf.h,$(prefix)/include/bpf,644); \ $(call do_install,libbpf.h,$(prefix)/include/bpf,644); \ $(call do_install,btf.h,$(prefix)/include/bpf,644); \ - $(call do_install,libbpf_util.h,$(prefix)/include/bpf,644); \ $(call do_install,libbpf_common.h,$(prefix)/include/bpf,644); \ $(call do_install,xsk.h,$(prefix)/include/bpf,644); \ $(call do_install,bpf_helpers.h,$(prefix)/include/bpf,644); \ diff --git a/tools/lib/bpf/libbpf_util.h b/tools/lib/bpf/libbpf_util.h deleted file mode 100644 index 954da9b34a34..000000000000 --- a/tools/lib/bpf/libbpf_util.h +++ /dev/null @@ -1,82 +0,0 @@ -/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ -/* Copyright (c) 2019 Facebook */ - -#ifndef __LIBBPF_LIBBPF_UTIL_H -#define __LIBBPF_LIBBPF_UTIL_H - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/* Load-Acquire Store-Release barriers used by the XDP socket - * library. The following macros should *NOT* be considered part of - * the xsk.h API, and is subject to change anytime. - * - * LIBRARY INTERNAL - */ - -#define __XSK_READ_ONCE(x) (*(volatile typeof(x) *)&x) -#define __XSK_WRITE_ONCE(x, v) (*(volatile typeof(x) *)&x) = (v) - -#if defined(__i386__) || defined(__x86_64__) -# define libbpf_smp_store_release(p, v) \ - do { \ - asm volatile("" : : : "memory"); \ - __XSK_WRITE_ONCE(*p, v); \ - } while (0) -# define libbpf_smp_load_acquire(p) \ - ({ \ - typeof(*p) ___p1 = __XSK_READ_ONCE(*p); \ - asm volatile("" : : : "memory"); \ - ___p1; \ - }) -#elif defined(__aarch64__) -# define libbpf_smp_store_release(p, v) \ - asm volatile ("stlr %w1, %0" : "=Q" (*p) : "r" (v) : "memory") -# define libbpf_smp_load_acquire(p) \ - ({ \ - typeof(*p) ___p1; \ - asm volatile ("ldar %w0, %1" \ - : "=r" (___p1) : "Q" (*p) : "memory"); \ - ___p1; \ - }) -#elif defined(__riscv) -# define libbpf_smp_store_release(p, v) \ - do { \ - asm volatile ("fence rw,w" : : : "memory"); \ - __XSK_WRITE_ONCE(*p, v); \ - } while (0) -# define libbpf_smp_load_acquire(p) \ - ({ \ - typeof(*p) ___p1 = __XSK_READ_ONCE(*p); \ - asm volatile ("fence r,rw" : : : "memory"); \ - ___p1; \ - }) -#endif - -#ifndef libbpf_smp_store_release -#define libbpf_smp_store_release(p, v) \ - do { \ - __sync_synchronize(); \ - __XSK_WRITE_ONCE(*p, v); \ - } while (0) -#endif - -#ifndef libbpf_smp_load_acquire -#define libbpf_smp_load_acquire(p) \ - ({ \ - typeof(*p) ___p1 = __XSK_READ_ONCE(*p); \ - __sync_synchronize(); \ - ___p1; \ - }) -#endif - -/* LIBRARY INTERNAL -- END */ - -#ifdef __cplusplus -} /* extern "C" */ -#endif - -#endif diff --git a/tools/lib/bpf/xsk.h b/tools/lib/bpf/xsk.h index a9fdea87b5cd..01c12dca9c10 100644 --- a/tools/lib/bpf/xsk.h +++ b/tools/lib/bpf/xsk.h @@ -3,7 +3,8 @@ /* * AF_XDP user-space access library. * - * Copyright(c) 2018 - 2019 Intel Corporation. + * Copyright (c) 2018 - 2019 Intel Corporation. + * Copyright (c) 2019 Facebook * * Author(s): Magnus Karlsson */ @@ -13,15 +14,80 @@ #include #include +#include #include #include "libbpf.h" -#include "libbpf_util.h" #ifdef __cplusplus extern "C" { #endif +/* Load-Acquire Store-Release barriers used by the XDP socket + * library. The following macros should *NOT* be considered part of + * the xsk.h API, and is subject to change anytime. + * + * LIBRARY INTERNAL + */ + +#define __XSK_READ_ONCE(x) (*(volatile typeof(x) *)&x) +#define __XSK_WRITE_ONCE(x, v) (*(volatile typeof(x) *)&x) = (v) + +#if defined(__i386__) || defined(__x86_64__) +# define libbpf_smp_store_release(p, v) \ + do { \ + asm volatile("" : : : "memory"); \ + __XSK_WRITE_ONCE(*p, v); \ + } while (0) +# define libbpf_smp_load_acquire(p) \ + ({ \ + typeof(*p) ___p1 = __XSK_READ_ONCE(*p); \ + asm volatile("" : : : "memory"); \ + ___p1; \ + }) +#elif defined(__aarch64__) +# define libbpf_smp_store_release(p, v) \ + asm volatile ("stlr %w1, %0" : "=Q" (*p) : "r" (v) : "memory") +# define libbpf_smp_load_acquire(p) \ + ({ \ + typeof(*p) ___p1; \ + asm volatile ("ldar %w0, %1" \ + : "=r" (___p1) : "Q" (*p) : "memory"); \ + ___p1; \ + }) +#elif defined(__riscv) +# define libbpf_smp_store_release(p, v) \ + do { \ + asm volatile ("fence rw,w" : : : "memory"); \ + __XSK_WRITE_ONCE(*p, v); \ + } while (0) +# define libbpf_smp_load_acquire(p) \ + ({ \ + typeof(*p) ___p1 = __XSK_READ_ONCE(*p); \ + asm volatile ("fence r,rw" : : : "memory"); \ + ___p1; \ + }) +#endif + +#ifndef libbpf_smp_store_release +#define libbpf_smp_store_release(p, v) \ + do { \ + __sync_synchronize(); \ + __XSK_WRITE_ONCE(*p, v); \ + } while (0) +#endif + +#ifndef libbpf_smp_load_acquire +#define libbpf_smp_load_acquire(p) \ + ({ \ + typeof(*p) ___p1 = __XSK_READ_ONCE(*p); \ + __sync_synchronize(); \ + ___p1; \ + }) +#endif + +/* LIBRARY INTERNAL -- END */ + /* Do not access these members directly. Use the functions below. */ #define DEFINE_XSK_RING(name) \ struct name { \ -- cgit v1.2.3 From b046664872dd78a8bebe3d5f3bb9da9baa93f5ca Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 11 Mar 2021 15:23:07 +0100 Subject: static_call: Move struct static_call_key definition to static_call_types.h Having the definition of static_call() in static_call_types.h makes no sense as long struct static_call_key isn't defined there, as the generic implementation of static_call() is referencing this structure. So move the definition of struct static_call_key to static_call_types.h. Signed-off-by: Juergen Gross Signed-off-by: Borislav Petkov Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210311142319.4723-3-jgross@suse.com --- include/linux/static_call.h | 18 ------------------ include/linux/static_call_types.h | 18 ++++++++++++++++++ tools/include/linux/static_call_types.h | 18 ++++++++++++++++++ 3 files changed, 36 insertions(+), 18 deletions(-) (limited to 'tools') diff --git a/include/linux/static_call.h b/include/linux/static_call.h index 85ecc789f4ff..76b881259144 100644 --- a/include/linux/static_call.h +++ b/include/linux/static_call.h @@ -128,16 +128,6 @@ struct static_call_mod { struct static_call_site *sites; }; -struct static_call_key { - void *func; - union { - /* bit 0: 0 = mods, 1 = sites */ - unsigned long type; - struct static_call_mod *mods; - struct static_call_site *sites; - }; -}; - /* For finding the key associated with a trampoline */ struct static_call_tramp_key { s32 tramp; @@ -187,10 +177,6 @@ extern long __static_call_return0(void); static inline int static_call_init(void) { return 0; } -struct static_call_key { - void *func; -}; - #define __DEFINE_STATIC_CALL(name, _func, _func_init) \ DECLARE_STATIC_CALL(name, _func); \ struct static_call_key STATIC_CALL_KEY(name) = { \ @@ -243,10 +229,6 @@ static inline long __static_call_return0(void) static inline int static_call_init(void) { return 0; } -struct static_call_key { - void *func; -}; - static inline long __static_call_return0(void) { return 0; diff --git a/include/linux/static_call_types.h b/include/linux/static_call_types.h index ae5662d368b9..5a00b8b2cf9f 100644 --- a/include/linux/static_call_types.h +++ b/include/linux/static_call_types.h @@ -58,11 +58,25 @@ struct static_call_site { __raw_static_call(name); \ }) +struct static_call_key { + void *func; + union { + /* bit 0: 0 = mods, 1 = sites */ + unsigned long type; + struct static_call_mod *mods; + struct static_call_site *sites; + }; +}; + #else /* !CONFIG_HAVE_STATIC_CALL_INLINE */ #define __STATIC_CALL_ADDRESSABLE(name) #define __static_call(name) __raw_static_call(name) +struct static_call_key { + void *func; +}; + #endif /* CONFIG_HAVE_STATIC_CALL_INLINE */ #ifdef MODULE @@ -77,6 +91,10 @@ struct static_call_site { #else +struct static_call_key { + void *func; +}; + #define static_call(name) \ ((typeof(STATIC_CALL_TRAMP(name))*)(STATIC_CALL_KEY(name).func)) diff --git a/tools/include/linux/static_call_types.h b/tools/include/linux/static_call_types.h index ae5662d368b9..5a00b8b2cf9f 100644 --- a/tools/include/linux/static_call_types.h +++ b/tools/include/linux/static_call_types.h @@ -58,11 +58,25 @@ struct static_call_site { __raw_static_call(name); \ }) +struct static_call_key { + void *func; + union { + /* bit 0: 0 = mods, 1 = sites */ + unsigned long type; + struct static_call_mod *mods; + struct static_call_site *sites; + }; +}; + #else /* !CONFIG_HAVE_STATIC_CALL_INLINE */ #define __STATIC_CALL_ADDRESSABLE(name) #define __static_call(name) __raw_static_call(name) +struct static_call_key { + void *func; +}; + #endif /* CONFIG_HAVE_STATIC_CALL_INLINE */ #ifdef MODULE @@ -77,6 +91,10 @@ struct static_call_site { #else +struct static_call_key { + void *func; +}; + #define static_call(name) \ ((typeof(STATIC_CALL_TRAMP(name))*)(STATIC_CALL_KEY(name).func)) -- cgit v1.2.3 From a605c8f4e71c35fddb9b13785f4ea5c24e273aa2 Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Mon, 15 Feb 2021 12:40:41 +0200 Subject: tools: iio: make iioutils_get_type() private in iio_utils This is a bit of a tidy-up, but also helps with extending the iioutils_get_type() function a bit, as we don't need to use it outside of the iio_utils.c file. So, we'll need to update it only in one place. With this change, the 'unsigned' types are updated to 'unsigned int' in the iioutils_get_type() function definition. Signed-off-by: Alexandru Ardelean Link: https://lore.kernel.org/r/20210215104043.91251-23-alexandru.ardelean@analog.com Signed-off-by: Jonathan Cameron --- tools/iio/iio_utils.c | 9 +++++---- tools/iio/iio_utils.h | 4 ---- 2 files changed, 5 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/iio/iio_utils.c b/tools/iio/iio_utils.c index 7399eb7f1378..a96002f2c2d5 100644 --- a/tools/iio/iio_utils.c +++ b/tools/iio/iio_utils.c @@ -82,10 +82,11 @@ int iioutils_break_up_name(const char *full_name, char **generic_name) * * Returns a value >= 0 on success, otherwise a negative error code. **/ -int iioutils_get_type(unsigned *is_signed, unsigned *bytes, unsigned *bits_used, - unsigned *shift, uint64_t *mask, unsigned *be, - const char *device_dir, const char *name, - const char *generic_name) +static int iioutils_get_type(unsigned int *is_signed, unsigned int *bytes, + unsigned int *bits_used, unsigned int *shift, + uint64_t *mask, unsigned int *be, + const char *device_dir, const char *name, + const char *generic_name) { FILE *sysfsfp; int ret; diff --git a/tools/iio/iio_utils.h b/tools/iio/iio_utils.h index 74bde4fde2c8..a5d0aa8a57d3 100644 --- a/tools/iio/iio_utils.h +++ b/tools/iio/iio_utils.h @@ -57,10 +57,6 @@ static inline int iioutils_check_suffix(const char *str, const char *suffix) } int iioutils_break_up_name(const char *full_name, char **generic_name); -int iioutils_get_type(unsigned *is_signed, unsigned *bytes, unsigned *bits_used, - unsigned *shift, uint64_t *mask, unsigned *be, - const char *device_dir, const char *name, - const char *generic_name); int iioutils_get_param_float(float *output, const char *param_name, const char *device_dir, const char *name, const char *generic_name); -- cgit v1.2.3 From ebe5112535b5cf389ca7d337cf6a0c1d885f9880 Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Mon, 15 Feb 2021 12:40:42 +0200 Subject: tools: iio: privatize globals and functions in iio_generic_buffer.c file Mostly a tidy-up. But also helps to understand the limits of scope of these functions and globals. Signed-off-by: Alexandru Ardelean Link: https://lore.kernel.org/r/20210215104043.91251-24-alexandru.ardelean@analog.com Signed-off-by: Jonathan Cameron --- tools/iio/iio_generic_buffer.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'tools') diff --git a/tools/iio/iio_generic_buffer.c b/tools/iio/iio_generic_buffer.c index 34d63bcebcd2..7c7240553777 100644 --- a/tools/iio/iio_generic_buffer.c +++ b/tools/iio/iio_generic_buffer.c @@ -49,7 +49,7 @@ enum autochan { * Has the side effect of filling the channels[i].location values used * in processing the buffer output. **/ -int size_from_channelarray(struct iio_channel_info *channels, int num_channels) +static int size_from_channelarray(struct iio_channel_info *channels, int num_channels) { int bytes = 0; int i = 0; @@ -68,7 +68,7 @@ int size_from_channelarray(struct iio_channel_info *channels, int num_channels) return bytes; } -void print1byte(uint8_t input, struct iio_channel_info *info) +static void print1byte(uint8_t input, struct iio_channel_info *info) { /* * Shift before conversion to avoid sign extension @@ -85,7 +85,7 @@ void print1byte(uint8_t input, struct iio_channel_info *info) } } -void print2byte(uint16_t input, struct iio_channel_info *info) +static void print2byte(uint16_t input, struct iio_channel_info *info) { /* First swap if incorrect endian */ if (info->be) @@ -108,7 +108,7 @@ void print2byte(uint16_t input, struct iio_channel_info *info) } } -void print4byte(uint32_t input, struct iio_channel_info *info) +static void print4byte(uint32_t input, struct iio_channel_info *info) { /* First swap if incorrect endian */ if (info->be) @@ -131,7 +131,7 @@ void print4byte(uint32_t input, struct iio_channel_info *info) } } -void print8byte(uint64_t input, struct iio_channel_info *info) +static void print8byte(uint64_t input, struct iio_channel_info *info) { /* First swap if incorrect endian */ if (info->be) @@ -167,9 +167,8 @@ void print8byte(uint64_t input, struct iio_channel_info *info) * to fill the location offsets. * @num_channels: number of channels **/ -void process_scan(char *data, - struct iio_channel_info *channels, - int num_channels) +static void process_scan(char *data, struct iio_channel_info *channels, + int num_channels) { int k; @@ -238,7 +237,7 @@ static int enable_disable_all_channels(char *dev_dir_name, int enable) return 0; } -void print_usage(void) +static void print_usage(void) { fprintf(stderr, "Usage: generic_buffer [options]...\n" "Capture, convert and output data from IIO device buffer\n" @@ -257,12 +256,12 @@ void print_usage(void) " -w Set delay between reads in us (event-less mode)\n"); } -enum autochan autochannels = AUTOCHANNELS_DISABLED; -char *dev_dir_name = NULL; -char *buf_dir_name = NULL; -bool current_trigger_set = false; +static enum autochan autochannels = AUTOCHANNELS_DISABLED; +static char *dev_dir_name = NULL; +static char *buf_dir_name = NULL; +static bool current_trigger_set = false; -void cleanup(void) +static void cleanup(void) { int ret; @@ -294,14 +293,14 @@ void cleanup(void) } } -void sig_handler(int signum) +static void sig_handler(int signum) { fprintf(stderr, "Caught signal %d\n", signum); cleanup(); exit(-signum); } -void register_cleanup(void) +static void register_cleanup(void) { struct sigaction sa = { .sa_handler = sig_handler }; const int signums[] = { SIGINT, SIGTERM, SIGABRT }; -- cgit v1.2.3 From 8827faab2c8b52e848071a039a945db6f3ae8365 Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Mon, 15 Feb 2021 12:40:43 +0200 Subject: tools: iio: convert iio_generic_buffer to use new IIO buffer API This change makes use of the new IIO buffer API to read data from an IIO buffer. It doesn't read the /sys/bus/iio/devices/iio:deviceX/scan_elements dir anymore, it reads /sys/bus/iio/devices/iio:deviceX/bufferY, where all the scan_elements have been merged together with the old/classical buffer attributes. And it makes use of the new IIO_BUFFER_GET_FD_IOCTL ioctl to get an FD for the IIO buffer for which to read data from. It also does a quick sanity check to see that -EBUSY is returned if reading the chardev after the ioctl() has succeeded. This was tested with the following cases: 1. Tested buffer0 works with ioctl() 2. Tested that buffer0 can't be opened via /dev/iio:deviceX after ioctl() This check should be omitted under normal operation; it's being done here to check that the driver change is sane 3. Moved valid buffer0 to be buffer1, and tested that data comes from it Signed-off-by: Alexandru Ardelean Link: https://lore.kernel.org/r/20210215104043.91251-25-alexandru.ardelean@analog.com Signed-off-by: Jonathan Cameron --- tools/iio/Makefile | 1 + tools/iio/iio_generic_buffer.c | 122 ++++++++++++++++++++++++++++++++--------- tools/iio/iio_utils.c | 13 +++-- tools/iio/iio_utils.h | 4 +- 4 files changed, 107 insertions(+), 33 deletions(-) (limited to 'tools') diff --git a/tools/iio/Makefile b/tools/iio/Makefile index 3de763d9ab70..5d12ac4e7f8f 100644 --- a/tools/iio/Makefile +++ b/tools/iio/Makefile @@ -27,6 +27,7 @@ include $(srctree)/tools/build/Makefile.include # $(OUTPUT)include/linux/iio: ../../include/uapi/linux/iio mkdir -p $(OUTPUT)include/linux/iio 2>&1 || true + ln -sf $(CURDIR)/../../include/uapi/linux/iio/buffer.h $@ ln -sf $(CURDIR)/../../include/uapi/linux/iio/events.h $@ ln -sf $(CURDIR)/../../include/uapi/linux/iio/types.h $@ diff --git a/tools/iio/iio_generic_buffer.c b/tools/iio/iio_generic_buffer.c index 7c7240553777..2491c54a5e4f 100644 --- a/tools/iio/iio_generic_buffer.c +++ b/tools/iio/iio_generic_buffer.c @@ -30,6 +30,8 @@ #include #include #include +#include +#include #include "iio_utils.h" /** @@ -197,7 +199,7 @@ static void process_scan(char *data, struct iio_channel_info *channels, printf("\n"); } -static int enable_disable_all_channels(char *dev_dir_name, int enable) +static int enable_disable_all_channels(char *dev_dir_name, int buffer_idx, int enable) { const struct dirent *ent; char scanelemdir[256]; @@ -205,7 +207,7 @@ static int enable_disable_all_channels(char *dev_dir_name, int enable) int ret; snprintf(scanelemdir, sizeof(scanelemdir), - FORMAT_SCAN_ELEMENTS_DIR, dev_dir_name); + FORMAT_SCAN_ELEMENTS_DIR, dev_dir_name, buffer_idx); scanelemdir[sizeof(scanelemdir)-1] = '\0'; dp = opendir(scanelemdir); @@ -243,6 +245,7 @@ static void print_usage(void) "Capture, convert and output data from IIO device buffer\n" " -a Auto-activate all available channels\n" " -A Force-activate ALL channels\n" + " -b The buffer which to open (by index), default 0\n" " -c Do n conversions, or loop forever if n < 0\n" " -e Disable wait for event (new data)\n" " -g Use trigger-less mode\n" @@ -259,6 +262,7 @@ static void print_usage(void) static enum autochan autochannels = AUTOCHANNELS_DISABLED; static char *dev_dir_name = NULL; static char *buf_dir_name = NULL; +static int buffer_idx = 0; static bool current_trigger_set = false; static void cleanup(void) @@ -286,7 +290,7 @@ static void cleanup(void) /* Disable channels if auto-enabled */ if (dev_dir_name && autochannels == AUTOCHANNELS_ACTIVE) { - ret = enable_disable_all_channels(dev_dir_name, 0); + ret = enable_disable_all_channels(dev_dir_name, buffer_idx, 0); if (ret) fprintf(stderr, "Failed to disable all channels\n"); autochannels = AUTOCHANNELS_DISABLED; @@ -333,7 +337,9 @@ int main(int argc, char **argv) unsigned long long j; unsigned long toread; int ret, c; - int fp = -1; + struct stat st; + int fd = -1; + int buf_fd = -1; int num_channels = 0; char *trigger_name = NULL, *device_name = NULL; @@ -352,7 +358,7 @@ int main(int argc, char **argv) register_cleanup(); - while ((c = getopt_long(argc, argv, "aAc:egl:n:N:t:T:w:?", longopts, + while ((c = getopt_long(argc, argv, "aAb:c:egl:n:N:t:T:w:?", longopts, NULL)) != -1) { switch (c) { case 'a': @@ -361,7 +367,20 @@ int main(int argc, char **argv) case 'A': autochannels = AUTOCHANNELS_ENABLED; force_autochannels = true; - break; + break; + case 'b': + errno = 0; + buffer_idx = strtoll(optarg, &dummy, 10); + if (errno) { + ret = -errno; + goto error; + } + if (buffer_idx < 0) { + ret = -ERANGE; + goto error; + } + + break; case 'c': errno = 0; num_loops = strtoll(optarg, &dummy, 10); @@ -518,7 +537,7 @@ int main(int argc, char **argv) * Parse the files in scan_elements to identify what channels are * present */ - ret = build_channel_array(dev_dir_name, &channels, &num_channels); + ret = build_channel_array(dev_dir_name, buffer_idx, &channels, &num_channels); if (ret) { fprintf(stderr, "Problem reading scan element information\n" "diag %s\n", dev_dir_name); @@ -535,7 +554,7 @@ int main(int argc, char **argv) (autochannels == AUTOCHANNELS_ENABLED && force_autochannels)) { fprintf(stderr, "Enabling all channels\n"); - ret = enable_disable_all_channels(dev_dir_name, 1); + ret = enable_disable_all_channels(dev_dir_name, buffer_idx, 1); if (ret) { fprintf(stderr, "Failed to enable all channels\n"); goto error; @@ -544,7 +563,7 @@ int main(int argc, char **argv) /* This flags that we need to disable the channels again */ autochannels = AUTOCHANNELS_ACTIVE; - ret = build_channel_array(dev_dir_name, &channels, + ret = build_channel_array(dev_dir_name, buffer_idx, &channels, &num_channels); if (ret) { fprintf(stderr, "Problem reading scan element " @@ -565,7 +584,7 @@ int main(int argc, char **argv) fprintf(stderr, "Enable channels manually in " FORMAT_SCAN_ELEMENTS_DIR "/*_en or pass -a to autoenable channels and " - "try again.\n", dev_dir_name); + "try again.\n", dev_dir_name, buffer_idx); ret = -ENOENT; goto error; } @@ -576,12 +595,25 @@ int main(int argc, char **argv) * be built rather than found. */ ret = asprintf(&buf_dir_name, - "%siio:device%d/buffer", iio_dir, dev_num); + "%siio:device%d/buffer%d", iio_dir, dev_num, buffer_idx); if (ret < 0) { ret = -ENOMEM; goto error; } + if (stat(buf_dir_name, &st)) { + fprintf(stderr, "Could not stat() '%s', got error %d: %s\n", + buf_dir_name, errno, strerror(errno)); + ret = -errno; + goto error; + } + + if (!S_ISDIR(st.st_mode)) { + fprintf(stderr, "File '%s' is not a directory\n", buf_dir_name); + ret = -EFAULT; + goto error; + } + if (!notrigger) { printf("%s %s\n", dev_dir_name, trigger_name); /* @@ -598,6 +630,35 @@ int main(int argc, char **argv) } } + ret = asprintf(&buffer_access, "/dev/iio:device%d", dev_num); + if (ret < 0) { + ret = -ENOMEM; + goto error; + } + + /* Attempt to open non blocking the access dev */ + fd = open(buffer_access, O_RDONLY | O_NONBLOCK); + if (fd == -1) { /* TODO: If it isn't there make the node */ + ret = -errno; + fprintf(stderr, "Failed to open %s\n", buffer_access); + goto error; + } + + /* specify for which buffer index we want an FD */ + buf_fd = buffer_idx; + + ret = ioctl(fd, IIO_BUFFER_GET_FD_IOCTL, &buf_fd); + if (ret == -1 || buf_fd == -1) { + ret = -errno; + if (ret == -ENODEV || ret == -EINVAL) + fprintf(stderr, + "Device does not have this many buffers\n"); + else + fprintf(stderr, "Failed to retrieve buffer fd\n"); + + goto error; + } + /* Setup ring buffer parameters */ ret = write_sysfs_int("length", buf_dir_name, buf_len); if (ret < 0) @@ -607,7 +668,8 @@ int main(int argc, char **argv) ret = write_sysfs_int("enable", buf_dir_name, 1); if (ret < 0) { fprintf(stderr, - "Failed to enable buffer: %s\n", strerror(-ret)); + "Failed to enable buffer '%s': %s\n", + buf_dir_name, strerror(-ret)); goto error; } @@ -618,24 +680,30 @@ int main(int argc, char **argv) goto error; } - ret = asprintf(&buffer_access, "/dev/iio:device%d", dev_num); - if (ret < 0) { - ret = -ENOMEM; - goto error; + /** + * This check is being done here for sanity reasons, however it + * should be omitted under normal operation. + * If this is buffer0, we check that we get EBUSY after this point. + */ + if (buffer_idx == 0) { + errno = 0; + read_size = read(fd, data, 1); + if (read_size > -1 || errno != EBUSY) { + ret = -EFAULT; + perror("Reading from '%s' should not be possible after ioctl()"); + goto error; + } } - /* Attempt to open non blocking the access dev */ - fp = open(buffer_access, O_RDONLY | O_NONBLOCK); - if (fp == -1) { /* TODO: If it isn't there make the node */ - ret = -errno; - fprintf(stderr, "Failed to open %s\n", buffer_access); - goto error; - } + /* close now the main chardev FD and let the buffer FD work */ + if (close(fd) == -1) + perror("Failed to close character device file"); + fd = -1; for (j = 0; j < num_loops || num_loops < 0; j++) { if (!noevents) { struct pollfd pfd = { - .fd = fp, + .fd = buf_fd, .events = POLLIN, }; @@ -653,7 +721,7 @@ int main(int argc, char **argv) toread = 64; } - read_size = read(fp, data, toread * scan_size); + read_size = read(buf_fd, data, toread * scan_size); if (read_size < 0) { if (errno == EAGAIN) { fprintf(stderr, "nothing available\n"); @@ -670,7 +738,9 @@ int main(int argc, char **argv) error: cleanup(); - if (fp >= 0 && close(fp) == -1) + if (fd >= 0 && close(fd) == -1) + perror("Failed to close character device"); + if (buf_fd >= 0 && close(buf_fd) == -1) perror("Failed to close buffer"); free(buffer_access); free(data); diff --git a/tools/iio/iio_utils.c b/tools/iio/iio_utils.c index a96002f2c2d5..aadee6d34c74 100644 --- a/tools/iio/iio_utils.c +++ b/tools/iio/iio_utils.c @@ -77,6 +77,7 @@ int iioutils_break_up_name(const char *full_name, char **generic_name) * @mask: output a bit mask for the raw data * @be: output if data in big endian * @device_dir: the IIO device directory + * @buffer_idx: the IIO buffer index * @name: the channel name * @generic_name: the channel type name * @@ -85,8 +86,8 @@ int iioutils_break_up_name(const char *full_name, char **generic_name) static int iioutils_get_type(unsigned int *is_signed, unsigned int *bytes, unsigned int *bits_used, unsigned int *shift, uint64_t *mask, unsigned int *be, - const char *device_dir, const char *name, - const char *generic_name) + const char *device_dir, int buffer_idx, + const char *name, const char *generic_name) { FILE *sysfsfp; int ret; @@ -96,7 +97,7 @@ static int iioutils_get_type(unsigned int *is_signed, unsigned int *bytes, unsigned padint; const struct dirent *ent; - ret = asprintf(&scan_el_dir, FORMAT_SCAN_ELEMENTS_DIR, device_dir); + ret = asprintf(&scan_el_dir, FORMAT_SCAN_ELEMENTS_DIR, device_dir, buffer_idx); if (ret < 0) return -ENOMEM; @@ -304,12 +305,13 @@ void bsort_channel_array_by_index(struct iio_channel_info *ci_array, int cnt) /** * build_channel_array() - function to figure out what channels are present * @device_dir: the IIO device directory in sysfs + * @buffer_idx: the IIO buffer for this channel array * @ci_array: output the resulting array of iio_channel_info * @counter: output the amount of array elements * * Returns 0 on success, otherwise a negative error code. **/ -int build_channel_array(const char *device_dir, +int build_channel_array(const char *device_dir, int buffer_idx, struct iio_channel_info **ci_array, int *counter) { DIR *dp; @@ -322,7 +324,7 @@ int build_channel_array(const char *device_dir, char *filename; *counter = 0; - ret = asprintf(&scan_el_dir, FORMAT_SCAN_ELEMENTS_DIR, device_dir); + ret = asprintf(&scan_el_dir, FORMAT_SCAN_ELEMENTS_DIR, device_dir, buffer_idx); if (ret < 0) return -ENOMEM; @@ -503,6 +505,7 @@ int build_channel_array(const char *device_dir, ¤t->mask, ¤t->be, device_dir, + buffer_idx, current->name, current->generic_name); if (ret < 0) diff --git a/tools/iio/iio_utils.h b/tools/iio/iio_utils.h index a5d0aa8a57d3..336752cade4f 100644 --- a/tools/iio/iio_utils.h +++ b/tools/iio/iio_utils.h @@ -12,7 +12,7 @@ /* Made up value to limit allocation sizes */ #define IIO_MAX_NAME_LENGTH 64 -#define FORMAT_SCAN_ELEMENTS_DIR "%s/scan_elements" +#define FORMAT_SCAN_ELEMENTS_DIR "%s/buffer%d" #define FORMAT_TYPE_FILE "%s_type" #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0])) @@ -61,7 +61,7 @@ int iioutils_get_param_float(float *output, const char *param_name, const char *device_dir, const char *name, const char *generic_name); void bsort_channel_array_by_index(struct iio_channel_info *ci_array, int cnt); -int build_channel_array(const char *device_dir, +int build_channel_array(const char *device_dir, int buffer_idx, struct iio_channel_info **ci_array, int *counter); int find_type_by_name(const char *name, const char *type); int write_sysfs_int(const char *filename, const char *basedir, int val); -- cgit v1.2.3 From 8e815284a5f9351973a5860447941823acf1182d Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 12 Mar 2021 17:50:21 +0100 Subject: selftests: fib_nexthops: Declutter test output Before: # ./fib_nexthops.sh -t ipv4_torture IPv4 runtime torture -------------------- TEST: IPv4 torture test [ OK ] ./fib_nexthops.sh: line 213: 19376 Killed ipv4_del_add_loop1 ./fib_nexthops.sh: line 213: 19377 Killed ipv4_grp_replace_loop ./fib_nexthops.sh: line 213: 19378 Killed ip netns exec me ping -f 172.16.101.1 > /dev/null 2>&1 ./fib_nexthops.sh: line 213: 19380 Killed ip netns exec me ping -f 172.16.101.2 > /dev/null 2>&1 ./fib_nexthops.sh: line 213: 19381 Killed ip netns exec me mausezahn veth1 -B 172.16.101.2 -A 172.16.1.1 -c 0 -t tcp "dp=1-1023, flags=syn" > /dev/null 2>&1 Tests passed: 1 Tests failed: 0 # ./fib_nexthops.sh -t ipv6_torture IPv6 runtime torture -------------------- TEST: IPv6 torture test [ OK ] ./fib_nexthops.sh: line 213: 24453 Killed ipv6_del_add_loop1 ./fib_nexthops.sh: line 213: 24454 Killed ipv6_grp_replace_loop ./fib_nexthops.sh: line 213: 24456 Killed ip netns exec me ping -f 2001:db8:101::1 > /dev/null 2>&1 ./fib_nexthops.sh: line 213: 24457 Killed ip netns exec me ping -f 2001:db8:101::2 > /dev/null 2>&1 ./fib_nexthops.sh: line 213: 24458 Killed ip netns exec me mausezahn -6 veth1 -B 2001:db8:101::2 -A 2001:db8:91::1 -c 0 -t tcp "dp=1-1023, flags=syn" > /dev/null 2>&1 Tests passed: 1 Tests failed: 0 After: # ./fib_nexthops.sh -t ipv4_torture IPv4 runtime torture -------------------- TEST: IPv4 torture test [ OK ] Tests passed: 1 Tests failed: 0 # ./fib_nexthops.sh -t ipv6_torture IPv6 runtime torture -------------------- TEST: IPv6 torture test [ OK ] Tests passed: 1 Tests failed: 0 Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- tools/testing/selftests/net/fib_nexthops.sh | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index d98fb85e201c..91226ac50112 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -874,6 +874,7 @@ ipv6_torture() sleep 300 kill -9 $pid1 $pid2 $pid3 $pid4 $pid5 + wait $pid1 $pid2 $pid3 $pid4 $pid5 2>/dev/null # if we did not crash, success log_test 0 0 "IPv6 torture test" @@ -1476,6 +1477,7 @@ ipv4_torture() sleep 300 kill -9 $pid1 $pid2 $pid3 $pid4 $pid5 + wait $pid1 $pid2 $pid3 $pid4 $pid5 2>/dev/null # if we did not crash, success log_test 0 0 "IPv4 torture test" -- cgit v1.2.3 From a8f9952d218d816ff1a13c9385edd821a8da527d Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 12 Mar 2021 17:50:22 +0100 Subject: selftests: fib_nexthops: List each test case in a different line The lines with the IPv4 and IPv6 test cases are already very long and more test cases will be added in subsequent patches. List each test case in a different line to make it easier to extend the test with more test cases. Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- tools/testing/selftests/net/fib_nexthops.sh | 30 +++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index 91226ac50112..c840aa88ff18 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -19,10 +19,32 @@ ret=0 ksft_skip=4 # all tests in this script. Can be overridden with -t option -IPV4_TESTS="ipv4_fcnal ipv4_grp_fcnal ipv4_withv6_fcnal ipv4_fcnal_runtime ipv4_large_grp ipv4_compat_mode ipv4_fdb_grp_fcnal ipv4_torture" -IPV6_TESTS="ipv6_fcnal ipv6_grp_fcnal ipv6_fcnal_runtime ipv6_large_grp ipv6_compat_mode ipv6_fdb_grp_fcnal ipv6_torture" - -ALL_TESTS="basic ${IPV4_TESTS} ${IPV6_TESTS}" +IPV4_TESTS=" + ipv4_fcnal + ipv4_grp_fcnal + ipv4_withv6_fcnal + ipv4_fcnal_runtime + ipv4_large_grp + ipv4_compat_mode + ipv4_fdb_grp_fcnal + ipv4_torture +" + +IPV6_TESTS=" + ipv6_fcnal + ipv6_grp_fcnal + ipv6_fcnal_runtime + ipv6_large_grp + ipv6_compat_mode + ipv6_fdb_grp_fcnal + ipv6_torture +" + +ALL_TESTS=" + basic + ${IPV4_TESTS} + ${IPV6_TESTS} +" TESTS="${ALL_TESTS}" VERBOSE=0 PAUSE_ON_FAIL=no -- cgit v1.2.3 From 557205f47dc47afeaf80707845cda8ec79ea4cb0 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 12 Mar 2021 17:50:23 +0100 Subject: selftests: fib_nexthops: Test resilient nexthop groups Add test cases for resilient nexthop groups. Exhaustive forwarding tests are added separately under net/forwarding/. Examples: # ./fib_nexthops.sh -t basic_res Basic resilient nexthop group functional tests ---------------------------------------------- TEST: Add a nexthop group with default parameters [ OK ] TEST: Get a nexthop group with default parameters [ OK ] TEST: Get a nexthop group with non-default parameters [ OK ] TEST: Add a nexthop group with 0 buckets [ OK ] TEST: Replace nexthop group parameters [ OK ] TEST: Get a nexthop group after replacing parameters [ OK ] TEST: Replace idle timer [ OK ] TEST: Get a nexthop group after replacing idle timer [ OK ] TEST: Replace unbalanced timer [ OK ] TEST: Get a nexthop group after replacing unbalanced timer [ OK ] TEST: Replace with no parameters [ OK ] TEST: Get a nexthop group after replacing no parameters [ OK ] TEST: Replace nexthop group type - implicit [ OK ] TEST: Replace nexthop group type - explicit [ OK ] TEST: Replace number of nexthop buckets [ OK ] TEST: Get a nexthop group after replacing with invalid parameters [ OK ] TEST: Dump all nexthop buckets [ OK ] TEST: Dump all nexthop buckets in a group [ OK ] TEST: Dump all nexthop buckets with a specific nexthop device [ OK ] TEST: Dump all nexthop buckets with a specific nexthop identifier [ OK ] TEST: Dump all nexthop buckets in a non-existent group [ OK ] TEST: Dump all nexthop buckets in a non-resilient group [ OK ] TEST: Dump all nexthop buckets using a non-existent device [ OK ] TEST: Dump all nexthop buckets with invalid 'groups' keyword [ OK ] TEST: Dump all nexthop buckets with invalid 'fdb' keyword [ OK ] TEST: Get a valid nexthop bucket [ OK ] TEST: Get a nexthop bucket with valid group, but invalid index [ OK ] TEST: Get a nexthop bucket from a non-resilient group [ OK ] TEST: Get a nexthop bucket from a non-existent group [ OK ] Tests passed: 29 Tests failed: 0 # ./fib_nexthops.sh -t ipv4_large_res_grp IPv4 large resilient group (128k buckets) ----------------------------------------- TEST: Dump large (x131072) nexthop buckets [ OK ] Tests passed: 1 Tests failed: 0 # ./fib_nexthops.sh -t ipv6_large_res_grp IPv6 large resilient group (128k buckets) ----------------------------------------- TEST: Dump large (x131072) nexthop buckets [ OK ] Tests passed: 1 Tests failed: 0 # ./fib_nexthops.sh -t ipv4_res_torture IPv4 runtime resilient nexthop group torture -------------------------------------------- TEST: IPv4 resilient nexthop group torture test [ OK ] Tests passed: 1 Tests failed: 0 # ./fib_nexthops.sh -t ipv6_res_torture IPv6 runtime resilient nexthop group torture -------------------------------------------- TEST: IPv6 resilient nexthop group torture test [ OK ] Tests passed: 1 Tests failed: 0 # ./fib_nexthops.sh -t ipv4_res_grp_fcnal IPv4 resilient groups functional -------------------------------- TEST: Nexthop group updated when entry is deleted [ OK ] TEST: Nexthop buckets updated when entry is deleted [ OK ] TEST: Nexthop group updated after replace [ OK ] TEST: Nexthop buckets updated after replace [ OK ] TEST: Nexthop group updated when entry is deleted - nECMP [ OK ] TEST: Nexthop buckets updated when entry is deleted - nECMP [ OK ] TEST: Nexthop group updated after replace - nECMP [ OK ] TEST: Nexthop buckets updated after replace - nECMP [ OK ] Tests passed: 8 Tests failed: 0 # ./fib_nexthops.sh -t ipv6_res_grp_fcnal IPv6 resilient groups functional -------------------------------- TEST: Nexthop group updated when entry is deleted [ OK ] TEST: Nexthop buckets updated when entry is deleted [ OK ] TEST: Nexthop group updated after replace [ OK ] TEST: Nexthop buckets updated after replace [ OK ] TEST: Nexthop group updated when entry is deleted - nECMP [ OK ] TEST: Nexthop buckets updated when entry is deleted - nECMP [ OK ] TEST: Nexthop group updated after replace - nECMP [ OK ] TEST: Nexthop buckets updated after replace - nECMP [ OK ] Tests passed: 8 Tests failed: 0 Signed-off-by: Ido Schimmel Co-developed-by: Petr Machata Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- tools/testing/selftests/net/fib_nexthops.sh | 517 ++++++++++++++++++++++++++++ 1 file changed, 517 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index c840aa88ff18..56dd0c6f2e96 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -22,26 +22,33 @@ ksft_skip=4 IPV4_TESTS=" ipv4_fcnal ipv4_grp_fcnal + ipv4_res_grp_fcnal ipv4_withv6_fcnal ipv4_fcnal_runtime ipv4_large_grp + ipv4_large_res_grp ipv4_compat_mode ipv4_fdb_grp_fcnal ipv4_torture + ipv4_res_torture " IPV6_TESTS=" ipv6_fcnal ipv6_grp_fcnal + ipv6_res_grp_fcnal ipv6_fcnal_runtime ipv6_large_grp + ipv6_large_res_grp ipv6_compat_mode ipv6_fdb_grp_fcnal ipv6_torture + ipv6_res_torture " ALL_TESTS=" basic + basic_res ${IPV4_TESTS} ${IPV6_TESTS} " @@ -254,6 +261,19 @@ check_nexthop() check_output "${out}" "${expected}" } +check_nexthop_bucket() +{ + local nharg="$1" + local expected="$2" + local out + + # remove the idle time since we cannot match it + out=$($IP nexthop bucket ${nharg} \ + | sed s/idle_time\ [0-9.]*\ // 2>/dev/null) + + check_output "${out}" "${expected}" +} + check_route() { local pfx="$1" @@ -330,6 +350,25 @@ check_large_grp() log_test $? 0 "Dump large (x$ecmp) ecmp groups" } +check_large_res_grp() +{ + local ipv=$1 + local buckets=$2 + local ipstr="" + + if [ $ipv -eq 4 ]; then + ipstr="172.16.1.2" + else + ipstr="2001:db8:91::2" + fi + + # create a resilient group with $buckets buckets and dump them + run_cmd "$IP nexthop add id 100 via $ipstr dev veth1" + run_cmd "$IP nexthop add id 1000 group 100 type resilient buckets $buckets" + run_cmd "$IP nexthop bucket list" + log_test $? 0 "Dump large (x$buckets) nexthop buckets" +} + start_ip_monitor() { local mtype=$1 @@ -366,6 +405,15 @@ check_nexthop_fdb_support() fi } +check_nexthop_res_support() +{ + $IP nexthop help 2>&1 | grep -q resilient + if [ $? -ne 0 ]; then + echo "SKIP: iproute2 too old, missing resilient nexthop group support" + return $ksft_skip + fi +} + ipv6_fdb_grp_fcnal() { local rc @@ -688,6 +736,70 @@ ipv6_grp_fcnal() log_test $? 2 "Nexthop group can not have a blackhole and another nexthop" } +ipv6_res_grp_fcnal() +{ + local rc + + echo + echo "IPv6 resilient groups functional" + echo "--------------------------------" + + check_nexthop_res_support + if [ $? -eq $ksft_skip ]; then + return $ksft_skip + fi + + # + # migration of nexthop buckets - equal weights + # + run_cmd "$IP nexthop add id 62 via 2001:db8:91::2 dev veth1" + run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1" + run_cmd "$IP nexthop add id 102 group 62/63 type resilient buckets 2 idle_timer 0" + + run_cmd "$IP nexthop del id 63" + check_nexthop "id 102" \ + "id 102 group 62 type resilient buckets 2 idle_timer 0 unbalanced_timer 0 unbalanced_time 0" + log_test $? 0 "Nexthop group updated when entry is deleted" + check_nexthop_bucket "list id 102" \ + "id 102 index 0 nhid 62 id 102 index 1 nhid 62" + log_test $? 0 "Nexthop buckets updated when entry is deleted" + + run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1" + run_cmd "$IP nexthop replace id 102 group 62/63 type resilient buckets 2 idle_timer 0" + check_nexthop "id 102" \ + "id 102 group 62/63 type resilient buckets 2 idle_timer 0 unbalanced_timer 0 unbalanced_time 0" + log_test $? 0 "Nexthop group updated after replace" + check_nexthop_bucket "list id 102" \ + "id 102 index 0 nhid 63 id 102 index 1 nhid 62" + log_test $? 0 "Nexthop buckets updated after replace" + + $IP nexthop flush >/dev/null 2>&1 + + # + # migration of nexthop buckets - unequal weights + # + run_cmd "$IP nexthop add id 62 via 2001:db8:91::2 dev veth1" + run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1" + run_cmd "$IP nexthop add id 102 group 62,3/63,1 type resilient buckets 4 idle_timer 0" + + run_cmd "$IP nexthop del id 63" + check_nexthop "id 102" \ + "id 102 group 62,3 type resilient buckets 4 idle_timer 0 unbalanced_timer 0 unbalanced_time 0" + log_test $? 0 "Nexthop group updated when entry is deleted - nECMP" + check_nexthop_bucket "list id 102" \ + "id 102 index 0 nhid 62 id 102 index 1 nhid 62 id 102 index 2 nhid 62 id 102 index 3 nhid 62" + log_test $? 0 "Nexthop buckets updated when entry is deleted - nECMP" + + run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1" + run_cmd "$IP nexthop replace id 102 group 62,3/63,1 type resilient buckets 4 idle_timer 0" + check_nexthop "id 102" \ + "id 102 group 62,3/63 type resilient buckets 4 idle_timer 0 unbalanced_timer 0 unbalanced_time 0" + log_test $? 0 "Nexthop group updated after replace - nECMP" + check_nexthop_bucket "list id 102" \ + "id 102 index 0 nhid 63 id 102 index 1 nhid 62 id 102 index 2 nhid 62 id 102 index 3 nhid 62" + log_test $? 0 "Nexthop buckets updated after replace - nECMP" +} + ipv6_fcnal_runtime() { local rc @@ -846,6 +958,22 @@ ipv6_large_grp() $IP nexthop flush >/dev/null 2>&1 } +ipv6_large_res_grp() +{ + echo + echo "IPv6 large resilient group (128k buckets)" + echo "-----------------------------------------" + + check_nexthop_res_support + if [ $? -eq $ksft_skip ]; then + return $ksft_skip + fi + + check_large_res_grp 6 $((128 * 1024)) + + $IP nexthop flush >/dev/null 2>&1 +} + ipv6_del_add_loop1() { while :; do @@ -902,6 +1030,61 @@ ipv6_torture() log_test 0 0 "IPv6 torture test" } +ipv6_res_grp_replace_loop() +{ + while :; do + $IP nexthop replace id 102 group 100/101 type resilient + done >/dev/null 2>&1 +} + +ipv6_res_torture() +{ + local pid1 + local pid2 + local pid3 + local pid4 + local pid5 + + echo + echo "IPv6 runtime resilient nexthop group torture" + echo "--------------------------------------------" + + check_nexthop_res_support + if [ $? -eq $ksft_skip ]; then + return $ksft_skip + fi + + if [ ! -x "$(command -v mausezahn)" ]; then + echo "SKIP: Could not run test; need mausezahn tool" + return + fi + + run_cmd "$IP nexthop add id 100 via 2001:db8:91::2 dev veth1" + run_cmd "$IP nexthop add id 101 via 2001:db8:92::2 dev veth3" + run_cmd "$IP nexthop add id 102 group 100/101 type resilient buckets 512 idle_timer 0" + run_cmd "$IP route add 2001:db8:101::1 nhid 102" + run_cmd "$IP route add 2001:db8:101::2 nhid 102" + + ipv6_del_add_loop1 & + pid1=$! + ipv6_res_grp_replace_loop & + pid2=$! + ip netns exec me ping -f 2001:db8:101::1 >/dev/null 2>&1 & + pid3=$! + ip netns exec me ping -f 2001:db8:101::2 >/dev/null 2>&1 & + pid4=$! + ip netns exec me mausezahn -6 veth1 \ + -B 2001:db8:101::2 -A 2001:db8:91::1 -c 0 \ + -t tcp "dp=1-1023, flags=syn" >/dev/null 2>&1 & + pid5=$! + + sleep 300 + kill -9 $pid1 $pid2 $pid3 $pid4 $pid5 + wait $pid1 $pid2 $pid3 $pid4 $pid5 2>/dev/null + + # if we did not crash, success + log_test 0 0 "IPv6 resilient nexthop group torture test" +} ipv4_fcnal() { @@ -1061,6 +1244,70 @@ ipv4_grp_fcnal() log_test $? 2 "Nexthop group can not have a blackhole and another nexthop" } +ipv4_res_grp_fcnal() +{ + local rc + + echo + echo "IPv4 resilient groups functional" + echo "--------------------------------" + + check_nexthop_res_support + if [ $? -eq $ksft_skip ]; then + return $ksft_skip + fi + + # + # migration of nexthop buckets - equal weights + # + run_cmd "$IP nexthop add id 12 via 172.16.1.2 dev veth1" + run_cmd "$IP nexthop add id 13 via 172.16.1.3 dev veth1" + run_cmd "$IP nexthop add id 102 group 12/13 type resilient buckets 2 idle_timer 0" + + run_cmd "$IP nexthop del id 13" + check_nexthop "id 102" \ + "id 102 group 12 type resilient buckets 2 idle_timer 0 unbalanced_timer 0 unbalanced_time 0" + log_test $? 0 "Nexthop group updated when entry is deleted" + check_nexthop_bucket "list id 102" \ + "id 102 index 0 nhid 12 id 102 index 1 nhid 12" + log_test $? 0 "Nexthop buckets updated when entry is deleted" + + run_cmd "$IP nexthop add id 13 via 172.16.1.3 dev veth1" + run_cmd "$IP nexthop replace id 102 group 12/13 type resilient buckets 2 idle_timer 0" + check_nexthop "id 102" \ + "id 102 group 12/13 type resilient buckets 2 idle_timer 0 unbalanced_timer 0 unbalanced_time 0" + log_test $? 0 "Nexthop group updated after replace" + check_nexthop_bucket "list id 102" \ + "id 102 index 0 nhid 13 id 102 index 1 nhid 12" + log_test $? 0 "Nexthop buckets updated after replace" + + $IP nexthop flush >/dev/null 2>&1 + + # + # migration of nexthop buckets - unequal weights + # + run_cmd "$IP nexthop add id 12 via 172.16.1.2 dev veth1" + run_cmd "$IP nexthop add id 13 via 172.16.1.3 dev veth1" + run_cmd "$IP nexthop add id 102 group 12,3/13,1 type resilient buckets 4 idle_timer 0" + + run_cmd "$IP nexthop del id 13" + check_nexthop "id 102" \ + "id 102 group 12,3 type resilient buckets 4 idle_timer 0 unbalanced_timer 0 unbalanced_time 0" + log_test $? 0 "Nexthop group updated when entry is deleted - nECMP" + check_nexthop_bucket "list id 102" \ + "id 102 index 0 nhid 12 id 102 index 1 nhid 12 id 102 index 2 nhid 12 id 102 index 3 nhid 12" + log_test $? 0 "Nexthop buckets updated when entry is deleted - nECMP" + + run_cmd "$IP nexthop add id 13 via 172.16.1.3 dev veth1" + run_cmd "$IP nexthop replace id 102 group 12,3/13,1 type resilient buckets 4 idle_timer 0" + check_nexthop "id 102" \ + "id 102 group 12,3/13 type resilient buckets 4 idle_timer 0 unbalanced_timer 0 unbalanced_time 0" + log_test $? 0 "Nexthop group updated after replace - nECMP" + check_nexthop_bucket "list id 102" \ + "id 102 index 0 nhid 13 id 102 index 1 nhid 12 id 102 index 2 nhid 12 id 102 index 3 nhid 12" + log_test $? 0 "Nexthop buckets updated after replace - nECMP" +} + ipv4_withv6_fcnal() { local lladdr @@ -1282,6 +1529,22 @@ ipv4_large_grp() $IP nexthop flush >/dev/null 2>&1 } +ipv4_large_res_grp() +{ + echo + echo "IPv4 large resilient group (128k buckets)" + echo "-----------------------------------------" + + check_nexthop_res_support + if [ $? -eq $ksft_skip ]; then + return $ksft_skip + fi + + check_large_res_grp 4 $((128 * 1024)) + + $IP nexthop flush >/dev/null 2>&1 +} + sysctl_nexthop_compat_mode_check() { local sysctlname="net.ipv4.nexthop_compat_mode" @@ -1505,6 +1768,62 @@ ipv4_torture() log_test 0 0 "IPv4 torture test" } +ipv4_res_grp_replace_loop() +{ + while :; do + $IP nexthop replace id 102 group 100/101 type resilient + done >/dev/null 2>&1 +} + +ipv4_res_torture() +{ + local pid1 + local pid2 + local pid3 + local pid4 + local pid5 + + echo + echo "IPv4 runtime resilient nexthop group torture" + echo "--------------------------------------------" + + check_nexthop_res_support + if [ $? -eq $ksft_skip ]; then + return $ksft_skip + fi + + if [ ! -x "$(command -v mausezahn)" ]; then + echo "SKIP: Could not run test; need mausezahn tool" + return + fi + + run_cmd "$IP nexthop add id 100 via 172.16.1.2 dev veth1" + run_cmd "$IP nexthop add id 101 via 172.16.2.2 dev veth3" + run_cmd "$IP nexthop add id 102 group 100/101 type resilient buckets 512 idle_timer 0" + run_cmd "$IP route add 172.16.101.1 nhid 102" + run_cmd "$IP route add 172.16.101.2 nhid 102" + + ipv4_del_add_loop1 & + pid1=$! + ipv4_res_grp_replace_loop & + pid2=$! + ip netns exec me ping -f 172.16.101.1 >/dev/null 2>&1 & + pid3=$! + ip netns exec me ping -f 172.16.101.2 >/dev/null 2>&1 & + pid4=$! + ip netns exec me mausezahn veth1 \ + -B 172.16.101.2 -A 172.16.1.1 -c 0 \ + -t tcp "dp=1-1023, flags=syn" >/dev/null 2>&1 & + pid5=$! + + sleep 300 + kill -9 $pid1 $pid2 $pid3 $pid4 $pid5 + wait $pid1 $pid2 $pid3 $pid4 $pid5 2>/dev/null + + # if we did not crash, success + log_test 0 0 "IPv4 resilient nexthop group torture test" +} + basic() { echo @@ -1616,6 +1935,204 @@ basic() $IP nexthop flush >/dev/null 2>&1 } +check_nexthop_buckets_balance() +{ + local nharg=$1; shift + local ret + + while (($# > 0)); do + local selector=$1; shift + local condition=$1; shift + local count + + count=$($IP -j nexthop bucket ${nharg} ${selector} | jq length) + (( $count $condition )) + ret=$? + if ((ret != 0)); then + return $ret + fi + done + + return 0 +} + +basic_res() +{ + echo + echo "Basic resilient nexthop group functional tests" + echo "----------------------------------------------" + + check_nexthop_res_support + if [ $? -eq $ksft_skip ]; then + return $ksft_skip + fi + + run_cmd "$IP nexthop add id 1 dev veth1" + + # + # resilient nexthop group addition + # + + run_cmd "$IP nexthop add id 101 group 1 type resilient buckets 8" + log_test $? 0 "Add a nexthop group with default parameters" + + run_cmd "$IP nexthop get id 101" + check_nexthop "id 101" \ + "id 101 group 1 type resilient buckets 8 idle_timer 120 unbalanced_timer 0 unbalanced_time 0" + log_test $? 0 "Get a nexthop group with default parameters" + + run_cmd "$IP nexthop add id 102 group 1 type resilient + buckets 4 idle_timer 100 unbalanced_timer 5" + run_cmd "$IP nexthop get id 102" + check_nexthop "id 102" \ + "id 102 group 1 type resilient buckets 4 idle_timer 100 unbalanced_timer 5 unbalanced_time 0" + log_test $? 0 "Get a nexthop group with non-default parameters" + + run_cmd "$IP nexthop add id 103 group 1 type resilient buckets 0" + log_test $? 2 "Add a nexthop group with 0 buckets" + + # + # resilient nexthop group replacement + # + + run_cmd "$IP nexthop replace id 101 group 1 type resilient + buckets 8 idle_timer 240 unbalanced_timer 80" + log_test $? 0 "Replace nexthop group parameters" + check_nexthop "id 101" \ + "id 101 group 1 type resilient buckets 8 idle_timer 240 unbalanced_timer 80 unbalanced_time 0" + log_test $? 0 "Get a nexthop group after replacing parameters" + + run_cmd "$IP nexthop replace id 101 group 1 type resilient idle_timer 512" + log_test $? 0 "Replace idle timer" + check_nexthop "id 101" \ + "id 101 group 1 type resilient buckets 8 idle_timer 512 unbalanced_timer 80 unbalanced_time 0" + log_test $? 0 "Get a nexthop group after replacing idle timer" + + run_cmd "$IP nexthop replace id 101 group 1 type resilient unbalanced_timer 256" + log_test $? 0 "Replace unbalanced timer" + check_nexthop "id 101" \ + "id 101 group 1 type resilient buckets 8 idle_timer 512 unbalanced_timer 256 unbalanced_time 0" + log_test $? 0 "Get a nexthop group after replacing unbalanced timer" + + run_cmd "$IP nexthop replace id 101 group 1 type resilient" + log_test $? 0 "Replace with no parameters" + check_nexthop "id 101" \ + "id 101 group 1 type resilient buckets 8 idle_timer 512 unbalanced_timer 256 unbalanced_time 0" + log_test $? 0 "Get a nexthop group after replacing no parameters" + + run_cmd "$IP nexthop replace id 101 group 1" + log_test $? 2 "Replace nexthop group type - implicit" + + run_cmd "$IP nexthop replace id 101 group 1 type mpath" + log_test $? 2 "Replace nexthop group type - explicit" + + run_cmd "$IP nexthop replace id 101 group 1 type resilient buckets 1024" + log_test $? 2 "Replace number of nexthop buckets" + + check_nexthop "id 101" \ + "id 101 group 1 type resilient buckets 8 idle_timer 512 unbalanced_timer 256 unbalanced_time 0" + log_test $? 0 "Get a nexthop group after replacing with invalid parameters" + + # + # resilient nexthop buckets dump + # + + $IP nexthop flush >/dev/null 2>&1 + run_cmd "$IP nexthop add id 1 dev veth1" + run_cmd "$IP nexthop add id 2 dev veth3" + run_cmd "$IP nexthop add id 101 group 1/2 type resilient buckets 4" + run_cmd "$IP nexthop add id 201 group 1/2" + + check_nexthop_bucket "" \ + "id 101 index 0 nhid 2 id 101 index 1 nhid 2 id 101 index 2 nhid 1 id 101 index 3 nhid 1" + log_test $? 0 "Dump all nexthop buckets" + + check_nexthop_bucket "list id 101" \ + "id 101 index 0 nhid 2 id 101 index 1 nhid 2 id 101 index 2 nhid 1 id 101 index 3 nhid 1" + log_test $? 0 "Dump all nexthop buckets in a group" + + (( $($IP -j nexthop bucket list id 101 | + jq '[.[] | select(.bucket.idle_time > 0 and + .bucket.idle_time < 2)] | length') == 4 )) + log_test $? 0 "All nexthop buckets report a positive near-zero idle time" + + check_nexthop_bucket "list dev veth1" \ + "id 101 index 2 nhid 1 id 101 index 3 nhid 1" + log_test $? 0 "Dump all nexthop buckets with a specific nexthop device" + + check_nexthop_bucket "list nhid 2" \ + "id 101 index 0 nhid 2 id 101 index 1 nhid 2" + log_test $? 0 "Dump all nexthop buckets with a specific nexthop identifier" + + run_cmd "$IP nexthop bucket list id 111" + log_test $? 2 "Dump all nexthop buckets in a non-existent group" + + run_cmd "$IP nexthop bucket list id 201" + log_test $? 2 "Dump all nexthop buckets in a non-resilient group" + + run_cmd "$IP nexthop bucket list dev bla" + log_test $? 255 "Dump all nexthop buckets using a non-existent device" + + run_cmd "$IP nexthop bucket list groups" + log_test $? 255 "Dump all nexthop buckets with invalid 'groups' keyword" + + run_cmd "$IP nexthop bucket list fdb" + log_test $? 255 "Dump all nexthop buckets with invalid 'fdb' keyword" + + # + # resilient nexthop buckets get requests + # + + check_nexthop_bucket "get id 101 index 0" "id 101 index 0 nhid 2" + log_test $? 0 "Get a valid nexthop bucket" + + run_cmd "$IP nexthop bucket get id 101 index 999" + log_test $? 2 "Get a nexthop bucket with valid group, but invalid index" + + run_cmd "$IP nexthop bucket get id 201 index 0" + log_test $? 2 "Get a nexthop bucket from a non-resilient group" + + run_cmd "$IP nexthop bucket get id 999 index 0" + log_test $? 2 "Get a nexthop bucket from a non-existent group" + + # + # tests for bucket migration + # + + $IP nexthop flush >/dev/null 2>&1 + + run_cmd "$IP nexthop add id 1 dev veth1" + run_cmd "$IP nexthop add id 2 dev veth3" + run_cmd "$IP nexthop add id 101 + group 1/2 type resilient buckets 10 + idle_timer 1 unbalanced_timer 20" + + check_nexthop_buckets_balance "list id 101" \ + "nhid 1" "== 5" \ + "nhid 2" "== 5" + log_test $? 0 "Initial bucket allocation" + + run_cmd "$IP nexthop replace id 101 + group 1,2/2,3 type resilient" + check_nexthop_buckets_balance "list id 101" \ + "nhid 1" "== 4" \ + "nhid 2" "== 6" + log_test $? 0 "Bucket allocation after replace" + + # Check that increase in idle timer does not make buckets appear busy. + run_cmd "$IP nexthop replace id 101 + group 1,2/2,3 type resilient + idle_timer 10" + run_cmd "$IP nexthop replace id 101 + group 1/2 type resilient" + check_nexthop_buckets_balance "list id 101" \ + "nhid 1" "== 5" \ + "nhid 2" "== 5" + log_test $? 0 "Buckets migrated after idle timer change" + + $IP nexthop flush >/dev/null 2>&1 +} + ################################################################################ # usage -- cgit v1.2.3 From 386e3792b52a4d815aefb30d8c049484dce7bdd2 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 12 Mar 2021 17:50:24 +0100 Subject: selftests: forwarding: Add resilient hashing test Verify that IPv4 and IPv6 multipath forwarding works correctly with resilient nexthop groups and with different weights. Test that when the idle timer is not zero, the resilient groups are not rebalanced - because the nexthop buckets are considered active - and the initial weights (1:1) are used. Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- .../net/forwarding/router_mpath_nh_res.sh | 400 +++++++++++++++++++++ 1 file changed, 400 insertions(+) create mode 100755 tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh b/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh new file mode 100755 index 000000000000..4898dd4118f1 --- /dev/null +++ b/tools/testing/selftests/net/forwarding/router_mpath_nh_res.sh @@ -0,0 +1,400 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +ALL_TESTS=" + ping_ipv4 + ping_ipv6 + multipath_test +" +NUM_NETIFS=8 +source lib.sh + +h1_create() +{ + vrf_create "vrf-h1" + ip link set dev $h1 master vrf-h1 + + ip link set dev vrf-h1 up + ip link set dev $h1 up + + ip address add 192.0.2.2/24 dev $h1 + ip address add 2001:db8:1::2/64 dev $h1 + + ip route add 198.51.100.0/24 vrf vrf-h1 nexthop via 192.0.2.1 + ip route add 2001:db8:2::/64 vrf vrf-h1 nexthop via 2001:db8:1::1 +} + +h1_destroy() +{ + ip route del 2001:db8:2::/64 vrf vrf-h1 + ip route del 198.51.100.0/24 vrf vrf-h1 + + ip address del 2001:db8:1::2/64 dev $h1 + ip address del 192.0.2.2/24 dev $h1 + + ip link set dev $h1 down + vrf_destroy "vrf-h1" +} + +h2_create() +{ + vrf_create "vrf-h2" + ip link set dev $h2 master vrf-h2 + + ip link set dev vrf-h2 up + ip link set dev $h2 up + + ip address add 198.51.100.2/24 dev $h2 + ip address add 2001:db8:2::2/64 dev $h2 + + ip route add 192.0.2.0/24 vrf vrf-h2 nexthop via 198.51.100.1 + ip route add 2001:db8:1::/64 vrf vrf-h2 nexthop via 2001:db8:2::1 +} + +h2_destroy() +{ + ip route del 2001:db8:1::/64 vrf vrf-h2 + ip route del 192.0.2.0/24 vrf vrf-h2 + + ip address del 2001:db8:2::2/64 dev $h2 + ip address del 198.51.100.2/24 dev $h2 + + ip link set dev $h2 down + vrf_destroy "vrf-h2" +} + +router1_create() +{ + vrf_create "vrf-r1" + ip link set dev $rp11 master vrf-r1 + ip link set dev $rp12 master vrf-r1 + ip link set dev $rp13 master vrf-r1 + + ip link set dev vrf-r1 up + ip link set dev $rp11 up + ip link set dev $rp12 up + ip link set dev $rp13 up + + ip address add 192.0.2.1/24 dev $rp11 + ip address add 2001:db8:1::1/64 dev $rp11 + + ip address add 169.254.2.12/24 dev $rp12 + ip address add fe80:2::12/64 dev $rp12 + + ip address add 169.254.3.13/24 dev $rp13 + ip address add fe80:3::13/64 dev $rp13 +} + +router1_destroy() +{ + ip route del 2001:db8:2::/64 vrf vrf-r1 + ip route del 198.51.100.0/24 vrf vrf-r1 + + ip address del fe80:3::13/64 dev $rp13 + ip address del 169.254.3.13/24 dev $rp13 + + ip address del fe80:2::12/64 dev $rp12 + ip address del 169.254.2.12/24 dev $rp12 + + ip address del 2001:db8:1::1/64 dev $rp11 + ip address del 192.0.2.1/24 dev $rp11 + + ip nexthop del id 103 + ip nexthop del id 101 + ip nexthop del id 102 + ip nexthop del id 106 + ip nexthop del id 104 + ip nexthop del id 105 + + ip link set dev $rp13 down + ip link set dev $rp12 down + ip link set dev $rp11 down + + vrf_destroy "vrf-r1" +} + +router2_create() +{ + vrf_create "vrf-r2" + ip link set dev $rp21 master vrf-r2 + ip link set dev $rp22 master vrf-r2 + ip link set dev $rp23 master vrf-r2 + + ip link set dev vrf-r2 up + ip link set dev $rp21 up + ip link set dev $rp22 up + ip link set dev $rp23 up + + ip address add 198.51.100.1/24 dev $rp21 + ip address add 2001:db8:2::1/64 dev $rp21 + + ip address add 169.254.2.22/24 dev $rp22 + ip address add fe80:2::22/64 dev $rp22 + + ip address add 169.254.3.23/24 dev $rp23 + ip address add fe80:3::23/64 dev $rp23 +} + +router2_destroy() +{ + ip route del 2001:db8:1::/64 vrf vrf-r2 + ip route del 192.0.2.0/24 vrf vrf-r2 + + ip address del fe80:3::23/64 dev $rp23 + ip address del 169.254.3.23/24 dev $rp23 + + ip address del fe80:2::22/64 dev $rp22 + ip address del 169.254.2.22/24 dev $rp22 + + ip address del 2001:db8:2::1/64 dev $rp21 + ip address del 198.51.100.1/24 dev $rp21 + + ip nexthop del id 201 + ip nexthop del id 202 + ip nexthop del id 204 + ip nexthop del id 205 + + ip link set dev $rp23 down + ip link set dev $rp22 down + ip link set dev $rp21 down + + vrf_destroy "vrf-r2" +} + +routing_nh_obj() +{ + ip nexthop add id 101 via 169.254.2.22 dev $rp12 + ip nexthop add id 102 via 169.254.3.23 dev $rp13 + ip nexthop add id 103 group 101/102 type resilient buckets 512 \ + idle_timer 0 + ip route add 198.51.100.0/24 vrf vrf-r1 nhid 103 + + ip nexthop add id 104 via fe80:2::22 dev $rp12 + ip nexthop add id 105 via fe80:3::23 dev $rp13 + ip nexthop add id 106 group 104/105 type resilient buckets 512 \ + idle_timer 0 + ip route add 2001:db8:2::/64 vrf vrf-r1 nhid 106 + + ip nexthop add id 201 via 169.254.2.12 dev $rp22 + ip nexthop add id 202 via 169.254.3.13 dev $rp23 + ip nexthop add id 203 group 201/202 type resilient buckets 512 \ + idle_timer 0 + ip route add 192.0.2.0/24 vrf vrf-r2 nhid 203 + + ip nexthop add id 204 via fe80:2::12 dev $rp22 + ip nexthop add id 205 via fe80:3::13 dev $rp23 + ip nexthop add id 206 group 204/205 type resilient buckets 512 \ + idle_timer 0 + ip route add 2001:db8:1::/64 vrf vrf-r2 nhid 206 +} + +multipath4_test() +{ + local desc="$1" + local weight_rp12=$2 + local weight_rp13=$3 + local t0_rp12 t0_rp13 t1_rp12 t1_rp13 + local packets_rp12 packets_rp13 + + # Transmit multiple flows from h1 to h2 and make sure they are + # distributed between both multipath links (rp12 and rp13) + # according to the provided weights. + sysctl_set net.ipv4.fib_multipath_hash_policy 1 + + t0_rp12=$(link_stats_tx_packets_get $rp12) + t0_rp13=$(link_stats_tx_packets_get $rp13) + + ip vrf exec vrf-h1 $MZ $h1 -q -p 64 -A 192.0.2.2 -B 198.51.100.2 \ + -d 1msec -t udp "sp=1024,dp=0-32768" + + t1_rp12=$(link_stats_tx_packets_get $rp12) + t1_rp13=$(link_stats_tx_packets_get $rp13) + + let "packets_rp12 = $t1_rp12 - $t0_rp12" + let "packets_rp13 = $t1_rp13 - $t0_rp13" + multipath_eval "$desc" $weight_rp12 $weight_rp13 $packets_rp12 $packets_rp13 + + # Restore settings. + sysctl_restore net.ipv4.fib_multipath_hash_policy +} + +multipath6_l4_test() +{ + local desc="$1" + local weight_rp12=$2 + local weight_rp13=$3 + local t0_rp12 t0_rp13 t1_rp12 t1_rp13 + local packets_rp12 packets_rp13 + + # Transmit multiple flows from h1 to h2 and make sure they are + # distributed between both multipath links (rp12 and rp13) + # according to the provided weights. + sysctl_set net.ipv6.fib_multipath_hash_policy 1 + + t0_rp12=$(link_stats_tx_packets_get $rp12) + t0_rp13=$(link_stats_tx_packets_get $rp13) + + $MZ $h1 -6 -q -p 64 -A 2001:db8:1::2 -B 2001:db8:2::2 \ + -d 1msec -t udp "sp=1024,dp=0-32768" + + t1_rp12=$(link_stats_tx_packets_get $rp12) + t1_rp13=$(link_stats_tx_packets_get $rp13) + + let "packets_rp12 = $t1_rp12 - $t0_rp12" + let "packets_rp13 = $t1_rp13 - $t0_rp13" + multipath_eval "$desc" $weight_rp12 $weight_rp13 $packets_rp12 $packets_rp13 + + sysctl_restore net.ipv6.fib_multipath_hash_policy +} + +multipath_test() +{ + # Without an idle timer, weight replacement should happen immediately. + log_info "Running multipath tests without an idle timer" + ip nexthop replace id 103 group 101/102 type resilient idle_timer 0 + ip nexthop replace id 106 group 104/105 type resilient idle_timer 0 + + log_info "Running IPv4 multipath tests" + ip nexthop replace id 103 group 101,1/102,1 type resilient + multipath4_test "ECMP" 1 1 + ip nexthop replace id 103 group 101,2/102,1 type resilient + multipath4_test "Weighted MP 2:1" 2 1 + ip nexthop replace id 103 group 101,11/102,45 type resilient + multipath4_test "Weighted MP 11:45" 11 45 + + ip nexthop replace id 103 group 101,1/102,1 type resilient + + log_info "Running IPv6 L4 hash multipath tests" + ip nexthop replace id 106 group 104,1/105,1 type resilient + multipath6_l4_test "ECMP" 1 1 + ip nexthop replace id 106 group 104,2/105,1 type resilient + multipath6_l4_test "Weighted MP 2:1" 2 1 + ip nexthop replace id 106 group 104,11/105,45 type resilient + multipath6_l4_test "Weighted MP 11:45" 11 45 + + ip nexthop replace id 106 group 104,1/105,1 type resilient + + # With an idle timer, weight replacement should not happen, so the + # expected ratio should always be the initial one (1:1). + log_info "Running multipath tests with an idle timer of 120 seconds" + ip nexthop replace id 103 group 101/102 type resilient idle_timer 120 + ip nexthop replace id 106 group 104/105 type resilient idle_timer 120 + + log_info "Running IPv4 multipath tests" + ip nexthop replace id 103 group 101,1/102,1 type resilient + multipath4_test "ECMP" 1 1 + ip nexthop replace id 103 group 101,2/102,1 type resilient + multipath4_test "Weighted MP 2:1" 1 1 + ip nexthop replace id 103 group 101,11/102,45 type resilient + multipath4_test "Weighted MP 11:45" 1 1 + + ip nexthop replace id 103 group 101,1/102,1 type resilient + + log_info "Running IPv6 L4 hash multipath tests" + ip nexthop replace id 106 group 104,1/105,1 type resilient + multipath6_l4_test "ECMP" 1 1 + ip nexthop replace id 106 group 104,2/105,1 type resilient + multipath6_l4_test "Weighted MP 2:1" 1 1 + ip nexthop replace id 106 group 104,11/105,45 type resilient + multipath6_l4_test "Weighted MP 11:45" 1 1 + + ip nexthop replace id 106 group 104,1/105,1 type resilient + + # With a short idle timer and enough idle time, weight replacement + # should happen. + log_info "Running multipath tests with an idle timer of 5 seconds" + ip nexthop replace id 103 group 101/102 type resilient idle_timer 5 + ip nexthop replace id 106 group 104/105 type resilient idle_timer 5 + + log_info "Running IPv4 multipath tests" + sleep 10 + ip nexthop replace id 103 group 101,1/102,1 type resilient + multipath4_test "ECMP" 1 1 + sleep 10 + ip nexthop replace id 103 group 101,2/102,1 type resilient + multipath4_test "Weighted MP 2:1" 2 1 + sleep 10 + ip nexthop replace id 103 group 101,11/102,45 type resilient + multipath4_test "Weighted MP 11:45" 11 45 + + ip nexthop replace id 103 group 101,1/102,1 type resilient + + log_info "Running IPv6 L4 hash multipath tests" + sleep 10 + ip nexthop replace id 106 group 104,1/105,1 type resilient + multipath6_l4_test "ECMP" 1 1 + sleep 10 + ip nexthop replace id 106 group 104,2/105,1 type resilient + multipath6_l4_test "Weighted MP 2:1" 2 1 + sleep 10 + ip nexthop replace id 106 group 104,11/105,45 type resilient + multipath6_l4_test "Weighted MP 11:45" 11 45 + + ip nexthop replace id 106 group 104,1/105,1 type resilient +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + rp11=${NETIFS[p2]} + + rp12=${NETIFS[p3]} + rp22=${NETIFS[p4]} + + rp13=${NETIFS[p5]} + rp23=${NETIFS[p6]} + + rp21=${NETIFS[p7]} + h2=${NETIFS[p8]} + + vrf_prepare + + h1_create + h2_create + + router1_create + router2_create + + forwarding_enable +} + +cleanup() +{ + pre_cleanup + + forwarding_restore + + router2_destroy + router1_destroy + + h2_destroy + h1_destroy + + vrf_cleanup +} + +ping_ipv4() +{ + ping_test $h1 198.51.100.2 +} + +ping_ipv6() +{ + ping6_test $h1 2001:db8:2::2 +} + +ip nexthop ls >/dev/null 2>&1 +if [ $? -ne 0 ]; then + echo "Nexthop objects not supported; skipping tests" + exit 0 +fi + +trap cleanup EXIT + +setup_prepare +setup_wait +routing_nh_obj + +tests_run + +exit $EXIT_STATUS -- cgit v1.2.3 From 902280cacc0367dcaa8be8261e02ead932a1488d Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 12 Mar 2021 17:50:25 +0100 Subject: selftests: forwarding: Add resilient multipath tunneling nexthop test Add a resilient nexthop objects version of gre_multipath_nh.sh. Test that both IPv4 and IPv6 overlays work with resilient nexthop groups where the nexthops are two GRE tunnels. Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- .../net/forwarding/gre_multipath_nh_res.sh | 361 +++++++++++++++++++++ 1 file changed, 361 insertions(+) create mode 100755 tools/testing/selftests/net/forwarding/gre_multipath_nh_res.sh (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/gre_multipath_nh_res.sh b/tools/testing/selftests/net/forwarding/gre_multipath_nh_res.sh new file mode 100755 index 000000000000..088b65e64d66 --- /dev/null +++ b/tools/testing/selftests/net/forwarding/gre_multipath_nh_res.sh @@ -0,0 +1,361 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# Test traffic distribution when a wECMP route forwards traffic to two GRE +# tunnels. +# +# +-------------------------+ +# | H1 | +# | $h1 + | +# | 192.0.2.1/28 | | +# | 2001:db8:1::1/64 | | +# +-------------------|-----+ +# | +# +-------------------|------------------------+ +# | SW1 | | +# | $ol1 + | +# | 192.0.2.2/28 | +# | 2001:db8:1::2/64 | +# | | +# | + g1a (gre) + g1b (gre) | +# | loc=192.0.2.65 loc=192.0.2.81 | +# | rem=192.0.2.66 --. rem=192.0.2.82 --. | +# | tos=inherit | tos=inherit | | +# | .------------------' | | +# | | .------------------' | +# | v v | +# | + $ul1.111 (vlan) + $ul1.222 (vlan) | +# | | 192.0.2.129/28 | 192.0.2.145/28 | +# | \ / | +# | \________________/ | +# | | | +# | + $ul1 | +# +------------|-------------------------------+ +# | +# +------------|-------------------------------+ +# | SW2 + $ul2 | +# | _______|________ | +# | / \ | +# | / \ | +# | + $ul2.111 (vlan) + $ul2.222 (vlan) | +# | ^ 192.0.2.130/28 ^ 192.0.2.146/28 | +# | | | | +# | | '------------------. | +# | '------------------. | | +# | + g2a (gre) | + g2b (gre) | | +# | loc=192.0.2.66 | loc=192.0.2.82 | | +# | rem=192.0.2.65 --' rem=192.0.2.81 --' | +# | tos=inherit tos=inherit | +# | | +# | $ol2 + | +# | 192.0.2.17/28 | | +# | 2001:db8:2::1/64 | | +# +-------------------|------------------------+ +# | +# +-------------------|-----+ +# | H2 | | +# | $h2 + | +# | 192.0.2.18/28 | +# | 2001:db8:2::2/64 | +# +-------------------------+ + +ALL_TESTS=" + ping_ipv4 + ping_ipv6 + multipath_ipv4 + multipath_ipv6 + multipath_ipv6_l4 +" + +NUM_NETIFS=6 +source lib.sh + +h1_create() +{ + simple_if_init $h1 192.0.2.1/28 2001:db8:1::1/64 + ip route add vrf v$h1 192.0.2.16/28 via 192.0.2.2 + ip route add vrf v$h1 2001:db8:2::/64 via 2001:db8:1::2 +} + +h1_destroy() +{ + ip route del vrf v$h1 2001:db8:2::/64 via 2001:db8:1::2 + ip route del vrf v$h1 192.0.2.16/28 via 192.0.2.2 + simple_if_fini $h1 192.0.2.1/28 +} + +sw1_create() +{ + simple_if_init $ol1 192.0.2.2/28 2001:db8:1::2/64 + __simple_if_init $ul1 v$ol1 + vlan_create $ul1 111 v$ol1 192.0.2.129/28 + vlan_create $ul1 222 v$ol1 192.0.2.145/28 + + tunnel_create g1a gre 192.0.2.65 192.0.2.66 tos inherit dev v$ol1 + __simple_if_init g1a v$ol1 192.0.2.65/32 + ip route add vrf v$ol1 192.0.2.66/32 via 192.0.2.130 + + tunnel_create g1b gre 192.0.2.81 192.0.2.82 tos inherit dev v$ol1 + __simple_if_init g1b v$ol1 192.0.2.81/32 + ip route add vrf v$ol1 192.0.2.82/32 via 192.0.2.146 + + ip -6 nexthop add id 101 dev g1a + ip -6 nexthop add id 102 dev g1b + ip nexthop add id 103 group 101/102 type resilient buckets 512 \ + idle_timer 0 + + ip route add vrf v$ol1 192.0.2.16/28 nhid 103 + ip route add vrf v$ol1 2001:db8:2::/64 nhid 103 +} + +sw1_destroy() +{ + ip route del vrf v$ol1 2001:db8:2::/64 + ip route del vrf v$ol1 192.0.2.16/28 + + ip nexthop del id 103 + ip -6 nexthop del id 102 + ip -6 nexthop del id 101 + + ip route del vrf v$ol1 192.0.2.82/32 via 192.0.2.146 + __simple_if_fini g1b 192.0.2.81/32 + tunnel_destroy g1b + + ip route del vrf v$ol1 192.0.2.66/32 via 192.0.2.130 + __simple_if_fini g1a 192.0.2.65/32 + tunnel_destroy g1a + + vlan_destroy $ul1 222 + vlan_destroy $ul1 111 + __simple_if_fini $ul1 + simple_if_fini $ol1 192.0.2.2/28 2001:db8:1::2/64 +} + +sw2_create() +{ + simple_if_init $ol2 192.0.2.17/28 2001:db8:2::1/64 + __simple_if_init $ul2 v$ol2 + vlan_create $ul2 111 v$ol2 192.0.2.130/28 + vlan_create $ul2 222 v$ol2 192.0.2.146/28 + + tunnel_create g2a gre 192.0.2.66 192.0.2.65 tos inherit dev v$ol2 + __simple_if_init g2a v$ol2 192.0.2.66/32 + ip route add vrf v$ol2 192.0.2.65/32 via 192.0.2.129 + + tunnel_create g2b gre 192.0.2.82 192.0.2.81 tos inherit dev v$ol2 + __simple_if_init g2b v$ol2 192.0.2.82/32 + ip route add vrf v$ol2 192.0.2.81/32 via 192.0.2.145 + + ip -6 nexthop add id 201 dev g2a + ip -6 nexthop add id 202 dev g2b + ip nexthop add id 203 group 201/202 type resilient buckets 512 \ + idle_timer 0 + + ip route add vrf v$ol2 192.0.2.0/28 nhid 203 + ip route add vrf v$ol2 2001:db8:1::/64 nhid 203 + + tc qdisc add dev $ul2 clsact + tc filter add dev $ul2 ingress pref 111 prot 802.1Q \ + flower vlan_id 111 action pass + tc filter add dev $ul2 ingress pref 222 prot 802.1Q \ + flower vlan_id 222 action pass +} + +sw2_destroy() +{ + tc qdisc del dev $ul2 clsact + + ip route del vrf v$ol2 2001:db8:1::/64 + ip route del vrf v$ol2 192.0.2.0/28 + + ip nexthop del id 203 + ip -6 nexthop del id 202 + ip -6 nexthop del id 201 + + ip route del vrf v$ol2 192.0.2.81/32 via 192.0.2.145 + __simple_if_fini g2b 192.0.2.82/32 + tunnel_destroy g2b + + ip route del vrf v$ol2 192.0.2.65/32 via 192.0.2.129 + __simple_if_fini g2a 192.0.2.66/32 + tunnel_destroy g2a + + vlan_destroy $ul2 222 + vlan_destroy $ul2 111 + __simple_if_fini $ul2 + simple_if_fini $ol2 192.0.2.17/28 2001:db8:2::1/64 +} + +h2_create() +{ + simple_if_init $h2 192.0.2.18/28 2001:db8:2::2/64 + ip route add vrf v$h2 192.0.2.0/28 via 192.0.2.17 + ip route add vrf v$h2 2001:db8:1::/64 via 2001:db8:2::1 +} + +h2_destroy() +{ + ip route del vrf v$h2 2001:db8:1::/64 via 2001:db8:2::1 + ip route del vrf v$h2 192.0.2.0/28 via 192.0.2.17 + simple_if_fini $h2 192.0.2.18/28 2001:db8:2::2/64 +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + ol1=${NETIFS[p2]} + + ul1=${NETIFS[p3]} + ul2=${NETIFS[p4]} + + ol2=${NETIFS[p5]} + h2=${NETIFS[p6]} + + vrf_prepare + h1_create + sw1_create + sw2_create + h2_create + + forwarding_enable +} + +cleanup() +{ + pre_cleanup + + forwarding_restore + + h2_destroy + sw2_destroy + sw1_destroy + h1_destroy + vrf_cleanup +} + +multipath4_test() +{ + local what=$1; shift + local weight1=$1; shift + local weight2=$1; shift + + sysctl_set net.ipv4.fib_multipath_hash_policy 1 + ip nexthop replace id 103 group 101,$weight1/102,$weight2 \ + type resilient + + local t0_111=$(tc_rule_stats_get $ul2 111 ingress) + local t0_222=$(tc_rule_stats_get $ul2 222 ingress) + + ip vrf exec v$h1 \ + $MZ $h1 -q -p 64 -A 192.0.2.1 -B 192.0.2.18 \ + -d 1msec -t udp "sp=1024,dp=0-32768" + + local t1_111=$(tc_rule_stats_get $ul2 111 ingress) + local t1_222=$(tc_rule_stats_get $ul2 222 ingress) + + local d111=$((t1_111 - t0_111)) + local d222=$((t1_222 - t0_222)) + multipath_eval "$what" $weight1 $weight2 $d111 $d222 + + ip nexthop replace id 103 group 101/102 type resilient + sysctl_restore net.ipv4.fib_multipath_hash_policy +} + +multipath6_test() +{ + local what=$1; shift + local weight1=$1; shift + local weight2=$1; shift + + sysctl_set net.ipv6.fib_multipath_hash_policy 0 + ip nexthop replace id 103 group 101,$weight1/102,$weight2 \ + type resilient + + local t0_111=$(tc_rule_stats_get $ul2 111 ingress) + local t0_222=$(tc_rule_stats_get $ul2 222 ingress) + + # Generate 16384 echo requests, each with a random flow label. + for ((i=0; i < 16384; ++i)); do + ip vrf exec v$h1 $PING6 2001:db8:2::2 -F 0 -c 1 -q &> /dev/null + done + + local t1_111=$(tc_rule_stats_get $ul2 111 ingress) + local t1_222=$(tc_rule_stats_get $ul2 222 ingress) + + local d111=$((t1_111 - t0_111)) + local d222=$((t1_222 - t0_222)) + multipath_eval "$what" $weight1 $weight2 $d111 $d222 + + ip nexthop replace id 103 group 101/102 type resilient + sysctl_restore net.ipv6.fib_multipath_hash_policy +} + +multipath6_l4_test() +{ + local what=$1; shift + local weight1=$1; shift + local weight2=$1; shift + + sysctl_set net.ipv6.fib_multipath_hash_policy 1 + ip nexthop replace id 103 group 101,$weight1/102,$weight2 \ + type resilient + + local t0_111=$(tc_rule_stats_get $ul2 111 ingress) + local t0_222=$(tc_rule_stats_get $ul2 222 ingress) + + ip vrf exec v$h1 \ + $MZ $h1 -6 -q -p 64 -A 2001:db8:1::1 -B 2001:db8:2::2 \ + -d 1msec -t udp "sp=1024,dp=0-32768" + + local t1_111=$(tc_rule_stats_get $ul2 111 ingress) + local t1_222=$(tc_rule_stats_get $ul2 222 ingress) + + local d111=$((t1_111 - t0_111)) + local d222=$((t1_222 - t0_222)) + multipath_eval "$what" $weight1 $weight2 $d111 $d222 + + ip nexthop replace id 103 group 101/102 type resilient + sysctl_restore net.ipv6.fib_multipath_hash_policy +} + +ping_ipv4() +{ + ping_test $h1 192.0.2.18 +} + +ping_ipv6() +{ + ping6_test $h1 2001:db8:2::2 +} + +multipath_ipv4() +{ + log_info "Running IPv4 multipath tests" + multipath4_test "ECMP" 1 1 + multipath4_test "Weighted MP 2:1" 2 1 + multipath4_test "Weighted MP 11:45" 11 45 +} + +multipath_ipv6() +{ + log_info "Running IPv6 multipath tests" + multipath6_test "ECMP" 1 1 + multipath6_test "Weighted MP 2:1" 2 1 + multipath6_test "Weighted MP 11:45" 11 45 +} + +multipath_ipv6_l4() +{ + log_info "Running IPv6 L4 hash multipath tests" + multipath6_l4_test "ECMP" 1 1 + multipath6_l4_test "Weighted MP 2:1" 2 1 + multipath6_l4_test "Weighted MP 11:45" 11 45 +} + +trap cleanup EXIT + +setup_prepare +setup_wait +tests_run + +exit $EXIT_STATUS -- cgit v1.2.3 From b8a07c4cea04c14008f34880424852d38ae03758 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 12 Mar 2021 17:50:26 +0100 Subject: selftests: netdevsim: Add test for resilient nexthop groups offload API Test various aspects of the resilient nexthop group offload API on top of the netdevsim implementation. Both good and bad flows are tested. Signed-off-by: Ido Schimmel Co-developed-by: Petr Machata Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- .../selftests/drivers/net/netdevsim/nexthop.sh | 620 +++++++++++++++++++++ 1 file changed, 620 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/netdevsim/nexthop.sh b/tools/testing/selftests/drivers/net/netdevsim/nexthop.sh index be0c1b5ee6b8..ba75c81cda91 100755 --- a/tools/testing/selftests/drivers/net/netdevsim/nexthop.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/nexthop.sh @@ -11,14 +11,33 @@ ALL_TESTS=" nexthop_single_add_err_test nexthop_group_add_test nexthop_group_add_err_test + nexthop_res_group_add_test + nexthop_res_group_add_err_test nexthop_group_replace_test nexthop_group_replace_err_test + nexthop_res_group_replace_test + nexthop_res_group_replace_err_test + nexthop_res_group_idle_timer_test + nexthop_res_group_idle_timer_del_test + nexthop_res_group_increase_idle_timer_test + nexthop_res_group_decrease_idle_timer_test + nexthop_res_group_unbalanced_timer_test + nexthop_res_group_unbalanced_timer_del_test + nexthop_res_group_no_unbalanced_timer_test + nexthop_res_group_short_unbalanced_timer_test + nexthop_res_group_increase_unbalanced_timer_test + nexthop_res_group_decrease_unbalanced_timer_test + nexthop_res_group_force_migrate_busy_test nexthop_single_replace_test nexthop_single_replace_err_test nexthop_single_in_group_replace_test nexthop_single_in_group_replace_err_test + nexthop_single_in_res_group_replace_test + nexthop_single_in_res_group_replace_err_test nexthop_single_in_group_delete_test nexthop_single_in_group_delete_err_test + nexthop_single_in_res_group_delete_test + nexthop_single_in_res_group_delete_err_test nexthop_replay_test nexthop_replay_err_test " @@ -27,6 +46,7 @@ DEV_ADDR=1337 DEV=netdevsim${DEV_ADDR} DEVLINK_DEV=netdevsim/${DEV} SYSFS_NET_DIR=/sys/bus/netdevsim/devices/$DEV/net/ +DEBUGFS_NET_DIR=/sys/kernel/debug/netdevsim/$DEV/ NUM_NETIFS=0 source $lib_dir/lib.sh source $lib_dir/devlink_lib.sh @@ -44,6 +64,28 @@ nexthop_check() return 0 } +nexthop_bucket_nhid_count_check() +{ + local group_id=$1; shift + local expected + local count + local nhid + local ret + + while (($# > 0)); do + nhid=$1; shift + expected=$1; shift + + count=$($IP nexthop bucket show id $group_id nhid $nhid | + grep "trap" | wc -l) + if ((expected != count)); then + return 1 + fi + done + + return 0 +} + nexthop_resource_check() { local expected_occ=$1; shift @@ -159,6 +201,71 @@ nexthop_group_add_err_test() nexthop_resource_set 9999 } +nexthop_res_group_add_test() +{ + RET=0 + + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + + $IP nexthop add id 10 group 1/2 type resilient buckets 4 + nexthop_check "id 10" "id 10 group 1/2 type resilient buckets 4 idle_timer 120 unbalanced_timer 0 unbalanced_time 0 trap" + check_err $? "Unexpected nexthop group entry" + + nexthop_bucket_nhid_count_check 10 1 2 + check_err $? "Wrong nexthop buckets count" + nexthop_bucket_nhid_count_check 10 2 2 + check_err $? "Wrong nexthop buckets count" + + nexthop_resource_check 6 + check_err $? "Wrong nexthop occupancy" + + $IP nexthop del id 10 + nexthop_resource_check 2 + check_err $? "Wrong nexthop occupancy after delete" + + $IP nexthop add id 10 group 1,3/2,2 type resilient buckets 5 + nexthop_check "id 10" "id 10 group 1,3/2,2 type resilient buckets 5 idle_timer 120 unbalanced_timer 0 unbalanced_time 0 trap" + check_err $? "Unexpected weighted nexthop group entry" + + nexthop_bucket_nhid_count_check 10 1 3 + check_err $? "Wrong nexthop buckets count" + nexthop_bucket_nhid_count_check 10 2 2 + check_err $? "Wrong nexthop buckets count" + + nexthop_resource_check 7 + check_err $? "Wrong weighted nexthop occupancy" + + $IP nexthop del id 10 + nexthop_resource_check 2 + check_err $? "Wrong nexthop occupancy after delete" + + log_test "Resilient nexthop group add and delete" + + $IP nexthop flush &> /dev/null +} + +nexthop_res_group_add_err_test() +{ + RET=0 + + nexthop_resource_set 2 + + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + + $IP nexthop add id 10 group 1/2 type resilient buckets 4 &> /dev/null + check_fail $? "Nexthop group addition succeeded when should fail" + + nexthop_resource_check 2 + check_err $? "Wrong nexthop occupancy" + + log_test "Resilient nexthop group add failure" + + $IP nexthop flush &> /dev/null + nexthop_resource_set 9999 +} + nexthop_group_replace_test() { RET=0 @@ -206,6 +313,411 @@ nexthop_group_replace_err_test() nexthop_resource_set 9999 } +nexthop_res_group_replace_test() +{ + RET=0 + + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + $IP nexthop add id 3 via 192.0.2.4 dev dummy1 + $IP nexthop add id 10 group 1/2 type resilient buckets 6 + + $IP nexthop replace id 10 group 1/2/3 type resilient + nexthop_check "id 10" "id 10 group 1/2/3 type resilient buckets 6 idle_timer 120 unbalanced_timer 0 unbalanced_time 0 trap" + check_err $? "Unexpected nexthop group entry" + + nexthop_bucket_nhid_count_check 10 1 2 + check_err $? "Wrong nexthop buckets count" + nexthop_bucket_nhid_count_check 10 2 2 + check_err $? "Wrong nexthop buckets count" + nexthop_bucket_nhid_count_check 10 3 2 + check_err $? "Wrong nexthop buckets count" + + nexthop_resource_check 9 + check_err $? "Wrong nexthop occupancy" + + log_test "Resilient nexthop group replace" + + $IP nexthop flush &> /dev/null +} + +nexthop_res_group_replace_err_test() +{ + RET=0 + + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + $IP nexthop add id 3 via 192.0.2.4 dev dummy1 + $IP nexthop add id 10 group 1/2 type resilient buckets 6 + + ip netns exec testns1 \ + echo 1 > $DEBUGFS_NET_DIR/fib/fail_res_nexthop_group_replace + $IP nexthop replace id 10 group 1/2/3 type resilient &> /dev/null + check_fail $? "Nexthop group replacement succeeded when should fail" + + nexthop_check "id 10" "id 10 group 1/2 type resilient buckets 6 idle_timer 120 unbalanced_timer 0 unbalanced_time 0 trap" + check_err $? "Unexpected nexthop group entry after failure" + + nexthop_bucket_nhid_count_check 10 1 3 + check_err $? "Wrong nexthop buckets count" + nexthop_bucket_nhid_count_check 10 2 3 + check_err $? "Wrong nexthop buckets count" + + nexthop_resource_check 9 + check_err $? "Wrong nexthop occupancy after failure" + + log_test "Resilient nexthop group replace failure" + + $IP nexthop flush &> /dev/null + ip netns exec testns1 \ + echo 0 > $DEBUGFS_NET_DIR/fib/fail_res_nexthop_group_replace +} + +nexthop_res_mark_buckets_busy() +{ + local group_id=$1; shift + local nhid=$1; shift + local count=$1; shift + local index + + for index in $($IP -j nexthop bucket show id $group_id nhid $nhid | + jq '.[].bucket.index' | head -n ${count:--0}) + do + echo $group_id $index \ + > $DEBUGFS_NET_DIR/fib/nexthop_bucket_activity + done +} + +nexthop_res_num_nhid_buckets() +{ + local group_id=$1; shift + local nhid=$1; shift + + $IP -j nexthop bucket show id $group_id nhid $nhid | jq length +} + +nexthop_res_group_idle_timer_test() +{ + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + + RET=0 + + $IP nexthop add id 10 group 1/2 type resilient buckets 8 idle_timer 4 + nexthop_res_mark_buckets_busy 10 1 + $IP nexthop replace id 10 group 1/2,3 type resilient + + nexthop_bucket_nhid_count_check 10 1 4 2 4 + check_err $? "Group expected to be unbalanced" + + sleep 6 + + nexthop_bucket_nhid_count_check 10 1 2 2 6 + check_err $? "Group expected to be balanced" + + log_test "Bucket migration after idle timer" + + $IP nexthop flush &> /dev/null +} + +nexthop_res_group_idle_timer_del_test() +{ + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + $IP nexthop add id 3 via 192.0.2.3 dev dummy1 + + RET=0 + + $IP nexthop add id 10 group 1,50/2,50/3,1 \ + type resilient buckets 8 idle_timer 6 + nexthop_res_mark_buckets_busy 10 1 + $IP nexthop replace id 10 group 1,50/2,150/3,1 type resilient + + nexthop_bucket_nhid_count_check 10 1 4 2 4 3 0 + check_err $? "Group expected to be unbalanced" + + sleep 4 + + # Deletion prompts group replacement. Check that the bucket timers + # are kept. + $IP nexthop delete id 3 + + nexthop_bucket_nhid_count_check 10 1 4 2 4 + check_err $? "Group expected to still be unbalanced" + + sleep 4 + + nexthop_bucket_nhid_count_check 10 1 2 2 6 + check_err $? "Group expected to be balanced" + + log_test "Bucket migration after idle timer (with delete)" + + $IP nexthop flush &> /dev/null +} + +__nexthop_res_group_increase_timer_test() +{ + local timer=$1; shift + + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + + RET=0 + + $IP nexthop add id 10 group 1/2 type resilient buckets 8 $timer 4 + nexthop_res_mark_buckets_busy 10 1 + $IP nexthop replace id 10 group 1/2,3 type resilient + + nexthop_bucket_nhid_count_check 10 2 6 + check_fail $? "Group expected to be unbalanced" + + sleep 2 + $IP nexthop replace id 10 group 1/2,3 type resilient $timer 8 + sleep 4 + + # 6 seconds, past the original timer. + nexthop_bucket_nhid_count_check 10 2 6 + check_fail $? "Group still expected to be unbalanced" + + sleep 4 + + # 10 seconds, past the new timer. + nexthop_bucket_nhid_count_check 10 2 6 + check_err $? "Group expected to be balanced" + + log_test "Bucket migration after $timer increase" + + $IP nexthop flush &> /dev/null +} + +__nexthop_res_group_decrease_timer_test() +{ + local timer=$1; shift + + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + + RET=0 + + $IP nexthop add id 10 group 1/2 type resilient buckets 8 $timer 8 + nexthop_res_mark_buckets_busy 10 1 + $IP nexthop replace id 10 group 1/2,3 type resilient + + nexthop_bucket_nhid_count_check 10 2 6 + check_fail $? "Group expected to be unbalanced" + + sleep 2 + $IP nexthop replace id 10 group 1/2,3 type resilient $timer 4 + sleep 4 + + # 6 seconds, past the new timer, before the old timer. + nexthop_bucket_nhid_count_check 10 2 6 + check_err $? "Group expected to be balanced" + + log_test "Bucket migration after $timer decrease" + + $IP nexthop flush &> /dev/null +} + +__nexthop_res_group_increase_timer_del_test() +{ + local timer=$1; shift + + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + $IP nexthop add id 3 via 192.0.2.3 dev dummy1 + + RET=0 + + $IP nexthop add id 10 group 1,100/2,100/3,1 \ + type resilient buckets 8 $timer 4 + nexthop_res_mark_buckets_busy 10 1 + $IP nexthop replace id 10 group 1,100/2,300/3,1 type resilient + + nexthop_bucket_nhid_count_check 10 2 6 + check_fail $? "Group expected to be unbalanced" + + sleep 2 + $IP nexthop replace id 10 group 1/2,3 type resilient $timer 8 + sleep 4 + + # 6 seconds, past the original timer. + nexthop_bucket_nhid_count_check 10 2 6 + check_fail $? "Group still expected to be unbalanced" + + sleep 4 + + # 10 seconds, past the new timer. + nexthop_bucket_nhid_count_check 10 2 6 + check_err $? "Group expected to be balanced" + + log_test "Bucket migration after $timer increase" + + $IP nexthop flush &> /dev/null +} + +nexthop_res_group_increase_idle_timer_test() +{ + __nexthop_res_group_increase_timer_test idle_timer +} + +nexthop_res_group_decrease_idle_timer_test() +{ + __nexthop_res_group_decrease_timer_test idle_timer +} + +nexthop_res_group_unbalanced_timer_test() +{ + local i + + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + + RET=0 + + $IP nexthop add id 10 group 1/2 type resilient \ + buckets 8 idle_timer 6 unbalanced_timer 10 + nexthop_res_mark_buckets_busy 10 1 + $IP nexthop replace id 10 group 1/2,3 type resilient + + for i in 1 2; do + sleep 4 + nexthop_bucket_nhid_count_check 10 1 4 2 4 + check_err $? "$i: Group expected to be unbalanced" + nexthop_res_mark_buckets_busy 10 1 + done + + # 3 x sleep 4 > unbalanced timer 10 + sleep 4 + nexthop_bucket_nhid_count_check 10 1 2 2 6 + check_err $? "Group expected to be balanced" + + log_test "Bucket migration after unbalanced timer" + + $IP nexthop flush &> /dev/null +} + +nexthop_res_group_unbalanced_timer_del_test() +{ + local i + + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + $IP nexthop add id 3 via 192.0.2.3 dev dummy1 + + RET=0 + + $IP nexthop add id 10 group 1,50/2,50/3,1 type resilient \ + buckets 8 idle_timer 6 unbalanced_timer 10 + nexthop_res_mark_buckets_busy 10 1 + $IP nexthop replace id 10 group 1,50/2,150/3,1 type resilient + + # Check that NH delete does not reset unbalanced time. + sleep 4 + $IP nexthop delete id 3 + nexthop_bucket_nhid_count_check 10 1 4 2 4 + check_err $? "1: Group expected to be unbalanced" + nexthop_res_mark_buckets_busy 10 1 + + sleep 4 + nexthop_bucket_nhid_count_check 10 1 4 2 4 + check_err $? "2: Group expected to be unbalanced" + nexthop_res_mark_buckets_busy 10 1 + + # 3 x sleep 4 > unbalanced timer 10 + sleep 4 + nexthop_bucket_nhid_count_check 10 1 2 2 6 + check_err $? "Group expected to be balanced" + + log_test "Bucket migration after unbalanced timer (with delete)" + + $IP nexthop flush &> /dev/null +} + +nexthop_res_group_no_unbalanced_timer_test() +{ + local i + + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + + RET=0 + + $IP nexthop add id 10 group 1/2 type resilient buckets 8 + nexthop_res_mark_buckets_busy 10 1 + $IP nexthop replace id 10 group 1/2,3 type resilient + + for i in $(seq 3); do + sleep 60 + nexthop_bucket_nhid_count_check 10 2 6 + check_fail $? "$i: Group expected to be unbalanced" + nexthop_res_mark_buckets_busy 10 1 + done + + log_test "Buckets never force-migrated without unbalanced timer" + + $IP nexthop flush &> /dev/null +} + +nexthop_res_group_short_unbalanced_timer_test() +{ + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + + RET=0 + + $IP nexthop add id 10 group 1/2 type resilient \ + buckets 8 idle_timer 120 unbalanced_timer 4 + nexthop_res_mark_buckets_busy 10 1 + $IP nexthop replace id 10 group 1/2,3 type resilient + + nexthop_bucket_nhid_count_check 10 2 6 + check_fail $? "Group expected to be unbalanced" + + sleep 5 + + nexthop_bucket_nhid_count_check 10 2 6 + check_err $? "Group expected to be balanced" + + log_test "Bucket migration after unbalanced < idle timer" + + $IP nexthop flush &> /dev/null +} + +nexthop_res_group_increase_unbalanced_timer_test() +{ + __nexthop_res_group_increase_timer_test unbalanced_timer +} + +nexthop_res_group_decrease_unbalanced_timer_test() +{ + __nexthop_res_group_decrease_timer_test unbalanced_timer +} + +nexthop_res_group_force_migrate_busy_test() +{ + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + + RET=0 + + $IP nexthop add id 10 group 1/2 type resilient \ + buckets 8 idle_timer 120 + nexthop_res_mark_buckets_busy 10 1 + $IP nexthop replace id 10 group 1/2,3 type resilient + + nexthop_bucket_nhid_count_check 10 2 6 + check_fail $? "Group expected to be unbalanced" + + $IP nexthop replace id 10 group 2 type resilient + nexthop_bucket_nhid_count_check 10 2 8 + check_err $? "All buckets expected to have migrated" + + log_test "Busy buckets force-migrated when NH removed" + + $IP nexthop flush &> /dev/null +} + nexthop_single_replace_test() { RET=0 @@ -299,6 +811,63 @@ nexthop_single_in_group_replace_err_test() nexthop_resource_set 9999 } +nexthop_single_in_res_group_replace_test() +{ + RET=0 + + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + $IP nexthop add id 10 group 1/2 type resilient buckets 4 + + $IP nexthop replace id 1 via 192.0.2.4 dev dummy1 + check_err $? "Failed to replace nexthop when should not" + + nexthop_check "id 10" "id 10 group 1/2 type resilient buckets 4 idle_timer 120 unbalanced_timer 0 unbalanced_time 0 trap" + check_err $? "Unexpected nexthop group entry" + + nexthop_bucket_nhid_count_check 10 1 2 2 2 + check_err $? "Wrong nexthop buckets count" + + nexthop_resource_check 6 + check_err $? "Wrong nexthop occupancy" + + log_test "Single nexthop replace while in resilient group" + + $IP nexthop flush &> /dev/null +} + +nexthop_single_in_res_group_replace_err_test() +{ + RET=0 + + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + $IP nexthop add id 10 group 1/2 type resilient buckets 4 + + ip netns exec testns1 \ + echo 1 > $DEBUGFS_NET_DIR/fib/fail_nexthop_bucket_replace + $IP nexthop replace id 1 via 192.0.2.4 dev dummy1 &> /dev/null + check_fail $? "Nexthop replacement succeeded when should fail" + + nexthop_check "id 1" "id 1 via 192.0.2.2 dev dummy1 scope link trap" + check_err $? "Unexpected nexthop entry after failure" + + nexthop_check "id 10" "id 10 group 1/2 type resilient buckets 4 idle_timer 120 unbalanced_timer 0 unbalanced_time 0 trap" + check_err $? "Unexpected nexthop group entry after failure" + + nexthop_bucket_nhid_count_check 10 1 2 2 2 + check_err $? "Wrong nexthop buckets count" + + nexthop_resource_check 6 + check_err $? "Wrong nexthop occupancy" + + log_test "Single nexthop replace while in resilient group failure" + + $IP nexthop flush &> /dev/null + ip netns exec testns1 \ + echo 0 > $DEBUGFS_NET_DIR/fib/fail_nexthop_bucket_replace +} + nexthop_single_in_group_delete_test() { RET=0 @@ -346,6 +915,57 @@ nexthop_single_in_group_delete_err_test() nexthop_resource_set 9999 } +nexthop_single_in_res_group_delete_test() +{ + RET=0 + + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + $IP nexthop add id 10 group 1/2 type resilient buckets 4 + + $IP nexthop del id 1 + nexthop_check "id 10" "id 10 group 2 type resilient buckets 4 idle_timer 120 unbalanced_timer 0 unbalanced_time 0 trap" + check_err $? "Unexpected nexthop group entry" + + nexthop_bucket_nhid_count_check 10 2 4 + check_err $? "Wrong nexthop buckets count" + + nexthop_resource_check 5 + check_err $? "Wrong nexthop occupancy" + + log_test "Single nexthop delete while in resilient group" + + $IP nexthop flush &> /dev/null +} + +nexthop_single_in_res_group_delete_err_test() +{ + RET=0 + + $IP nexthop add id 1 via 192.0.2.2 dev dummy1 + $IP nexthop add id 2 via 192.0.2.3 dev dummy1 + $IP nexthop add id 3 via 192.0.2.4 dev dummy1 + $IP nexthop add id 10 group 1/2/3 type resilient buckets 6 + + ip netns exec testns1 \ + echo 1 > $DEBUGFS_NET_DIR/fib/fail_nexthop_bucket_replace + $IP nexthop del id 1 + + # We failed to replace the two nexthop buckets that were originally + # assigned to nhid 1. + nexthop_bucket_nhid_count_check 10 2 2 3 2 + check_err $? "Wrong nexthop buckets count" + + nexthop_resource_check 8 + check_err $? "Wrong nexthop occupancy" + + log_test "Single nexthop delete while in resilient group failure" + + $IP nexthop flush &> /dev/null + ip netns exec testns1 \ + echo 0 > $DEBUGFS_NET_DIR/fib/fail_nexthop_bucket_replace +} + nexthop_replay_test() { RET=0 -- cgit v1.2.3 From 7028ba8ac9683b7e4f4db4db63a306d7497e7150 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 12 Mar 2021 17:16:19 -0800 Subject: selftests: mptcp: add invert argument for chk_rm_nr Some of the removing testcases used two zeros as arguments for chk_rm_nr like this: chk_rm_nr 0 0. This doesn't mean that no RM_ADDR has been sent. It only means that RM_ADDR had been sent in the opposite direction that chk_rm_nr is checking. This patch added a new argument invert for chk_rm_nr to allow it can check the RM_ADDR from the opposite direction. Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 964db9ed544f..15b71ddee615 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -610,11 +610,22 @@ chk_rm_nr() { local rm_addr_nr=$1 local rm_subflow_nr=$2 + local invert=${3:-""} local count local dump_stats + local addr_ns + local subflow_ns + + if [ -z $invert ]; then + addr_ns=$ns1 + subflow_ns=$ns2 + elif [ $invert = "invert" ]; then + addr_ns=$ns2 + subflow_ns=$ns1 + fi printf "%-39s %s" " " "rm " - count=`ip netns exec $ns1 nstat -as | grep MPTcpExtRmAddr | awk '{print $2}'` + count=`ip netns exec $addr_ns nstat -as | grep MPTcpExtRmAddr | awk '{print $2}'` [ -z "$count" ] && count=0 if [ "$count" != "$rm_addr_nr" ]; then echo "[fail] got $count RM_ADDR[s] expected $rm_addr_nr" @@ -625,7 +636,7 @@ chk_rm_nr() fi echo -n " - sf " - count=`ip netns exec $ns2 nstat -as | grep MPTcpExtRmSubflow | awk '{print $2}'` + count=`ip netns exec $subflow_ns nstat -as | grep MPTcpExtRmSubflow | awk '{print $2}'` [ -z "$count" ] && count=0 if [ "$count" != "$rm_subflow_nr" ]; then echo "[fail] got $count RM_SUBFLOW[s] expected $rm_subflow_nr" @@ -833,7 +844,7 @@ remove_tests() run_tests $ns1 $ns2 10.0.1.1 0 -1 0 slow chk_join_nr "remove single address" 1 1 1 chk_add_nr 1 1 - chk_rm_nr 0 0 + chk_rm_nr 1 1 invert # subflow and signal, remove reset @@ -945,7 +956,7 @@ ipv6_tests() run_tests $ns1 $ns2 dead:beef:1::1 0 -1 0 slow chk_join_nr "remove single address IPv6" 1 1 1 chk_add_nr 1 1 - chk_rm_nr 0 0 + chk_rm_nr 1 1 invert # subflow and signal IPv6, remove reset @@ -1088,7 +1099,7 @@ add_addr_ports_tests() run_tests $ns1 $ns2 10.0.1.1 0 -1 0 slow chk_join_nr "remove single address with port" 1 1 1 chk_add_nr 1 1 1 - chk_rm_nr 0 0 + chk_rm_nr 1 1 invert # subflow and signal with port, remove reset -- cgit v1.2.3 From f87744ad42446c8510296f11fdc73f6e6f1376cc Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 12 Mar 2021 17:16:20 -0800 Subject: selftests: mptcp: set addr id for removing testcases The removing testcases can only delete the addresses from id 1, this patch added the support for deleting the addresses from any id that user set. Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 38 ++++++++++++++++--------- 1 file changed, 24 insertions(+), 14 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 15b71ddee615..6782a891b3e7 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -284,14 +284,19 @@ do_transfer() let rm_nr_ns1=-addr_nr_ns1 if [ $rm_nr_ns1 -lt 8 ]; then counter=1 - sleep 1 - - while [ $counter -le $rm_nr_ns1 ] - do - ip netns exec ${listener_ns} ./pm_nl_ctl del $counter + dump=(`ip netns exec ${listener_ns} ./pm_nl_ctl dump`) + if [ ${#dump[@]} -gt 0 ]; then + id=${dump[1]} sleep 1 - let counter+=1 - done + + while [ $counter -le $rm_nr_ns1 ] + do + ip netns exec ${listener_ns} ./pm_nl_ctl del $id + sleep 1 + let counter+=1 + let id+=1 + done + fi else sleep 1 ip netns exec ${listener_ns} ./pm_nl_ctl flush @@ -318,14 +323,19 @@ do_transfer() let rm_nr_ns2=-addr_nr_ns2 if [ $rm_nr_ns2 -lt 8 ]; then counter=1 - sleep 1 - - while [ $counter -le $rm_nr_ns2 ] - do - ip netns exec ${connector_ns} ./pm_nl_ctl del $counter + dump=(`ip netns exec ${connector_ns} ./pm_nl_ctl dump`) + if [ ${#dump[@]} -gt 0 ]; then + id=${dump[1]} sleep 1 - let counter+=1 - done + + while [ $counter -le $rm_nr_ns2 ] + do + ip netns exec ${connector_ns} ./pm_nl_ctl del $id + sleep 1 + let counter+=1 + let id+=1 + done + fi else sleep 1 ip netns exec ${connector_ns} ./pm_nl_ctl flush -- cgit v1.2.3 From d2c4333a801c73a8bc2e4bde75b573e2d1014436 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 12 Mar 2021 17:16:21 -0800 Subject: selftests: mptcp: add testcases for removing addrs This patch added the testcases for removing a list of addresses. Used the netlink to flush the addresses in the testcases. Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 6782a891b3e7..191303b652a6 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -890,6 +890,29 @@ remove_tests() chk_join_nr "flush subflows and signal" 3 3 3 chk_add_nr 1 1 chk_rm_nr 2 2 + + # subflows flush + reset + ip netns exec $ns1 ./pm_nl_ctl limits 3 3 + ip netns exec $ns2 ./pm_nl_ctl limits 3 3 + ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow id 150 + ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow + ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags subflow + run_tests $ns1 $ns2 10.0.1.1 0 -8 -8 slow + chk_join_nr "flush subflows" 3 3 3 + chk_rm_nr 3 3 + + # addresses flush + reset + ip netns exec $ns1 ./pm_nl_ctl limits 3 3 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal id 250 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.3.1 flags signal + ip netns exec $ns1 ./pm_nl_ctl add 10.0.4.1 flags signal + ip netns exec $ns2 ./pm_nl_ctl limits 3 3 + run_tests $ns1 $ns2 10.0.1.1 0 -8 -8 slow + chk_join_nr "flush addresses" 3 3 3 + chk_add_nr 3 3 + chk_rm_nr 3 3 invert } add_tests() -- cgit v1.2.3 From f26b30918dac568425811f71fd21a53ed2307f44 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 14 Mar 2021 14:19:33 +0200 Subject: selftests: netdevsim: Test psample functionality Test various aspects of psample functionality over netdevsim and in particular test that the psample module correctly reports the provided metadata. Example: # ./psample.sh TEST: psample enable / disable [ OK ] TEST: psample group number [ OK ] TEST: psample metadata [ OK ] Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- .../selftests/drivers/net/netdevsim/psample.sh | 181 +++++++++++++++++++++ 1 file changed, 181 insertions(+) create mode 100755 tools/testing/selftests/drivers/net/netdevsim/psample.sh (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/netdevsim/psample.sh b/tools/testing/selftests/drivers/net/netdevsim/psample.sh new file mode 100755 index 000000000000..ee10b1a8933c --- /dev/null +++ b/tools/testing/selftests/drivers/net/netdevsim/psample.sh @@ -0,0 +1,181 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# This test is for checking the psample module. It makes use of netdevsim +# which periodically generates "sampled" packets. + +lib_dir=$(dirname $0)/../../../net/forwarding + +ALL_TESTS=" + psample_enable_test + psample_group_num_test + psample_md_test +" +NETDEVSIM_PATH=/sys/bus/netdevsim/ +DEV_ADDR=1337 +DEV=netdevsim${DEV_ADDR} +DEVLINK_DEV=netdevsim/${DEV} +SYSFS_NET_DIR=/sys/bus/netdevsim/devices/$DEV/net/ +PSAMPLE_DIR=/sys/kernel/debug/netdevsim/$DEV/psample/ +CAPTURE_FILE=$(mktemp) +NUM_NETIFS=0 +source $lib_dir/lib.sh +source $lib_dir/devlink_lib.sh + +# Available at https://github.com/Mellanox/libpsample +require_command psample + +psample_capture() +{ + rm -f $CAPTURE_FILE + + timeout 2 ip netns exec testns1 psample &> $CAPTURE_FILE +} + +psample_enable_test() +{ + RET=0 + + echo 1 > $PSAMPLE_DIR/enable + check_err $? "Failed to enable sampling when should not" + + echo 1 > $PSAMPLE_DIR/enable 2>/dev/null + check_fail $? "Sampling enablement succeeded when should fail" + + psample_capture + if [ $(cat $CAPTURE_FILE | wc -l) -eq 0 ]; then + check_err 1 "Failed to capture sampled packets" + fi + + echo 0 > $PSAMPLE_DIR/enable + check_err $? "Failed to disable sampling when should not" + + echo 0 > $PSAMPLE_DIR/enable 2>/dev/null + check_fail $? "Sampling disablement succeeded when should fail" + + psample_capture + if [ $(cat $CAPTURE_FILE | wc -l) -ne 0 ]; then + check_err 1 "Captured sampled packets when should not" + fi + + log_test "psample enable / disable" +} + +psample_group_num_test() +{ + RET=0 + + echo 1234 > $PSAMPLE_DIR/group_num + echo 1 > $PSAMPLE_DIR/enable + + psample_capture + grep -q -e "group 1234" $CAPTURE_FILE + check_err $? "Sampled packets reported with wrong group number" + + # New group number should only be used after disable / enable. + echo 4321 > $PSAMPLE_DIR/group_num + + psample_capture + grep -q -e "group 4321" $CAPTURE_FILE + check_fail $? "Group number changed while sampling is active" + + echo 0 > $PSAMPLE_DIR/enable && echo 1 > $PSAMPLE_DIR/enable + + psample_capture + grep -q -e "group 4321" $CAPTURE_FILE + check_err $? "Group number did not change after restarting sampling" + + log_test "psample group number" + + echo 0 > $PSAMPLE_DIR/enable +} + +psample_md_test() +{ + RET=0 + + echo 1 > $PSAMPLE_DIR/enable + + echo 1234 > $PSAMPLE_DIR/in_ifindex + echo 4321 > $PSAMPLE_DIR/out_ifindex + psample_capture + + grep -q -e "in-ifindex 1234" $CAPTURE_FILE + check_err $? "Sampled packets reported with wrong in-ifindex" + + grep -q -e "out-ifindex 4321" $CAPTURE_FILE + check_err $? "Sampled packets reported with wrong out-ifindex" + + echo 5 > $PSAMPLE_DIR/out_tc + psample_capture + + grep -q -e "out-tc 5" $CAPTURE_FILE + check_err $? "Sampled packets reported with wrong out-tc" + + echo $((2**16 - 1)) > $PSAMPLE_DIR/out_tc + psample_capture + + grep -q -e "out-tc " $CAPTURE_FILE + check_fail $? "Sampled packets reported with out-tc when should not" + + echo 1 > $PSAMPLE_DIR/out_tc + echo 10000 > $PSAMPLE_DIR/out_tc_occ_max + psample_capture + + grep -q -e "out-tc-occ " $CAPTURE_FILE + check_err $? "Sampled packets not reported with out-tc-occ when should" + + echo 0 > $PSAMPLE_DIR/out_tc_occ_max + psample_capture + + grep -q -e "out-tc-occ " $CAPTURE_FILE + check_fail $? "Sampled packets reported with out-tc-occ when should not" + + echo 10000 > $PSAMPLE_DIR/latency_max + psample_capture + + grep -q -e "latency " $CAPTURE_FILE + check_err $? "Sampled packets not reported with latency when should" + + echo 0 > $PSAMPLE_DIR/latency_max + psample_capture + + grep -q -e "latency " $CAPTURE_FILE + check_fail $? "Sampled packets reported with latency when should not" + + log_test "psample metadata" + + echo 0 > $PSAMPLE_DIR/enable +} + +setup_prepare() +{ + modprobe netdevsim &> /dev/null + + echo "$DEV_ADDR 1" > ${NETDEVSIM_PATH}/new_device + while [ ! -d $SYSFS_NET_DIR ] ; do :; done + + set -e + + ip netns add testns1 + devlink dev reload $DEVLINK_DEV netns testns1 + + set +e +} + +cleanup() +{ + pre_cleanup + rm -f $CAPTURE_FILE + ip netns del testns1 + echo "$DEV_ADDR" > ${NETDEVSIM_PATH}/del_device + modprobe -r netdevsim &> /dev/null +} + +trap cleanup EXIT + +setup_prepare + +tests_run + +exit $EXIT_STATUS -- cgit v1.2.3 From bb24d592e66eb059e086595510e0f0b064e4114d Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 14 Mar 2021 14:19:40 +0200 Subject: selftests: mlxsw: Add tc sample tests Test that packets are sampled when tc-sample is used and that reported metadata is correct. Two sets of hosts (with and without LAG) are used, since metadata extraction in mlxsw is a bit different when LAG is involved. # ./tc_sample.sh TEST: tc sample rate (forward) [ OK ] TEST: tc sample rate (local receive) [ OK ] TEST: tc sample maximum rate [ OK ] TEST: tc sample group conflict test [ OK ] TEST: tc sample iif [ OK ] TEST: tc sample lag iif [ OK ] TEST: tc sample oif [ OK ] TEST: tc sample lag oif [ OK ] TEST: tc sample out-tc [ OK ] TEST: tc sample out-tc-occ [ OK ] Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- .../selftests/drivers/net/mlxsw/tc_sample.sh | 492 +++++++++++++++++++++ 1 file changed, 492 insertions(+) create mode 100755 tools/testing/selftests/drivers/net/mlxsw/tc_sample.sh (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/mlxsw/tc_sample.sh b/tools/testing/selftests/drivers/net/mlxsw/tc_sample.sh new file mode 100755 index 000000000000..75d00104f291 --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/tc_sample.sh @@ -0,0 +1,492 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test that packets are sampled when tc-sample is used and that reported +# metadata is correct. Two sets of hosts (with and without LAG) are used, since +# metadata extraction in mlxsw is a bit different when LAG is involved. +# +# +---------------------------------+ +---------------------------------+ +# | H1 (vrf) | | H3 (vrf) | +# | + $h1 | | + $h3_lag | +# | | 192.0.2.1/28 | | | 192.0.2.17/28 | +# | | | | | | +# | | default via 192.0.2.2 | | | default via 192.0.2.18 | +# +----|----------------------------+ +----|----------------------------+ +# | | +# +----|-----------------------------------------|----------------------------+ +# | | 192.0.2.2/28 | 192.0.2.18/28 | +# | + $rp1 + $rp3_lag | +# | | +# | + $rp2 + $rp4_lag | +# | | 198.51.100.2/28 | 198.51.100.18/28 | +# +----|-----------------------------------------|----------------------------+ +# | | +# +----|----------------------------+ +----|----------------------------+ +# | | default via 198.51.100.2 | | | default via 198.51.100.18 | +# | | | | | | +# | | 198.51.100.1/28 | | | 198.51.100.17/28 | +# | + $h2 | | + $h4_lag | +# | H2 (vrf) | | H4 (vrf) | +# +---------------------------------+ +---------------------------------+ + +lib_dir=$(dirname $0)/../../../net/forwarding + +ALL_TESTS=" + tc_sample_rate_test + tc_sample_max_rate_test + tc_sample_group_conflict_test + tc_sample_md_iif_test + tc_sample_md_lag_iif_test + tc_sample_md_oif_test + tc_sample_md_lag_oif_test + tc_sample_md_out_tc_test + tc_sample_md_out_tc_occ_test +" +NUM_NETIFS=8 +CAPTURE_FILE=$(mktemp) +source $lib_dir/lib.sh +source $lib_dir/devlink_lib.sh + +# Available at https://github.com/Mellanox/libpsample +require_command psample + +h1_create() +{ + simple_if_init $h1 192.0.2.1/28 + + ip -4 route add default vrf v$h1 nexthop via 192.0.2.2 +} + +h1_destroy() +{ + ip -4 route del default vrf v$h1 nexthop via 192.0.2.2 + + simple_if_fini $h1 192.0.2.1/28 +} + +h2_create() +{ + simple_if_init $h2 198.51.100.1/28 + + ip -4 route add default vrf v$h2 nexthop via 198.51.100.2 +} + +h2_destroy() +{ + ip -4 route del default vrf v$h2 nexthop via 198.51.100.2 + + simple_if_fini $h2 198.51.100.1/28 +} + +h3_create() +{ + ip link set dev $h3 down + ip link add name ${h3}_bond type bond mode 802.3ad + ip link set dev $h3 master ${h3}_bond + + simple_if_init ${h3}_bond 192.0.2.17/28 + + ip -4 route add default vrf v${h3}_bond nexthop via 192.0.2.18 +} + +h3_destroy() +{ + ip -4 route del default vrf v${h3}_bond nexthop via 192.0.2.18 + + simple_if_fini ${h3}_bond 192.0.2.17/28 + + ip link set dev $h3 nomaster + ip link del dev ${h3}_bond +} + +h4_create() +{ + ip link set dev $h4 down + ip link add name ${h4}_bond type bond mode 802.3ad + ip link set dev $h4 master ${h4}_bond + + simple_if_init ${h4}_bond 198.51.100.17/28 + + ip -4 route add default vrf v${h4}_bond nexthop via 198.51.100.18 +} + +h4_destroy() +{ + ip -4 route del default vrf v${h4}_bond nexthop via 198.51.100.18 + + simple_if_fini ${h4}_bond 198.51.100.17/28 + + ip link set dev $h4 nomaster + ip link del dev ${h4}_bond +} + +router_create() +{ + ip link set dev $rp1 up + __addr_add_del $rp1 add 192.0.2.2/28 + tc qdisc add dev $rp1 clsact + + ip link set dev $rp2 up + __addr_add_del $rp2 add 198.51.100.2/28 + tc qdisc add dev $rp2 clsact + + ip link add name ${rp3}_bond type bond mode 802.3ad + ip link set dev $rp3 master ${rp3}_bond + __addr_add_del ${rp3}_bond add 192.0.2.18/28 + tc qdisc add dev $rp3 clsact + ip link set dev ${rp3}_bond up + + ip link add name ${rp4}_bond type bond mode 802.3ad + ip link set dev $rp4 master ${rp4}_bond + __addr_add_del ${rp4}_bond add 198.51.100.18/28 + tc qdisc add dev $rp4 clsact + ip link set dev ${rp4}_bond up +} + +router_destroy() +{ + ip link set dev ${rp4}_bond down + tc qdisc del dev $rp4 clsact + __addr_add_del ${rp4}_bond del 198.51.100.18/28 + ip link set dev $rp4 nomaster + ip link del dev ${rp4}_bond + + ip link set dev ${rp3}_bond down + tc qdisc del dev $rp3 clsact + __addr_add_del ${rp3}_bond del 192.0.2.18/28 + ip link set dev $rp3 nomaster + ip link del dev ${rp3}_bond + + tc qdisc del dev $rp2 clsact + __addr_add_del $rp2 del 198.51.100.2/28 + ip link set dev $rp2 down + + tc qdisc del dev $rp1 clsact + __addr_add_del $rp1 del 192.0.2.2/28 + ip link set dev $rp1 down +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + rp1=${NETIFS[p2]} + rp2=${NETIFS[p3]} + h2=${NETIFS[p4]} + h3=${NETIFS[p5]} + rp3=${NETIFS[p6]} + h4=${NETIFS[p7]} + rp4=${NETIFS[p8]} + + vrf_prepare + + h1_create + h2_create + h3_create + h4_create + router_create +} + +cleanup() +{ + pre_cleanup + + rm -f $CAPTURE_FILE + + router_destroy + h4_destroy + h3_destroy + h2_destroy + h1_destroy + + vrf_cleanup +} + +psample_capture_start() +{ + rm -f $CAPTURE_FILE + + psample &> $CAPTURE_FILE & + + sleep 1 +} + +psample_capture_stop() +{ + { kill %% && wait %%; } 2>/dev/null +} + +__tc_sample_rate_test() +{ + local desc=$1; shift + local dip=$1; shift + local pkts pct + + RET=0 + + tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \ + skip_sw action sample rate 32 group 1 + check_err $? "Failed to configure sampling rule" + + psample_capture_start + + ip vrf exec v$h1 $MZ $h1 -c 3200 -d 1msec -p 64 -A 192.0.2.1 \ + -B $dip -t udp dp=52768,sp=42768 -q + + psample_capture_stop + + pkts=$(grep -e "group 1 " $CAPTURE_FILE | wc -l) + pct=$((100 * (pkts - 100) / 100)) + (( -25 <= pct && pct <= 25)) + check_err $? "Expected 100 packets, got $pkts packets, which is $pct% off. Required accuracy is +-25%" + + log_test "tc sample rate ($desc)" + + tc filter del dev $rp1 ingress protocol all pref 1 handle 101 matchall +} + +tc_sample_rate_test() +{ + __tc_sample_rate_test "forward" 198.51.100.1 + __tc_sample_rate_test "local receive" 192.0.2.2 +} + +tc_sample_max_rate_test() +{ + RET=0 + + tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \ + skip_sw action sample rate $((35 * 10 ** 8)) group 1 + check_err $? "Failed to configure sampling rule with max rate" + + tc filter del dev $rp1 ingress protocol all pref 1 handle 101 matchall + + tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \ + skip_sw action sample rate $((35 * 10 ** 8 + 1)) \ + group 1 &> /dev/null + check_fail $? "Managed to configure sampling rate above maximum" + + log_test "tc sample maximum rate" +} + +tc_sample_group_conflict_test() +{ + RET=0 + + # Test that two sampling rules cannot be configured on the same port + # with different groups. + + tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \ + skip_sw action sample rate 1024 group 1 + check_err $? "Failed to configure sampling rule" + + tc filter add dev $rp1 ingress protocol all pref 2 handle 102 matchall \ + skip_sw action sample rate 1024 group 2 &> /dev/null + check_fail $? "Managed to configure sampling rule with conflicting group" + + log_test "tc sample group conflict test" + + tc filter del dev $rp1 ingress protocol all pref 1 handle 101 matchall +} + +tc_sample_md_iif_test() +{ + local rp1_ifindex + + RET=0 + + tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \ + skip_sw action sample rate 5 group 1 + check_err $? "Failed to configure sampling rule" + + psample_capture_start + + ip vrf exec v$h1 $MZ $h1 -c 3200 -d 1msec -p 64 -A 192.0.2.1 \ + -B 198.51.100.1 -t udp dp=52768,sp=42768 -q + + psample_capture_stop + + rp1_ifindex=$(ip -j -p link show dev $rp1 | jq '.[]["ifindex"]') + grep -q -e "in-ifindex $rp1_ifindex " $CAPTURE_FILE + check_err $? "Sampled packets do not have expected in-ifindex" + + log_test "tc sample iif" + + tc filter del dev $rp1 ingress protocol all pref 1 handle 101 matchall +} + +tc_sample_md_lag_iif_test() +{ + local rp3_ifindex + + RET=0 + + tc filter add dev $rp3 ingress protocol all pref 1 handle 101 matchall \ + skip_sw action sample rate 5 group 1 + check_err $? "Failed to configure sampling rule" + + psample_capture_start + + ip vrf exec v${h3}_bond $MZ ${h3}_bond -c 3200 -d 1msec -p 64 \ + -A 192.0.2.17 -B 198.51.100.17 -t udp dp=52768,sp=42768 -q + + psample_capture_stop + + rp3_ifindex=$(ip -j -p link show dev $rp3 | jq '.[]["ifindex"]') + grep -q -e "in-ifindex $rp3_ifindex " $CAPTURE_FILE + check_err $? "Sampled packets do not have expected in-ifindex" + + log_test "tc sample lag iif" + + tc filter del dev $rp3 ingress protocol all pref 1 handle 101 matchall +} + +tc_sample_md_oif_test() +{ + local rp2_ifindex + + RET=0 + + tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \ + skip_sw action sample rate 5 group 1 + check_err $? "Failed to configure sampling rule" + + psample_capture_start + + ip vrf exec v$h1 $MZ $h1 -c 3200 -d 1msec -p 64 -A 192.0.2.1 \ + -B 198.51.100.1 -t udp dp=52768,sp=42768 -q + + psample_capture_stop + + rp2_ifindex=$(ip -j -p link show dev $rp2 | jq '.[]["ifindex"]') + grep -q -e "out-ifindex $rp2_ifindex " $CAPTURE_FILE + check_err $? "Sampled packets do not have expected out-ifindex" + + log_test "tc sample oif" + + tc filter del dev $rp1 ingress protocol all pref 1 handle 101 matchall +} + +tc_sample_md_lag_oif_test() +{ + local rp4_ifindex + + RET=0 + + tc filter add dev $rp3 ingress protocol all pref 1 handle 101 matchall \ + skip_sw action sample rate 5 group 1 + check_err $? "Failed to configure sampling rule" + + psample_capture_start + + ip vrf exec v${h3}_bond $MZ ${h3}_bond -c 3200 -d 1msec -p 64 \ + -A 192.0.2.17 -B 198.51.100.17 -t udp dp=52768,sp=42768 -q + + psample_capture_stop + + rp4_ifindex=$(ip -j -p link show dev $rp4 | jq '.[]["ifindex"]') + grep -q -e "out-ifindex $rp4_ifindex " $CAPTURE_FILE + check_err $? "Sampled packets do not have expected out-ifindex" + + log_test "tc sample lag oif" + + tc filter del dev $rp3 ingress protocol all pref 1 handle 101 matchall +} + +tc_sample_md_out_tc_test() +{ + RET=0 + + # Output traffic class is not supported on Spectrum-1. + [[ "$DEVLINK_VIDDID" == "15b3:cb84" ]] && return + + tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \ + skip_sw action sample rate 5 group 1 + check_err $? "Failed to configure sampling rule" + + # By default, all the packets should go to the same traffic class (0). + + psample_capture_start + + ip vrf exec v$h1 $MZ $h1 -c 3200 -d 1msec -p 64 -A 192.0.2.1 \ + -B 198.51.100.1 -t udp dp=52768,sp=42768 -q + + psample_capture_stop + + grep -q -e "out-tc 0 " $CAPTURE_FILE + check_err $? "Sampled packets do not have expected out-tc (0)" + + # Map all priorities to highest traffic class (7) and check reported + # out-tc. + tc qdisc replace dev $rp2 root handle 1: \ + prio bands 3 priomap 0 0 0 0 0 0 0 0 + + psample_capture_start + + ip vrf exec v$h1 $MZ $h1 -c 3200 -d 1msec -p 64 -A 192.0.2.1 \ + -B 198.51.100.1 -t udp dp=52768,sp=42768 -q + + psample_capture_stop + + grep -q -e "out-tc 7 " $CAPTURE_FILE + check_err $? "Sampled packets do not have expected out-tc (7)" + + log_test "tc sample out-tc" + + tc qdisc del dev $rp2 root handle 1: + tc filter del dev $rp1 ingress protocol all pref 1 handle 101 matchall +} + +tc_sample_md_out_tc_occ_test() +{ + local backlog pct occ + + RET=0 + + # Output traffic class occupancy is not supported on Spectrum-1. + [[ "$DEVLINK_VIDDID" == "15b3:cb84" ]] && return + + tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \ + skip_sw action sample rate 1024 group 1 + check_err $? "Failed to configure sampling rule" + + # Configure a shaper on egress to create congestion. + tc qdisc replace dev $rp2 root handle 1: \ + tbf rate 1Mbit burst 256k limit 1M + + psample_capture_start + + ip vrf exec v$h1 $MZ $h1 -c 0 -d 1usec -p 1400 -A 192.0.2.1 \ + -B 198.51.100.1 -t udp dp=52768,sp=42768 -q & + + # Allow congestion to reach steady state. + sleep 10 + + backlog=$(tc -j -p -s qdisc show dev $rp2 | jq '.[0]["backlog"]') + + # Kill mausezahn. + { kill %% && wait %%; } 2>/dev/null + + psample_capture_stop + + # Record last congestion sample. + occ=$(grep -e "out-tc-occ " $CAPTURE_FILE | tail -n 1 | \ + cut -d ' ' -f 16) + + pct=$((100 * (occ - backlog) / backlog)) + (( -1 <= pct && pct <= 1)) + check_err $? "Recorded a congestion of $backlog bytes, but sampled congestion is $occ bytes, which is $pct% off. Required accuracy is +-5%" + + log_test "tc sample out-tc-occ" + + tc qdisc del dev $rp2 root handle 1: + tc filter del dev $rp1 ingress protocol all pref 1 handle 101 matchall +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS -- cgit v1.2.3 From 508ef28674c1fe6ac388586cb31dc0f0bbc4172c Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 2 Nov 2020 19:12:16 +0100 Subject: x86/insn: Add @buf_len param to insn_init() kernel-doc comment It wasn't documented so add it. No functional changes. Signed-off-by: Borislav Petkov Acked-by: Masami Hiramatsu Link: https://lkml.kernel.org/r/20210304174237.31945-3-bp@alien8.de --- arch/x86/lib/insn.c | 1 + tools/arch/x86/lib/insn.c | 1 + 2 files changed, 2 insertions(+) (limited to 'tools') diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index 435630a6ec97..4d1640dc7882 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c @@ -51,6 +51,7 @@ * insn_init() - initialize struct insn * @insn: &struct insn to be initialized * @kaddr: address (in kernel memory) of instruction (or copy thereof) + * @buf_len: length of the insn buffer at @kaddr * @x86_64: !0 for 64-bit kernel or 64-bit app */ void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64) diff --git a/tools/arch/x86/lib/insn.c b/tools/arch/x86/lib/insn.c index 3d9355ed1246..31afbf0a75f7 100644 --- a/tools/arch/x86/lib/insn.c +++ b/tools/arch/x86/lib/insn.c @@ -51,6 +51,7 @@ * insn_init() - initialize struct insn * @insn: &struct insn to be initialized * @kaddr: address (in kernel memory) of instruction (or copy thereof) + * @buf_len: length of the insn buffer at @kaddr * @x86_64: !0 for 64-bit kernel or 64-bit app */ void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64) -- cgit v1.2.3 From d30c7b820be5c4777fe6c3b0c21f9d0064251e51 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 22 Feb 2021 13:34:40 +0100 Subject: x86/insn: Add a __ignore_sync_check__ marker Add an explicit __ignore_sync_check__ marker which will be used to mark lines which are supposed to be ignored by file synchronization check scripts, its advantage being that it explicitly denotes such lines in the code. Signed-off-by: Borislav Petkov Reviewed-by: Masami Hiramatsu Link: https://lkml.kernel.org/r/20210304174237.31945-4-bp@alien8.de --- arch/x86/include/asm/inat.h | 2 +- arch/x86/include/asm/insn.h | 2 +- arch/x86/lib/inat.c | 2 +- arch/x86/lib/insn.c | 6 +++--- tools/arch/x86/include/asm/inat.h | 2 +- tools/arch/x86/include/asm/insn.h | 2 +- tools/arch/x86/lib/inat.c | 2 +- tools/arch/x86/lib/insn.c | 6 +++--- tools/objtool/sync-check.sh | 17 +++++++++++++---- tools/perf/check-headers.sh | 15 +++++++++++---- 10 files changed, 36 insertions(+), 20 deletions(-) (limited to 'tools') diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h index 4cf2ad521f65..b56c5741581a 100644 --- a/arch/x86/include/asm/inat.h +++ b/arch/x86/include/asm/inat.h @@ -6,7 +6,7 @@ * * Written by Masami Hiramatsu */ -#include +#include /* __ignore_sync_check__ */ /* * Internal bits. Don't use bitmasks directly, because these bits are diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h index 95a448fbb44c..93f84600ac2c 100644 --- a/arch/x86/include/asm/insn.h +++ b/arch/x86/include/asm/insn.h @@ -9,7 +9,7 @@ #include /* insn_attr_t is defined in inat.h */ -#include +#include /* __ignore_sync_check__ */ #if defined(__BYTE_ORDER) ? __BYTE_ORDER == __LITTLE_ENDIAN : defined(__LITTLE_ENDIAN) diff --git a/arch/x86/lib/inat.c b/arch/x86/lib/inat.c index 12539fca75c4..b0f3b2a62ae2 100644 --- a/arch/x86/lib/inat.c +++ b/arch/x86/lib/inat.c @@ -4,7 +4,7 @@ * * Written by Masami Hiramatsu */ -#include +#include /* __ignore_sync_check__ */ /* Attribute tables are generated from opcode map */ #include "inat-tables.c" diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index 4d1640dc7882..ed1e0335aa14 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c @@ -11,10 +11,10 @@ #else #include #endif -#include -#include +#include /*__ignore_sync_check__ */ +#include /* __ignore_sync_check__ */ -#include +#include /* __ignore_sync_check__ */ #define leXX_to_cpu(t, r) \ ({ \ diff --git a/tools/arch/x86/include/asm/inat.h b/tools/arch/x86/include/asm/inat.h index 877827b7c2c3..a61051400311 100644 --- a/tools/arch/x86/include/asm/inat.h +++ b/tools/arch/x86/include/asm/inat.h @@ -6,7 +6,7 @@ * * Written by Masami Hiramatsu */ -#include "inat_types.h" +#include "inat_types.h" /* __ignore_sync_check__ */ /* * Internal bits. Don't use bitmasks directly, because these bits are diff --git a/tools/arch/x86/include/asm/insn.h b/tools/arch/x86/include/asm/insn.h index cc777c185212..7e1239aa7dd9 100644 --- a/tools/arch/x86/include/asm/insn.h +++ b/tools/arch/x86/include/asm/insn.h @@ -9,7 +9,7 @@ #include /* insn_attr_t is defined in inat.h */ -#include "inat.h" +#include "inat.h" /* __ignore_sync_check__ */ #if defined(__BYTE_ORDER) ? __BYTE_ORDER == __LITTLE_ENDIAN : defined(__LITTLE_ENDIAN) diff --git a/tools/arch/x86/lib/inat.c b/tools/arch/x86/lib/inat.c index 4f5ed49e1b4e..dfbcc6405941 100644 --- a/tools/arch/x86/lib/inat.c +++ b/tools/arch/x86/lib/inat.c @@ -4,7 +4,7 @@ * * Written by Masami Hiramatsu */ -#include "../include/asm/insn.h" +#include "../include/asm/insn.h" /* __ignore_sync_check__ */ /* Attribute tables are generated from opcode map */ #include "inat-tables.c" diff --git a/tools/arch/x86/lib/insn.c b/tools/arch/x86/lib/insn.c index 31afbf0a75f7..1174c43b3944 100644 --- a/tools/arch/x86/lib/insn.c +++ b/tools/arch/x86/lib/insn.c @@ -11,10 +11,10 @@ #else #include #endif -#include "../include/asm/inat.h" -#include "../include/asm/insn.h" +#include "../include/asm/inat.h" /* __ignore_sync_check__ */ +#include "../include/asm/insn.h" /* __ignore_sync_check__ */ -#include "../include/asm/emulate_prefix.h" +#include "../include/asm/emulate_prefix.h" /* __ignore_sync_check__ */ #define leXX_to_cpu(t, r) \ ({ \ diff --git a/tools/objtool/sync-check.sh b/tools/objtool/sync-check.sh index 606a4b5e929f..4bbabaecab14 100755 --- a/tools/objtool/sync-check.sh +++ b/tools/objtool/sync-check.sh @@ -16,11 +16,14 @@ arch/x86/include/asm/emulate_prefix.h arch/x86/lib/x86-opcode-map.txt arch/x86/tools/gen-insn-attr-x86.awk include/linux/static_call_types.h -arch/x86/include/asm/inat.h -I '^#include [\"<]\(asm/\)*inat_types.h[\">]' -arch/x86/include/asm/insn.h -I '^#include [\"<]\(asm/\)*inat.h[\">]' -arch/x86/lib/inat.c -I '^#include [\"<]\(../include/\)*asm/insn.h[\">]' -arch/x86/lib/insn.c -I '^#include [\"<]\(../include/\)*asm/in\(at\|sn\).h[\">]' -I '^#include [\"<]\(../include/\)*asm/emulate_prefix.h[\">]' " + +SYNC_CHECK_FILES=' +arch/x86/include/asm/inat.h +arch/x86/include/asm/insn.h +arch/x86/lib/inat.c +arch/x86/lib/insn.c +' fi check_2 () { @@ -63,3 +66,9 @@ while read -r file_entry; do done <string syscall @@ -129,6 +136,10 @@ for i in $FILES; do check $i -B done +for i in $SYNC_CHECK_FILES; do + check $i '-I "^.*\/\*.*__ignore_sync_check__.*\*\/.*$"' +done + # diff with extra ignore lines check arch/x86/lib/memcpy_64.S '-I "^EXPORT_SYMBOL" -I "^#include " -I"^SYM_FUNC_START\(_LOCAL\)*(memcpy_\(erms\|orig\))"' check arch/x86/lib/memset_64.S '-I "^EXPORT_SYMBOL" -I "^#include " -I"^SYM_FUNC_START\(_LOCAL\)*(memset_\(erms\|orig\))"' @@ -137,10 +148,6 @@ check include/uapi/linux/mman.h '-I "^#include <\(uapi/\)*asm/mman.h>"' check include/linux/build_bug.h '-I "^#\(ifndef\|endif\)\( \/\/\)* static_assert$"' check include/linux/ctype.h '-I "isdigit("' check lib/ctype.c '-I "^EXPORT_SYMBOL" -I "^#include " -B' -check arch/x86/include/asm/inat.h '-I "^#include [\"<]\(asm/\)*inat_types.h[\">]"' -check arch/x86/include/asm/insn.h '-I "^#include [\"<]\(asm/\)*inat.h[\">]"' -check arch/x86/lib/inat.c '-I "^#include [\"<]\(../include/\)*asm/insn.h[\">]"' -check arch/x86/lib/insn.c '-I "^#include [\"<]\(../include/\)*asm/in\(at\|sn\).h[\">]" -I "^#include [\"<]\(../include/\)*asm/emulate_prefix.h[\">]"' # diff non-symmetric files check_2 tools/perf/arch/x86/entry/syscalls/syscall_64.tbl arch/x86/entry/syscalls/syscall_64.tbl -- cgit v1.2.3 From 93281c4a96572a34504244969b938e035204778d Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 3 Nov 2020 17:28:30 +0100 Subject: x86/insn: Add an insn_decode() API Users of the instruction decoder should use this to decode instruction bytes. For that, have insn*() helpers return an int value to denote success/failure. When there's an error fetching the next insn byte and the insn falls short, return -ENODATA to denote that. While at it, make insn_get_opcode() more stricter as to whether what has seen so far is a valid insn and if not. Copy linux/kconfig.h for the tools-version of the decoder so that it can use IS_ENABLED(). Also, cast the INSN_MODE_KERN dummy define value to (enum insn_mode) for tools use of the decoder because perf tool builds with -Werror and errors out with -Werror=sign-compare otherwise. Signed-off-by: Borislav Petkov Acked-by: Masami Hiramatsu Link: https://lkml.kernel.org/r/20210304174237.31945-5-bp@alien8.de --- arch/x86/include/asm/insn.h | 24 +++-- arch/x86/lib/insn.c | 216 +++++++++++++++++++++++++++++-------- tools/arch/x86/include/asm/insn.h | 24 +++-- tools/arch/x86/lib/insn.c | 222 +++++++++++++++++++++++++++++--------- tools/include/linux/kconfig.h | 73 +++++++++++++ 5 files changed, 452 insertions(+), 107 deletions(-) create mode 100644 tools/include/linux/kconfig.h (limited to 'tools') diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h index 93f84600ac2c..de9fe760c1e7 100644 --- a/arch/x86/include/asm/insn.h +++ b/arch/x86/include/asm/insn.h @@ -132,13 +132,23 @@ struct insn { #define X86_VEX_M_MAX 0x1f /* VEX3.M Maximum value */ extern void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64); -extern void insn_get_prefixes(struct insn *insn); -extern void insn_get_opcode(struct insn *insn); -extern void insn_get_modrm(struct insn *insn); -extern void insn_get_sib(struct insn *insn); -extern void insn_get_displacement(struct insn *insn); -extern void insn_get_immediate(struct insn *insn); -extern void insn_get_length(struct insn *insn); +extern int insn_get_prefixes(struct insn *insn); +extern int insn_get_opcode(struct insn *insn); +extern int insn_get_modrm(struct insn *insn); +extern int insn_get_sib(struct insn *insn); +extern int insn_get_displacement(struct insn *insn); +extern int insn_get_immediate(struct insn *insn); +extern int insn_get_length(struct insn *insn); + +enum insn_mode { + INSN_MODE_32, + INSN_MODE_64, + /* Mode is determined by the current kernel build. */ + INSN_MODE_KERN, + INSN_NUM_MODES, +}; + +extern int insn_decode(struct insn *insn, const void *kaddr, int buf_len, enum insn_mode m); /* Attribute will be determined after getting ModRM (for opcode groups) */ static inline void insn_get_attribute(struct insn *insn) diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index ed1e0335aa14..bb58004497f8 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c @@ -14,6 +14,9 @@ #include /*__ignore_sync_check__ */ #include /* __ignore_sync_check__ */ +#include +#include + #include /* __ignore_sync_check__ */ #define leXX_to_cpu(t, r) \ @@ -112,8 +115,12 @@ static void insn_get_emulate_prefix(struct insn *insn) * Populates the @insn->prefixes bitmap, and updates @insn->next_byte * to point to the (first) opcode. No effect if @insn->prefixes.got * is already set. + * + * * Returns: + * 0: on success + * < 0: on error */ -void insn_get_prefixes(struct insn *insn) +int insn_get_prefixes(struct insn *insn) { struct insn_field *prefixes = &insn->prefixes; insn_attr_t attr; @@ -121,7 +128,7 @@ void insn_get_prefixes(struct insn *insn) int i, nb; if (prefixes->got) - return; + return 0; insn_get_emulate_prefix(insn); @@ -231,8 +238,10 @@ vex_end: prefixes->got = 1; + return 0; + err_out: - return; + return -ENODATA; } /** @@ -244,16 +253,25 @@ err_out: * If necessary, first collects any preceding (prefix) bytes. * Sets @insn->opcode.value = opcode1. No effect if @insn->opcode.got * is already 1. + * + * Returns: + * 0: on success + * < 0: on error */ -void insn_get_opcode(struct insn *insn) +int insn_get_opcode(struct insn *insn) { struct insn_field *opcode = &insn->opcode; + int pfx_id, ret; insn_byte_t op; - int pfx_id; + if (opcode->got) - return; - if (!insn->prefixes.got) - insn_get_prefixes(insn); + return 0; + + if (!insn->prefixes.got) { + ret = insn_get_prefixes(insn); + if (ret) + return ret; + } /* Get first opcode */ op = get_next(insn_byte_t, insn); @@ -268,9 +286,13 @@ void insn_get_opcode(struct insn *insn) insn->attr = inat_get_avx_attribute(op, m, p); if ((inat_must_evex(insn->attr) && !insn_is_evex(insn)) || (!inat_accept_vex(insn->attr) && - !inat_is_group(insn->attr))) - insn->attr = 0; /* This instruction is bad */ - goto end; /* VEX has only 1 byte for opcode */ + !inat_is_group(insn->attr))) { + /* This instruction is bad */ + insn->attr = 0; + return -EINVAL; + } + /* VEX has only 1 byte for opcode */ + goto end; } insn->attr = inat_get_opcode_attribute(op); @@ -281,13 +303,18 @@ void insn_get_opcode(struct insn *insn) pfx_id = insn_last_prefix_id(insn); insn->attr = inat_get_escape_attribute(op, pfx_id, insn->attr); } - if (inat_must_vex(insn->attr)) - insn->attr = 0; /* This instruction is bad */ + + if (inat_must_vex(insn->attr)) { + /* This instruction is bad */ + insn->attr = 0; + return -EINVAL; + } end: opcode->got = 1; + return 0; err_out: - return; + return -ENODATA; } /** @@ -297,15 +324,25 @@ err_out: * Populates @insn->modrm and updates @insn->next_byte to point past the * ModRM byte, if any. If necessary, first collects the preceding bytes * (prefixes and opcode(s)). No effect if @insn->modrm.got is already 1. + * + * Returns: + * 0: on success + * < 0: on error */ -void insn_get_modrm(struct insn *insn) +int insn_get_modrm(struct insn *insn) { struct insn_field *modrm = &insn->modrm; insn_byte_t pfx_id, mod; + int ret; + if (modrm->got) - return; - if (!insn->opcode.got) - insn_get_opcode(insn); + return 0; + + if (!insn->opcode.got) { + ret = insn_get_opcode(insn); + if (ret) + return ret; + } if (inat_has_modrm(insn->attr)) { mod = get_next(insn_byte_t, insn); @@ -314,17 +351,22 @@ void insn_get_modrm(struct insn *insn) pfx_id = insn_last_prefix_id(insn); insn->attr = inat_get_group_attribute(mod, pfx_id, insn->attr); - if (insn_is_avx(insn) && !inat_accept_vex(insn->attr)) - insn->attr = 0; /* This is bad */ + if (insn_is_avx(insn) && !inat_accept_vex(insn->attr)) { + /* Bad insn */ + insn->attr = 0; + return -EINVAL; + } } } if (insn->x86_64 && inat_is_force64(insn->attr)) insn->opnd_bytes = 8; + modrm->got = 1; + return 0; err_out: - return; + return -ENODATA; } @@ -338,11 +380,16 @@ err_out: int insn_rip_relative(struct insn *insn) { struct insn_field *modrm = &insn->modrm; + int ret; if (!insn->x86_64) return 0; - if (!modrm->got) - insn_get_modrm(insn); + + if (!modrm->got) { + ret = insn_get_modrm(insn); + if (ret) + return 0; + } /* * For rip-relative instructions, the mod field (top 2 bits) * is zero and the r/m field (bottom 3 bits) is 0x5. @@ -356,15 +403,25 @@ int insn_rip_relative(struct insn *insn) * * If necessary, first collects the instruction up to and including the * ModRM byte. + * + * Returns: + * 0: if decoding succeeded + * < 0: otherwise. */ -void insn_get_sib(struct insn *insn) +int insn_get_sib(struct insn *insn) { insn_byte_t modrm; + int ret; if (insn->sib.got) - return; - if (!insn->modrm.got) - insn_get_modrm(insn); + return 0; + + if (!insn->modrm.got) { + ret = insn_get_modrm(insn); + if (ret) + return ret; + } + if (insn->modrm.nbytes) { modrm = insn->modrm.bytes[0]; if (insn->addr_bytes != 2 && @@ -375,8 +432,10 @@ void insn_get_sib(struct insn *insn) } insn->sib.got = 1; + return 0; + err_out: - return; + return -ENODATA; } @@ -387,15 +446,25 @@ err_out: * If necessary, first collects the instruction up to and including the * SIB byte. * Displacement value is sign-expanded. + * + * * Returns: + * 0: if decoding succeeded + * < 0: otherwise. */ -void insn_get_displacement(struct insn *insn) +int insn_get_displacement(struct insn *insn) { insn_byte_t mod, rm, base; + int ret; if (insn->displacement.got) - return; - if (!insn->sib.got) - insn_get_sib(insn); + return 0; + + if (!insn->sib.got) { + ret = insn_get_sib(insn); + if (ret) + return ret; + } + if (insn->modrm.nbytes) { /* * Interpreting the modrm byte: @@ -437,9 +506,10 @@ void insn_get_displacement(struct insn *insn) } out: insn->displacement.got = 1; + return 0; err_out: - return; + return -ENODATA; } /* Decode moffset16/32/64. Return 0 if failed */ @@ -538,20 +608,30 @@ err_out: } /** - * insn_get_immediate() - Get the immediates of instruction + * insn_get_immediate() - Get the immediate in an instruction * @insn: &struct insn containing instruction * * If necessary, first collects the instruction up to and including the * displacement bytes. * Basically, most of immediates are sign-expanded. Unsigned-value can be - * get by bit masking with ((1 << (nbytes * 8)) - 1) + * computed by bit masking with ((1 << (nbytes * 8)) - 1) + * + * Returns: + * 0: on success + * < 0: on error */ -void insn_get_immediate(struct insn *insn) +int insn_get_immediate(struct insn *insn) { + int ret; + if (insn->immediate.got) - return; - if (!insn->displacement.got) - insn_get_displacement(insn); + return 0; + + if (!insn->displacement.got) { + ret = insn_get_displacement(insn); + if (ret) + return ret; + } if (inat_has_moffset(insn->attr)) { if (!__get_moffset(insn)) @@ -598,9 +678,10 @@ void insn_get_immediate(struct insn *insn) } done: insn->immediate.got = 1; + return 0; err_out: - return; + return -ENODATA; } /** @@ -609,13 +690,58 @@ err_out: * * If necessary, first collects the instruction up to and including the * immediates bytes. - */ -void insn_get_length(struct insn *insn) + * + * Returns: + * - 0 on success + * - < 0 on error +*/ +int insn_get_length(struct insn *insn) { + int ret; + if (insn->length) - return; - if (!insn->immediate.got) - insn_get_immediate(insn); + return 0; + + if (!insn->immediate.got) { + ret = insn_get_immediate(insn); + if (ret) + return ret; + } + insn->length = (unsigned char)((unsigned long)insn->next_byte - (unsigned long)insn->kaddr); + + return 0; +} + +/** + * insn_decode() - Decode an x86 instruction + * @insn: &struct insn to be initialized + * @kaddr: address (in kernel memory) of instruction (or copy thereof) + * @buf_len: length of the insn buffer at @kaddr + * @m: insn mode, see enum insn_mode + * + * Returns: + * 0: if decoding succeeded + * < 0: otherwise. + */ +int insn_decode(struct insn *insn, const void *kaddr, int buf_len, enum insn_mode m) +{ + int ret; + +/* #define INSN_MODE_KERN -1 __ignore_sync_check__ mode is only valid in the kernel */ + + if (m == INSN_MODE_KERN) + insn_init(insn, kaddr, buf_len, IS_ENABLED(CONFIG_X86_64)); + else + insn_init(insn, kaddr, buf_len, m == INSN_MODE_64); + + ret = insn_get_length(insn); + if (ret) + return ret; + + if (insn_complete(insn)) + return 0; + + return -EINVAL; } diff --git a/tools/arch/x86/include/asm/insn.h b/tools/arch/x86/include/asm/insn.h index 7e1239aa7dd9..2fe19b1a8765 100644 --- a/tools/arch/x86/include/asm/insn.h +++ b/tools/arch/x86/include/asm/insn.h @@ -132,13 +132,23 @@ struct insn { #define X86_VEX_M_MAX 0x1f /* VEX3.M Maximum value */ extern void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64); -extern void insn_get_prefixes(struct insn *insn); -extern void insn_get_opcode(struct insn *insn); -extern void insn_get_modrm(struct insn *insn); -extern void insn_get_sib(struct insn *insn); -extern void insn_get_displacement(struct insn *insn); -extern void insn_get_immediate(struct insn *insn); -extern void insn_get_length(struct insn *insn); +extern int insn_get_prefixes(struct insn *insn); +extern int insn_get_opcode(struct insn *insn); +extern int insn_get_modrm(struct insn *insn); +extern int insn_get_sib(struct insn *insn); +extern int insn_get_displacement(struct insn *insn); +extern int insn_get_immediate(struct insn *insn); +extern int insn_get_length(struct insn *insn); + +enum insn_mode { + INSN_MODE_32, + INSN_MODE_64, + /* Mode is determined by the current kernel build. */ + INSN_MODE_KERN, + INSN_NUM_MODES, +}; + +extern int insn_decode(struct insn *insn, const void *kaddr, int buf_len, enum insn_mode m); /* Attribute will be determined after getting ModRM (for opcode groups) */ static inline void insn_get_attribute(struct insn *insn) diff --git a/tools/arch/x86/lib/insn.c b/tools/arch/x86/lib/insn.c index 1174c43b3944..be2b057f91d4 100644 --- a/tools/arch/x86/lib/insn.c +++ b/tools/arch/x86/lib/insn.c @@ -11,10 +11,13 @@ #else #include #endif -#include "../include/asm/inat.h" /* __ignore_sync_check__ */ -#include "../include/asm/insn.h" /* __ignore_sync_check__ */ +#include /*__ignore_sync_check__ */ +#include /* __ignore_sync_check__ */ -#include "../include/asm/emulate_prefix.h" /* __ignore_sync_check__ */ +#include +#include + +#include /* __ignore_sync_check__ */ #define leXX_to_cpu(t, r) \ ({ \ @@ -112,8 +115,12 @@ static void insn_get_emulate_prefix(struct insn *insn) * Populates the @insn->prefixes bitmap, and updates @insn->next_byte * to point to the (first) opcode. No effect if @insn->prefixes.got * is already set. + * + * * Returns: + * 0: on success + * < 0: on error */ -void insn_get_prefixes(struct insn *insn) +int insn_get_prefixes(struct insn *insn) { struct insn_field *prefixes = &insn->prefixes; insn_attr_t attr; @@ -121,7 +128,7 @@ void insn_get_prefixes(struct insn *insn) int i, nb; if (prefixes->got) - return; + return 0; insn_get_emulate_prefix(insn); @@ -231,8 +238,10 @@ vex_end: prefixes->got = 1; + return 0; + err_out: - return; + return -ENODATA; } /** @@ -244,16 +253,25 @@ err_out: * If necessary, first collects any preceding (prefix) bytes. * Sets @insn->opcode.value = opcode1. No effect if @insn->opcode.got * is already 1. + * + * Returns: + * 0: on success + * < 0: on error */ -void insn_get_opcode(struct insn *insn) +int insn_get_opcode(struct insn *insn) { struct insn_field *opcode = &insn->opcode; + int pfx_id, ret; insn_byte_t op; - int pfx_id; + if (opcode->got) - return; - if (!insn->prefixes.got) - insn_get_prefixes(insn); + return 0; + + if (!insn->prefixes.got) { + ret = insn_get_prefixes(insn); + if (ret) + return ret; + } /* Get first opcode */ op = get_next(insn_byte_t, insn); @@ -268,9 +286,13 @@ void insn_get_opcode(struct insn *insn) insn->attr = inat_get_avx_attribute(op, m, p); if ((inat_must_evex(insn->attr) && !insn_is_evex(insn)) || (!inat_accept_vex(insn->attr) && - !inat_is_group(insn->attr))) - insn->attr = 0; /* This instruction is bad */ - goto end; /* VEX has only 1 byte for opcode */ + !inat_is_group(insn->attr))) { + /* This instruction is bad */ + insn->attr = 0; + return -EINVAL; + } + /* VEX has only 1 byte for opcode */ + goto end; } insn->attr = inat_get_opcode_attribute(op); @@ -281,13 +303,18 @@ void insn_get_opcode(struct insn *insn) pfx_id = insn_last_prefix_id(insn); insn->attr = inat_get_escape_attribute(op, pfx_id, insn->attr); } - if (inat_must_vex(insn->attr)) - insn->attr = 0; /* This instruction is bad */ + + if (inat_must_vex(insn->attr)) { + /* This instruction is bad */ + insn->attr = 0; + return -EINVAL; + } end: opcode->got = 1; + return 0; err_out: - return; + return -ENODATA; } /** @@ -297,15 +324,25 @@ err_out: * Populates @insn->modrm and updates @insn->next_byte to point past the * ModRM byte, if any. If necessary, first collects the preceding bytes * (prefixes and opcode(s)). No effect if @insn->modrm.got is already 1. + * + * Returns: + * 0: on success + * < 0: on error */ -void insn_get_modrm(struct insn *insn) +int insn_get_modrm(struct insn *insn) { struct insn_field *modrm = &insn->modrm; insn_byte_t pfx_id, mod; + int ret; + if (modrm->got) - return; - if (!insn->opcode.got) - insn_get_opcode(insn); + return 0; + + if (!insn->opcode.got) { + ret = insn_get_opcode(insn); + if (ret) + return ret; + } if (inat_has_modrm(insn->attr)) { mod = get_next(insn_byte_t, insn); @@ -314,17 +351,22 @@ void insn_get_modrm(struct insn *insn) pfx_id = insn_last_prefix_id(insn); insn->attr = inat_get_group_attribute(mod, pfx_id, insn->attr); - if (insn_is_avx(insn) && !inat_accept_vex(insn->attr)) - insn->attr = 0; /* This is bad */ + if (insn_is_avx(insn) && !inat_accept_vex(insn->attr)) { + /* Bad insn */ + insn->attr = 0; + return -EINVAL; + } } } if (insn->x86_64 && inat_is_force64(insn->attr)) insn->opnd_bytes = 8; + modrm->got = 1; + return 0; err_out: - return; + return -ENODATA; } @@ -338,11 +380,16 @@ err_out: int insn_rip_relative(struct insn *insn) { struct insn_field *modrm = &insn->modrm; + int ret; if (!insn->x86_64) return 0; - if (!modrm->got) - insn_get_modrm(insn); + + if (!modrm->got) { + ret = insn_get_modrm(insn); + if (ret) + return 0; + } /* * For rip-relative instructions, the mod field (top 2 bits) * is zero and the r/m field (bottom 3 bits) is 0x5. @@ -356,15 +403,25 @@ int insn_rip_relative(struct insn *insn) * * If necessary, first collects the instruction up to and including the * ModRM byte. + * + * Returns: + * 0: if decoding succeeded + * < 0: otherwise. */ -void insn_get_sib(struct insn *insn) +int insn_get_sib(struct insn *insn) { insn_byte_t modrm; + int ret; if (insn->sib.got) - return; - if (!insn->modrm.got) - insn_get_modrm(insn); + return 0; + + if (!insn->modrm.got) { + ret = insn_get_modrm(insn); + if (ret) + return ret; + } + if (insn->modrm.nbytes) { modrm = insn->modrm.bytes[0]; if (insn->addr_bytes != 2 && @@ -375,8 +432,10 @@ void insn_get_sib(struct insn *insn) } insn->sib.got = 1; + return 0; + err_out: - return; + return -ENODATA; } @@ -387,15 +446,25 @@ err_out: * If necessary, first collects the instruction up to and including the * SIB byte. * Displacement value is sign-expanded. + * + * * Returns: + * 0: if decoding succeeded + * < 0: otherwise. */ -void insn_get_displacement(struct insn *insn) +int insn_get_displacement(struct insn *insn) { insn_byte_t mod, rm, base; + int ret; if (insn->displacement.got) - return; - if (!insn->sib.got) - insn_get_sib(insn); + return 0; + + if (!insn->sib.got) { + ret = insn_get_sib(insn); + if (ret) + return ret; + } + if (insn->modrm.nbytes) { /* * Interpreting the modrm byte: @@ -437,9 +506,10 @@ void insn_get_displacement(struct insn *insn) } out: insn->displacement.got = 1; + return 0; err_out: - return; + return -ENODATA; } /* Decode moffset16/32/64. Return 0 if failed */ @@ -538,20 +608,30 @@ err_out: } /** - * insn_get_immediate() - Get the immediates of instruction + * insn_get_immediate() - Get the immediate in an instruction * @insn: &struct insn containing instruction * * If necessary, first collects the instruction up to and including the * displacement bytes. * Basically, most of immediates are sign-expanded. Unsigned-value can be - * get by bit masking with ((1 << (nbytes * 8)) - 1) + * computed by bit masking with ((1 << (nbytes * 8)) - 1) + * + * Returns: + * 0: on success + * < 0: on error */ -void insn_get_immediate(struct insn *insn) +int insn_get_immediate(struct insn *insn) { + int ret; + if (insn->immediate.got) - return; - if (!insn->displacement.got) - insn_get_displacement(insn); + return 0; + + if (!insn->displacement.got) { + ret = insn_get_displacement(insn); + if (ret) + return ret; + } if (inat_has_moffset(insn->attr)) { if (!__get_moffset(insn)) @@ -598,9 +678,10 @@ void insn_get_immediate(struct insn *insn) } done: insn->immediate.got = 1; + return 0; err_out: - return; + return -ENODATA; } /** @@ -609,13 +690,58 @@ err_out: * * If necessary, first collects the instruction up to and including the * immediates bytes. - */ -void insn_get_length(struct insn *insn) + * + * Returns: + * - 0 on success + * - < 0 on error +*/ +int insn_get_length(struct insn *insn) { + int ret; + if (insn->length) - return; - if (!insn->immediate.got) - insn_get_immediate(insn); + return 0; + + if (!insn->immediate.got) { + ret = insn_get_immediate(insn); + if (ret) + return ret; + } + insn->length = (unsigned char)((unsigned long)insn->next_byte - (unsigned long)insn->kaddr); + + return 0; +} + +/** + * insn_decode() - Decode an x86 instruction + * @insn: &struct insn to be initialized + * @kaddr: address (in kernel memory) of instruction (or copy thereof) + * @buf_len: length of the insn buffer at @kaddr + * @m: insn mode, see enum insn_mode + * + * Returns: + * 0: if decoding succeeded + * < 0: otherwise. + */ +int insn_decode(struct insn *insn, const void *kaddr, int buf_len, enum insn_mode m) +{ + int ret; + +#define INSN_MODE_KERN (enum insn_mode)-1 /* __ignore_sync_check__ mode is only valid in the kernel */ + + if (m == INSN_MODE_KERN) + insn_init(insn, kaddr, buf_len, IS_ENABLED(CONFIG_X86_64)); + else + insn_init(insn, kaddr, buf_len, m == INSN_MODE_64); + + ret = insn_get_length(insn); + if (ret) + return ret; + + if (insn_complete(insn)) + return 0; + + return -EINVAL; } diff --git a/tools/include/linux/kconfig.h b/tools/include/linux/kconfig.h new file mode 100644 index 000000000000..1555a0c4f345 --- /dev/null +++ b/tools/include/linux/kconfig.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _TOOLS_LINUX_KCONFIG_H +#define _TOOLS_LINUX_KCONFIG_H + +/* CONFIG_CC_VERSION_TEXT (Do not delete this comment. See help in Kconfig) */ + +#ifdef CONFIG_CPU_BIG_ENDIAN +#define __BIG_ENDIAN 4321 +#else +#define __LITTLE_ENDIAN 1234 +#endif + +#define __ARG_PLACEHOLDER_1 0, +#define __take_second_arg(__ignored, val, ...) val + +/* + * The use of "&&" / "||" is limited in certain expressions. + * The following enable to calculate "and" / "or" with macro expansion only. + */ +#define __and(x, y) ___and(x, y) +#define ___and(x, y) ____and(__ARG_PLACEHOLDER_##x, y) +#define ____and(arg1_or_junk, y) __take_second_arg(arg1_or_junk y, 0) + +#define __or(x, y) ___or(x, y) +#define ___or(x, y) ____or(__ARG_PLACEHOLDER_##x, y) +#define ____or(arg1_or_junk, y) __take_second_arg(arg1_or_junk 1, y) + +/* + * Helper macros to use CONFIG_ options in C/CPP expressions. Note that + * these only work with boolean and tristate options. + */ + +/* + * Getting something that works in C and CPP for an arg that may or may + * not be defined is tricky. Here, if we have "#define CONFIG_BOOGER 1" + * we match on the placeholder define, insert the "0," for arg1 and generate + * the triplet (0, 1, 0). Then the last step cherry picks the 2nd arg (a one). + * When CONFIG_BOOGER is not defined, we generate a (... 1, 0) pair, and when + * the last step cherry picks the 2nd arg, we get a zero. + */ +#define __is_defined(x) ___is_defined(x) +#define ___is_defined(val) ____is_defined(__ARG_PLACEHOLDER_##val) +#define ____is_defined(arg1_or_junk) __take_second_arg(arg1_or_junk 1, 0) + +/* + * IS_BUILTIN(CONFIG_FOO) evaluates to 1 if CONFIG_FOO is set to 'y', 0 + * otherwise. For boolean options, this is equivalent to + * IS_ENABLED(CONFIG_FOO). + */ +#define IS_BUILTIN(option) __is_defined(option) + +/* + * IS_MODULE(CONFIG_FOO) evaluates to 1 if CONFIG_FOO is set to 'm', 0 + * otherwise. + */ +#define IS_MODULE(option) __is_defined(option##_MODULE) + +/* + * IS_REACHABLE(CONFIG_FOO) evaluates to 1 if the currently compiled + * code can call a function defined in code compiled based on CONFIG_FOO. + * This is similar to IS_ENABLED(), but returns false when invoked from + * built-in code when CONFIG_FOO is set to 'm'. + */ +#define IS_REACHABLE(option) __or(IS_BUILTIN(option), \ + __and(IS_MODULE(option), __is_defined(MODULE))) + +/* + * IS_ENABLED(CONFIG_FOO) evaluates to 1 if CONFIG_FOO is set to 'y' or 'm', + * 0 otherwise. + */ +#define IS_ENABLED(option) __or(IS_BUILTIN(option), IS_MODULE(option)) + +#endif /* _TOOLS_LINUX_KCONFIG_H */ -- cgit v1.2.3 From c7e41b099be40112d53daccef8553e99e455e0d6 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 20 Nov 2020 17:37:06 +0100 Subject: tools/objtool: Convert to insn_decode() Simplify code, no functional changes. Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210304174237.31945-18-bp@alien8.de --- tools/objtool/arch/x86/decode.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index 549813cff8ab..8380d0b1d933 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -90,7 +90,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, struct list_head *ops_list) { struct insn insn; - int x86_64, sign; + int x86_64, sign, ret; unsigned char op1, op2, rex = 0, rex_b = 0, rex_r = 0, rex_w = 0, rex_x = 0, modrm = 0, modrm_mod = 0, modrm_rm = 0, modrm_reg = 0, sib = 0; @@ -101,10 +101,9 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, if (x86_64 == -1) return -1; - insn_init(&insn, sec->data->d_buf + offset, maxlen, x86_64); - insn_get_length(&insn); - - if (!insn_complete(&insn)) { + ret = insn_decode(&insn, sec->data->d_buf + offset, maxlen, + x86_64 ? INSN_MODE_64 : INSN_MODE_32); + if (ret < 0) { WARN("can't decode instruction at %s:0x%lx", sec->name, offset); return -1; } -- cgit v1.2.3 From 62660b0fd238253aff951479a2adf1f06a231422 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 23 Nov 2020 23:00:16 +0100 Subject: tools/perf: Convert to insn_decode() Simplify code, no functional changes. Signed-off-by: Borislav Petkov Cc: Arnaldo Carvalho de Melo Link: https://lkml.kernel.org/r/20210304174237.31945-20-bp@alien8.de --- tools/perf/arch/x86/tests/insn-x86.c | 9 ++++----- tools/perf/arch/x86/util/archinsn.c | 9 +++++---- .../perf/util/intel-pt-decoder/intel-pt-insn-decoder.c | 17 ++++++++++------- 3 files changed, 19 insertions(+), 16 deletions(-) (limited to 'tools') diff --git a/tools/perf/arch/x86/tests/insn-x86.c b/tools/perf/arch/x86/tests/insn-x86.c index 4f75ae990140..0262b0d8ccf5 100644 --- a/tools/perf/arch/x86/tests/insn-x86.c +++ b/tools/perf/arch/x86/tests/insn-x86.c @@ -96,13 +96,12 @@ static int get_branch(const char *branch_str) static int test_data_item(struct test_data *dat, int x86_64) { struct intel_pt_insn intel_pt_insn; + int op, branch, ret; struct insn insn; - int op, branch; - insn_init(&insn, dat->data, MAX_INSN_SIZE, x86_64); - insn_get_length(&insn); - - if (!insn_complete(&insn)) { + ret = insn_decode(&insn, dat->data, MAX_INSN_SIZE, + x86_64 ? INSN_MODE_64 : INSN_MODE_32); + if (ret < 0) { pr_debug("Failed to decode: %s\n", dat->asm_rep); return -1; } diff --git a/tools/perf/arch/x86/util/archinsn.c b/tools/perf/arch/x86/util/archinsn.c index 34d600c51044..546feda08428 100644 --- a/tools/perf/arch/x86/util/archinsn.c +++ b/tools/perf/arch/x86/util/archinsn.c @@ -11,7 +11,7 @@ void arch_fetch_insn(struct perf_sample *sample, struct machine *machine) { struct insn insn; - int len; + int len, ret; bool is64bit = false; if (!sample->ip) @@ -19,8 +19,9 @@ void arch_fetch_insn(struct perf_sample *sample, len = thread__memcpy(thread, machine, sample->insn, sample->ip, sizeof(sample->insn), &is64bit); if (len <= 0) return; - insn_init(&insn, sample->insn, len, is64bit); - insn_get_length(&insn); - if (insn_complete(&insn) && insn.length <= len) + + ret = insn_decode(&insn, sample->insn, len, + is64bit ? INSN_MODE_64 : INSN_MODE_32); + if (ret >= 0 && insn.length <= len) sample->insn_len = insn.length; } diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c index 2f6cc7eea251..593f20e9774c 100644 --- a/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c +++ b/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c @@ -169,11 +169,13 @@ int intel_pt_get_insn(const unsigned char *buf, size_t len, int x86_64, struct intel_pt_insn *intel_pt_insn) { struct insn insn; + int ret; - insn_init(&insn, buf, len, x86_64); - insn_get_length(&insn); - if (!insn_complete(&insn) || insn.length > len) + ret = insn_decode(&insn, buf, len, + x86_64 ? INSN_MODE_64 : INSN_MODE_32); + if (ret < 0 || insn.length > len) return -1; + intel_pt_insn_decoder(&insn, intel_pt_insn); if (insn.length < INTEL_PT_INSN_BUF_SZ) memcpy(intel_pt_insn->buf, buf, insn.length); @@ -194,12 +196,13 @@ const char *dump_insn(struct perf_insn *x, uint64_t ip __maybe_unused, u8 *inbuf, int inlen, int *lenp) { struct insn insn; - int n, i; + int n, i, ret; int left; - insn_init(&insn, inbuf, inlen, x->is64bit); - insn_get_length(&insn); - if (!insn_complete(&insn) || insn.length > inlen) + ret = insn_decode(&insn, inbuf, inlen, + x->is64bit ? INSN_MODE_64 : INSN_MODE_32); + + if (ret < 0 || insn.length > inlen) return ""; if (lenp) *lenp = insn.length; -- cgit v1.2.3 From 404b639e510b36136ef15b08ca8a022845ed87db Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 22 Nov 2020 18:12:37 +0100 Subject: x86/insn: Remove kernel_insn_init() Now that it is not needed anymore, drop it. Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210304174237.31945-21-bp@alien8.de --- arch/x86/include/asm/insn.h | 11 ----------- tools/arch/x86/include/asm/insn.h | 11 ----------- 2 files changed, 22 deletions(-) (limited to 'tools') diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h index de9fe760c1e7..5eb3753dce33 100644 --- a/arch/x86/include/asm/insn.h +++ b/arch/x86/include/asm/insn.h @@ -159,17 +159,6 @@ static inline void insn_get_attribute(struct insn *insn) /* Instruction uses RIP-relative addressing */ extern int insn_rip_relative(struct insn *insn); -/* Init insn for kernel text */ -static inline void kernel_insn_init(struct insn *insn, - const void *kaddr, int buf_len) -{ -#ifdef CONFIG_X86_64 - insn_init(insn, kaddr, buf_len, 1); -#else /* CONFIG_X86_32 */ - insn_init(insn, kaddr, buf_len, 0); -#endif -} - static inline int insn_is_avx(struct insn *insn) { if (!insn->prefixes.got) diff --git a/tools/arch/x86/include/asm/insn.h b/tools/arch/x86/include/asm/insn.h index 2fe19b1a8765..5aae785003dc 100644 --- a/tools/arch/x86/include/asm/insn.h +++ b/tools/arch/x86/include/asm/insn.h @@ -159,17 +159,6 @@ static inline void insn_get_attribute(struct insn *insn) /* Instruction uses RIP-relative addressing */ extern int insn_rip_relative(struct insn *insn); -/* Init insn for kernel text */ -static inline void kernel_insn_init(struct insn *insn, - const void *kaddr, int buf_len) -{ -#ifdef CONFIG_X86_64 - insn_init(insn, kaddr, buf_len, 1); -#else /* CONFIG_X86_32 */ - insn_init(insn, kaddr, buf_len, 0); -#endif -} - static inline int insn_is_avx(struct insn *insn) { if (!insn->prefixes.got) -- cgit v1.2.3 From f935178b5c1c32ff803b15892a8ba85a1280cb01 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 23 Nov 2020 23:19:03 +0100 Subject: x86/insn: Make insn_complete() static ... and move it above the only place it is used. Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210304174237.31945-22-bp@alien8.de --- arch/x86/include/asm/insn.h | 7 ------- arch/x86/lib/insn.c | 7 +++++++ tools/arch/x86/include/asm/insn.h | 7 ------- tools/arch/x86/lib/insn.c | 7 +++++++ 4 files changed, 14 insertions(+), 14 deletions(-) (limited to 'tools') diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h index 5eb3753dce33..f03b6ca7dec6 100644 --- a/arch/x86/include/asm/insn.h +++ b/arch/x86/include/asm/insn.h @@ -178,13 +178,6 @@ static inline int insn_has_emulate_prefix(struct insn *insn) return !!insn->emulate_prefix_size; } -/* Ensure this instruction is decoded completely */ -static inline int insn_complete(struct insn *insn) -{ - return insn->opcode.got && insn->modrm.got && insn->sib.got && - insn->displacement.got && insn->immediate.got; -} - static inline insn_byte_t insn_vex_m_bits(struct insn *insn) { if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */ diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index bb58004497f8..058f19b20465 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c @@ -714,6 +714,13 @@ int insn_get_length(struct insn *insn) return 0; } +/* Ensure this instruction is decoded completely */ +static inline int insn_complete(struct insn *insn) +{ + return insn->opcode.got && insn->modrm.got && insn->sib.got && + insn->displacement.got && insn->immediate.got; +} + /** * insn_decode() - Decode an x86 instruction * @insn: &struct insn to be initialized diff --git a/tools/arch/x86/include/asm/insn.h b/tools/arch/x86/include/asm/insn.h index 5aae785003dc..c9f3eeebb53b 100644 --- a/tools/arch/x86/include/asm/insn.h +++ b/tools/arch/x86/include/asm/insn.h @@ -178,13 +178,6 @@ static inline int insn_has_emulate_prefix(struct insn *insn) return !!insn->emulate_prefix_size; } -/* Ensure this instruction is decoded completely */ -static inline int insn_complete(struct insn *insn) -{ - return insn->opcode.got && insn->modrm.got && insn->sib.got && - insn->displacement.got && insn->immediate.got; -} - static inline insn_byte_t insn_vex_m_bits(struct insn *insn) { if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */ diff --git a/tools/arch/x86/lib/insn.c b/tools/arch/x86/lib/insn.c index be2b057f91d4..cd4dedde3265 100644 --- a/tools/arch/x86/lib/insn.c +++ b/tools/arch/x86/lib/insn.c @@ -714,6 +714,13 @@ int insn_get_length(struct insn *insn) return 0; } +/* Ensure this instruction is decoded completely */ +static inline int insn_complete(struct insn *insn) +{ + return insn->opcode.got && insn->modrm.got && insn->sib.got && + insn->displacement.got && insn->immediate.got; +} + /** * insn_decode() - Decode an x86 instruction * @insn: &struct insn to be initialized -- cgit v1.2.3 From 2d4177c01b4e7496c7d47b31865f8c85bffb3604 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 13 Mar 2021 14:56:16 +0100 Subject: tools/x86/kcpuid: Add AMD Secure Encryption leaf Add the 0x8000001f leaf's fields. Signed-off-by: Borislav Petkov Acked-by: Feng Tang Link: https://lkml.kernel.org/r/20210313140118.17010-1-bp@alien8.de --- tools/arch/x86/kcpuid/cpuid.csv | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'tools') diff --git a/tools/arch/x86/kcpuid/cpuid.csv b/tools/arch/x86/kcpuid/cpuid.csv index f4a5b85073f4..dd94c07421a8 100644 --- a/tools/arch/x86/kcpuid/cpuid.csv +++ b/tools/arch/x86/kcpuid/cpuid.csv @@ -378,3 +378,13 @@ 0x80000008, 0, EAX, 7:0, phy_adr_bits, Physical Address Bits 0x80000008, 0, EAX, 15:8, lnr_adr_bits, Linear Address Bits 0x80000007, 0, EBX, 9, wbnoinvd, WBNOINVD + +# 8000001F: AMD Secure Encryption +0x8000001F, 0, EAX, 0, sme, Secure Memory Encryption +0x8000001F, 0, EAX, 1, sev, Secure Encrypted Virtualization +0x8000001F, 0, EAX, 2, vmpgflush, VM Page Flush MSR +0x8000001F, 0, EAX, 3, seves, SEV Encrypted State +0x8000001F, 0, EBX, 5:0, c-bit, Page table bit number used to enable memory encryption +0x8000001F, 0, EBX, 11:6, mem_encrypt_physaddr_width, Reduction of physical address space in bits with SME enabled +0x8000001F, 0, ECX, 31:0, num_encrypted_guests, Maximum ASID value that may be used for an SEV-enabled guest +0x8000001F, 0, EDX, 31:0, minimum_sev_asid, Minimum ASID value that must be used for an SEV-enabled, SEV-ES-disabled guest -- cgit v1.2.3 From a7672d1df5737009bd5339b80da429da7ceb9964 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 15 Mar 2021 10:13:22 -0300 Subject: perf evlist: Change the COMM when preparing the workload It was reported that --exclude-perf wasn't working, as tracepoints were appearing in 'perf script' output as having the 'perf' COMM, that is just the window in evlist__prepare_workload() after the fork() and before the execvp() call for workloads specified in the command line. Example: # perf record -e kmem:kmalloc --filter 'bytes_alloc<650 && bytes_alloc>620' --exclude-perf -e kmem:kfree --exclude-perf -aR sleep 30 Then: # perf script perf 15905 [009] 1498.356094: kmem:kfree: call_site=perf_event_mmap+0x279 ptr=(nil) perf 15905 [009] 1498.356116: kmem:kfree: call_site=free_bprm+0x8f ptr=(nil) perf 15905 [009] 1498.356116: kmem:kfree: call_site=do_execveat_common+0x19d ptr=0xffff9cf750421c00 perf 15905 [009] 1498.356138: kmem:kfree: call_site=perf_event_mmap+0x279 ptr=(nil) perf 15905 [009] 1498.356148: kmem:kfree: call_site=free_bprm+0x8f ptr=(nil) perf 15905 [009] 1498.356148: kmem:kfree: call_site=do_execveat_common+0x19d ptr=0xffff9cf750421c00 perf 15905 [009] 1498.356168: kmem:kfree: call_site=perf_event_mmap+0x279 ptr=(nil) perf 15905 [009] 1498.356176: kmem:kfree: call_site=free_bprm+0x8f ptr=(nil) perf 15905 [009] 1498.356348: kmem:kfree: call_site=perf_event_mmap+0x279 ptr=(nil) perf 15905 [014] 1498.356386: kmem:kfree: call_site=security_compute_sid.part.0+0x3b2 ptr=(nil) perf 15905 [014] 1498.356423: kmem:kfree: call_site=load_elf_binary+0x207 ptr=0xffff9cf5b2a34220 perf 15905 [014] 1498.356694: kmem:kfree: call_site=__free_slab+0xb5 ptr=0xffff9cf6d0b3b000 sleep 15905 [014] 1498.356739: kmem:kfree: call_site=perf_event_mmap+0x279 ptr=(nil) Use prctl() to show that that is just the preparation of the workload: # perf script perf-exec 19036 [009] 2199.357582: kmem:kfree: call_site=perf_event_mmap+0x279 ptr=(nil) perf-exec 19036 [009] 2199.357604: kmem:kfree: call_site=free_bprm+0x8f ptr=(nil) perf-exec 19036 [009] 2199.357604: kmem:kfree: call_site=do_execveat_common+0x19d ptr=0xffff9cf786459800 perf-exec 19036 [009] 2199.357630: kmem:kfree: call_site=perf_event_mmap+0x279 ptr=(nil) perf-exec 19036 [000] 2199.358277: kmem:kfree: call_site=__free_slab+0xb5 ptr=0xffff9cf786fb9c00 perf-exec 19036 [000] 2199.358278: kmem:kfree: call_site=__free_slab+0xb5 ptr=0xffff9cf786458200 perf-exec 19036 [000] 2199.358279: kmem:kfree: call_site=__free_slab+0xb5 ptr=0xffff9cf786458600 sleep 19036 [000] 2199.358316: kmem:kfree: call_site=perf_event_mmap+0x279 ptr=(nil) sleep 19036 [000] 2199.358323: kmem:kfree: call_site=perf_event_mmap+0x279 ptr=(nil) sleep 19036 [000] 2199.358330: kmem:kfree: call_site=perf_event_mmap+0x279 ptr=0xffff9cf58be2d000 sleep 19036 [000] 2199.358337: kmem:kfree: call_site=perf_event_mmap+0x279 ptr=0xffff9cf58be2d000 sleep 19036 [000] 2199.358339: kmem:kfree: call_site=perf_event_mmap+0x279 ptr=0xffff9cf58be2d000 sleep 19036 [000] 2199.358341: kmem:kfree: call_site=perf_event_mmap+0x279 ptr=0xffff9cf58be2d000 Reporter: zhanweiw Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=212213 Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evlist.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 882cd1f721d9..435bbfd00551 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -1405,6 +1406,13 @@ int evlist__prepare_workload(struct evlist *evlist, struct target *target, const close(go_pipe[1]); fcntl(go_pipe[0], F_SETFD, FD_CLOEXEC); + /* + * Change the name of this process not to confuse --exclude-perf users + * that sees 'perf' in the window up to the execvp() and thinks that + * perf samples are not being excluded. + */ + prctl(PR_SET_NAME, "perf-exec"); + /* * Tell the parent we're ready to go */ -- cgit v1.2.3 From 8efd1634542d9255023d7c2ec2194e8908450b12 Mon Sep 17 00:00:00 2001 From: Shunsuke Nakamura Date: Mon, 8 Mar 2021 19:53:40 +0900 Subject: perf vendor events arm64: Add more common and uarch events MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the following events.[1] Common architectural events: - L2I_TLB_REFILL - L2I_TLB - SIMD_INST_RETIRED - SVE_INST_RETIRED Common microarchitectural events: - UOP_SPEC - SVE_MATH_SPEC - FP_SPEC - FP_FMA_SPEC - FP_RECPE_SPEC - FP_CVT_SPEC - ASE_SVE_INT_SPEC - SVE_PRED_SPEC - SVE_MOVPRFX_SPEC - SVE_MOVPRFX_U_SPEC - ASE_SVE_LD_SPEC - ASE_SVE_ST_SPEC - PRF_SPEC - BASE_LD_REG_SPEC - BASE_ST_REG_SPEC - SVE_LDR_REG_SPEC - SVE_STR_REG_SPEC - SVE_LDR_PREG_SPEC - SVE_STR_PREG_SPEC - SVE_PRF_CONTIG_SPEC - ASE_SVE_LD_MULTI_SPEC - ASE_SVE_ST_MULTI_SPEC - SVE_LD_GATHER_SPEC - SVE_ST_SCATTER_SPEC - SVE_PRF_GATHER_SPEC - SVE_LDFF_SPEC - FP_SCALE_OPS_SPEC - FP_FIXED_OPS_SPEC - FP_HP_SCALE_OPS_SPEC - FP_HP_FIXED_OPS_SPEC - FP_SP_SCALE_OPS_SPEC - FP_SP_FIXED_OPS_SPEC - FP_DP_SCALE_OPS_SPEC - FP_DP_FIXED_OPS_SPEC Reference document is at the following: [1] https://github.com/fujitsu/A64FX/blob/master/doc/A64FX_PMU_Events_v1.2.pdf Signed-off-by: Nakamura, Shunsuke/中村 俊介 Reviewed-by: John Garry Tested-by: Masayoshi Mizuma Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Leo Yan Cc: Mark Rutland Cc: Mathieu Poirier Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Link: http://lore.kernel.org/lkml/20210308105342.746940-2-nakamura.shun@fujitsu.com Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/arm64/armv8-common-and-microarch.json | 228 +++++++++++++++++++++ 1 file changed, 228 insertions(+) (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/arm64/armv8-common-and-microarch.json b/tools/perf/pmu-events/arch/arm64/armv8-common-and-microarch.json index 75376c7cc072..913fb200ea52 100644 --- a/tools/perf/pmu-events/arch/arm64/armv8-common-and-microarch.json +++ b/tools/perf/pmu-events/arch/arm64/armv8-common-and-microarch.json @@ -209,12 +209,24 @@ "EventName": "L2D_TLB_REFILL", "BriefDescription": "Attributable Level 2 data TLB refill" }, + { + "PublicDescription": "Attributable Level 2 instruction TLB refill.", + "EventCode": "0x2E", + "EventName": "L2I_TLB_REFILL", + "BriefDescription": "Attributable Level 2 instruction TLB refill." + }, { "PublicDescription": "Attributable Level 2 data or unified TLB access", "EventCode": "0x2F", "EventName": "L2D_TLB", "BriefDescription": "Attributable Level 2 data or unified TLB access" }, + { + "PublicDescription": "Attributable Level 2 instruction TLB access.", + "EventCode": "0x30", + "EventName": "L2I_TLB", + "BriefDescription": "Attributable Level 2 instruction TLB access." + }, { "PublicDescription": "Access to another socket in a multi-socket system", "EventCode": "0x31", @@ -244,5 +256,221 @@ "EventCode": "0x37", "EventName": "LL_CACHE_MISS_RD", "BriefDescription": "Last level cache miss, read" + }, + { + "PublicDescription": "SIMD Instruction architecturally executed.", + "EventCode": "0x8000", + "EventName": "SIMD_INST_RETIRED", + "BriefDescription": "SIMD Instruction architecturally executed." + }, + { + "PublicDescription": "Instruction architecturally executed, SVE.", + "EventCode": "0x8002", + "EventName": "SVE_INST_RETIRED", + "BriefDescription": "Instruction architecturally executed, SVE." + }, + { + "PublicDescription": "Microarchitectural operation, Operations speculatively executed.", + "EventCode": "0x8008", + "EventName": "UOP_SPEC", + "BriefDescription": "Microarchitectural operation, Operations speculatively executed." + }, + { + "PublicDescription": "SVE Math accelerator Operations speculatively executed.", + "EventCode": "0x800E", + "EventName": "SVE_MATH_SPEC", + "BriefDescription": "SVE Math accelerator Operations speculatively executed." + }, + { + "PublicDescription": "Floating-point Operations speculatively executed.", + "EventCode": "0x8010", + "EventName": "FP_SPEC", + "BriefDescription": "Floating-point Operations speculatively executed." + }, + { + "PublicDescription": "Floating-point FMA Operations speculatively executed.", + "EventCode": "0x8028", + "EventName": "FP_FMA_SPEC", + "BriefDescription": "Floating-point FMA Operations speculatively executed." + }, + { + "PublicDescription": "Floating-point reciprocal estimate Operations speculatively executed.", + "EventCode": "0x8034", + "EventName": "FP_RECPE_SPEC", + "BriefDescription": "Floating-point reciprocal estimate Operations speculatively executed." + }, + { + "PublicDescription": "floating-point convert Operations speculatively executed.", + "EventCode": "0x8038", + "EventName": "FP_CVT_SPEC", + "BriefDescription": "floating-point convert Operations speculatively executed." + }, + { + "PublicDescription": "Advanced SIMD and SVE integer Operations speculatively executed.", + "EventCode": "0x8043", + "EventName": "ASE_SVE_INT_SPEC", + "BriefDescription": "Advanced SIMD and SVE integer Operations speculatively executed." + }, + { + "PublicDescription": "SVE predicated Operations speculatively executed.", + "EventCode": "0x8074", + "EventName": "SVE_PRED_SPEC", + "BriefDescription": "SVE predicated Operations speculatively executed." + }, + { + "PublicDescription": "SVE MOVPRFX Operations speculatively executed.", + "EventCode": "0x807C", + "EventName": "SVE_MOVPRFX_SPEC", + "BriefDescription": "SVE MOVPRFX Operations speculatively executed." + }, + { + "PublicDescription": "SVE MOVPRFX unfused Operations speculatively executed.", + "EventCode": "0x807F", + "EventName": "SVE_MOVPRFX_U_SPEC", + "BriefDescription": "SVE MOVPRFX unfused Operations speculatively executed." + }, + { + "PublicDescription": "Advanced SIMD and SVE load Operations speculatively executed.", + "EventCode": "0x8085", + "EventName": "ASE_SVE_LD_SPEC", + "BriefDescription": "Advanced SIMD and SVE load Operations speculatively executed." + }, + { + "PublicDescription": "Advanced SIMD and SVE store Operations speculatively executed.", + "EventCode": "0x8086", + "EventName": "ASE_SVE_ST_SPEC", + "BriefDescription": "Advanced SIMD and SVE store Operations speculatively executed." + }, + { + "PublicDescription": "Prefetch Operations speculatively executed.", + "EventCode": "0x8087", + "EventName": "PRF_SPEC", + "BriefDescription": "Prefetch Operations speculatively executed." + }, + { + "PublicDescription": "General-purpose register load Operations speculatively executed.", + "EventCode": "0x8089", + "EventName": "BASE_LD_REG_SPEC", + "BriefDescription": "General-purpose register load Operations speculatively executed." + }, + { + "PublicDescription": "General-purpose register store Operations speculatively executed.", + "EventCode": "0x808A", + "EventName": "BASE_ST_REG_SPEC", + "BriefDescription": "General-purpose register store Operations speculatively executed." + }, + { + "PublicDescription": "SVE unpredicated load register Operations speculatively executed.", + "EventCode": "0x8091", + "EventName": "SVE_LDR_REG_SPEC", + "BriefDescription": "SVE unpredicated load register Operations speculatively executed." + }, + { + "PublicDescription": "SVE unpredicated store register Operations speculatively executed.", + "EventCode": "0x8092", + "EventName": "SVE_STR_REG_SPEC", + "BriefDescription": "SVE unpredicated store register Operations speculatively executed." + }, + { + "PublicDescription": "SVE load predicate register Operations speculatively executed.", + "EventCode": "0x8095", + "EventName": "SVE_LDR_PREG_SPEC", + "BriefDescription": "SVE load predicate register Operations speculatively executed." + }, + { + "PublicDescription": "SVE store predicate register Operations speculatively executed.", + "EventCode": "0x8096", + "EventName": "SVE_STR_PREG_SPEC", + "BriefDescription": "SVE store predicate register Operations speculatively executed." + }, + { + "PublicDescription": "SVE contiguous prefetch element Operations speculatively executed.", + "EventCode": "0x809F", + "EventName": "SVE_PRF_CONTIG_SPEC", + "BriefDescription": "SVE contiguous prefetch element Operations speculatively executed." + }, + { + "PublicDescription": "Advanced SIMD and SVE contiguous load multiple vector Operations speculatively executed.", + "EventCode": "0x80A5", + "EventName": "ASE_SVE_LD_MULTI_SPEC", + "BriefDescription": "Advanced SIMD and SVE contiguous load multiple vector Operations speculatively executed." + }, + { + "PublicDescription": "Advanced SIMD and SVE contiguous store multiple vector Operations speculatively executed.", + "EventCode": "0x80A6", + "EventName": "ASE_SVE_ST_MULTI_SPEC", + "BriefDescription": "Advanced SIMD and SVE contiguous store multiple vector Operations speculatively executed." + }, + { + "PublicDescription": "SVE gather-load Operations speculatively executed.", + "EventCode": "0x80AD", + "EventName": "SVE_LD_GATHER_SPEC", + "BriefDescription": "SVE gather-load Operations speculatively executed." + }, + { + "PublicDescription": "SVE scatter-store Operations speculatively executed.", + "EventCode": "0x80AE", + "EventName": "SVE_ST_SCATTER_SPEC", + "BriefDescription": "SVE scatter-store Operations speculatively executed." + }, + { + "PublicDescription": "SVE gather-prefetch Operations speculatively executed.", + "EventCode": "0x80AF", + "EventName": "SVE_PRF_GATHER_SPEC", + "BriefDescription": "SVE gather-prefetch Operations speculatively executed." + }, + { + "PublicDescription": "SVE First-fault load Operations speculatively executed.", + "EventCode": "0x80BC", + "EventName": "SVE_LDFF_SPEC", + "BriefDescription": "SVE First-fault load Operations speculatively executed." + }, + { + "PublicDescription": "Scalable floating-point element Operations speculatively executed.", + "EventCode": "0x80C0", + "EventName": "FP_SCALE_OPS_SPEC", + "BriefDescription": "Scalable floating-point element Operations speculatively executed." + }, + { + "PublicDescription": "Non-scalable floating-point element Operations speculatively executed.", + "EventCode": "0x80C1", + "EventName": "FP_FIXED_OPS_SPEC", + "BriefDescription": "Non-scalable floating-point element Operations speculatively executed." + }, + { + "PublicDescription": "Scalable half-precision floating-point element Operations speculatively executed.", + "EventCode": "0x80C2", + "EventName": "FP_HP_SCALE_OPS_SPEC", + "BriefDescription": "Scalable half-precision floating-point element Operations speculatively executed." + }, + { + "PublicDescription": "Non-scalable half-precision floating-point element Operations speculatively executed.", + "EventCode": "0x80C3", + "EventName": "FP_HP_FIXED_OPS_SPEC", + "BriefDescription": "Non-scalable half-precision floating-point element Operations speculatively executed." + }, + { + "PublicDescription": "Scalable single-precision floating-point element Operations speculatively executed.", + "EventCode": "0x80C4", + "EventName": "FP_SP_SCALE_OPS_SPEC", + "BriefDescription": "Scalable single-precision floating-point element Operations speculatively executed." + }, + { + "PublicDescription": "Non-scalable single-precision floating-point element Operations speculatively executed.", + "EventCode": "0x80C5", + "EventName": "FP_SP_FIXED_OPS_SPEC", + "BriefDescription": "Non-scalable single-precision floating-point element Operations speculatively executed." + }, + { + "PublicDescription": "Scalable double-precision floating-point element Operations speculatively executed.", + "EventCode": "0x80C6", + "EventName": "FP_DP_SCALE_OPS_SPEC", + "BriefDescription": "Scalable double-precision floating-point element Operations speculatively executed." + }, + { + "PublicDescription": "Non-scalable double-precision floating-point element Operations speculatively executed.", + "EventCode": "0x80C7", + "EventName": "FP_DP_FIXED_OPS_SPEC", + "BriefDescription": "Non-scalable double-precision floating-point element Operations speculatively executed." } ] -- cgit v1.2.3 From 5497b23e870c45486e8caf1116ccbb592443bff3 Mon Sep 17 00:00:00 2001 From: Shunsuke Nakamura Date: Mon, 8 Mar 2021 19:53:41 +0900 Subject: perf vendor events arm64: Add Fujitsu A64FX pmu event MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add pmu events for A64FX. Documentation source: https://github.com/fujitsu/A64FX/blob/master/doc/A64FX_PMU_Events_v1.2.pdf Signed-off-by: Nakamura, Shunsuke/中村 俊介 Reviewed-by: John Garry Tested-by: Masayoshi Mizuma Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Leo Yan Cc: Mark Rutland Cc: Mathieu Poirier Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Link: http://lore.kernel.org/lkml/20210308105342.746940-3-nakamura.shun@fujitsu.com Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/arm64/fujitsu/a64fx/branch.json | 8 + .../pmu-events/arch/arm64/fujitsu/a64fx/bus.json | 62 +++++++ .../pmu-events/arch/arm64/fujitsu/a64fx/cache.json | 128 ++++++++++++++ .../pmu-events/arch/arm64/fujitsu/a64fx/cycle.json | 5 + .../arch/arm64/fujitsu/a64fx/exception.json | 29 +++ .../arch/arm64/fujitsu/a64fx/instruction.json | 131 ++++++++++++++ .../arch/arm64/fujitsu/a64fx/memory.json | 8 + .../pmu-events/arch/arm64/fujitsu/a64fx/other.json | 188 ++++++++++++++++++++ .../arch/arm64/fujitsu/a64fx/pipeline.json | 194 +++++++++++++++++++++ .../pmu-events/arch/arm64/fujitsu/a64fx/sve.json | 110 ++++++++++++ tools/perf/pmu-events/arch/arm64/mapfile.csv | 1 + 11 files changed, 864 insertions(+) create mode 100644 tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/branch.json create mode 100644 tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/bus.json create mode 100644 tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/cache.json create mode 100644 tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/cycle.json create mode 100644 tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/exception.json create mode 100644 tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/instruction.json create mode 100644 tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/memory.json create mode 100644 tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/other.json create mode 100644 tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/pipeline.json create mode 100644 tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/sve.json (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/branch.json b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/branch.json new file mode 100644 index 000000000000..b011af11bf94 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/branch.json @@ -0,0 +1,8 @@ +[ + { + "ArchStdEvent": "BR_MIS_PRED" + }, + { + "ArchStdEvent": "BR_PRED" + } +] diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/bus.json b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/bus.json new file mode 100644 index 000000000000..084e88d7df73 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/bus.json @@ -0,0 +1,62 @@ +[ + { + "PublicDescription": "This event counts read transactions from tofu controller to measured CMG.", + "EventCode": "0x314", + "EventName": "BUS_READ_TOTAL_TOFU", + "BriefDescription": "This event counts read transactions from tofu controller to measured CMG." + }, + { + "PublicDescription": "This event counts read transactions from PCI controller to measured CMG.", + "EventCode": "0x315", + "EventName": "BUS_READ_TOTAL_PCI", + "BriefDescription": "This event counts read transactions from PCI controller to measured CMG." + }, + { + "PublicDescription": "This event counts read transactions from measured CMG local memory to measured CMG.", + "EventCode": "0x316", + "EventName": "BUS_READ_TOTAL_MEM", + "BriefDescription": "This event counts read transactions from measured CMG local memory to measured CMG." + }, + { + "PublicDescription": "This event counts write transactions from measured CMG to CMG0, if measured CMG is not CMG0.", + "EventCode": "0x318", + "EventName": "BUS_WRITE_TOTAL_CMG0", + "BriefDescription": "This event counts write transactions from measured CMG to CMG0, if measured CMG is not CMG0." + }, + { + "PublicDescription": "This event counts write transactions from measured CMG to CMG1, if measured CMG is not CMG1.", + "EventCode": "0x319", + "EventName": "BUS_WRITE_TOTAL_CMG1", + "BriefDescription": "This event counts write transactions from measured CMG to CMG1, if measured CMG is not CMG1." + }, + { + "PublicDescription": "This event counts write transactions from measured CMG to CMG2, if measured CMG is not CMG2.", + "EventCode": "0x31A", + "EventName": "BUS_WRITE_TOTAL_CMG2", + "BriefDescription": "This event counts write transactions from measured CMG to CMG2, if measured CMG is not CMG2." + }, + { + "PublicDescription": "This event counts write transactions from measured CMG to CMG3, if measured CMG is not CMG3.", + "EventCode": "0x31B", + "EventName": "BUS_WRITE_TOTAL_CMG3", + "BriefDescription": "This event counts write transactions from measured CMG to CMG3, if measured CMG is not CMG3." + }, + { + "PublicDescription": "This event counts write transactions from measured CMG to tofu controller.", + "EventCode": "0x31C", + "EventName": "BUS_WRITE_TOTAL_TOFU", + "BriefDescription": "This event counts write transactions from measured CMG to tofu controller." + }, + { + "PublicDescription": "This event counts write transactions from measured CMG to PCI controller.", + "EventCode": "0x31D", + "EventName": "BUS_WRITE_TOTAL_PCI", + "BriefDescription": "This event counts write transactions from measured CMG to PCI controller." + }, + { + "PublicDescription": "This event counts write transactions from measured CMG to measured CMG local memory.", + "EventCode": "0x31E", + "EventName": "BUS_WRITE_TOTAL_MEM", + "BriefDescription": "This event counts write transactions from measured CMG to measured CMG local memory." + } +] diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/cache.json b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/cache.json new file mode 100644 index 000000000000..2e341a951a10 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/cache.json @@ -0,0 +1,128 @@ +[ + { + "ArchStdEvent": "L1I_CACHE_REFILL" + }, + { + "ArchStdEvent": "L1I_TLB_REFILL" + }, + { + "ArchStdEvent": "L1D_CACHE_REFILL" + }, + { + "ArchStdEvent": "L1D_CACHE" + }, + { + "ArchStdEvent": "L1D_TLB_REFILL" + }, + { + "ArchStdEvent": "L1I_CACHE" + }, + { + "ArchStdEvent": "L1D_CACHE_WB" + }, + { + "ArchStdEvent": "L2D_CACHE" + }, + { + "ArchStdEvent": "L2D_CACHE_REFILL" + }, + { + "ArchStdEvent": "L2D_CACHE_WB" + }, + { + "ArchStdEvent": "L2D_TLB_REFILL" + }, + { + "ArchStdEvent": "L2I_TLB_REFILL" + }, + { + "ArchStdEvent": "L2D_TLB" + }, + { + "ArchStdEvent": "L2I_TLB" + }, + { + "PublicDescription": "This event counts L1D_CACHE_REFILL caused by software or hardware prefetch.", + "EventCode": "0x49", + "EventName": "L1D_CACHE_REFILL_PRF", + "BriefDescription": "This event counts L1D_CACHE_REFILL caused by software or hardware prefetch." + }, + { + "PublicDescription": "This event counts L2D_CACHE_REFILL caused by software or hardware prefetch.", + "EventCode": "0x59", + "EventName": "L2D_CACHE_REFILL_PRF", + "BriefDescription": "This event counts L2D_CACHE_REFILL caused by software or hardware prefetch." + }, + { + "PublicDescription": "This event counts L1D_CACHE_REFILL caused by demand access.", + "EventCode": "0x200", + "EventName": "L1D_CACHE_REFILL_DM", + "BriefDescription": "This event counts L1D_CACHE_REFILL caused by demand access." + }, + { + "PublicDescription": "This event counts L1D_CACHE_REFILL caused by hardware prefetch.", + "EventCode": "0x202", + "EventName": "L1D_CACHE_REFILL_HWPRF", + "BriefDescription": "This event counts L1D_CACHE_REFILL caused by hardware prefetch." + }, + { + "PublicDescription": "This event counts outstanding L1D cache miss requests per cycle.", + "EventCode": "0x208", + "EventName": "L1_MISS_WAIT", + "BriefDescription": "This event counts outstanding L1D cache miss requests per cycle." + }, + { + "PublicDescription": "This event counts outstanding L1I cache miss requests per cycle.", + "EventCode": "0x209", + "EventName": "L1I_MISS_WAIT", + "BriefDescription": "This event counts outstanding L1I cache miss requests per cycle." + }, + { + "PublicDescription": "This event counts L2D_CACHE_REFILL caused by demand access.", + "EventCode": "0x300", + "EventName": "L2D_CACHE_REFILL_DM", + "BriefDescription": "This event counts L2D_CACHE_REFILL caused by demand access." + }, + { + "PublicDescription": "This event counts L2D_CACHE_REFILL caused by hardware prefetch.", + "EventCode": "0x302", + "EventName": "L2D_CACHE_REFILL_HWPRF", + "BriefDescription": "This event counts L2D_CACHE_REFILL caused by hardware prefetch." + }, + { + "PublicDescription": "This event counts outstanding L2 cache miss requests per cycle.", + "EventCode": "0x308", + "EventName": "L2_MISS_WAIT", + "BriefDescription": "This event counts outstanding L2 cache miss requests per cycle." + }, + { + "PublicDescription": "This event counts the number of times of L2 cache miss.", + "EventCode": "0x309", + "EventName": "L2_MISS_COUNT", + "BriefDescription": "This event counts the number of times of L2 cache miss." + }, + { + "PublicDescription": "This event counts operations where demand access hits an L2 cache refill buffer allocated by software or hardware prefetch.", + "EventCode": "0x325", + "EventName": "L2D_SWAP_DM", + "BriefDescription": "This event counts operations where demand access hits an L2 cache refill buffer allocated by software or hardware prefetch." + }, + { + "PublicDescription": "This event counts operations where software or hardware prefetch hits an L2 cache refill buffer allocated by demand access.", + "EventCode": "0x326", + "EventName": "L2D_CACHE_MIBMCH_PRF", + "BriefDescription": "This event counts operations where software or hardware prefetch hits an L2 cache refill buffer allocated by demand access." + }, + { + "PublicDescription": "This event counts operations where demand access hits an L2 cache refill buffer allocated by software or hardware prefetch.", + "EventCode": "0x396", + "EventName": "L2D_CACHE_SWAP_LOCAL", + "BriefDescription": "This event counts operations where demand access hits an L2 cache refill buffer allocated by software or hardware prefetch." + }, + { + "PublicDescription": "This event counts energy consumption per cycle of L2 cache.", + "EventCode": "0x3E0", + "EventName": "EA_L2", + "BriefDescription": "This event counts energy consumption per cycle of L2 cache." + } +] diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/cycle.json b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/cycle.json new file mode 100644 index 000000000000..b16484628290 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/cycle.json @@ -0,0 +1,5 @@ +[ + { + "ArchStdEvent": "CPU_CYCLES" + } +] diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/exception.json b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/exception.json new file mode 100644 index 000000000000..348749c154c0 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/exception.json @@ -0,0 +1,29 @@ +[ + { + "ArchStdEvent": "EXC_TAKEN" + }, + { + "ArchStdEvent": "EXC_UNDEF" + }, + { + "ArchStdEvent": "EXC_SVC" + }, + { + "ArchStdEvent": "EXC_PABORT" + }, + { + "ArchStdEvent": "EXC_DABORT" + }, + { + "ArchStdEvent": "EXC_IRQ" + }, + { + "ArchStdEvent": "EXC_FIQ" + }, + { + "ArchStdEvent": "EXC_SMC" + }, + { + "ArchStdEvent": "EXC_HVC" + } +] diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/instruction.json b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/instruction.json new file mode 100644 index 000000000000..6d258b1080cf --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/instruction.json @@ -0,0 +1,131 @@ +[ + { + "ArchStdEvent": "SW_INCR" + }, + { + "ArchStdEvent": "INST_RETIRED" + }, + { + "ArchStdEvent": "EXC_RETURN" + }, + { + "ArchStdEvent": "CID_WRITE_RETIRED" + }, + { + "ArchStdEvent": "INST_SPEC" + }, + { + "ArchStdEvent": "LDREX_SPEC" + }, + { + "ArchStdEvent": "STREX_SPEC" + }, + { + "ArchStdEvent": "LD_SPEC" + }, + { + "ArchStdEvent": "ST_SPEC" + }, + { + "ArchStdEvent": "LDST_SPEC" + }, + { + "ArchStdEvent": "DP_SPEC" + }, + { + "ArchStdEvent": "ASE_SPEC" + }, + { + "ArchStdEvent": "VFP_SPEC" + }, + { + "ArchStdEvent": "PC_WRITE_SPEC" + }, + { + "ArchStdEvent": "CRYPTO_SPEC" + }, + { + "ArchStdEvent": "BR_IMMED_SPEC" + }, + { + "ArchStdEvent": "BR_RETURN_SPEC" + }, + { + "ArchStdEvent": "BR_INDIRECT_SPEC" + }, + { + "ArchStdEvent": "ISB_SPEC" + }, + { + "ArchStdEvent": "DSB_SPEC" + }, + { + "ArchStdEvent": "DMB_SPEC" + }, + { + "PublicDescription": "This event counts architecturally executed zero blocking operations due to the 'DC ZVA' instruction.", + "EventCode": "0x9F", + "EventName": "DCZVA_SPEC", + "BriefDescription": "This event counts architecturally executed zero blocking operations due to the 'DC ZVA' instruction." + }, + { + "PublicDescription": "This event counts architecturally executed floating-point move operations.", + "EventCode": "0x105", + "EventName": "FP_MV_SPEC", + "BriefDescription": "This event counts architecturally executed floating-point move operations." + }, + { + "PublicDescription": "This event counts architecturally executed operations that using predicate register.", + "EventCode": "0x108", + "EventName": "PRD_SPEC", + "BriefDescription": "This event counts architecturally executed operations that using predicate register." + }, + { + "PublicDescription": "This event counts architecturally executed inter-element manipulation operations.", + "EventCode": "0x109", + "EventName": "IEL_SPEC", + "BriefDescription": "This event counts architecturally executed inter-element manipulation operations." + }, + { + "PublicDescription": "This event counts architecturally executed inter-register manipulation operations.", + "EventCode": "0x10A", + "EventName": "IREG_SPEC", + "BriefDescription": "This event counts architecturally executed inter-register manipulation operations." + }, + { + "PublicDescription": "This event counts architecturally executed NOSIMD load operations that using SIMD&FP registers.", + "EventCode": "0x112", + "EventName": "FP_LD_SPEC", + "BriefDescription": "This event counts architecturally executed NOSIMD load operations that using SIMD&FP registers." + }, + { + "PublicDescription": "This event counts architecturally executed NOSIMD store operations that using SIMD&FP registers.", + "EventCode": "0x113", + "EventName": "FP_ST_SPEC", + "BriefDescription": "This event counts architecturally executed NOSIMD store operations that using SIMD&FP registers." + }, + { + "PublicDescription": "This event counts architecturally executed SIMD broadcast floating-point load operations.", + "EventCode": "0x11A", + "EventName": "BC_LD_SPEC", + "BriefDescription": "This event counts architecturally executed SIMD broadcast floating-point load operations." + }, + { + "PublicDescription": "This event counts architecturally executed instructions, excluding the MOVPRFX instruction.", + "EventCode": "0x121", + "EventName": "EFFECTIVE_INST_SPEC", + "BriefDescription": "This event counts architecturally executed instructions, excluding the MOVPRFX instruction." + }, + { + "PublicDescription": "This event counts architecturally executed operations that uses 'pre-index' as its addressing mode.", + "EventCode": "0x123", + "EventName": "PRE_INDEX_SPEC", + "BriefDescription": "This event counts architecturally executed operations that uses 'pre-index' as its addressing mode." + }, + { + "PublicDescription": "This event counts architecturally executed operations that uses 'post-index' as its addressing mode.", + "EventCode": "0x124", + "EventName": "POST_INDEX_SPEC", + "BriefDescription": "This event counts architecturally executed operations that uses 'post-index' as its addressing mode." + } +] diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/memory.json b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/memory.json new file mode 100644 index 000000000000..c1f6479e92b4 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/memory.json @@ -0,0 +1,8 @@ +[ + { + "PublicDescription": "This event counts energy consumption per cycle of CMG local memory.", + "EventCode": "0x3E8", + "EventName": "EA_MEMORY", + "BriefDescription": "This event counts energy consumption per cycle of CMG local memory." + } +] diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/other.json b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/other.json new file mode 100644 index 000000000000..10c823ac26cc --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/other.json @@ -0,0 +1,188 @@ +[ + { + "PublicDescription": "This event counts the occurrence count of the micro-operation split.", + "EventCode": "0x139", + "EventName": "UOP_SPLIT", + "BriefDescription": "This event counts the occurrence count of the micro-operation split." + }, + { + "PublicDescription": "This event counts every cycle that no operation was committed because the oldest and uncommitted load/store/prefetch operation waits for memory access.", + "EventCode": "0x180", + "EventName": "LD_COMP_WAIT_L2_MISS", + "BriefDescription": "This event counts every cycle that no operation was committed because the oldest and uncommitted load/store/prefetch operation waits for memory access." + }, + { + "PublicDescription": "This event counts every cycle that no instruction was committed because the oldest and uncommitted integer load operation waits for memory access.", + "EventCode": "0x181", + "EventName": "LD_COMP_WAIT_L2_MISS_EX", + "BriefDescription": "This event counts every cycle that no instruction was committed because the oldest and uncommitted integer load operation waits for memory access." + }, + { + "PublicDescription": "This event counts every cycle that no instruction was committed because the oldest and uncommitted load/store/prefetch operation waits for L2 cache access.", + "EventCode": "0x182", + "EventName": "LD_COMP_WAIT_L1_MISS", + "BriefDescription": "This event counts every cycle that no instruction was committed because the oldest and uncommitted load/store/prefetch operation waits for L2 cache access." + }, + { + "PublicDescription": "This event counts every cycle that no instruction was committed because the oldest and uncommitted integer load operation waits for L2 cache access.", + "EventCode": "0x183", + "EventName": "LD_COMP_WAIT_L1_MISS_EX", + "BriefDescription": "This event counts every cycle that no instruction was committed because the oldest and uncommitted integer load operation waits for L2 cache access." + }, + { + "PublicDescription": "This event counts every cycle that no instruction was committed because the oldest and uncommitted load/store/prefetch operation waits for L1D cache, L2 cache and memory access.", + "EventCode": "0x184", + "EventName": "LD_COMP_WAIT", + "BriefDescription": "This event counts every cycle that no instruction was committed because the oldest and uncommitted load/store/prefetch operation waits for L1D cache, L2 cache and memory access." + }, + { + "PublicDescription": "This event counts every cycle that no instruction was committed because the oldest and uncommitted integer load operation waits for L1D cache, L2 cache and memory access.", + "EventCode": "0x185", + "EventName": "LD_COMP_WAIT_EX", + "BriefDescription": "This event counts every cycle that no instruction was committed because the oldest and uncommitted integer load operation waits for L1D cache, L2 cache and memory access." + }, + { + "PublicDescription": "This event counts every cycle that no instruction was committed due to the lack of an available prefetch port.", + "EventCode": "0x186", + "EventName": "LD_COMP_WAIT_PFP_BUSY", + "BriefDescription": "This event counts every cycle that no instruction was committed due to the lack of an available prefetch port." + }, + { + "PublicDescription": "This event counts the LD_COMP_WAIT_PFP_BUSY caused by an integer load operation.", + "EventCode": "0x187", + "EventName": "LD_COMP_WAIT_PFP_BUSY_EX", + "BriefDescription": "This event counts the LD_COMP_WAIT_PFP_BUSY caused by an integer load operation." + }, + { + "PublicDescription": "This event counts the LD_COMP_WAIT_PFP_BUSY caused by a software prefetch instruction.", + "EventCode": "0x188", + "EventName": "LD_COMP_WAIT_PFP_BUSY_SWPF", + "BriefDescription": "This event counts the LD_COMP_WAIT_PFP_BUSY caused by a software prefetch instruction." + }, + { + "PublicDescription": "This event counts every cycle that no instruction was committed and the oldest and uncommitted instruction is an integer or floating-point/SIMD instruction.", + "EventCode": "0x189", + "EventName": "EU_COMP_WAIT", + "BriefDescription": "This event counts every cycle that no instruction was committed and the oldest and uncommitted instruction is an integer or floating-point/SIMD instruction." + }, + { + "PublicDescription": "This event counts every cycle that no instruction was committed and the oldest and uncommitted instruction is a floating-point/SIMD instruction.", + "EventCode": "0x18A", + "EventName": "FL_COMP_WAIT", + "BriefDescription": "This event counts every cycle that no instruction was committed and the oldest and uncommitted instruction is a floating-point/SIMD instruction." + }, + { + "PublicDescription": "This event counts every cycle that no instruction was committed and the oldest and uncommitted instruction is a branch instruction.", + "EventCode": "0x18B", + "EventName": "BR_COMP_WAIT", + "BriefDescription": "This event counts every cycle that no instruction was committed and the oldest and uncommitted instruction is a branch instruction." + }, + { + "PublicDescription": "This event counts every cycle that no instruction was committed because the CSE is empty.", + "EventCode": "0x18C", + "EventName": "ROB_EMPTY", + "BriefDescription": "This event counts every cycle that no instruction was committed because the CSE is empty." + }, + { + "PublicDescription": "This event counts every cycle that no instruction was committed because the CSE is empty and the store port (SP) is full.", + "EventCode": "0x18D", + "EventName": "ROB_EMPTY_STQ_BUSY", + "BriefDescription": "This event counts every cycle that no instruction was committed because the CSE is empty and the store port (SP) is full." + }, + { + "PublicDescription": "This event counts every cycle that the instruction unit is halted by the WFE/WFI instruction.", + "EventCode": "0x18E", + "EventName": "WFE_WFI_CYCLE", + "BriefDescription": "This event counts every cycle that the instruction unit is halted by the WFE/WFI instruction." + }, + { + "PublicDescription": "This event counts every cycle that no instruction was committed, but counts at the time when commits MOVPRFX only.", + "EventCode": "0x190", + "EventName": "_0INST_COMMIT", + "BriefDescription": "This event counts every cycle that no instruction was committed, but counts at the time when commits MOVPRFX only." + }, + { + "PublicDescription": "This event counts every cycle that one instruction is committed.", + "EventCode": "0x191", + "EventName": "_1INST_COMMIT", + "BriefDescription": "This event counts every cycle that one instruction is committed." + }, + { + "PublicDescription": "This event counts every cycle that two instructions are committed.", + "EventCode": "0x192", + "EventName": "_2INST_COMMIT", + "BriefDescription": "This event counts every cycle that two instructions are committed." + }, + { + "PublicDescription": "This event counts every cycle that three instructions are committed.", + "EventCode": "0x193", + "EventName": "_3INST_COMMIT", + "BriefDescription": "This event counts every cycle that three instructions are committed." + }, + { + "PublicDescription": "This event counts every cycle that four instructions are committed.", + "EventCode": "0x194", + "EventName": "_4INST_COMMIT", + "BriefDescription": "This event counts every cycle that four instructions are committed." + }, + { + "PublicDescription": "This event counts every cycle that only any micro-operations are committed.", + "EventCode": "0x198", + "EventName": "UOP_ONLY_COMMIT", + "BriefDescription": "This event counts every cycle that only any micro-operations are committed." + }, + { + "PublicDescription": "This event counts every cycle that only the MOVPRFX instruction is committed.", + "EventCode": "0x199", + "EventName": "SINGLE_MOVPRFX_COMMIT", + "BriefDescription": "This event counts every cycle that only the MOVPRFX instruction is committed." + }, + { + "PublicDescription": "This event counts energy consumption per cycle of core.", + "EventCode": "0x1E0", + "EventName": "EA_CORE", + "BriefDescription": "This event counts energy consumption per cycle of core." + }, + { + "PublicDescription": "This event counts streaming prefetch requests to L1D cache generated by hardware prefetcher.", + "EventCode": "0x230", + "EventName": "L1HWPF_STREAM_PF", + "BriefDescription": "This event counts streaming prefetch requests to L1D cache generated by hardware prefetcher." + }, + { + "PublicDescription": "This event counts allocation type prefetch injection requests to L1D cache generated by hardware prefetcher.", + "EventCode": "0x231", + "EventName": "L1HWPF_INJ_ALLOC_PF", + "BriefDescription": "This event counts allocation type prefetch injection requests to L1D cache generated by hardware prefetcher." + }, + { + "PublicDescription": "This event counts non-allocation type prefetch injection requests to L1D cache generated by hardware prefetcher.", + "EventCode": "0x232", + "EventName": "L1HWPF_INJ_NOALLOC_PF", + "BriefDescription": "This event counts non-allocation type prefetch injection requests to L1D cache generated by hardware prefetcher." + }, + { + "PublicDescription": "This event counts streaming prefetch requests to L2 cache generated by hardware prefecher.", + "EventCode": "0x233", + "EventName": "L2HWPF_STREAM_PF", + "BriefDescription": "This event counts streaming prefetch requests to L2 cache generated by hardware prefecher." + }, + { + "PublicDescription": "This event counts allocation type prefetch injection requests to L2 cache generated by hardware prefetcher.", + "EventCode": "0x234", + "EventName": "L2HWPF_INJ_ALLOC_PF", + "BriefDescription": "This event counts allocation type prefetch injection requests to L2 cache generated by hardware prefetcher." + }, + { + "PublicDescription": "This event counts non-allocation type prefetch injection requests to L2 cache generated by hardware prefetcher.", + "EventCode": "0x235", + "EventName": "L2HWPF_INJ_NOALLOC_PF", + "BriefDescription": "This event counts non-allocation type prefetch injection requests to L2 cache generated by hardware prefetcher." + }, + { + "PublicDescription": "This event counts prefetch requests to L2 cache generated by the other causes.", + "EventCode": "0x236", + "EventName": "L2HWPF_OTHER", + "BriefDescription": "This event counts prefetch requests to L2 cache generated by the other causes." + } +] diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/pipeline.json b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/pipeline.json new file mode 100644 index 000000000000..dd7c97a9972b --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/pipeline.json @@ -0,0 +1,194 @@ +[ + { + "ArchStdEvent": "STALL_FRONTEND" + }, + { + "ArchStdEvent": "STALL_BACKEND" + }, + { + "PublicDescription": "This event counts valid cycles of EAGA pipeline.", + "EventCode": "0x1A0", + "EventName": "EAGA_VAL", + "BriefDescription": "This event counts valid cycles of EAGA pipeline." + }, + { + "PublicDescription": "This event counts valid cycles of EAGB pipeline.", + "EventCode": "0x1A1", + "EventName": "EAGB_VAL", + "BriefDescription": "This event counts valid cycles of EAGB pipeline." + }, + { + "PublicDescription": "This event counts valid cycles of EXA pipeline.", + "EventCode": "0x1A2", + "EventName": "EXA_VAL", + "BriefDescription": "This event counts valid cycles of EXA pipeline." + }, + { + "PublicDescription": "This event counts valid cycles of EXB pipeline.", + "EventCode": "0x1A3", + "EventName": "EXB_VAL", + "BriefDescription": "This event counts valid cycles of EXB pipeline." + }, + { + "PublicDescription": "This event counts valid cycles of FLA pipeline.", + "EventCode": "0x1A4", + "EventName": "FLA_VAL", + "BriefDescription": "This event counts valid cycles of FLA pipeline." + }, + { + "PublicDescription": "This event counts valid cycles of FLB pipeline.", + "EventCode": "0x1A5", + "EventName": "FLB_VAL", + "BriefDescription": "This event counts valid cycles of FLB pipeline." + }, + { + "PublicDescription": "This event counts valid cycles of PRX pipeline.", + "EventCode": "0x1A6", + "EventName": "PRX_VAL", + "BriefDescription": "This event counts valid cycles of PRX pipeline." + }, + { + "PublicDescription": "This event counts the number of 1's in the predicate bits of request in FLA pipeline, where it is corrected so that it becomes 16 when all bits are 1.", + "EventCode": "0x1B4", + "EventName": "FLA_VAL_PRD_CNT", + "BriefDescription": "This event counts the number of 1's in the predicate bits of request in FLA pipeline, where it is corrected so that it becomes 16 when all bits are 1." + }, + { + "PublicDescription": "This event counts the number of 1's in the predicate bits of request in FLB pipeline, where it is corrected so that it becomes 16 when all bits are 1.", + "EventCode": "0x1B5", + "EventName": "FLB_VAL_PRD_CNT", + "BriefDescription": "This event counts the number of 1's in the predicate bits of request in FLB pipeline, where it is corrected so that it becomes 16 when all bits are 1." + }, + { + "PublicDescription": "This event counts valid cycles of L1D cache pipeline#0.", + "EventCode": "0x240", + "EventName": "L1_PIPE0_VAL", + "BriefDescription": "This event counts valid cycles of L1D cache pipeline#0." + }, + { + "PublicDescription": "This event counts valid cycles of L1D cache pipeline#1.", + "EventCode": "0x241", + "EventName": "L1_PIPE1_VAL", + "BriefDescription": "This event counts valid cycles of L1D cache pipeline#1." + }, + { + "PublicDescription": "This event counts requests in L1D cache pipeline#0 that its sce bit of tagged address is 1.", + "EventCode": "0x250", + "EventName": "L1_PIPE0_VAL_IU_TAG_ADRS_SCE", + "BriefDescription": "This event counts requests in L1D cache pipeline#0 that its sce bit of tagged address is 1." + }, + { + "PublicDescription": "This event counts requests in L1D cache pipeline#0 that its pfe bit of tagged address is 1.", + "EventCode": "0x251", + "EventName": "L1_PIPE0_VAL_IU_TAG_ADRS_PFE", + "BriefDescription": "This event counts requests in L1D cache pipeline#0 that its pfe bit of tagged address is 1." + }, + { + "PublicDescription": "This event counts requests in L1D cache pipeline#1 that its sce bit of tagged address is 1.", + "EventCode": "0x252", + "EventName": "L1_PIPE1_VAL_IU_TAG_ADRS_SCE", + "BriefDescription": "This event counts requests in L1D cache pipeline#1 that its sce bit of tagged address is 1." + }, + { + "PublicDescription": "This event counts requests in L1D cache pipeline#1 that its pfe bit of tagged address is 1.", + "EventCode": "0x253", + "EventName": "L1_PIPE1_VAL_IU_TAG_ADRS_PFE", + "BriefDescription": "This event counts requests in L1D cache pipeline#1 that its pfe bit of tagged address is 1." + }, + { + "PublicDescription": "This event counts completed requests in L1D cache pipeline#0.", + "EventCode": "0x260", + "EventName": "L1_PIPE0_COMP", + "BriefDescription": "This event counts completed requests in L1D cache pipeline#0." + }, + { + "PublicDescription": "This event counts completed requests in L1D cache pipeline#1.", + "EventCode": "0x261", + "EventName": "L1_PIPE1_COMP", + "BriefDescription": "This event counts completed requests in L1D cache pipeline#1." + }, + { + "PublicDescription": "This event counts completed requests in L1I cache pipeline.", + "EventCode": "0x268", + "EventName": "L1I_PIPE_COMP", + "BriefDescription": "This event counts completed requests in L1I cache pipeline." + }, + { + "PublicDescription": "This event counts valid cycles of L1I cache pipeline.", + "EventCode": "0x269", + "EventName": "L1I_PIPE_VAL", + "BriefDescription": "This event counts valid cycles of L1I cache pipeline." + }, + { + "PublicDescription": "This event counts aborted requests in L1D pipelines that due to store-load interlock.", + "EventCode": "0x274", + "EventName": "L1_PIPE_ABORT_STLD_INTLK", + "BriefDescription": "This event counts aborted requests in L1D pipelines that due to store-load interlock." + }, + { + "PublicDescription": "This event counts requests in L1D cache pipeline#0 that its sector cache ID is not 0.", + "EventCode": "0x2A0", + "EventName": "L1_PIPE0_VAL_IU_NOT_SEC0", + "BriefDescription": "This event counts requests in L1D cache pipeline#0 that its sector cache ID is not 0." + }, + { + "PublicDescription": "This event counts requests in L1D cache pipeline#1 that its sector cache ID is not 0.", + "EventCode": "0x2A1", + "EventName": "L1_PIPE1_VAL_IU_NOT_SEC0", + "BriefDescription": "This event counts requests in L1D cache pipeline#1 that its sector cache ID is not 0." + }, + { + "PublicDescription": "This event counts the number of times where 2 elements of the gather instructions became 2 flows because 2 elements could not be combined.", + "EventCode": "0x2B0", + "EventName": "L1_PIPE_COMP_GATHER_2FLOW", + "BriefDescription": "This event counts the number of times where 2 elements of the gather instructions became 2 flows because 2 elements could not be combined." + }, + { + "PublicDescription": "This event counts the number of times where 2 elements of the gather instructions became 1 flow because 2 elements could be combined.", + "EventCode": "0x2B1", + "EventName": "L1_PIPE_COMP_GATHER_1FLOW", + "BriefDescription": "This event counts the number of times where 2 elements of the gather instructions became 1 flow because 2 elements could be combined." + }, + { + "PublicDescription": "This event counts the number of times where 2 elements of the gather instructions became 0 flow because both predicate values are 0.", + "EventCode": "0x2B2", + "EventName": "L1_PIPE_COMP_GATHER_0FLOW", + "BriefDescription": "This event counts the number of times where 2 elements of the gather instructions became 0 flow because both predicate values are 0." + }, + { + "PublicDescription": "This event counts the number of flows of the scatter instructions.", + "EventCode": "0x2B3", + "EventName": "L1_PIPE_COMP_SCATTER_1FLOW", + "BriefDescription": "This event counts the number of flows of the scatter instructions." + }, + { + "PublicDescription": "This event counts the number of 1's in the predicate bits of request in L1D cache pipeline#0, where it is corrected so that it becomes 16 when all bits are 1.", + "EventCode": "0x2B8", + "EventName": "L1_PIPE0_COMP_PRD_CNT", + "BriefDescription": "This event counts the number of 1's in the predicate bits of request in L1D cache pipeline#0, where it is corrected so that it becomes 16 when all bits are 1." + }, + { + "PublicDescription": "This event counts the number of 1's in the predicate bits of request in L1D cache pipeline#1, where it is corrected so that it becomes 16 when all bits are 1.", + "EventCode": "0x2B9", + "EventName": "L1_PIPE1_COMP_PRD_CNT", + "BriefDescription": "This event counts the number of 1's in the predicate bits of request in L1D cache pipeline#1, where it is corrected so that it becomes 16 when all bits are 1." + }, + { + "PublicDescription": "This event counts valid cycles of L2 cache pipeline.", + "EventCode": "0x330", + "EventName": "L2_PIPE_VAL", + "BriefDescription": "This event counts valid cycles of L2 cache pipeline." + }, + { + "PublicDescription": "This event counts completed requests in L2 cache pipeline.", + "EventCode": "0x350", + "EventName": "L2_PIPE_COMP_ALL", + "BriefDescription": "This event counts completed requests in L2 cache pipeline." + }, + { + "PublicDescription": "This event counts operations where software or hardware prefetch hits an L2 cache refill buffer allocated by demand access.", + "EventCode": "0x370", + "EventName": "L2_PIPE_COMP_PF_L2MIB_MCH", + "BriefDescription": "This event counts operations where software or hardware prefetch hits an L2 cache refill buffer allocated by demand access." + } +] diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/sve.json b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/sve.json new file mode 100644 index 000000000000..dc1b95e42c32 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/sve.json @@ -0,0 +1,110 @@ +[ + { + "ArchStdEvent": "SIMD_INST_RETIRED" + }, + { + "ArchStdEvent": "SVE_INST_RETIRED" + }, + { + "ArchStdEvent": "UOP_SPEC" + }, + { + "ArchStdEvent": "SVE_MATH_SPEC" + }, + { + "ArchStdEvent": "FP_SPEC" + }, + { + "ArchStdEvent": "FP_FMA_SPEC" + }, + { + "ArchStdEvent": "FP_RECPE_SPEC" + }, + { + "ArchStdEvent": "FP_CVT_SPEC" + }, + { + "ArchStdEvent": "ASE_SVE_INT_SPEC" + }, + { + "ArchStdEvent": "SVE_PRED_SPEC" + }, + { + "ArchStdEvent": "SVE_MOVPRFX_SPEC" + }, + { + "ArchStdEvent": "SVE_MOVPRFX_U_SPEC" + }, + { + "ArchStdEvent": "ASE_SVE_LD_SPEC" + }, + { + "ArchStdEvent": "ASE_SVE_ST_SPEC" + }, + { + "ArchStdEvent": "PRF_SPEC" + }, + { + "ArchStdEvent": "BASE_LD_REG_SPEC" + }, + { + "ArchStdEvent": "BASE_ST_REG_SPEC" + }, + { + "ArchStdEvent": "SVE_LDR_REG_SPEC" + }, + { + "ArchStdEvent": "SVE_STR_REG_SPEC" + }, + { + "ArchStdEvent": "SVE_LDR_PREG_SPEC" + }, + { + "ArchStdEvent": "SVE_STR_PREG_SPEC" + }, + { + "ArchStdEvent": "SVE_PRF_CONTIG_SPEC" + }, + { + "ArchStdEvent": "ASE_SVE_LD_MULTI_SPEC" + }, + { + "ArchStdEvent": "ASE_SVE_ST_MULTI_SPEC" + }, + { + "ArchStdEvent": "SVE_LD_GATHER_SPEC" + }, + { + "ArchStdEvent": "SVE_ST_SCATTER_SPEC" + }, + { + "ArchStdEvent": "SVE_PRF_GATHER_SPEC" + }, + { + "ArchStdEvent": "SVE_LDFF_SPEC" + }, + { + "ArchStdEvent": "FP_SCALE_OPS_SPEC" + }, + { + "ArchStdEvent": "FP_FIXED_OPS_SPEC" + }, + { + "ArchStdEvent": "FP_HP_SCALE_OPS_SPEC" + }, + { + "ArchStdEvent": "FP_HP_FIXED_OPS_SPEC" + }, + { + "ArchStdEvent": "FP_SP_SCALE_OPS_SPEC" + }, + { + "ArchStdEvent": "FP_SP_FIXED_OPS_SPEC" + }, + { + "ArchStdEvent": "FP_DP_SCALE_OPS_SPEC" + }, + { + "ArchStdEvent": "FP_DP_FIXED_OPS_SPEC" + } +] diff --git a/tools/perf/pmu-events/arch/arm64/mapfile.csv b/tools/perf/pmu-events/arch/arm64/mapfile.csv index 0d609149b82a..c43591d831b8 100644 --- a/tools/perf/pmu-events/arch/arm64/mapfile.csv +++ b/tools/perf/pmu-events/arch/arm64/mapfile.csv @@ -20,5 +20,6 @@ 0x00000000410fd0c0,v1,arm/cortex-a76-n1,core 0x00000000420f5160,v1,cavium/thunderx2,core 0x00000000430f0af0,v1,cavium/thunderx2,core +0x00000000460f0010,v1,fujitsu/a64fx,core 0x00000000480fd010,v1,hisilicon/hip08,core 0x00000000500f0000,v1,ampere/emag,core -- cgit v1.2.3 From 4a03af3ee399e87934e18ae720dda72c52f0050c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 15 Mar 2021 11:27:24 -0300 Subject: perf stat: Elaborate use cases for the -n/--null command line option The existing text was way too terse, pick the intended usage from the cset that introduced this option. Twitter: https://twitter.com/_monoid/status/1371461130175004672?s=20 Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-stat.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt index 08a1714494f8..3055aad38d46 100644 --- a/tools/perf/Documentation/perf-stat.txt +++ b/tools/perf/Documentation/perf-stat.txt @@ -142,7 +142,10 @@ Do not aggregate counts across all monitored CPUs. -n:: --null:: - null run - don't start any counters +null run - Don't start any counters. + +This can be useful to measure just elapsed wall-clock time - or to assess the +raw overhead of perf stat itself, without running any counters. -v:: --verbose:: -- cgit v1.2.3 From 6859bc0e78c6a699599cbb21404fdb6c8125da74 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Mon, 15 Mar 2021 22:30:47 +0800 Subject: perf stat: Improve readability of shadow stats This adds function convert_unit_double() and selects appropriate unit for shadow stats between K/M/G. $ sudo perf stat -a -- sleep 1 Before: Unit 'M' is selected even the number is very small. Performance counter stats for 'system wide': 4,003.06 msec cpu-clock # 3.998 CPUs utilized 16,179 context-switches # 0.004 M/sec 161 cpu-migrations # 0.040 K/sec 4,699 page-faults # 0.001 M/sec 6,135,801,925 cycles # 1.533 GHz (83.21%) 5,783,308,491 stalled-cycles-frontend # 94.26% frontend cycles idle (83.21%) 4,543,694,050 stalled-cycles-backend # 74.05% backend cycles idle (66.49%) 4,720,130,587 instructions # 0.77 insn per cycle # 1.23 stalled cycles per insn (83.28%) 753,848,078 branches # 188.318 M/sec (83.61%) 37,457,747 branch-misses # 4.97% of all branches (83.48%) 1.001283725 seconds time elapsed After: $ sudo perf stat -a -- sleep 2 Performance counter stats for 'system wide': 8,005.52 msec cpu-clock # 3.999 CPUs utilized 10,715 context-switches # 1.338 K/sec 785 cpu-migrations # 98.057 /sec 102 page-faults # 12.741 /sec 1,948,202,279 cycles # 0.243 GHz 2,816,470,932 stalled-cycles-frontend # 144.57% frontend cycles idle 2,661,172,207 stalled-cycles-backend # 136.60% backend cycles idle 464,172,105 instructions # 0.24 insn per cycle # 6.07 stalled cycles per insn 91,567,662 branches # 11.438 M/sec 7,756,054 branch-misses # 8.47% of all branches 2.002040043 seconds time elapsed v2: o do not change 'sec' to 'cpu-sec'. o use convert_unit_double to implement convert_unit. Signed-off-by: Changbin Du Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20210315143047.3867-1-changbin.du@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/stat-shadow.c | 16 +++++++--------- tools/perf/util/units.c | 21 ++++++++++++++------- tools/perf/util/units.h | 1 + 3 files changed, 22 insertions(+), 16 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c index 6ccf21a72f06..3f800e71126f 100644 --- a/tools/perf/util/stat-shadow.c +++ b/tools/perf/util/stat-shadow.c @@ -9,6 +9,7 @@ #include "expr.h" #include "metricgroup.h" #include "cgroup.h" +#include "units.h" #include /* @@ -1270,18 +1271,15 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, generic_metric(config, evsel->metric_expr, evsel->metric_events, NULL, evsel->name, evsel->metric_name, NULL, 1, cpu, out, st); } else if (runtime_stat_n(st, STAT_NSECS, cpu, &rsd) != 0) { - char unit = 'M'; - char unit_buf[10]; + char unit = ' '; + char unit_buf[10] = "/sec"; total = runtime_stat_avg(st, STAT_NSECS, cpu, &rsd); - if (total) - ratio = 1000.0 * avg / total; - if (ratio < 0.001) { - ratio *= 1000; - unit = 'K'; - } - snprintf(unit_buf, sizeof(unit_buf), "%c/sec", unit); + ratio = convert_unit_double(1000000000.0 * avg / total, &unit); + + if (unit != ' ') + snprintf(unit_buf, sizeof(unit_buf), "%c/sec", unit); print_metric(config, ctxp, NULL, "%8.3f", unit_buf, ratio); } else if (perf_stat_evsel__is(evsel, SMI_NUM)) { print_smi_cost(config, cpu, out, st, &rsd); diff --git a/tools/perf/util/units.c b/tools/perf/util/units.c index a46762aec4c9..32c39cfe209b 100644 --- a/tools/perf/util/units.c +++ b/tools/perf/util/units.c @@ -33,28 +33,35 @@ unsigned long parse_tag_value(const char *str, struct parse_tag *tags) return (unsigned long) -1; } -unsigned long convert_unit(unsigned long value, char *unit) +double convert_unit_double(double value, char *unit) { *unit = ' '; - if (value > 1000) { - value /= 1000; + if (value > 1000.0) { + value /= 1000.0; *unit = 'K'; } - if (value > 1000) { - value /= 1000; + if (value > 1000.0) { + value /= 1000.0; *unit = 'M'; } - if (value > 1000) { - value /= 1000; + if (value > 1000.0) { + value /= 1000.0; *unit = 'G'; } return value; } +unsigned long convert_unit(unsigned long value, char *unit) +{ + double v = convert_unit_double((double)value, unit); + + return (unsigned long)v; +} + int unit_number__scnprintf(char *buf, size_t size, u64 n) { char unit[4] = "BKMG"; diff --git a/tools/perf/util/units.h b/tools/perf/util/units.h index 99263b6a23f7..ea43e74e3240 100644 --- a/tools/perf/util/units.h +++ b/tools/perf/util/units.h @@ -12,6 +12,7 @@ struct parse_tag { unsigned long parse_tag_value(const char *str, struct parse_tag *tags); +double convert_unit_double(double value, char *unit); unsigned long convert_unit(unsigned long value, char *unit); int unit_number__scnprintf(char *buf, size_t size, u64 n); -- cgit v1.2.3 From 301cddc21a157a3072d789a3097857202e550a24 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 12 Mar 2021 12:32:55 +0100 Subject: objtool/x86: Use asm/nops.h Since the kernel will rely on a single canonical set of NOPs, make sure objtool uses the exact same ones. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210312115749.136357911@infradead.org --- tools/arch/x86/include/asm/nops.h | 81 +++++++++++++++++++++++++++++++++++++++ tools/objtool/arch/x86/decode.c | 13 ++++--- tools/objtool/sync-check.sh | 1 + 3 files changed, 90 insertions(+), 5 deletions(-) create mode 100644 tools/arch/x86/include/asm/nops.h (limited to 'tools') diff --git a/tools/arch/x86/include/asm/nops.h b/tools/arch/x86/include/asm/nops.h new file mode 100644 index 000000000000..c1e5e818ba16 --- /dev/null +++ b/tools/arch/x86/include/asm/nops.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_NOPS_H +#define _ASM_X86_NOPS_H + +/* + * Define nops for use with alternative() and for tracing. + */ + +#ifndef CONFIG_64BIT + +/* + * Generic 32bit nops from GAS: + * + * 1: nop + * 2: movl %esi,%esi + * 3: leal 0x0(%esi),%esi + * 4: leal 0x0(%esi,%eiz,1),%esi + * 5: leal %ds:0x0(%esi,%eiz,1),%esi + * 6: leal 0x0(%esi),%esi + * 7: leal 0x0(%esi,%eiz,1),%esi + * 8: leal %ds:0x0(%esi,%eiz,1),%esi + * + * Except 5 and 8, which are DS prefixed 4 and 7 resp, where GAS would emit 2 + * nop instructions. + */ +#define BYTES_NOP1 0x90 +#define BYTES_NOP2 0x89,0xf6 +#define BYTES_NOP3 0x8d,0x76,0x00 +#define BYTES_NOP4 0x8d,0x74,0x26,0x00 +#define BYTES_NOP5 0x3e,BYTES_NOP4 +#define BYTES_NOP6 0x8d,0xb6,0x00,0x00,0x00,0x00 +#define BYTES_NOP7 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00 +#define BYTES_NOP8 0x3e,BYTES_NOP7 + +#else + +/* + * Generic 64bit nops from GAS: + * + * 1: nop + * 2: osp nop + * 3: nopl (%eax) + * 4: nopl 0x00(%eax) + * 5: nopl 0x00(%eax,%eax,1) + * 6: osp nopl 0x00(%eax,%eax,1) + * 7: nopl 0x00000000(%eax) + * 8: nopl 0x00000000(%eax,%eax,1) + */ +#define BYTES_NOP1 0x90 +#define BYTES_NOP2 0x66,BYTES_NOP1 +#define BYTES_NOP3 0x0f,0x1f,0x00 +#define BYTES_NOP4 0x0f,0x1f,0x40,0x00 +#define BYTES_NOP5 0x0f,0x1f,0x44,0x00,0x00 +#define BYTES_NOP6 0x66,BYTES_NOP5 +#define BYTES_NOP7 0x0f,0x1f,0x80,0x00,0x00,0x00,0x00 +#define BYTES_NOP8 0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 + +#endif /* CONFIG_64BIT */ + +#ifdef __ASSEMBLY__ +#define _ASM_MK_NOP(x) .byte x +#else +#define _ASM_MK_NOP(x) ".byte " __stringify(x) "\n" +#endif + +#define ASM_NOP1 _ASM_MK_NOP(BYTES_NOP1) +#define ASM_NOP2 _ASM_MK_NOP(BYTES_NOP2) +#define ASM_NOP3 _ASM_MK_NOP(BYTES_NOP3) +#define ASM_NOP4 _ASM_MK_NOP(BYTES_NOP4) +#define ASM_NOP5 _ASM_MK_NOP(BYTES_NOP5) +#define ASM_NOP6 _ASM_MK_NOP(BYTES_NOP6) +#define ASM_NOP7 _ASM_MK_NOP(BYTES_NOP7) +#define ASM_NOP8 _ASM_MK_NOP(BYTES_NOP8) + +#define ASM_NOP_MAX 8 + +#ifndef __ASSEMBLY__ +extern const unsigned char * const x86_nops[]; +#endif + +#endif /* _ASM_X86_NOPS_H */ diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index 549813cff8ab..c117bfc60370 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -11,6 +11,9 @@ #include "../../../arch/x86/lib/inat.c" #include "../../../arch/x86/lib/insn.c" +#define CONFIG_64BIT 1 +#include + #include #include #include @@ -596,11 +599,11 @@ void arch_initial_func_cfi_state(struct cfi_init_state *state) const char *arch_nop_insn(int len) { static const char nops[5][5] = { - /* 1 */ { 0x90 }, - /* 2 */ { 0x66, 0x90 }, - /* 3 */ { 0x0f, 0x1f, 0x00 }, - /* 4 */ { 0x0f, 0x1f, 0x40, 0x00 }, - /* 5 */ { 0x0f, 0x1f, 0x44, 0x00, 0x00 }, + { BYTES_NOP1 }, + { BYTES_NOP2 }, + { BYTES_NOP3 }, + { BYTES_NOP4 }, + { BYTES_NOP5 }, }; if (len < 1 || len > 5) { diff --git a/tools/objtool/sync-check.sh b/tools/objtool/sync-check.sh index 606a4b5e929f..d23268683910 100755 --- a/tools/objtool/sync-check.sh +++ b/tools/objtool/sync-check.sh @@ -10,6 +10,7 @@ FILES="include/linux/objtool.h" if [ "$SRCARCH" = "x86" ]; then FILES="$FILES +arch/x86/include/asm/nops.h arch/x86/include/asm/inat_types.h arch/x86/include/asm/orc_types.h arch/x86/include/asm/emulate_prefix.h -- cgit v1.2.3 From 6503b9f29a47cdb4ebd6c36d8bbb018418415c2a Mon Sep 17 00:00:00 2001 From: Manu Bretelle Date: Wed, 10 Mar 2021 10:23:05 -0800 Subject: bpf: Add getter and setter for SO_REUSEPORT through bpf_{g,s}etsockopt Augment the current set of options that are accessible via bpf_{g,s}etsockopt to also support SO_REUSEPORT. Signed-off-by: Manu Bretelle Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210310182305.1910312-1-chantra@fb.com --- net/core/filter.c | 6 ++++++ tools/testing/selftests/bpf/progs/bind4_prog.c | 25 +++++++++++++++++++++++++ tools/testing/selftests/bpf/progs/bind6_prog.c | 25 +++++++++++++++++++++++++ 3 files changed, 56 insertions(+) (limited to 'tools') diff --git a/net/core/filter.c b/net/core/filter.c index b6732000d8a2..10dac9dd5086 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4729,6 +4729,9 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, sk->sk_prot->keepalive(sk, valbool); sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); break; + case SO_REUSEPORT: + sk->sk_reuseport = valbool; + break; default: ret = -EINVAL; } @@ -4898,6 +4901,9 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname, case SO_BINDTOIFINDEX: *((int *)optval) = sk->sk_bound_dev_if; break; + case SO_REUSEPORT: + *((int *)optval) = sk->sk_reuseport; + break; default: goto err_clear; } diff --git a/tools/testing/selftests/bpf/progs/bind4_prog.c b/tools/testing/selftests/bpf/progs/bind4_prog.c index 115a3b0ad984..474c6a62078a 100644 --- a/tools/testing/selftests/bpf/progs/bind4_prog.c +++ b/tools/testing/selftests/bpf/progs/bind4_prog.c @@ -57,6 +57,27 @@ static __inline int bind_to_device(struct bpf_sock_addr *ctx) return 0; } +static __inline int bind_reuseport(struct bpf_sock_addr *ctx) +{ + int val = 1; + + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_REUSEPORT, + &val, sizeof(val))) + return 1; + if (bpf_getsockopt(ctx, SOL_SOCKET, SO_REUSEPORT, + &val, sizeof(val)) || !val) + return 1; + val = 0; + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_REUSEPORT, + &val, sizeof(val))) + return 1; + if (bpf_getsockopt(ctx, SOL_SOCKET, SO_REUSEPORT, + &val, sizeof(val)) || val) + return 1; + + return 0; +} + static __inline int misc_opts(struct bpf_sock_addr *ctx, int opt) { int old, tmp, new = 0xeb9f; @@ -127,6 +148,10 @@ int bind_v4_prog(struct bpf_sock_addr *ctx) if (misc_opts(ctx, SO_MARK) || misc_opts(ctx, SO_PRIORITY)) return 0; + /* Set reuseport and unset */ + if (bind_reuseport(ctx)) + return 0; + ctx->user_ip4 = bpf_htonl(SERV4_REWRITE_IP); ctx->user_port = bpf_htons(SERV4_REWRITE_PORT); diff --git a/tools/testing/selftests/bpf/progs/bind6_prog.c b/tools/testing/selftests/bpf/progs/bind6_prog.c index 4c0d348034b9..c19cfa869f30 100644 --- a/tools/testing/selftests/bpf/progs/bind6_prog.c +++ b/tools/testing/selftests/bpf/progs/bind6_prog.c @@ -63,6 +63,27 @@ static __inline int bind_to_device(struct bpf_sock_addr *ctx) return 0; } +static __inline int bind_reuseport(struct bpf_sock_addr *ctx) +{ + int val = 1; + + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_REUSEPORT, + &val, sizeof(val))) + return 1; + if (bpf_getsockopt(ctx, SOL_SOCKET, SO_REUSEPORT, + &val, sizeof(val)) || !val) + return 1; + val = 0; + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_REUSEPORT, + &val, sizeof(val))) + return 1; + if (bpf_getsockopt(ctx, SOL_SOCKET, SO_REUSEPORT, + &val, sizeof(val)) || val) + return 1; + + return 0; +} + static __inline int misc_opts(struct bpf_sock_addr *ctx, int opt) { int old, tmp, new = 0xeb9f; @@ -141,6 +162,10 @@ int bind_v6_prog(struct bpf_sock_addr *ctx) if (misc_opts(ctx, SO_MARK) || misc_opts(ctx, SO_PRIORITY)) return 0; + /* Set reuseport and unset */ + if (bind_reuseport(ctx)) + return 0; + ctx->user_ip6[0] = bpf_htonl(SERV6_REWRITE_IP_0); ctx->user_ip6[1] = bpf_htonl(SERV6_REWRITE_IP_1); ctx->user_ip6[2] = bpf_htonl(SERV6_REWRITE_IP_2); -- cgit v1.2.3 From 49ab51b01ec6fd837ae3efe2e0cdb41fcf5cf048 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 Feb 2021 15:40:05 -0800 Subject: tools/memory-model: Add access-marking documentation This commit adapts the "Concurrency bugs should fear the big bad data-race detector (part 2)" LWN article (https://lwn.net/Articles/816854/) to kernel-documentation form. This allows more easily updating the material as needed. Suggested-by: Thomas Gleixner [ paulmck: Apply Marco Elver feedback. ] [ paulmck: Update per Akira Yokosawa feedback. ] Reviewed-by: Marco Elver Signed-off-by: Paul E. McKenney --- .../memory-model/Documentation/access-marking.txt | 479 +++++++++++++++++++++ 1 file changed, 479 insertions(+) create mode 100644 tools/memory-model/Documentation/access-marking.txt (limited to 'tools') diff --git a/tools/memory-model/Documentation/access-marking.txt b/tools/memory-model/Documentation/access-marking.txt new file mode 100644 index 000000000000..1ab189f51f55 --- /dev/null +++ b/tools/memory-model/Documentation/access-marking.txt @@ -0,0 +1,479 @@ +MARKING SHARED-MEMORY ACCESSES +============================== + +This document provides guidelines for marking intentionally concurrent +normal accesses to shared memory, that is "normal" as in accesses that do +not use read-modify-write atomic operations. It also describes how to +document these accesses, both with comments and with special assertions +processed by the Kernel Concurrency Sanitizer (KCSAN). This discussion +builds on an earlier LWN article [1]. + + +ACCESS-MARKING OPTIONS +====================== + +The Linux kernel provides the following access-marking options: + +1. Plain C-language accesses (unmarked), for example, "a = b;" + +2. Data-race marking, for example, "data_race(a = b);" + +3. READ_ONCE(), for example, "a = READ_ONCE(b);" + The various forms of atomic_read() also fit in here. + +4. WRITE_ONCE(), for example, "WRITE_ONCE(a, b);" + The various forms of atomic_set() also fit in here. + + +These may be used in combination, as shown in this admittedly improbable +example: + + WRITE_ONCE(a, b + data_race(c + d) + READ_ONCE(e)); + +Neither plain C-language accesses nor data_race() (#1 and #2 above) place +any sort of constraint on the compiler's choice of optimizations [2]. +In contrast, READ_ONCE() and WRITE_ONCE() (#3 and #4 above) restrict the +compiler's use of code-motion and common-subexpression optimizations. +Therefore, if a given access is involved in an intentional data race, +using READ_ONCE() for loads and WRITE_ONCE() for stores is usually +preferable to data_race(), which in turn is usually preferable to plain +C-language accesses. + +KCSAN will complain about many types of data races involving plain +C-language accesses, but marking all accesses involved in a given data +race with one of data_race(), READ_ONCE(), or WRITE_ONCE(), will prevent +KCSAN from complaining. Of course, lack of KCSAN complaints does not +imply correct code. Therefore, please take a thoughtful approach +when responding to KCSAN complaints. Churning the code base with +ill-considered additions of data_race(), READ_ONCE(), and WRITE_ONCE() +is unhelpful. + +In fact, the following sections describe situations where use of +data_race() and even plain C-language accesses is preferable to +READ_ONCE() and WRITE_ONCE(). + + +Use of the data_race() Macro +---------------------------- + +Here are some situations where data_race() should be used instead of +READ_ONCE() and WRITE_ONCE(): + +1. Data-racy loads from shared variables whose values are used only + for diagnostic purposes. + +2. Data-racy reads whose values are checked against marked reload. + +3. Reads whose values feed into error-tolerant heuristics. + +4. Writes setting values that feed into error-tolerant heuristics. + + +Data-Racy Reads for Approximate Diagnostics + +Approximate diagnostics include lockdep reports, monitoring/statistics +(including /proc and /sys output), WARN*()/BUG*() checks whose return +values are ignored, and other situations where reads from shared variables +are not an integral part of the core concurrency design. + +In fact, use of data_race() instead READ_ONCE() for these diagnostic +reads can enable better checking of the remaining accesses implementing +the core concurrency design. For example, suppose that the core design +prevents any non-diagnostic reads from shared variable x from running +concurrently with updates to x. Then using plain C-language writes +to x allows KCSAN to detect reads from x from within regions of code +that fail to exclude the updates. In this case, it is important to use +data_race() for the diagnostic reads because otherwise KCSAN would give +false-positive warnings about these diagnostic reads. + +In theory, plain C-language loads can also be used for this use case. +However, in practice this will have the disadvantage of causing KCSAN +to generate false positives because KCSAN will have no way of knowing +that the resulting data race was intentional. + + +Data-Racy Reads That Are Checked Against Marked Reload + +The values from some reads are not implicitly trusted. They are instead +fed into some operation that checks the full value against a later marked +load from memory, which means that the occasional arbitrarily bogus value +is not a problem. For example, if a bogus value is fed into cmpxchg(), +all that happens is that this cmpxchg() fails, which normally results +in a retry. Unless the race condition that resulted in the bogus value +recurs, this retry will with high probability succeed, so no harm done. + +However, please keep in mind that a data_race() load feeding into +a cmpxchg_relaxed() might still be subject to load fusing on some +architectures. Therefore, it is best to capture the return value from +the failing cmpxchg() for the next iteration of the loop, an approach +that provides the compiler much less scope for mischievous optimizations. +Capturing the return value from cmpxchg() also saves a memory reference +in many cases. + +In theory, plain C-language loads can also be used for this use case. +However, in practice this will have the disadvantage of causing KCSAN +to generate false positives because KCSAN will have no way of knowing +that the resulting data race was intentional. + + +Reads Feeding Into Error-Tolerant Heuristics + +Values from some reads feed into heuristics that can tolerate occasional +errors. Such reads can use data_race(), thus allowing KCSAN to focus on +the other accesses to the relevant shared variables. But please note +that data_race() loads are subject to load fusing, which can result in +consistent errors, which in turn are quite capable of breaking heuristics. +Therefore use of data_race() should be limited to cases where some other +code (such as a barrier() call) will force the occasional reload. + +In theory, plain C-language loads can also be used for this use case. +However, in practice this will have the disadvantage of causing KCSAN +to generate false positives because KCSAN will have no way of knowing +that the resulting data race was intentional. + + +Writes Setting Values Feeding Into Error-Tolerant Heuristics + +The values read into error-tolerant heuristics come from somewhere, +for example, from sysfs. This means that some code in sysfs writes +to this same variable, and these writes can also use data_race(). +After all, if the heuristic can tolerate the occasional bogus value +due to compiler-mangled reads, it can also tolerate the occasional +compiler-mangled write, at least assuming that the proper value is in +place once the write completes. + +Plain C-language stores can also be used for this use case. However, +in kernels built with CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=n, this +will have the disadvantage of causing KCSAN to generate false positives +because KCSAN will have no way of knowing that the resulting data race +was intentional. + + +Use of Plain C-Language Accesses +-------------------------------- + +Here are some example situations where plain C-language accesses should +used instead of READ_ONCE(), WRITE_ONCE(), and data_race(): + +1. Accesses protected by mutual exclusion, including strict locking + and sequence locking. + +2. Initialization-time and cleanup-time accesses. This covers a + wide variety of situations, including the uniprocessor phase of + system boot, variables to be used by not-yet-spawned kthreads, + structures not yet published to reference-counted or RCU-protected + data structures, and the cleanup side of any of these situations. + +3. Per-CPU variables that are not accessed from other CPUs. + +4. Private per-task variables, including on-stack variables, some + fields in the task_struct structure, and task-private heap data. + +5. Any other loads for which there is not supposed to be a concurrent + store to that same variable. + +6. Any other stores for which there should be neither concurrent + loads nor concurrent stores to that same variable. + + But note that KCSAN makes two explicit exceptions to this rule + by default, refraining from flagging plain C-language stores: + + a. No matter what. You can override this default by building + with CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=n. + + b. When the store writes the value already contained in + that variable. You can override this default by building + with CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY=n. + + c. When one of the stores is in an interrupt handler and + the other in the interrupted code. You can override this + default by building with CONFIG_KCSAN_INTERRUPT_WATCHER=y. + +Note that it is important to use plain C-language accesses in these cases, +because doing otherwise prevents KCSAN from detecting violations of your +code's synchronization rules. + + +ACCESS-DOCUMENTATION OPTIONS +============================ + +It is important to comment marked accesses so that people reading your +code, yourself included, are reminded of the synchronization design. +However, it is even more important to comment plain C-language accesses +that are intentionally involved in data races. Such comments are +needed to remind people reading your code, again, yourself included, +of how the compiler has been prevented from optimizing those accesses +into concurrency bugs. + +It is also possible to tell KCSAN about your synchronization design. +For example, ASSERT_EXCLUSIVE_ACCESS(foo) tells KCSAN that any +concurrent access to variable foo by any other CPU is an error, even +if that concurrent access is marked with READ_ONCE(). In addition, +ASSERT_EXCLUSIVE_WRITER(foo) tells KCSAN that although it is OK for there +to be concurrent reads from foo from other CPUs, it is an error for some +other CPU to be concurrently writing to foo, even if that concurrent +write is marked with data_race() or WRITE_ONCE(). + +Note that although KCSAN will call out data races involving either +ASSERT_EXCLUSIVE_ACCESS() or ASSERT_EXCLUSIVE_WRITER() on the one hand +and data_race() writes on the other, KCSAN will not report the location +of these data_race() writes. + + +EXAMPLES +======== + +As noted earlier, the goal is to prevent the compiler from destroying +your concurrent algorithm, to help the human reader, and to inform +KCSAN of aspects of your concurrency design. This section looks at a +few examples showing how this can be done. + + +Lock Protection With Lockless Diagnostic Access +----------------------------------------------- + +For example, suppose a shared variable "foo" is read only while a +reader-writer spinlock is read-held, written only while that same +spinlock is write-held, except that it is also read locklessly for +diagnostic purposes. The code might look as follows: + + int foo; + DEFINE_RWLOCK(foo_rwlock); + + void update_foo(int newval) + { + write_lock(&foo_rwlock); + foo = newval; + do_something(newval); + write_unlock(&foo_rwlock); + } + + int read_foo(void) + { + int ret; + + read_lock(&foo_rwlock); + do_something_else(); + ret = foo; + read_unlock(&foo_rwlock); + return ret; + } + + int read_foo_diagnostic(void) + { + return data_race(foo); + } + +The reader-writer lock prevents the compiler from introducing concurrency +bugs into any part of the main algorithm using foo, which means that +the accesses to foo within both update_foo() and read_foo() can (and +should) be plain C-language accesses. One benefit of making them be +plain C-language accesses is that KCSAN can detect any erroneous lockless +reads from or updates to foo. The data_race() in read_foo_diagnostic() +tells KCSAN that data races are expected, and should be silently +ignored. This data_race() also tells the human reading the code that +read_foo_diagnostic() might sometimes return a bogus value. + +However, please note that your kernel must be built with +CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=n in order for KCSAN to +detect a buggy lockless write. If you need KCSAN to detect such a +write even if that write did not change the value of foo, you also +need CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY=n. If you need KCSAN to +detect such a write happening in an interrupt handler running on the +same CPU doing the legitimate lock-protected write, you also need +CONFIG_KCSAN_INTERRUPT_WATCHER=y. With some or all of these Kconfig +options set properly, KCSAN can be quite helpful, although it is not +necessarily a full replacement for hardware watchpoints. On the other +hand, neither are hardware watchpoints a full replacement for KCSAN +because it is not always easy to tell hardware watchpoint to conditionally +trap on accesses. + + +Lock-Protected Writes With Lockless Reads +----------------------------------------- + +For another example, suppose a shared variable "foo" is updated only +while holding a spinlock, but is read locklessly. The code might look +as follows: + + int foo; + DEFINE_SPINLOCK(foo_lock); + + void update_foo(int newval) + { + spin_lock(&foo_lock); + WRITE_ONCE(foo, newval); + ASSERT_EXCLUSIVE_WRITER(foo); + do_something(newval); + spin_unlock(&foo_wlock); + } + + int read_foo(void) + { + do_something_else(); + return READ_ONCE(foo); + } + +Because foo is read locklessly, all accesses are marked. The purpose +of the ASSERT_EXCLUSIVE_WRITER() is to allow KCSAN to check for a buggy +concurrent lockless write. + + +Lockless Reads and Writes +------------------------- + +For another example, suppose a shared variable "foo" is both read and +updated locklessly. The code might look as follows: + + int foo; + + int update_foo(int newval) + { + int ret; + + ret = xchg(&foo, newval); + do_something(newval); + return ret; + } + + int read_foo(void) + { + do_something_else(); + return READ_ONCE(foo); + } + +Because foo is accessed locklessly, all accesses are marked. It does +not make sense to use ASSERT_EXCLUSIVE_WRITER() in this case because +there really can be concurrent lockless writers. KCSAN would +flag any concurrent plain C-language reads from foo, and given +CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=n, also any concurrent plain +C-language writes to foo. + + +Lockless Reads and Writes, But With Single-Threaded Initialization +------------------------------------------------------------------ + +For yet another example, suppose that foo is initialized in a +single-threaded manner, but that a number of kthreads are then created +that locklessly and concurrently access foo. Some snippets of this code +might look as follows: + + int foo; + + void initialize_foo(int initval, int nkthreads) + { + int i; + + foo = initval; + ASSERT_EXCLUSIVE_ACCESS(foo); + for (i = 0; i < nkthreads; i++) + kthread_run(access_foo_concurrently, ...); + } + + /* Called from access_foo_concurrently(). */ + int update_foo(int newval) + { + int ret; + + ret = xchg(&foo, newval); + do_something(newval); + return ret; + } + + /* Also called from access_foo_concurrently(). */ + int read_foo(void) + { + do_something_else(); + return READ_ONCE(foo); + } + +The initialize_foo() uses a plain C-language write to foo because there +are not supposed to be concurrent accesses during initialization. The +ASSERT_EXCLUSIVE_ACCESS() allows KCSAN to flag buggy concurrent unmarked +reads, and the ASSERT_EXCLUSIVE_ACCESS() call further allows KCSAN to +flag buggy concurrent writes, even if: (1) Those writes are marked or +(2) The kernel was built with CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=y. + + +Checking Stress-Test Race Coverage +---------------------------------- + +When designing stress tests it is important to ensure that race conditions +of interest really do occur. For example, consider the following code +fragment: + + int foo; + + int update_foo(int newval) + { + return xchg(&foo, newval); + } + + int xor_shift_foo(int shift, int mask) + { + int old, new, newold; + + newold = data_race(foo); /* Checked by cmpxchg(). */ + do { + old = newold; + new = (old << shift) ^ mask; + newold = cmpxchg(&foo, old, new); + } while (newold != old); + return old; + } + + int read_foo(void) + { + return READ_ONCE(foo); + } + +If it is possible for update_foo(), xor_shift_foo(), and read_foo() to be +invoked concurrently, the stress test should force this concurrency to +actually happen. KCSAN can evaluate the stress test when the above code +is modified to read as follows: + + int foo; + + int update_foo(int newval) + { + ASSERT_EXCLUSIVE_ACCESS(foo); + return xchg(&foo, newval); + } + + int xor_shift_foo(int shift, int mask) + { + int old, new, newold; + + newold = data_race(foo); /* Checked by cmpxchg(). */ + do { + old = newold; + new = (old << shift) ^ mask; + ASSERT_EXCLUSIVE_ACCESS(foo); + newold = cmpxchg(&foo, old, new); + } while (newold != old); + return old; + } + + + int read_foo(void) + { + ASSERT_EXCLUSIVE_ACCESS(foo); + return READ_ONCE(foo); + } + +If a given stress-test run does not result in KCSAN complaints from +each possible pair of ASSERT_EXCLUSIVE_ACCESS() invocations, the +stress test needs improvement. If the stress test was to be evaluated +on a regular basis, it would be wise to place the above instances of +ASSERT_EXCLUSIVE_ACCESS() under #ifdef so that they did not result in +false positives when not evaluating the stress test. + + +REFERENCES +========== + +[1] "Concurrency bugs should fear the big bad data-race detector (part 2)" + https://lwn.net/Articles/816854/ + +[2] "Who's afraid of a big bad optimizing compiler?" + https://lwn.net/Articles/793253/ -- cgit v1.2.3 From 0205e9de42911404902728911b03fc1469242419 Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Sun, 14 Mar 2021 18:38:38 +0100 Subject: libbpf: Avoid inline hint definition from 'linux/stddef.h' Linux headers might pull 'linux/stddef.h' which defines '__always_inline' as the following: #ifndef __always_inline #define __always_inline inline #endif This becomes an issue if the program picks up the 'linux/stddef.h' definition as the macro now just hints inline to clang. This change now enforces the proper definition for BPF programs regardless of the include order. Signed-off-by: Pedro Tammela Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210314173839.457768-1-pctammela@gmail.com --- tools/lib/bpf/bpf_helpers.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index ae6c975e0b87..53ff81c49dbd 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -29,9 +29,10 @@ */ #define SEC(NAME) __attribute__((section(NAME), used)) -#ifndef __always_inline +/* Avoid 'linux/stddef.h' definition of '__always_inline'. */ +#undef __always_inline #define __always_inline inline __attribute__((always_inline)) -#endif + #ifndef __noinline #define __noinline __attribute__((noinline)) #endif -- cgit v1.2.3 From 23f50b5ac331c8c27c421a7116618355508e8427 Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Mon, 15 Mar 2021 14:29:51 +0100 Subject: bpf: selftests: Remove unused 'nospace_err' in tests for batched ops in array maps This seems to be a reminiscent from the hashmap tests. Signed-off-by: Pedro Tammela Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210315132954.603108-1-pctammela@gmail.com --- tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c b/tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c index f0a64d8ac59a..e42ea1195d18 100644 --- a/tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c +++ b/tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c @@ -55,7 +55,6 @@ void test_array_map_batch_ops(void) int map_fd, *keys, *values, *visited; __u32 count, total, total_success; const __u32 max_entries = 10; - bool nospace_err; __u64 batch = 0; int err, step; DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, opts, @@ -90,7 +89,6 @@ void test_array_map_batch_ops(void) * elements each. */ count = step; - nospace_err = false; while (true) { err = bpf_map_lookup_batch(map_fd, total ? &batch : NULL, &batch, @@ -107,9 +105,6 @@ void test_array_map_batch_ops(void) } - if (nospace_err == true) - continue; - CHECK(total != max_entries, "lookup with steps", "total = %u, max_entries = %u\n", total, max_entries); -- cgit v1.2.3 From dde7b3f5f2f458297aeccfd4783e53ab8ca046db Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 13 Mar 2021 13:09:17 -0800 Subject: libbpf: Add explicit padding to bpf_xdp_set_link_opts Adding such anonymous padding fixes the issue with uninitialized portions of bpf_xdp_set_link_opts when using LIBBPF_DECLARE_OPTS macro with inline field initialization: DECLARE_LIBBPF_OPTS(bpf_xdp_set_link_opts, opts, .old_fd = -1); When such code is compiled in debug mode, compiler is generating code that leaves padding bytes uninitialized, which triggers error inside libbpf APIs that do strict zero initialization checks for OPTS structs. Adding anonymous padding field fixes the issue. Fixes: bd5ca3ef93cd ("libbpf: Add function to set link XDP fd while specifying old program") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210313210920.1959628-2-andrii@kernel.org --- tools/lib/bpf/libbpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 3c35eb401931..3d690d4e785c 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -507,6 +507,7 @@ struct xdp_link_info { struct bpf_xdp_set_link_opts { size_t sz; int old_fd; + size_t :0; }; #define bpf_xdp_set_link_opts__last_field old_fd -- cgit v1.2.3 From 4bbb3583687051ef99966ddaeb1730441b777d40 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 13 Mar 2021 13:09:18 -0800 Subject: bpftool: Fix maybe-uninitialized warnings Somehow when bpftool is compiled in -Og mode, compiler produces new warnings about possibly uninitialized variables. Fix all the reported problems. Fixes: 2119f2189df1 ("bpftool: add C output format option to btf dump subcommand") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210313210920.1959628-3-andrii@kernel.org --- tools/bpf/bpftool/btf.c | 3 +++ tools/bpf/bpftool/main.c | 3 +-- tools/bpf/bpftool/map.c | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c index 985610c3f193..62953bbf68b4 100644 --- a/tools/bpf/bpftool/btf.c +++ b/tools/bpf/bpftool/btf.c @@ -546,6 +546,7 @@ static int do_dump(int argc, char **argv) NEXT_ARG(); if (argc < 1) { p_err("expecting value for 'format' option\n"); + err = -EINVAL; goto done; } if (strcmp(*argv, "c") == 0) { @@ -555,11 +556,13 @@ static int do_dump(int argc, char **argv) } else { p_err("unrecognized format specifier: '%s', possible values: raw, c", *argv); + err = -EINVAL; goto done; } NEXT_ARG(); } else { p_err("unrecognized option: '%s'", *argv); + err = -EINVAL; goto done; } } diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c index b86f450e6fce..d9afb730136a 100644 --- a/tools/bpf/bpftool/main.c +++ b/tools/bpf/bpftool/main.c @@ -276,7 +276,7 @@ static int do_batch(int argc, char **argv) int n_argc; FILE *fp; char *cp; - int err; + int err = 0; int i; if (argc < 2) { @@ -370,7 +370,6 @@ static int do_batch(int argc, char **argv) } else { if (!json_output) printf("processed %d commands\n", lines); - err = 0; } err_close: if (fp != stdin) diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index b400364ee054..09ae0381205b 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -100,7 +100,7 @@ static int do_dump_btf(const struct btf_dumper *d, void *value) { __u32 value_id; - int ret; + int ret = 0; /* start of key-value pair */ jsonw_start_object(d->jw); -- cgit v1.2.3 From 105b842ba4ef2ddc287645cb33f90b960c16075b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 13 Mar 2021 13:09:19 -0800 Subject: selftests/bpf: Fix maybe-uninitialized warning in xdpxceiver test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit xsk_ring_prod__reserve() doesn't necessarily set idx in some conditions, so from static analysis point of view compiler is right about the problems like: In file included from xdpxceiver.c:92: xdpxceiver.c: In function ‘xsk_populate_fill_ring’: /data/users/andriin/linux/tools/testing/selftests/bpf/tools/include/bpf/xsk.h:119:20: warning: ‘idx’ may be used uninitialized in this function [-Wmaybe-uninitialized] return &addrs[idx & fill->mask]; ~~~~^~~~~~~~~~~~ xdpxceiver.c:300:6: note: ‘idx’ was declared here u32 idx; ^~~ xdpxceiver.c: In function ‘tx_only’: xdpxceiver.c:596:30: warning: ‘idx’ may be used uninitialized in this function [-Wmaybe-uninitialized] struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx + i); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Fix two warnings reported by compiler by pre-initializing variable. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210313210920.1959628-4-andrii@kernel.org --- tools/testing/selftests/bpf/xdpxceiver.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index 8b0f7fdd9003..1e21a3172687 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -297,7 +297,7 @@ static void xsk_configure_umem(struct ifobject *data, void *buffer, u64 size) static void xsk_populate_fill_ring(struct xsk_umem_info *umem) { int ret, i; - u32 idx; + u32 idx = 0; ret = xsk_ring_prod__reserve(&umem->fq, XSK_RING_PROD__DEFAULT_NUM_DESCS, &idx); if (ret != XSK_RING_PROD__DEFAULT_NUM_DESCS) @@ -584,7 +584,7 @@ static void rx_pkt(struct xsk_socket_info *xsk, struct pollfd *fds) static void tx_only(struct xsk_socket_info *xsk, u32 *frameptr, int batch_size) { - u32 idx; + u32 idx = 0; unsigned int i; bool tx_invalid_test = stat_test_type == STAT_TEST_TX_INVALID; u32 len = tx_invalid_test ? XSK_UMEM__DEFAULT_FRAME_SIZE + 1 : PKT_SIZE; -- cgit v1.2.3 From 252e3cbf2b623422b359b965b94060e8d68597e1 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 13 Mar 2021 13:09:20 -0800 Subject: selftests/bpf: Build everything in debug mode Build selftests, bpftool, and libbpf in debug mode with DWARF data to facilitate easier debugging. In terms of impact on building and running selftests. Build is actually faster now: BEFORE: make -j60 380.21s user 37.87s system 1466% cpu 28.503 total AFTER: make -j60 345.47s user 37.37s system 1599% cpu 23.939 total test_progs runtime seems to be the same: BEFORE: real 1m5.139s user 0m1.600s sys 0m43.977s AFTER: real 1m3.799s user 0m1.721s sys 0m42.420s Huge difference is being able to debug issues throughout test_progs, bpftool, and libbpf without constantly updating 3 Makefiles by hand (including GDB seeing the source code without any extra incantations). Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210313210920.1959628-5-andrii@kernel.org --- tools/testing/selftests/bpf/Makefile | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index c3999587bc23..d0db2b673c6f 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -21,7 +21,7 @@ endif BPF_GCC ?= $(shell command -v bpf-gcc;) SAN_CFLAGS ?= -CFLAGS += -g -rdynamic -Wall -O2 $(GENFLAGS) $(SAN_CFLAGS) \ +CFLAGS += -g -Og -rdynamic -Wall $(GENFLAGS) $(SAN_CFLAGS) \ -I$(CURDIR) -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \ -I$(TOOLSINCDIR) -I$(APIDIR) -I$(OUTPUT) \ -Dbpf_prog_load=bpf_prog_test_load \ @@ -201,6 +201,7 @@ $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) \ $(HOST_BPFOBJ) | $(HOST_BUILD_DIR)/bpftool $(Q)$(MAKE) $(submake_extras) -C $(BPFTOOLDIR) \ CC=$(HOSTCC) LD=$(HOSTLD) \ + EXTRA_CFLAGS='-g -Og' \ OUTPUT=$(HOST_BUILD_DIR)/bpftool/ \ prefix= DESTDIR=$(HOST_SCRATCH_DIR)/ install @@ -218,6 +219,7 @@ $(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ ../../../include/uapi/linux/bpf.h \ | $(INCLUDE_DIR) $(BUILD_DIR)/libbpf $(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(BUILD_DIR)/libbpf/ \ + EXTRA_CFLAGS='-g -Og' \ DESTDIR=$(SCRATCH_DIR) prefix= all install_headers ifneq ($(BPFOBJ),$(HOST_BPFOBJ)) @@ -225,7 +227,8 @@ $(HOST_BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ ../../../include/uapi/linux/bpf.h \ | $(INCLUDE_DIR) $(HOST_BUILD_DIR)/libbpf $(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) \ - OUTPUT=$(HOST_BUILD_DIR)/libbpf/ CC=$(HOSTCC) LD=$(HOSTLD) \ + EXTRA_CFLAGS='-g -Og' \ + OUTPUT=$(HOST_BUILD_DIR)/libbpf/ CC=$(HOSTCC) LD=$(HOSTLD) \ DESTDIR=$(HOST_SCRATCH_DIR)/ prefix= all install_headers endif -- cgit v1.2.3 From f0b692c4ee2fdd0d4291fe3cbde156792896895e Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 16 Mar 2021 17:03:02 +0200 Subject: selftests: mlxsw: Add tc sample tests for new triggers Test that packets are sampled when tc-sample is used with matchall egress binding and flower classifier. Verify that when performing sampling on egress the end-to-end latency is reported as metadata. Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- .../selftests/drivers/net/mlxsw/tc_sample.sh | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/mlxsw/tc_sample.sh b/tools/testing/selftests/drivers/net/mlxsw/tc_sample.sh index 75d00104f291..57b05f042787 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/tc_sample.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/tc_sample.sh @@ -41,6 +41,10 @@ ALL_TESTS=" tc_sample_md_lag_oif_test tc_sample_md_out_tc_test tc_sample_md_out_tc_occ_test + tc_sample_md_latency_test + tc_sample_acl_group_conflict_test + tc_sample_acl_rate_test + tc_sample_acl_max_rate_test " NUM_NETIFS=8 CAPTURE_FILE=$(mktemp) @@ -482,6 +486,137 @@ tc_sample_md_out_tc_occ_test() tc filter del dev $rp1 ingress protocol all pref 1 handle 101 matchall } +tc_sample_md_latency_test() +{ + RET=0 + + # Egress sampling not supported on Spectrum-1. + [[ "$DEVLINK_VIDDID" == "15b3:cb84" ]] && return + + tc filter add dev $rp2 egress protocol all pref 1 handle 101 matchall \ + skip_sw action sample rate 5 group 1 + check_err $? "Failed to configure sampling rule" + + psample_capture_start + + ip vrf exec v$h1 $MZ $h1 -c 3200 -d 1msec -p 64 -A 192.0.2.1 \ + -B 198.51.100.1 -t udp dp=52768,sp=42768 -q + + psample_capture_stop + + grep -q -e "latency " $CAPTURE_FILE + check_err $? "Sampled packets do not have latency attribute" + + log_test "tc sample latency" + + tc filter del dev $rp2 egress protocol all pref 1 handle 101 matchall +} + +tc_sample_acl_group_conflict_test() +{ + RET=0 + + # Test that two flower sampling rules cannot be configured on the same + # port with different groups. + + # Policy-based sampling is not supported on Spectrum-1. + [[ "$DEVLINK_VIDDID" == "15b3:cb84" ]] && return + + tc filter add dev $rp1 ingress protocol ip pref 1 handle 101 flower \ + skip_sw action sample rate 1024 group 1 + check_err $? "Failed to configure sampling rule" + + tc filter add dev $rp1 ingress protocol ip pref 2 handle 102 flower \ + skip_sw action sample rate 1024 group 1 + check_err $? "Failed to configure sampling rule with same group" + + tc filter add dev $rp1 ingress protocol ip pref 3 handle 103 flower \ + skip_sw action sample rate 1024 group 2 &> /dev/null + check_fail $? "Managed to configure sampling rule with conflicting group" + + log_test "tc sample (w/ flower) group conflict test" + + tc filter del dev $rp1 ingress protocol ip pref 2 handle 102 flower + tc filter del dev $rp1 ingress protocol ip pref 1 handle 101 flower +} + +__tc_sample_acl_rate_test() +{ + local bind=$1; shift + local port=$1; shift + local pkts pct + + RET=0 + + # Policy-based sampling is not supported on Spectrum-1. + [[ "$DEVLINK_VIDDID" == "15b3:cb84" ]] && return + + tc filter add dev $port $bind protocol ip pref 1 handle 101 flower \ + skip_sw dst_ip 198.51.100.1 action sample rate 32 group 1 + check_err $? "Failed to configure sampling rule" + + psample_capture_start + + ip vrf exec v$h1 $MZ $h1 -c 3200 -d 1msec -p 64 -A 192.0.2.1 \ + -B 198.51.100.1 -t udp dp=52768,sp=42768 -q + + psample_capture_stop + + pkts=$(grep -e "group 1 " $CAPTURE_FILE | wc -l) + pct=$((100 * (pkts - 100) / 100)) + (( -25 <= pct && pct <= 25)) + check_err $? "Expected 100 packets, got $pkts packets, which is $pct% off. Required accuracy is +-25%" + + # Setup a filter that should not match any packet and make sure packets + # are not sampled. + tc filter del dev $port $bind protocol ip pref 1 handle 101 flower + + tc filter add dev $port $bind protocol ip pref 1 handle 101 flower \ + skip_sw dst_ip 198.51.100.10 action sample rate 32 group 1 + check_err $? "Failed to configure sampling rule" + + psample_capture_start + + ip vrf exec v$h1 $MZ $h1 -c 3200 -d 1msec -p 64 -A 192.0.2.1 \ + -B 198.51.100.1 -t udp dp=52768,sp=42768 -q + + psample_capture_stop + + grep -q -e "group 1 " $CAPTURE_FILE + check_fail $? "Sampled packets when should not" + + log_test "tc sample (w/ flower) rate ($bind)" + + tc filter del dev $port $bind protocol ip pref 1 handle 101 flower +} + +tc_sample_acl_rate_test() +{ + __tc_sample_acl_rate_test ingress $rp1 + __tc_sample_acl_rate_test egress $rp2 +} + +tc_sample_acl_max_rate_test() +{ + RET=0 + + # Policy-based sampling is not supported on Spectrum-1. + [[ "$DEVLINK_VIDDID" == "15b3:cb84" ]] && return + + tc filter add dev $rp1 ingress protocol ip pref 1 handle 101 flower \ + skip_sw action sample rate $((2 ** 24 - 1)) group 1 + check_err $? "Failed to configure sampling rule with max rate" + + tc filter del dev $rp1 ingress protocol ip pref 1 handle 101 flower + + tc filter add dev $rp1 ingress protocol ip pref 1 handle 101 flower \ + skip_sw action sample rate $((2 ** 24)) \ + group 1 &> /dev/null + check_fail $? "Managed to configure sampling rate above maximum" + + log_test "tc sample (w/ flower) maximum rate" +} + trap cleanup EXIT setup_prepare -- cgit v1.2.3 From 0f967d9e5a20aeee51bb004295e83c2c913cceda Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 16 Mar 2021 17:03:03 +0200 Subject: selftests: mlxsw: Test egress sampling limitation on Spectrum-1 only Make sure egress sampling configuration only fails on Spectrum-1, given that mlxsw now supports it on Spectrum-{2,3}. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh b/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh index 553cb9fad508..b4dbda706c4d 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh @@ -18,6 +18,7 @@ NUM_NETIFS=2 source $lib_dir/tc_common.sh source $lib_dir/lib.sh +source $lib_dir/devlink_lib.sh switch_create() { @@ -166,7 +167,8 @@ matchall_sample_egress_test() RET=0 # It is forbidden in mlxsw driver to have matchall with sample action - # bound on egress + # bound on egress. Spectrum-1 specific restriction + [[ "$DEVLINK_VIDDID" != "15b3:cb84" ]] && return tc qdisc add dev $swp1 clsact -- cgit v1.2.3 From ebda107e5f222a086c83ddf6d1ab1da97dd15810 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Tue, 16 Mar 2021 15:59:37 +0800 Subject: selftests/bpf: Fix warning comparing pointer to 0 Fix the following coccicheck warnings: ./tools/testing/selftests/bpf/progs/fexit_test.c:77:15-16: WARNING comparing pointer to 0. ./tools/testing/selftests/bpf/progs/fexit_test.c:68:12-13: WARNING comparing pointer to 0. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/1615881577-3493-1-git-send-email-jiapeng.chong@linux.alibaba.com --- tools/testing/selftests/bpf/progs/fexit_test.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/fexit_test.c b/tools/testing/selftests/bpf/progs/fexit_test.c index 0952affb22a6..8f1ccb7302e1 100644 --- a/tools/testing/selftests/bpf/progs/fexit_test.c +++ b/tools/testing/selftests/bpf/progs/fexit_test.c @@ -65,7 +65,7 @@ __u64 test7_result = 0; SEC("fexit/bpf_fentry_test7") int BPF_PROG(test7, struct bpf_fentry_test_t *arg) { - if (arg == 0) + if (!arg) test7_result = 1; return 0; } @@ -74,7 +74,7 @@ __u64 test8_result = 0; SEC("fexit/bpf_fentry_test8") int BPF_PROG(test8, struct bpf_fentry_test_t *arg) { - if (arg->a == 0) + if (!arg->a) test8_result = 1; return 0; } -- cgit v1.2.3 From 56901d483bf14ca729a6c4e06d2b49590b9f1971 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Tue, 16 Mar 2021 21:00:48 +0530 Subject: selftests/bpf: Use nanosleep() syscall instead of sleep() in get_cgroup_id Glibc's sleep() switched to clock_nanosleep() from nanosleep(), and thus syscalls:sys_enter_nanosleep tracepoint is not hitting which is causing testcase failure. Instead of depending on glibc sleep(), call nanosleep() systemcall directly. Before: # ./get_cgroup_id_user ... main:FAIL:compare_cgroup_id kern cgid 0 user cgid 483 After: # ./get_cgroup_id_user ... main:PASS:compare_cgroup_id Signed-off-by: Ravi Bangoria Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210316153048.136447-1-ravi.bangoria@linux.ibm.com --- tools/testing/selftests/bpf/get_cgroup_id_user.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/get_cgroup_id_user.c b/tools/testing/selftests/bpf/get_cgroup_id_user.c index b8d6aef99db4..99628e1a1e58 100644 --- a/tools/testing/selftests/bpf/get_cgroup_id_user.c +++ b/tools/testing/selftests/bpf/get_cgroup_id_user.c @@ -57,6 +57,10 @@ int main(int argc, char **argv) __u32 key = 0, pid; int exit_code = 1; char buf[256]; + const struct timespec req = { + .tv_sec = 1, + .tv_nsec = 0, + }; cgroup_fd = cgroup_setup_and_join(TEST_CGROUP); if (CHECK(cgroup_fd < 0, "cgroup_setup_and_join", "err %d errno %d\n", cgroup_fd, errno)) @@ -115,7 +119,7 @@ int main(int argc, char **argv) goto close_pmu; /* trigger some syscalls */ - sleep(1); + syscall(__NR_nanosleep, &req, NULL); err = bpf_map_lookup_elem(cgidmap_fd, &key, &kcgid); if (CHECK(err, "bpf_map_lookup_elem", "err %d errno %d\n", err, errno)) -- cgit v1.2.3 From 87cb88d3c00275d7dfd695b2002eb53411a53cfa Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 16 Mar 2021 17:55:03 -0700 Subject: perf test: Remove unused argument Remove unused argument from daemon_exit. Signed-off-by: Ian Rogers Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lore.kernel.org/lkml/20210317005505.2794804-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/daemon.sh | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/shell/daemon.sh b/tools/perf/tests/shell/daemon.sh index 5ad3ca8d681b..66ad56b4e0a5 100755 --- a/tools/perf/tests/shell/daemon.sh +++ b/tools/perf/tests/shell/daemon.sh @@ -115,8 +115,7 @@ daemon_start() daemon_exit() { - local base=$1 - local config=$2 + local config=$1 local line=`perf daemon --config ${config} -x: | head -1` local pid=`echo "${line}" | awk 'BEGIN { FS = ":" } ; { print $1 }'` @@ -171,7 +170,7 @@ EOF ${base}/session-time/ack "0" # stop daemon - daemon_exit ${base} ${config} + daemon_exit ${config} rm -rf ${base} rm -f ${config} @@ -288,7 +287,7 @@ EOF done # stop daemon - daemon_exit ${base} ${config} + daemon_exit ${config} rm -rf ${base} rm -f ${config} @@ -333,7 +332,7 @@ EOF fi # stop daemon - daemon_exit ${base} ${config} + daemon_exit ${config} # check that sessions are gone if [ -d "/proc/${pid_size}" ]; then @@ -374,7 +373,7 @@ EOF perf daemon signal --config ${config} # stop daemon - daemon_exit ${base} ${config} + daemon_exit ${config} # count is 2 perf.data for signals and 1 for perf record finished count=`ls ${base}/session-test/ | grep perf.data | wc -l` @@ -420,7 +419,7 @@ EOF fi # stop daemon - daemon_exit ${base} ${config} + daemon_exit ${config} rm -rf ${base} rm -f ${config} @@ -457,7 +456,7 @@ EOF fi # stop daemon - daemon_exit ${base} ${config} + daemon_exit ${config} rm -rf ${base} rm -f ${config} -- cgit v1.2.3 From 078cbb6f75f16a16fb843a431e83c2f92605bb75 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 16 Mar 2021 17:55:04 -0700 Subject: perf test: Cleanup daemon if test is interrupted. Reorder daemon_start and daemon_exit as the trap handler is added in daemon_start referencing daemon_exit. Signed-off-by: Ian Rogers Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lore.kernel.org/lkml/20210317005505.2794804-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/daemon.sh | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/shell/daemon.sh b/tools/perf/tests/shell/daemon.sh index 66ad56b4e0a5..61d13c4c64b8 100755 --- a/tools/perf/tests/shell/daemon.sh +++ b/tools/perf/tests/shell/daemon.sh @@ -98,6 +98,23 @@ check_line_other() fi } +daemon_exit() +{ + local config=$1 + + local line=`perf daemon --config ${config} -x: | head -1` + local pid=`echo "${line}" | awk 'BEGIN { FS = ":" } ; { print $1 }'` + + # Reset trap handler. + trap - SIGINT SIGTERM + + # stop daemon + perf daemon stop --config ${config} + + # ... and wait for the pid to go away + tail --pid=${pid} -f /dev/null +} + daemon_start() { local config=$1 @@ -105,6 +122,9 @@ daemon_start() perf daemon start --config ${config} + # Clean up daemon if interrupted. + trap "echo 'FAILED: Signal caught'; daemon_exit ${config}; exit 1" SIGINT SIGTERM + # wait for the session to ping local state="FAIL" while [ "${state}" != "OK" ]; do @@ -113,20 +133,6 @@ daemon_start() done } -daemon_exit() -{ - local config=$1 - - local line=`perf daemon --config ${config} -x: | head -1` - local pid=`echo "${line}" | awk 'BEGIN { FS = ":" } ; { print $1 }'` - - # stop daemon - perf daemon stop --config ${config} - - # ... and wait for the pid to go away - tail --pid=${pid} -f /dev/null -} - test_list() { echo "test daemon list" -- cgit v1.2.3 From a6cb06ff49fd522aaeecfee7d5952ac6e2ab9d13 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 16 Mar 2021 17:55:05 -0700 Subject: perf test: Add 30s timeout for wait for daemon start. Retry the ping loop upto 600 times, or approximately 30 seconds, to make sure the test does hang at start up. Signed-off-by: Ian Rogers Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lore.kernel.org/lkml/20210317005505.2794804-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/daemon.sh | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'tools') diff --git a/tools/perf/tests/shell/daemon.sh b/tools/perf/tests/shell/daemon.sh index 61d13c4c64b8..ee4a30ca3f57 100755 --- a/tools/perf/tests/shell/daemon.sh +++ b/tools/perf/tests/shell/daemon.sh @@ -127,9 +127,16 @@ daemon_start() # wait for the session to ping local state="FAIL" + local retries=0 while [ "${state}" != "OK" ]; do state=`perf daemon ping --config ${config} --session ${session} | awk '{ print $1 }'` sleep 0.05 + retries=$((${retries} +1)) + if [ ${retries} -ge 600 ]; then + echo "FAILED: Timeout waiting for daemon to ping" + daemon_exit ${config} + exit 1 + fi done } -- cgit v1.2.3 From 0705ef64d1ff52b817e278ca6e28095585ff31e1 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 17 Mar 2021 11:33:04 +0100 Subject: tools/insn: Restore the relative include paths for cross building Building perf on ppc causes: In file included from util/intel-pt-decoder/intel-pt-insn-decoder.c:15: util/intel-pt-decoder/../../../arch/x86/lib/insn.c:14:10: fatal error: asm/inat.h: No such file or directory 14 | #include /*__ignore_sync_check__ */ | ^~~~~~~~~~~~ Restore the relative include paths so that the compiler can find the headers. Fixes: 93281c4a9657 ("x86/insn: Add an insn_decode() API") Reported-by: Ian Rogers Reported-by: Stephen Rothwell Signed-off-by: Borislav Petkov Tested-by: Ian Rogers Tested-by: Stephen Rothwell Link: https://lkml.kernel.org/r/20210317150858.02b1bbc8@canb.auug.org.au --- tools/arch/x86/lib/insn.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/arch/x86/lib/insn.c b/tools/arch/x86/lib/insn.c index cd4dedde3265..c41f95815480 100644 --- a/tools/arch/x86/lib/insn.c +++ b/tools/arch/x86/lib/insn.c @@ -11,13 +11,13 @@ #else #include #endif -#include /*__ignore_sync_check__ */ -#include /* __ignore_sync_check__ */ +#include "../include/asm/inat.h" /* __ignore_sync_check__ */ +#include "../include/asm/insn.h" /* __ignore_sync_check__ */ #include #include -#include /* __ignore_sync_check__ */ +#include "../include/asm/emulate_prefix.h" /* __ignore_sync_check__ */ #define leXX_to_cpu(t, r) \ ({ \ -- cgit v1.2.3 From 35f15ab378fa7e1eaa25798076a457523b5ace75 Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Wed, 17 Mar 2021 12:35:28 +0200 Subject: selftests: forwarding: Add test for dual VxLAN bridge Configure VxLAN with an 802.1ad bridge and VxLAN with an 802.1d bridge at the same time in same switch, verify that traffic passed as expected. Signed-off-by: Amit Cohen Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- .../selftests/net/forwarding/dual_vxlan_bridge.sh | 366 +++++++++++++++++++++ 1 file changed, 366 insertions(+) create mode 100755 tools/testing/selftests/net/forwarding/dual_vxlan_bridge.sh (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/dual_vxlan_bridge.sh b/tools/testing/selftests/net/forwarding/dual_vxlan_bridge.sh new file mode 100755 index 000000000000..5148d97a5df8 --- /dev/null +++ b/tools/testing/selftests/net/forwarding/dual_vxlan_bridge.sh @@ -0,0 +1,366 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# +--------------------+ +----------------------+ +# | H1 (vrf) | | H2 (vrf) | +# | + h1.10 | | + h2.20 | +# | | 192.0.2.1/28 | | | 192.0.2.2/28 | +# | | | | | | +# | + $h1 | | + $h2 | +# | | | | | | +# +----|---------------+ +--|-------------------+ +# | | +# +----|--------------------------------------------------|--------------------+ +# | SW | | | +# | +--|-------------------------------+ +----------------|------------------+ | +# | | + $swp1 BR1 (802.1ad) | | BR2 (802.1d) + $swp2 | | +# | | vid 100 pvid untagged | | | | | +# | | | | + $swp2.20 | | +# | | | | | | +# | | + vx100 (vxlan) | | + vx200 (vxlan) | | +# | | local 192.0.2.17 | | local 192.0.2.17 | | +# | | remote 192.0.2.34 | | remote 192.0.2.50 | | +# | | id 1000 dstport $VXPORT | | id 2000 dstport $VXPORT | | +# | | vid 100 pvid untagged | | | | +# | +--------------------------------- + +-----------------------------------+ | +# | | +# | 192.0.2.32/28 via 192.0.2.18 | +# | 192.0.2.48/28 via 192.0.2.18 | +# | | +# | + $rp1 | +# | | 192.0.2.17/28 | +# +----|-----------------------------------------------------------------------+ +# | +# +----|--------------------------------------------------------+ +# | | VRP2 (vrf) | +# | + $rp2 | +# | 192.0.2.18/28 | +# | | (maybe) HW +# ============================================================================= +# | | (likely) SW +# | + v1 (veth) + v3 (veth) | +# | | 192.0.2.33/28 | 192.0.2.49/28 | +# +----|---------------------------------------|----------------+ +# | | +# +----|------------------------------+ +----|------------------------------+ +# | + v2 (veth) NS1 (netns) | | + v4 (veth) NS2 (netns) | +# | 192.0.2.34/28 | | 192.0.2.50/28 | +# | | | | +# | 192.0.2.16/28 via 192.0.2.33 | | 192.0.2.16/28 via 192.0.2.49 | +# | 192.0.2.50/32 via 192.0.2.33 | | 192.0.2.34/32 via 192.0.2.49 | +# | | | | +# | +-------------------------------+ | | +-------------------------------+ | +# | | BR3 (802.1ad) | | | | BR3 (802.1d) | | +# | | + vx100 (vxlan) | | | | + vx200 (vxlan) | | +# | | local 192.0.2.34 | | | | local 192.0.2.50 | | +# | | remote 192.0.2.17 | | | | remote 192.0.2.17 | | +# | | remote 192.0.2.50 | | | | remote 192.0.2.34 | | +# | | id 1000 dstport $VXPORT | | | | id 2000 dstport $VXPORT | | +# | | vid 100 pvid untagged | | | | | | +# | | | | | | + w1.20 | | +# | | | | | | | | | +# | | + w1 (veth) | | | | + w1 (veth) | | +# | | | vid 100 pvid untagged | | | | | | | +# | +--|----------------------------+ | | +--|----------------------------+ | +# | | | | | | +# | +--|----------------------------+ | | +--|----------------------------+ | +# | | | VW2 (vrf) | | | | | VW2 (vrf) | | +# | | + w2 (veth) | | | | + w2 (veth) | | +# | | | | | | | | | | +# | | | | | | | | | | +# | | + w2.10 | | | | + w2.20 | | +# | | 192.0.2.3/28 | | | | 192.0.2.4/28 | | +# | +-------------------------------+ | | +-------------------------------+ | +# +-----------------------------------+ +-----------------------------------+ + +: ${VXPORT:=4789} +export VXPORT + +: ${ALL_TESTS:=" + ping_ipv4 + "} + +NUM_NETIFS=6 +source lib.sh + +h1_create() +{ + simple_if_init $h1 + tc qdisc add dev $h1 clsact + vlan_create $h1 10 v$h1 192.0.2.1/28 +} + +h1_destroy() +{ + vlan_destroy $h1 10 + tc qdisc del dev $h1 clsact + simple_if_fini $h1 +} + +h2_create() +{ + simple_if_init $h2 + tc qdisc add dev $h2 clsact + vlan_create $h2 20 v$h2 192.0.2.2/28 +} + +h2_destroy() +{ + vlan_destroy $h2 20 + tc qdisc del dev $h2 clsact + simple_if_fini $h2 +} + +rp1_set_addr() +{ + ip address add dev $rp1 192.0.2.17/28 + + ip route add 192.0.2.32/28 nexthop via 192.0.2.18 + ip route add 192.0.2.48/28 nexthop via 192.0.2.18 +} + +rp1_unset_addr() +{ + ip route del 192.0.2.48/28 nexthop via 192.0.2.18 + ip route del 192.0.2.32/28 nexthop via 192.0.2.18 + + ip address del dev $rp1 192.0.2.17/28 +} + +switch_create() +{ + #### BR1 #### + ip link add name br1 type bridge vlan_filtering 1 \ + vlan_protocol 802.1ad vlan_default_pvid 0 mcast_snooping 0 + # Make sure the bridge uses the MAC address of the local port and not + # that of the VxLAN's device. + ip link set dev br1 address $(mac_get $swp1) + ip link set dev br1 up + + #### BR2 #### + ip link add name br2 type bridge vlan_filtering 0 mcast_snooping 0 + # Make sure the bridge uses the MAC address of the local port and not + # that of the VxLAN's device. + ip link set dev br2 address $(mac_get $swp2) + ip link set dev br2 up + + ip link set dev $rp1 up + rp1_set_addr + + #### VX100 #### + ip link add name vx100 type vxlan id 1000 local 192.0.2.17 \ + dstport "$VXPORT" nolearning noudpcsum tos inherit ttl 100 + ip link set dev vx100 up + + ip link set dev vx100 master br1 + bridge vlan add vid 100 dev vx100 pvid untagged + + ip link set dev $swp1 master br1 + ip link set dev $swp1 up + bridge vlan add vid 100 dev $swp1 pvid untagged + + #### VX200 #### + ip link add name vx200 type vxlan id 2000 local 192.0.2.17 \ + dstport "$VXPORT" nolearning noudpcsum tos inherit ttl 100 + ip link set dev vx200 up + + ip link set dev vx200 master br2 + + ip link set dev $swp2 up + ip link add name $swp2.20 link $swp2 type vlan id 20 + ip link set dev $swp2.20 master br2 + ip link set dev $swp2.20 up + + bridge fdb append dev vx100 00:00:00:00:00:00 dst 192.0.2.34 self + bridge fdb append dev vx200 00:00:00:00:00:00 dst 192.0.2.50 self +} + +switch_destroy() +{ + bridge fdb del dev vx200 00:00:00:00:00:00 dst 192.0.2.50 self + bridge fdb del dev vx100 00:00:00:00:00:00 dst 192.0.2.34 self + + ip link set dev vx200 nomaster + ip link set dev vx200 down + ip link del dev vx200 + + ip link del dev $swp2.20 + ip link set dev $swp2 down + ip link set dev $swp2 nomaster + + bridge vlan del vid 100 dev $swp1 + ip link set dev $swp1 down + ip link set dev $swp1 nomaster + + ip link set dev vx100 nomaster + ip link set dev vx100 down + ip link del dev vx100 + + rp1_unset_addr + ip link set dev $rp1 down + + ip link set dev br2 down + ip link del dev br2 + + ip link set dev br1 down + ip link del dev br1 +} + +vrp2_create() +{ + simple_if_init $rp2 192.0.2.18/28 + __simple_if_init v1 v$rp2 192.0.2.33/28 + __simple_if_init v3 v$rp2 192.0.2.49/28 + tc qdisc add dev v1 clsact +} + +vrp2_destroy() +{ + tc qdisc del dev v1 clsact + __simple_if_fini v3 192.0.2.49/28 + __simple_if_fini v1 192.0.2.33/28 + simple_if_fini $rp2 192.0.2.18/28 +} + +ns_init_common() +{ + local in_if=$1; shift + local in_addr=$1; shift + local other_in_addr=$1; shift + local vxlan_name=$1; shift + local vxlan_id=$1; shift + local vlan_id=$1; shift + local host_addr=$1; shift + local nh_addr=$1; shift + + ip link set dev $in_if up + ip address add dev $in_if $in_addr/28 + tc qdisc add dev $in_if clsact + + ip link add name br3 type bridge vlan_filtering 0 + ip link set dev br3 up + + ip link add name w1 type veth peer name w2 + + ip link set dev w1 master br3 + ip link set dev w1 up + + ip link add name $vxlan_name type vxlan id $vxlan_id local $in_addr \ + dstport "$VXPORT" + ip link set dev $vxlan_name up + bridge fdb append dev $vxlan_name 00:00:00:00:00:00 dst 192.0.2.17 self + bridge fdb append dev $vxlan_name 00:00:00:00:00:00 dst $other_in_addr self + + ip link set dev $vxlan_name master br3 + tc qdisc add dev $vxlan_name clsact + + simple_if_init w2 + vlan_create w2 $vlan_id vw2 $host_addr/28 + + ip route add 192.0.2.16/28 nexthop via $nh_addr + ip route add $other_in_addr/32 nexthop via $nh_addr +} +export -f ns_init_common + +ns1_create() +{ + ip netns add ns1 + ip link set dev v2 netns ns1 + in_ns ns1 \ + ns_init_common v2 192.0.2.34 192.0.2.50 vx100 1000 10 192.0.2.3 \ + 192.0.2.33 + + in_ns ns1 bridge vlan add vid 100 dev vx100 pvid untagged +} + +ns1_destroy() +{ + ip netns exec ns1 ip link set dev v2 netns 1 + ip netns del ns1 +} + +ns2_create() +{ + ip netns add ns2 + ip link set dev v4 netns ns2 + in_ns ns2 \ + ns_init_common v4 192.0.2.50 192.0.2.34 vx200 2000 20 192.0.2.4 \ + 192.0.2.49 + + in_ns ns2 ip link add name w1.20 link w1 type vlan id 20 + in_ns ns2 ip link set dev w1.20 master br3 + in_ns ns2 ip link set dev w1.20 up +} + +ns2_destroy() +{ + ip netns exec ns2 ip link set dev v4 netns 1 + ip netns del ns2 +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + swp1=${NETIFS[p2]} + + swp2=${NETIFS[p3]} + h2=${NETIFS[p4]} + + rp1=${NETIFS[p5]} + rp2=${NETIFS[p6]} + + vrf_prepare + forwarding_enable + + h1_create + h2_create + switch_create + + ip link add name v1 type veth peer name v2 + ip link add name v3 type veth peer name v4 + vrp2_create + ns1_create + ns2_create + + r1_mac=$(in_ns ns1 mac_get w2) + r2_mac=$(in_ns ns2 mac_get w2) + h2_mac=$(mac_get $h2) +} + +cleanup() +{ + pre_cleanup + + ns2_destroy + ns1_destroy + vrp2_destroy + ip link del dev v3 + ip link del dev v1 + + switch_destroy + h2_destroy + h1_destroy + + forwarding_restore + vrf_cleanup +} + +ping_ipv4() +{ + ping_test $h1 192.0.2.3 ": local->remote 1 through VxLAN with an 802.1ad bridge" + ping_test $h2 192.0.2.4 ": local->remote 2 through VxLAN with an 802.1d bridge" +} + +test_all() +{ + echo "Running tests with UDP port $VXPORT" + tests_run +} + +trap cleanup EXIT + +setup_prepare +setup_wait +test_all + +exit $EXIT_STATUS -- cgit v1.2.3 From 1724c97d2f9ddcfe2e372f9b02d6efde15b885b0 Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Wed, 17 Mar 2021 12:35:29 +0200 Subject: selftests: mlxsw: spectrum-2: Remove q_in_vni_veto test q_in_vni_veto.sh is not needed anymore because VxLAN with an 802.1ad bridge and VxLAN with an 802.1d bridge can coexist. Remove the test. Signed-off-by: Amit Cohen Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- .../drivers/net/mlxsw/spectrum-2/q_in_vni_veto.sh | 77 ---------------------- 1 file changed, 77 deletions(-) delete mode 100755 tools/testing/selftests/drivers/net/mlxsw/spectrum-2/q_in_vni_veto.sh (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/q_in_vni_veto.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/q_in_vni_veto.sh deleted file mode 100755 index 0231205a7147..000000000000 --- a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/q_in_vni_veto.sh +++ /dev/null @@ -1,77 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -lib_dir=$(dirname $0)/../../../../net/forwarding - -VXPORT=4789 - -ALL_TESTS=" - create_dot1d_and_dot1ad_vxlans -" -NUM_NETIFS=2 -source $lib_dir/lib.sh - -setup_prepare() -{ - swp1=${NETIFS[p1]} - swp2=${NETIFS[p2]} - - ip link set dev $swp1 up - ip link set dev $swp2 up -} - -cleanup() -{ - pre_cleanup - - ip link set dev $swp2 down - ip link set dev $swp1 down -} - -create_dot1d_and_dot1ad_vxlans() -{ - RET=0 - - ip link add dev br0 type bridge vlan_filtering 1 vlan_protocol 802.1ad \ - vlan_default_pvid 0 mcast_snooping 0 - ip link set dev br0 up - - ip link add name vx100 type vxlan id 1000 local 192.0.2.17 dstport \ - "$VXPORT" nolearning noudpcsum tos inherit ttl 100 - ip link set dev vx100 up - - ip link set dev $swp1 master br0 - ip link set dev vx100 master br0 - bridge vlan add vid 100 dev vx100 pvid untagged - - ip link add dev br1 type bridge vlan_filtering 0 mcast_snooping 0 - ip link set dev br1 up - - ip link add name vx200 type vxlan id 2000 local 192.0.2.17 dstport \ - "$VXPORT" nolearning noudpcsum tos inherit ttl 100 - ip link set dev vx200 up - - ip link set dev $swp2 master br1 - ip link set dev vx200 master br1 2>/dev/null - check_fail $? "802.1d and 802.1ad VxLANs at the same time not rejected" - - ip link set dev vx200 master br1 2>&1 >/dev/null \ - | grep -q mlxsw_spectrum - check_err $? "802.1d and 802.1ad VxLANs at the same time rejected without extack" - - log_test "create 802.1d and 802.1ad VxLANs" - - ip link del dev vx200 - ip link del dev br1 - ip link del dev vx100 - ip link del dev br0 -} - -trap cleanup EXIT - -setup_prepare -setup_wait - -tests_run - -exit $EXIT_STATUS -- cgit v1.2.3 From 9ae2c26e43248b722e79fe867be38062c9dd1e5f Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 17 Mar 2021 13:05:09 -0700 Subject: libbpf: provide NULL and KERNEL_VERSION macros in bpf_helpers.h Given that vmlinux.h is not compatible with headers like stddef.h, NULL poses an annoying problem: it is defined as #define, so is not captured in BTF, so is not emitted into vmlinux.h. This leads to users either sticking to explicit 0, or defining their own NULL (as progs/skb_pkt_end.c does). But it's easy for bpf_helpers.h to provide (conditionally) NULL definition. Similarly, KERNEL_VERSION is another commonly missed macro that came up multiple times. So this patch adds both of them, along with offsetof(), that also is typically defined in stddef.h, just like NULL. This might cause compilation warning for existing BPF applications defining their own NULL and/or KERNEL_VERSION already: progs/skb_pkt_end.c:7:9: warning: 'NULL' macro redefined [-Wmacro-redefined] #define NULL 0 ^ /tmp/linux/tools/testing/selftests/bpf/tools/include/vmlinux.h:4:9: note: previous definition is here #define NULL ((void *)0) ^ It is trivial to fix, though, so long-term benefits outweight temporary inconveniences. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20210317200510.1354627-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/bpf_helpers.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index 53ff81c49dbd..cc2e51c64a54 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -40,8 +40,22 @@ #define __weak __attribute__((weak)) #endif +/* When utilizing vmlinux.h with BPF CO-RE, user BPF programs can't include + * any system-level headers (such as stddef.h, linux/version.h, etc), and + * commonly-used macros like NULL and KERNEL_VERSION aren't available through + * vmlinux.h. This just adds unnecessary hurdles and forces users to re-define + * them on their own. So as a convenience, provide such definitions here. + */ +#ifndef NULL +#define NULL ((void *)0) +#endif + +#ifndef KERNEL_VERSION +#define KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + ((c) > 255 ? 255 : (c)) +#endif + /* - * Helper macro to manipulate data structures + * Helper macros to manipulate data structures */ #ifndef offsetof #define offsetof(TYPE, MEMBER) ((unsigned long)&((TYPE *)0)->MEMBER) -- cgit v1.2.3 From c53a3355eb2976fc4eb4d42862db0eea205045a1 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 17 Mar 2021 13:05:10 -0700 Subject: selftests/bpf: drop custom NULL #define in skb_pkt_end selftest Now that bpftool generates NULL definition as part of vmlinux.h, drop custom NULL definition in skb_pkt_end.c. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20210317200510.1354627-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/skb_pkt_end.c | 1 - 1 file changed, 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/skb_pkt_end.c b/tools/testing/selftests/bpf/progs/skb_pkt_end.c index cf6823f42e80..7f2eaa2f89f8 100644 --- a/tools/testing/selftests/bpf/progs/skb_pkt_end.c +++ b/tools/testing/selftests/bpf/progs/skb_pkt_end.c @@ -4,7 +4,6 @@ #include #include -#define NULL 0 #define INLINE __always_inline #define skb_shorter(skb, len) ((void *)(long)(skb)->data + (len) > (void *)(long)skb->data_end) -- cgit v1.2.3 From f706bb59204ba1c47e896b456c97977fc97b7964 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 4 Mar 2021 09:01:55 -0800 Subject: selftests/x86: Add a missing .note.GNU-stack section to thunks_32.S test_syscall_vdso_32 ended up with an executable stacks because the asm was missing the annotation that says that it is modern and doesn't need an executable stack. Add the annotation. This was missed in commit aeaaf005da1d ("selftests/x86: Add missing .note.GNU-stack sections"). Fixes: aeaaf005da1d ("selftests/x86: Add missing .note.GNU-stack sections") Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/487ed5348a43c031b816fa7e9efedb75dc324299.1614877299.git.luto@kernel.org --- tools/testing/selftests/x86/thunks_32.S | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/x86/thunks_32.S b/tools/testing/selftests/x86/thunks_32.S index a71d92da8f46..f3f56e681e9f 100644 --- a/tools/testing/selftests/x86/thunks_32.S +++ b/tools/testing/selftests/x86/thunks_32.S @@ -45,3 +45,5 @@ call64_from_32: ret .size call64_from_32, .-call64_from_32 + +.section .note.GNU-stack,"",%progbits -- cgit v1.2.3 From e20f67026b5ead2afc5627e98b45e6b65e7fb38c Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 15 Mar 2021 13:08:35 +0100 Subject: tools/x86/kcpuid: Check last token too Input lines like 0x8000001E, 0, EAX, 31:0, Extended APIC ID where the short name is missing lead to a segfault because the loop takes the long name for the short name and tokens[5] becomes NULL which explodes later in strcpy(). Check its value too before further processing. Signed-off-by: Borislav Petkov Acked-by: Feng Tang Link: https://lkml.kernel.org/r/20210315125901.30315-1-bp@alien8.de --- tools/arch/x86/kcpuid/kcpuid.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/arch/x86/kcpuid/kcpuid.c b/tools/arch/x86/kcpuid/kcpuid.c index 6048da34fcc6..dae75511fef7 100644 --- a/tools/arch/x86/kcpuid/kcpuid.c +++ b/tools/arch/x86/kcpuid/kcpuid.c @@ -324,6 +324,8 @@ static int parse_line(char *line) str = NULL; } tokens[5] = strtok(str, "\n"); + if (!tokens[5]) + goto err_exit; /* index/main-leaf */ index = strtoull(tokens[0], NULL, 0); -- cgit v1.2.3 From f281854fa743f3474b2d0d69533301f48cf0e184 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 15 Mar 2021 13:55:14 +0100 Subject: tools/x86/kcpuid: Add AMD leaf 0x8000001E Contains core IDs, node IDs and other topology info. Signed-off-by: Borislav Petkov Acked-by: Feng Tang Link: https://lkml.kernel.org/r/20210315125901.30315-2-bp@alien8.de --- tools/arch/x86/kcpuid/cpuid.csv | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/arch/x86/kcpuid/cpuid.csv b/tools/arch/x86/kcpuid/cpuid.csv index dd94c07421a8..4f1c4b0c29e9 100644 --- a/tools/arch/x86/kcpuid/cpuid.csv +++ b/tools/arch/x86/kcpuid/cpuid.csv @@ -379,12 +379,22 @@ 0x80000008, 0, EAX, 15:8, lnr_adr_bits, Linear Address Bits 0x80000007, 0, EBX, 9, wbnoinvd, WBNOINVD +# 0x8000001E +# EAX: Extended APIC ID +0x8000001E, 0, EAX, 31:0, extended_apic_id, Extended APIC ID +# EBX: Core Identifiers +0x8000001E, 0, EBX, 7:0, core_id, Identifies the logical core ID +0x8000001E, 0, EBX, 15:8, threads_per_core, The number of threads per core is threads_per_core + 1 +# ECX: Node Identifiers +0x8000001E, 0, ECX, 7:0, node_id, Node ID +0x8000001E, 0, ECX, 10:8, nodes_per_processor, Nodes per processor { 0: 1 node, else reserved } + # 8000001F: AMD Secure Encryption -0x8000001F, 0, EAX, 0, sme, Secure Memory Encryption -0x8000001F, 0, EAX, 1, sev, Secure Encrypted Virtualization -0x8000001F, 0, EAX, 2, vmpgflush, VM Page Flush MSR -0x8000001F, 0, EAX, 3, seves, SEV Encrypted State -0x8000001F, 0, EBX, 5:0, c-bit, Page table bit number used to enable memory encryption -0x8000001F, 0, EBX, 11:6, mem_encrypt_physaddr_width, Reduction of physical address space in bits with SME enabled -0x8000001F, 0, ECX, 31:0, num_encrypted_guests, Maximum ASID value that may be used for an SEV-enabled guest -0x8000001F, 0, EDX, 31:0, minimum_sev_asid, Minimum ASID value that must be used for an SEV-enabled, SEV-ES-disabled guest +0x8000001F, 0, EAX, 0, sme, Secure Memory Encryption +0x8000001F, 0, EAX, 1, sev, Secure Encrypted Virtualization +0x8000001F, 0, EAX, 2, vmpgflush, VM Page Flush MSR +0x8000001F, 0, EAX, 3, seves, SEV Encrypted State +0x8000001F, 0, EBX, 5:0, c-bit, Page table bit number used to enable memory encryption +0x8000001F, 0, EBX, 11:6, mem_encrypt_physaddr_width, Reduction of physical address space in bits with SME enabled +0x8000001F, 0, ECX, 31:0, num_encrypted_guests, Maximum ASID value that may be used for an SEV-enabled guest +0x8000001F, 0, EDX, 31:0, minimum_sev_asid, Minimum ASID value that must be used for an SEV-enabled, SEV-ES-disabled guest -- cgit v1.2.3 From 421d9d1bea6545543c00ffba4c83f369510de9a1 Mon Sep 17 00:00:00 2001 From: Xu Wang Date: Mon, 8 Mar 2021 02:24:59 +0000 Subject: tools/latency-collector: Remove unneeded semicolon Fix semicolon.cocci warning: tools/tracing/latency/latency-collector.c:1021:2-3: Unneeded semicolon Link: https://lkml.kernel.org/r/20210308022459.59881-1-vulab@iscas.ac.cn Reviewed-by: Viktor Rosendahl Signed-off-by: Xu Wang Signed-off-by: Steven Rostedt (VMware) --- tools/tracing/latency/latency-collector.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/tracing/latency/latency-collector.c b/tools/tracing/latency/latency-collector.c index b69de9263ee6..3a2e6bb781a8 100644 --- a/tools/tracing/latency/latency-collector.c +++ b/tools/tracing/latency/latency-collector.c @@ -1018,7 +1018,7 @@ static long go_to_sleep(const struct entry *req) cond_timedwait(&printstate.cond, &printstate.mutex, &future); if (time_has_passed(&future)) break; - }; + } if (printstate_has_new_req_arrived(req)) delay = -1; @@ -1941,7 +1941,7 @@ static void scan_arguments(int argc, char *argv[]) if (value < 0) { warnx("TIME must be >= 0\n"); show_usage(); - ; + exit(0); } trace_enable = true; use_random_sleep = true; -- cgit v1.2.3 From e14ef4bf011192b61b48c4f3a35b3041140073ff Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 18 Mar 2021 12:40:25 -0700 Subject: libbpf: Expose btf_type_by_id() internally btf_type_by_id() is internal-only convenience API returning non-const pointer to struct btf_type. Expose it outside of btf.c for re-use. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210318194036.3521577-2-andrii@kernel.org --- tools/lib/bpf/btf.c | 2 +- tools/lib/bpf/libbpf_internal.h | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 3aa58f2ac183..e0b0a78b04fe 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -435,7 +435,7 @@ const struct btf *btf__base_btf(const struct btf *btf) } /* internal helper returning non-const pointer to a type */ -static struct btf_type *btf_type_by_id(struct btf *btf, __u32 type_id) +struct btf_type *btf_type_by_id(struct btf *btf, __u32 type_id) { if (type_id == 0) return &btf_void; diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index 343f6eb05637..d09860e435c8 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -107,6 +107,11 @@ static inline void *libbpf_reallocarray(void *ptr, size_t nmemb, size_t size) return realloc(ptr, total); } +struct btf; +struct btf_type; + +struct btf_type *btf_type_by_id(struct btf *btf, __u32 type_id); + void *btf_add_mem(void **data, size_t *cap_cnt, size_t elem_sz, size_t cur_cnt, size_t max_cnt, size_t add_cnt); int btf_ensure_mem(void **data, size_t *cap_cnt, size_t elem_sz, size_t need_cnt); -- cgit v1.2.3 From f36e99a45dbe76949eb99bba413c67eda5cd2591 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 18 Mar 2021 12:40:26 -0700 Subject: libbpf: Generalize BTF and BTF.ext type ID and strings iteration Extract and generalize the logic to iterate BTF type ID and string offset fields within BTF types and .BTF.ext data. Expose this internally in libbpf for re-use by bpf_linker. Additionally, complete strings deduplication handling for BTF.ext (e.g., CO-RE access strings), which was previously missing. There previously was no case of deduplicating .BTF.ext data, but bpf_linker is going to use it. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210318194036.3521577-3-andrii@kernel.org --- tools/lib/bpf/btf.c | 393 ++++++++++++++++++++++------------------ tools/lib/bpf/libbpf_internal.h | 7 + 2 files changed, 228 insertions(+), 172 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index e0b0a78b04fe..e137781f9bc6 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -3155,95 +3155,28 @@ done: return d; } -typedef int (*str_off_fn_t)(__u32 *str_off_ptr, void *ctx); - /* * Iterate over all possible places in .BTF and .BTF.ext that can reference * string and pass pointer to it to a provided callback `fn`. */ -static int btf_for_each_str_off(struct btf_dedup *d, str_off_fn_t fn, void *ctx) +static int btf_for_each_str_off(struct btf_dedup *d, str_off_visit_fn fn, void *ctx) { - void *line_data_cur, *line_data_end; - int i, j, r, rec_size; - struct btf_type *t; + int i, r; for (i = 0; i < d->btf->nr_types; i++) { - t = btf_type_by_id(d->btf, d->btf->start_id + i); - r = fn(&t->name_off, ctx); + struct btf_type *t = btf_type_by_id(d->btf, d->btf->start_id + i); + + r = btf_type_visit_str_offs(t, fn, ctx); if (r) return r; - - switch (btf_kind(t)) { - case BTF_KIND_STRUCT: - case BTF_KIND_UNION: { - struct btf_member *m = btf_members(t); - __u16 vlen = btf_vlen(t); - - for (j = 0; j < vlen; j++) { - r = fn(&m->name_off, ctx); - if (r) - return r; - m++; - } - break; - } - case BTF_KIND_ENUM: { - struct btf_enum *m = btf_enum(t); - __u16 vlen = btf_vlen(t); - - for (j = 0; j < vlen; j++) { - r = fn(&m->name_off, ctx); - if (r) - return r; - m++; - } - break; - } - case BTF_KIND_FUNC_PROTO: { - struct btf_param *m = btf_params(t); - __u16 vlen = btf_vlen(t); - - for (j = 0; j < vlen; j++) { - r = fn(&m->name_off, ctx); - if (r) - return r; - m++; - } - break; - } - default: - break; - } } if (!d->btf_ext) return 0; - line_data_cur = d->btf_ext->line_info.info; - line_data_end = d->btf_ext->line_info.info + d->btf_ext->line_info.len; - rec_size = d->btf_ext->line_info.rec_size; - - while (line_data_cur < line_data_end) { - struct btf_ext_info_sec *sec = line_data_cur; - struct bpf_line_info_min *line_info; - __u32 num_info = sec->num_info; - - r = fn(&sec->sec_name_off, ctx); - if (r) - return r; - - line_data_cur += sizeof(struct btf_ext_info_sec); - for (i = 0; i < num_info; i++) { - line_info = line_data_cur; - r = fn(&line_info->file_name_off, ctx); - if (r) - return r; - r = fn(&line_info->line_off, ctx); - if (r) - return r; - line_data_cur += rec_size; - } - } + r = btf_ext_visit_str_offs(d->btf_ext, fn, ctx); + if (r) + return r; return 0; } @@ -4498,15 +4431,18 @@ static int btf_dedup_compact_types(struct btf_dedup *d) * then mapping it to a deduplicated type ID, stored in btf_dedup->hypot_map, * which is populated during compaction phase. */ -static int btf_dedup_remap_type_id(struct btf_dedup *d, __u32 type_id) +static int btf_dedup_remap_type_id(__u32 *type_id, void *ctx) { + struct btf_dedup *d = ctx; __u32 resolved_type_id, new_type_id; - resolved_type_id = resolve_type_id(d, type_id); + resolved_type_id = resolve_type_id(d, *type_id); new_type_id = d->hypot_map[resolved_type_id]; if (new_type_id > BTF_MAX_NR_TYPES) return -EINVAL; - return new_type_id; + + *type_id = new_type_id; + return 0; } /* @@ -4519,109 +4455,25 @@ static int btf_dedup_remap_type_id(struct btf_dedup *d, __u32 type_id) * referenced from any BTF type (e.g., struct fields, func proto args, etc) to * their final deduped type IDs. */ -static int btf_dedup_remap_type(struct btf_dedup *d, __u32 type_id) +static int btf_dedup_remap_types(struct btf_dedup *d) { - struct btf_type *t = btf_type_by_id(d->btf, type_id); int i, r; - switch (btf_kind(t)) { - case BTF_KIND_INT: - case BTF_KIND_ENUM: - case BTF_KIND_FLOAT: - break; - - case BTF_KIND_FWD: - case BTF_KIND_CONST: - case BTF_KIND_VOLATILE: - case BTF_KIND_RESTRICT: - case BTF_KIND_PTR: - case BTF_KIND_TYPEDEF: - case BTF_KIND_FUNC: - case BTF_KIND_VAR: - r = btf_dedup_remap_type_id(d, t->type); - if (r < 0) - return r; - t->type = r; - break; - - case BTF_KIND_ARRAY: { - struct btf_array *arr_info = btf_array(t); - - r = btf_dedup_remap_type_id(d, arr_info->type); - if (r < 0) - return r; - arr_info->type = r; - r = btf_dedup_remap_type_id(d, arr_info->index_type); - if (r < 0) - return r; - arr_info->index_type = r; - break; - } - - case BTF_KIND_STRUCT: - case BTF_KIND_UNION: { - struct btf_member *member = btf_members(t); - __u16 vlen = btf_vlen(t); - - for (i = 0; i < vlen; i++) { - r = btf_dedup_remap_type_id(d, member->type); - if (r < 0) - return r; - member->type = r; - member++; - } - break; - } - - case BTF_KIND_FUNC_PROTO: { - struct btf_param *param = btf_params(t); - __u16 vlen = btf_vlen(t); + for (i = 0; i < d->btf->nr_types; i++) { + struct btf_type *t = btf_type_by_id(d->btf, d->btf->start_id + i); - r = btf_dedup_remap_type_id(d, t->type); - if (r < 0) + r = btf_type_visit_type_ids(t, btf_dedup_remap_type_id, d); + if (r) return r; - t->type = r; - - for (i = 0; i < vlen; i++) { - r = btf_dedup_remap_type_id(d, param->type); - if (r < 0) - return r; - param->type = r; - param++; - } - break; - } - - case BTF_KIND_DATASEC: { - struct btf_var_secinfo *var = btf_var_secinfos(t); - __u16 vlen = btf_vlen(t); - - for (i = 0; i < vlen; i++) { - r = btf_dedup_remap_type_id(d, var->type); - if (r < 0) - return r; - var->type = r; - var++; - } - break; - } - - default: - return -EINVAL; } - return 0; -} + if (!d->btf_ext) + return 0; -static int btf_dedup_remap_types(struct btf_dedup *d) -{ - int i, r; + r = btf_ext_visit_type_ids(d->btf_ext, btf_dedup_remap_type_id, d); + if (r) + return r; - for (i = 0; i < d->btf->nr_types; i++) { - r = btf_dedup_remap_type(d, d->btf->start_id + i); - if (r < 0) - return r; - } return 0; } @@ -4675,3 +4527,200 @@ struct btf *libbpf_find_kernel_btf(void) pr_warn("failed to find valid kernel BTF\n"); return ERR_PTR(-ESRCH); } + +int btf_type_visit_type_ids(struct btf_type *t, type_id_visit_fn visit, void *ctx) +{ + int i, n, err; + + switch (btf_kind(t)) { + case BTF_KIND_INT: + case BTF_KIND_FLOAT: + case BTF_KIND_ENUM: + return 0; + + case BTF_KIND_FWD: + case BTF_KIND_CONST: + case BTF_KIND_VOLATILE: + case BTF_KIND_RESTRICT: + case BTF_KIND_PTR: + case BTF_KIND_TYPEDEF: + case BTF_KIND_FUNC: + case BTF_KIND_VAR: + return visit(&t->type, ctx); + + case BTF_KIND_ARRAY: { + struct btf_array *a = btf_array(t); + + err = visit(&a->type, ctx); + err = err ?: visit(&a->index_type, ctx); + return err; + } + + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + struct btf_member *m = btf_members(t); + + for (i = 0, n = btf_vlen(t); i < n; i++, m++) { + err = visit(&m->type, ctx); + if (err) + return err; + } + return 0; + } + + case BTF_KIND_FUNC_PROTO: { + struct btf_param *m = btf_params(t); + + err = visit(&t->type, ctx); + if (err) + return err; + for (i = 0, n = btf_vlen(t); i < n; i++, m++) { + err = visit(&m->type, ctx); + if (err) + return err; + } + return 0; + } + + case BTF_KIND_DATASEC: { + struct btf_var_secinfo *m = btf_var_secinfos(t); + + for (i = 0, n = btf_vlen(t); i < n; i++, m++) { + err = visit(&m->type, ctx); + if (err) + return err; + } + return 0; + } + + default: + return -EINVAL; + } +} + +int btf_type_visit_str_offs(struct btf_type *t, str_off_visit_fn visit, void *ctx) +{ + int i, n, err; + + err = visit(&t->name_off, ctx); + if (err) + return err; + + switch (btf_kind(t)) { + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + struct btf_member *m = btf_members(t); + + for (i = 0, n = btf_vlen(t); i < n; i++, m++) { + err = visit(&m->name_off, ctx); + if (err) + return err; + } + break; + } + case BTF_KIND_ENUM: { + struct btf_enum *m = btf_enum(t); + + for (i = 0, n = btf_vlen(t); i < n; i++, m++) { + err = visit(&m->name_off, ctx); + if (err) + return err; + } + break; + } + case BTF_KIND_FUNC_PROTO: { + struct btf_param *m = btf_params(t); + + for (i = 0, n = btf_vlen(t); i < n; i++, m++) { + err = visit(&m->name_off, ctx); + if (err) + return err; + } + break; + } + default: + break; + } + + return 0; +} + +int btf_ext_visit_type_ids(struct btf_ext *btf_ext, type_id_visit_fn visit, void *ctx) +{ + const struct btf_ext_info *seg; + struct btf_ext_info_sec *sec; + int i, err; + + seg = &btf_ext->func_info; + for_each_btf_ext_sec(seg, sec) { + struct bpf_func_info_min *rec; + + for_each_btf_ext_rec(seg, sec, i, rec) { + err = visit(&rec->type_id, ctx); + if (err < 0) + return err; + } + } + + seg = &btf_ext->core_relo_info; + for_each_btf_ext_sec(seg, sec) { + struct bpf_core_relo *rec; + + for_each_btf_ext_rec(seg, sec, i, rec) { + err = visit(&rec->type_id, ctx); + if (err < 0) + return err; + } + } + + return 0; +} + +int btf_ext_visit_str_offs(struct btf_ext *btf_ext, str_off_visit_fn visit, void *ctx) +{ + const struct btf_ext_info *seg; + struct btf_ext_info_sec *sec; + int i, err; + + seg = &btf_ext->func_info; + for_each_btf_ext_sec(seg, sec) { + err = visit(&sec->sec_name_off, ctx); + if (err) + return err; + } + + seg = &btf_ext->line_info; + for_each_btf_ext_sec(seg, sec) { + struct bpf_line_info_min *rec; + + err = visit(&sec->sec_name_off, ctx); + if (err) + return err; + + for_each_btf_ext_rec(seg, sec, i, rec) { + err = visit(&rec->file_name_off, ctx); + if (err) + return err; + err = visit(&rec->line_off, ctx); + if (err) + return err; + } + } + + seg = &btf_ext->core_relo_info; + for_each_btf_ext_sec(seg, sec) { + struct bpf_core_relo *rec; + + err = visit(&sec->sec_name_off, ctx); + if (err) + return err; + + for_each_btf_ext_rec(seg, sec, i, rec) { + err = visit(&rec->access_str_off, ctx); + if (err) + return err; + } + } + + return 0; +} diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index d09860e435c8..97b6b9cc9839 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -356,4 +356,11 @@ struct bpf_core_relo { enum bpf_core_relo_kind kind; }; +typedef int (*type_id_visit_fn)(__u32 *type_id, void *ctx); +typedef int (*str_off_visit_fn)(__u32 *str_off, void *ctx); +int btf_type_visit_type_ids(struct btf_type *t, type_id_visit_fn visit, void *ctx); +int btf_type_visit_str_offs(struct btf_type *t, str_off_visit_fn visit, void *ctx); +int btf_ext_visit_type_ids(struct btf_ext *btf_ext, type_id_visit_fn visit, void *ctx); +int btf_ext_visit_str_offs(struct btf_ext *btf_ext, str_off_visit_fn visit, void *ctx); + #endif /* __LIBBPF_LIBBPF_INTERNAL_H */ -- cgit v1.2.3 From 3b029e06f624efa90c9a4354e408acf134adb185 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 18 Mar 2021 12:40:27 -0700 Subject: libbpf: Rename internal memory-management helpers Rename btf_add_mem() and btf_ensure_mem() helpers that abstract away details of dynamically resizable memory to use libbpf_ prefix, as they are not BTF-specific. No functional changes. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210318194036.3521577-4-andrii@kernel.org --- tools/lib/bpf/btf.c | 24 ++++++++++++------------ tools/lib/bpf/btf_dump.c | 8 ++++---- tools/lib/bpf/libbpf.c | 4 ++-- tools/lib/bpf/libbpf_internal.h | 6 +++--- 4 files changed, 21 insertions(+), 21 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index e137781f9bc6..c98d39710515 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -142,8 +142,8 @@ static inline __u64 ptr_to_u64(const void *ptr) * On success, memory pointer to the beginning of unused memory is returned. * On error, NULL is returned. */ -void *btf_add_mem(void **data, size_t *cap_cnt, size_t elem_sz, - size_t cur_cnt, size_t max_cnt, size_t add_cnt) +void *libbpf_add_mem(void **data, size_t *cap_cnt, size_t elem_sz, + size_t cur_cnt, size_t max_cnt, size_t add_cnt) { size_t new_cnt; void *new_data; @@ -179,14 +179,14 @@ void *btf_add_mem(void **data, size_t *cap_cnt, size_t elem_sz, /* Ensure given dynamically allocated memory region has enough allocated space * to accommodate *need_cnt* elements of size *elem_sz* bytes each */ -int btf_ensure_mem(void **data, size_t *cap_cnt, size_t elem_sz, size_t need_cnt) +int libbpf_ensure_mem(void **data, size_t *cap_cnt, size_t elem_sz, size_t need_cnt) { void *p; if (need_cnt <= *cap_cnt) return 0; - p = btf_add_mem(data, cap_cnt, elem_sz, *cap_cnt, SIZE_MAX, need_cnt - *cap_cnt); + p = libbpf_add_mem(data, cap_cnt, elem_sz, *cap_cnt, SIZE_MAX, need_cnt - *cap_cnt); if (!p) return -ENOMEM; @@ -197,8 +197,8 @@ static int btf_add_type_idx_entry(struct btf *btf, __u32 type_off) { __u32 *p; - p = btf_add_mem((void **)&btf->type_offs, &btf->type_offs_cap, sizeof(__u32), - btf->nr_types, BTF_MAX_NR_TYPES, 1); + p = libbpf_add_mem((void **)&btf->type_offs, &btf->type_offs_cap, sizeof(__u32), + btf->nr_types, BTF_MAX_NR_TYPES, 1); if (!p) return -ENOMEM; @@ -1586,8 +1586,8 @@ err_out: static void *btf_add_str_mem(struct btf *btf, size_t add_sz) { - return btf_add_mem(&btf->strs_data, &btf->strs_data_cap, 1, - btf->hdr->str_len, BTF_MAX_STR_OFFSET, add_sz); + return libbpf_add_mem(&btf->strs_data, &btf->strs_data_cap, 1, + btf->hdr->str_len, BTF_MAX_STR_OFFSET, add_sz); } /* Find an offset in BTF string section that corresponds to a given string *s*. @@ -1683,8 +1683,8 @@ int btf__add_str(struct btf *btf, const char *s) static void *btf_add_type_mem(struct btf *btf, size_t add_sz) { - return btf_add_mem(&btf->types_data, &btf->types_data_cap, 1, - btf->hdr->type_len, UINT_MAX, add_sz); + return libbpf_add_mem(&btf->types_data, &btf->types_data_cap, 1, + btf->hdr->type_len, UINT_MAX, add_sz); } static __u32 btf_type_info(int kind, int vlen, int kflag) @@ -3208,7 +3208,7 @@ static int strs_dedup_remap_str_off(__u32 *str_off_ptr, void *ctx) len = strlen(s) + 1; new_off = d->strs_len; - p = btf_add_mem(&d->strs_data, &d->strs_cap, 1, new_off, BTF_MAX_STR_OFFSET, len); + p = libbpf_add_mem(&d->strs_data, &d->strs_cap, 1, new_off, BTF_MAX_STR_OFFSET, len); if (!p) return -ENOMEM; @@ -3264,7 +3264,7 @@ static int btf_dedup_strings(struct btf_dedup *d) } if (!d->btf->base_btf) { - s = btf_add_mem(&d->strs_data, &d->strs_cap, 1, d->strs_len, BTF_MAX_STR_OFFSET, 1); + s = libbpf_add_mem(&d->strs_data, &d->strs_cap, 1, d->strs_len, BTF_MAX_STR_OFFSET, 1); if (!s) return -ENOMEM; /* initial empty string */ diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index 5e957fcceee6..b5dbd5adc0e8 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -166,11 +166,11 @@ static int btf_dump_resize(struct btf_dump *d) if (last_id <= d->last_id) return 0; - if (btf_ensure_mem((void **)&d->type_states, &d->type_states_cap, - sizeof(*d->type_states), last_id + 1)) + if (libbpf_ensure_mem((void **)&d->type_states, &d->type_states_cap, + sizeof(*d->type_states), last_id + 1)) return -ENOMEM; - if (btf_ensure_mem((void **)&d->cached_names, &d->cached_names_cap, - sizeof(*d->cached_names), last_id + 1)) + if (libbpf_ensure_mem((void **)&d->cached_names, &d->cached_names_cap, + sizeof(*d->cached_names), last_id + 1)) return -ENOMEM; if (d->last_id == 0) { diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 2f351d3ad3e7..18ba37164e17 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -4867,8 +4867,8 @@ static int load_module_btfs(struct bpf_object *obj) goto err_out; } - err = btf_ensure_mem((void **)&obj->btf_modules, &obj->btf_module_cap, - sizeof(*obj->btf_modules), obj->btf_module_cnt + 1); + err = libbpf_ensure_mem((void **)&obj->btf_modules, &obj->btf_module_cap, + sizeof(*obj->btf_modules), obj->btf_module_cnt + 1); if (err) goto err_out; diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index 97b6b9cc9839..e0787900eeca 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -112,9 +112,9 @@ struct btf_type; struct btf_type *btf_type_by_id(struct btf *btf, __u32 type_id); -void *btf_add_mem(void **data, size_t *cap_cnt, size_t elem_sz, - size_t cur_cnt, size_t max_cnt, size_t add_cnt); -int btf_ensure_mem(void **data, size_t *cap_cnt, size_t elem_sz, size_t need_cnt); +void *libbpf_add_mem(void **data, size_t *cap_cnt, size_t elem_sz, + size_t cur_cnt, size_t max_cnt, size_t add_cnt); +int libbpf_ensure_mem(void **data, size_t *cap_cnt, size_t elem_sz, size_t need_cnt); static inline bool libbpf_validate_opts(const char *opts, size_t opts_sz, size_t user_sz, -- cgit v1.2.3 From 90d76d3ececc74bf43b2a97f178dadfa1e52be54 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 18 Mar 2021 12:40:28 -0700 Subject: libbpf: Extract internal set-of-strings datastructure APIs Extract BTF logic for maintaining a set of strings data structure, used for BTF strings section construction in writable mode, into separate re-usable API. This data structure is going to be used by bpf_linker to maintains ELF STRTAB section, which has the same layout as BTF strings section. Suggested-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210318194036.3521577-5-andrii@kernel.org --- tools/lib/bpf/Build | 2 +- tools/lib/bpf/btf.c | 255 ++++++++++++------------------------------------- tools/lib/bpf/strset.c | 176 ++++++++++++++++++++++++++++++++++ tools/lib/bpf/strset.h | 21 ++++ 4 files changed, 259 insertions(+), 195 deletions(-) create mode 100644 tools/lib/bpf/strset.c create mode 100644 tools/lib/bpf/strset.h (limited to 'tools') diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build index 190366d05588..8136186a453f 100644 --- a/tools/lib/bpf/Build +++ b/tools/lib/bpf/Build @@ -1,3 +1,3 @@ libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o \ netlink.o bpf_prog_linfo.o libbpf_probes.o xsk.o hashmap.o \ - btf_dump.o ringbuf.o + btf_dump.o ringbuf.o strset.o diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index c98d39710515..fe087592ad35 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -21,6 +21,7 @@ #include "libbpf.h" #include "libbpf_internal.h" #include "hashmap.h" +#include "strset.h" #define BTF_MAX_NR_TYPES 0x7fffffffU #define BTF_MAX_STR_OFFSET 0x7fffffffU @@ -67,7 +68,7 @@ struct btf { * | | | * hdr | | * types_data----+ | - * strs_data------------------+ + * strset__data(strs_set)-----+ * * +----------+---------+-----------+ * | Header | Types | Strings | @@ -105,20 +106,15 @@ struct btf { */ int start_str_off; + /* only one of strs_data or strs_set can be non-NULL, depending on + * whether BTF is in a modifiable state (strs_set is used) or not + * (strs_data points inside raw_data) + */ void *strs_data; - size_t strs_data_cap; /* used size stored in hdr->str_len */ - - /* lookup index for each unique string in strings section */ - struct hashmap *strs_hash; + /* a set of unique strings */ + struct strset *strs_set; /* whether strings are already deduplicated */ bool strs_deduped; - /* extra indirection layer to make strings hashmap work with stable - * string offsets and ability to transparently choose between - * btf->strs_data or btf_dedup->strs_data as a source of strings. - * This is used for BTF strings dedup to transfer deduplicated strings - * data back to struct btf without re-building strings index. - */ - void **strs_data_ptr; /* BTF object FD, if loaded into kernel */ int fd; @@ -738,7 +734,7 @@ void btf__free(struct btf *btf) */ free(btf->hdr); free(btf->types_data); - free(btf->strs_data); + strset__free(btf->strs_set); } free(btf->raw_data); free(btf->raw_data_swapped); @@ -1246,6 +1242,11 @@ void btf__set_fd(struct btf *btf, int fd) btf->fd = fd; } +static const void *btf_strs_data(const struct btf *btf) +{ + return btf->strs_data ? btf->strs_data : strset__data(btf->strs_set); +} + static void *btf_get_raw_data(const struct btf *btf, __u32 *size, bool swap_endian) { struct btf_header *hdr = btf->hdr; @@ -1286,7 +1287,7 @@ static void *btf_get_raw_data(const struct btf *btf, __u32 *size, bool swap_endi } p += hdr->type_len; - memcpy(p, btf->strs_data, hdr->str_len); + memcpy(p, btf_strs_data(btf), hdr->str_len); p += hdr->str_len; *size = data_sz; @@ -1320,7 +1321,7 @@ const char *btf__str_by_offset(const struct btf *btf, __u32 offset) if (offset < btf->start_str_off) return btf__str_by_offset(btf->base_btf, offset); else if (offset - btf->start_str_off < btf->hdr->str_len) - return btf->strs_data + (offset - btf->start_str_off); + return btf_strs_data(btf) + (offset - btf->start_str_off); else return NULL; } @@ -1474,25 +1475,6 @@ int btf__get_map_kv_tids(const struct btf *btf, const char *map_name, return 0; } -static size_t strs_hash_fn(const void *key, void *ctx) -{ - const struct btf *btf = ctx; - const char *strs = *btf->strs_data_ptr; - const char *str = strs + (long)key; - - return str_hash(str); -} - -static bool strs_hash_equal_fn(const void *key1, const void *key2, void *ctx) -{ - const struct btf *btf = ctx; - const char *strs = *btf->strs_data_ptr; - const char *str1 = strs + (long)key1; - const char *str2 = strs + (long)key2; - - return strcmp(str1, str2) == 0; -} - static void btf_invalidate_raw_data(struct btf *btf) { if (btf->raw_data) { @@ -1511,10 +1493,9 @@ static void btf_invalidate_raw_data(struct btf *btf) */ static int btf_ensure_modifiable(struct btf *btf) { - void *hdr, *types, *strs, *strs_end, *s; - struct hashmap *hash = NULL; - long off; - int err; + void *hdr, *types; + struct strset *set = NULL; + int err = -ENOMEM; if (btf_is_modifiable(btf)) { /* any BTF modification invalidates raw_data */ @@ -1525,44 +1506,25 @@ static int btf_ensure_modifiable(struct btf *btf) /* split raw data into three memory regions */ hdr = malloc(btf->hdr->hdr_len); types = malloc(btf->hdr->type_len); - strs = malloc(btf->hdr->str_len); - if (!hdr || !types || !strs) + if (!hdr || !types) goto err_out; memcpy(hdr, btf->hdr, btf->hdr->hdr_len); memcpy(types, btf->types_data, btf->hdr->type_len); - memcpy(strs, btf->strs_data, btf->hdr->str_len); - - /* make hashmap below use btf->strs_data as a source of strings */ - btf->strs_data_ptr = &btf->strs_data; /* build lookup index for all strings */ - hash = hashmap__new(strs_hash_fn, strs_hash_equal_fn, btf); - if (IS_ERR(hash)) { - err = PTR_ERR(hash); - hash = NULL; + set = strset__new(BTF_MAX_STR_OFFSET, btf->strs_data, btf->hdr->str_len); + if (IS_ERR(set)) { + err = PTR_ERR(set); goto err_out; } - strs_end = strs + btf->hdr->str_len; - for (off = 0, s = strs; s < strs_end; off += strlen(s) + 1, s = strs + off) { - /* hashmap__add() returns EEXIST if string with the same - * content already is in the hash map - */ - err = hashmap__add(hash, (void *)off, (void *)off); - if (err == -EEXIST) - continue; /* duplicate */ - if (err) - goto err_out; - } - /* only when everything was successful, update internal state */ btf->hdr = hdr; btf->types_data = types; btf->types_data_cap = btf->hdr->type_len; - btf->strs_data = strs; - btf->strs_data_cap = btf->hdr->str_len; - btf->strs_hash = hash; + btf->strs_data = NULL; + btf->strs_set = set; /* if BTF was created from scratch, all strings are guaranteed to be * unique and deduplicated */ @@ -1577,17 +1539,10 @@ static int btf_ensure_modifiable(struct btf *btf) return 0; err_out: - hashmap__free(hash); + strset__free(set); free(hdr); free(types); - free(strs); - return -ENOMEM; -} - -static void *btf_add_str_mem(struct btf *btf, size_t add_sz) -{ - return libbpf_add_mem(&btf->strs_data, &btf->strs_data_cap, 1, - btf->hdr->str_len, BTF_MAX_STR_OFFSET, add_sz); + return err; } /* Find an offset in BTF string section that corresponds to a given string *s*. @@ -1598,34 +1553,23 @@ static void *btf_add_str_mem(struct btf *btf, size_t add_sz) */ int btf__find_str(struct btf *btf, const char *s) { - long old_off, new_off, len; - void *p; + int off; if (btf->base_btf) { - int ret; - - ret = btf__find_str(btf->base_btf, s); - if (ret != -ENOENT) - return ret; + off = btf__find_str(btf->base_btf, s); + if (off != -ENOENT) + return off; } /* BTF needs to be in a modifiable state to build string lookup index */ if (btf_ensure_modifiable(btf)) return -ENOMEM; - /* see btf__add_str() for why we do this */ - len = strlen(s) + 1; - p = btf_add_str_mem(btf, len); - if (!p) - return -ENOMEM; - - new_off = btf->hdr->str_len; - memcpy(p, s, len); + off = strset__find_str(btf->strs_set, s); + if (off < 0) + return off; - if (hashmap__find(btf->strs_hash, (void *)new_off, (void **)&old_off)) - return btf->start_str_off + old_off; - - return -ENOENT; + return btf->start_str_off + off; } /* Add a string s to the BTF string section. @@ -1635,50 +1579,24 @@ int btf__find_str(struct btf *btf, const char *s) */ int btf__add_str(struct btf *btf, const char *s) { - long old_off, new_off, len; - void *p; - int err; + int off; if (btf->base_btf) { - int ret; - - ret = btf__find_str(btf->base_btf, s); - if (ret != -ENOENT) - return ret; + off = btf__find_str(btf->base_btf, s); + if (off != -ENOENT) + return off; } if (btf_ensure_modifiable(btf)) return -ENOMEM; - /* Hashmap keys are always offsets within btf->strs_data, so to even - * look up some string from the "outside", we need to first append it - * at the end, so that it can be addressed with an offset. Luckily, - * until btf->hdr->str_len is incremented, that string is just a piece - * of garbage for the rest of BTF code, so no harm, no foul. On the - * other hand, if the string is unique, it's already appended and - * ready to be used, only a simple btf->hdr->str_len increment away. - */ - len = strlen(s) + 1; - p = btf_add_str_mem(btf, len); - if (!p) - return -ENOMEM; + off = strset__add_str(btf->strs_set, s); + if (off < 0) + return off; - new_off = btf->hdr->str_len; - memcpy(p, s, len); + btf->hdr->str_len = strset__data_size(btf->strs_set); - /* Now attempt to add the string, but only if the string with the same - * contents doesn't exist already (HASHMAP_ADD strategy). If such - * string exists, we'll get its offset in old_off (that's old_key). - */ - err = hashmap__insert(btf->strs_hash, (void *)new_off, (void *)new_off, - HASHMAP_ADD, (const void **)&old_off, NULL); - if (err == -EEXIST) - return btf->start_str_off + old_off; /* duplicated string, return existing offset */ - if (err) - return err; - - btf->hdr->str_len += len; /* new unique string, adjust data length */ - return btf->start_str_off + new_off; + return btf->start_str_off + off; } static void *btf_add_type_mem(struct btf *btf, size_t add_sz) @@ -3016,10 +2934,7 @@ struct btf_dedup { /* Various option modifying behavior of algorithm */ struct btf_dedup_opts opts; /* temporary strings deduplication state */ - void *strs_data; - size_t strs_cap; - size_t strs_len; - struct hashmap* strs_hash; + struct strset *strs_set; }; static long hash_combine(long h, long value) @@ -3185,10 +3100,8 @@ static int strs_dedup_remap_str_off(__u32 *str_off_ptr, void *ctx) { struct btf_dedup *d = ctx; __u32 str_off = *str_off_ptr; - long old_off, new_off, len; const char *s; - void *p; - int err; + int off, err; /* don't touch empty string or string in main BTF */ if (str_off == 0 || str_off < d->btf->start_str_off) @@ -3205,29 +3118,11 @@ static int strs_dedup_remap_str_off(__u32 *str_off_ptr, void *ctx) return err; } - len = strlen(s) + 1; + off = strset__add_str(d->strs_set, s); + if (off < 0) + return off; - new_off = d->strs_len; - p = libbpf_add_mem(&d->strs_data, &d->strs_cap, 1, new_off, BTF_MAX_STR_OFFSET, len); - if (!p) - return -ENOMEM; - - memcpy(p, s, len); - - /* Now attempt to add the string, but only if the string with the same - * contents doesn't exist already (HASHMAP_ADD strategy). If such - * string exists, we'll get its offset in old_off (that's old_key). - */ - err = hashmap__insert(d->strs_hash, (void *)new_off, (void *)new_off, - HASHMAP_ADD, (const void **)&old_off, NULL); - if (err == -EEXIST) { - *str_off_ptr = d->btf->start_str_off + old_off; - } else if (err) { - return err; - } else { - *str_off_ptr = d->btf->start_str_off + new_off; - d->strs_len += len; - } + *str_off_ptr = d->btf->start_str_off + off; return 0; } @@ -3244,39 +3139,23 @@ static int strs_dedup_remap_str_off(__u32 *str_off_ptr, void *ctx) */ static int btf_dedup_strings(struct btf_dedup *d) { - char *s; int err; if (d->btf->strs_deduped) return 0; - /* temporarily switch to use btf_dedup's strs_data for strings for hash - * functions; later we'll just transfer hashmap to struct btf as is, - * along the strs_data - */ - d->btf->strs_data_ptr = &d->strs_data; - - d->strs_hash = hashmap__new(strs_hash_fn, strs_hash_equal_fn, d->btf); - if (IS_ERR(d->strs_hash)) { - err = PTR_ERR(d->strs_hash); - d->strs_hash = NULL; + d->strs_set = strset__new(BTF_MAX_STR_OFFSET, NULL, 0); + if (IS_ERR(d->strs_set)) { + err = PTR_ERR(d->strs_set); goto err_out; } if (!d->btf->base_btf) { - s = libbpf_add_mem(&d->strs_data, &d->strs_cap, 1, d->strs_len, BTF_MAX_STR_OFFSET, 1); - if (!s) - return -ENOMEM; - /* initial empty string */ - s[0] = 0; - d->strs_len = 1; - /* insert empty string; we won't be looking it up during strings * dedup, but it's good to have it for generic BTF string lookups */ - err = hashmap__insert(d->strs_hash, (void *)0, (void *)0, - HASHMAP_ADD, NULL, NULL); - if (err) + err = strset__add_str(d->strs_set, ""); + if (err < 0) goto err_out; } @@ -3286,28 +3165,16 @@ static int btf_dedup_strings(struct btf_dedup *d) goto err_out; /* replace BTF string data and hash with deduped ones */ - free(d->btf->strs_data); - hashmap__free(d->btf->strs_hash); - d->btf->strs_data = d->strs_data; - d->btf->strs_data_cap = d->strs_cap; - d->btf->hdr->str_len = d->strs_len; - d->btf->strs_hash = d->strs_hash; - /* now point strs_data_ptr back to btf->strs_data */ - d->btf->strs_data_ptr = &d->btf->strs_data; - - d->strs_data = d->strs_hash = NULL; - d->strs_len = d->strs_cap = 0; + strset__free(d->btf->strs_set); + d->btf->hdr->str_len = strset__data_size(d->strs_set); + d->btf->strs_set = d->strs_set; + d->strs_set = NULL; d->btf->strs_deduped = true; return 0; err_out: - free(d->strs_data); - hashmap__free(d->strs_hash); - d->strs_data = d->strs_hash = NULL; - d->strs_len = d->strs_cap = 0; - - /* restore strings pointer for existing d->btf->strs_hash back */ - d->btf->strs_data_ptr = &d->strs_data; + strset__free(d->strs_set); + d->strs_set = NULL; return err; } diff --git a/tools/lib/bpf/strset.c b/tools/lib/bpf/strset.c new file mode 100644 index 000000000000..1fb8b49de1d6 --- /dev/null +++ b/tools/lib/bpf/strset.c @@ -0,0 +1,176 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (c) 2021 Facebook */ +#include +#include +#include +#include +#include +#include "hashmap.h" +#include "libbpf_internal.h" +#include "strset.h" + +struct strset { + void *strs_data; + size_t strs_data_len; + size_t strs_data_cap; + size_t strs_data_max_len; + + /* lookup index for each unique string in strings set */ + struct hashmap *strs_hash; +}; + +static size_t strset_hash_fn(const void *key, void *ctx) +{ + const struct strset *s = ctx; + const char *str = s->strs_data + (long)key; + + return str_hash(str); +} + +static bool strset_equal_fn(const void *key1, const void *key2, void *ctx) +{ + const struct strset *s = ctx; + const char *str1 = s->strs_data + (long)key1; + const char *str2 = s->strs_data + (long)key2; + + return strcmp(str1, str2) == 0; +} + +struct strset *strset__new(size_t max_data_sz, const char *init_data, size_t init_data_sz) +{ + struct strset *set = calloc(1, sizeof(*set)); + struct hashmap *hash; + int err = -ENOMEM; + + if (!set) + return ERR_PTR(-ENOMEM); + + hash = hashmap__new(strset_hash_fn, strset_equal_fn, set); + if (IS_ERR(hash)) + goto err_out; + + set->strs_data_max_len = max_data_sz; + set->strs_hash = hash; + + if (init_data) { + long off; + + set->strs_data = malloc(init_data_sz); + if (!set->strs_data) + goto err_out; + + memcpy(set->strs_data, init_data, init_data_sz); + set->strs_data_len = init_data_sz; + set->strs_data_cap = init_data_sz; + + for (off = 0; off < set->strs_data_len; off += strlen(set->strs_data + off) + 1) { + /* hashmap__add() returns EEXIST if string with the same + * content already is in the hash map + */ + err = hashmap__add(hash, (void *)off, (void *)off); + if (err == -EEXIST) + continue; /* duplicate */ + if (err) + goto err_out; + } + } + + return set; +err_out: + strset__free(set); + return ERR_PTR(err); +} + +void strset__free(struct strset *set) +{ + if (IS_ERR_OR_NULL(set)) + return; + + hashmap__free(set->strs_hash); + free(set->strs_data); +} + +size_t strset__data_size(const struct strset *set) +{ + return set->strs_data_len; +} + +const char *strset__data(const struct strset *set) +{ + return set->strs_data; +} + +static void *strset_add_str_mem(struct strset *set, size_t add_sz) +{ + return libbpf_add_mem(&set->strs_data, &set->strs_data_cap, 1, + set->strs_data_len, set->strs_data_max_len, add_sz); +} + +/* Find string offset that corresponds to a given string *s*. + * Returns: + * - >0 offset into string data, if string is found; + * - -ENOENT, if string is not in the string data; + * - <0, on any other error. + */ +int strset__find_str(struct strset *set, const char *s) +{ + long old_off, new_off, len; + void *p; + + /* see strset__add_str() for why we do this */ + len = strlen(s) + 1; + p = strset_add_str_mem(set, len); + if (!p) + return -ENOMEM; + + new_off = set->strs_data_len; + memcpy(p, s, len); + + if (hashmap__find(set->strs_hash, (void *)new_off, (void **)&old_off)) + return old_off; + + return -ENOENT; +} + +/* Add a string s to the string data. If the string already exists, return its + * offset within string data. + * Returns: + * - > 0 offset into string data, on success; + * - < 0, on error. + */ +int strset__add_str(struct strset *set, const char *s) +{ + long old_off, new_off, len; + void *p; + int err; + + /* Hashmap keys are always offsets within set->strs_data, so to even + * look up some string from the "outside", we need to first append it + * at the end, so that it can be addressed with an offset. Luckily, + * until set->strs_data_len is incremented, that string is just a piece + * of garbage for the rest of the code, so no harm, no foul. On the + * other hand, if the string is unique, it's already appended and + * ready to be used, only a simple set->strs_data_len increment away. + */ + len = strlen(s) + 1; + p = strset_add_str_mem(set, len); + if (!p) + return -ENOMEM; + + new_off = set->strs_data_len; + memcpy(p, s, len); + + /* Now attempt to add the string, but only if the string with the same + * contents doesn't exist already (HASHMAP_ADD strategy). If such + * string exists, we'll get its offset in old_off (that's old_key). + */ + err = hashmap__insert(set->strs_hash, (void *)new_off, (void *)new_off, + HASHMAP_ADD, (const void **)&old_off, NULL); + if (err == -EEXIST) + return old_off; /* duplicated string, return existing offset */ + if (err) + return err; + + set->strs_data_len += len; /* new unique string, adjust data length */ + return new_off; +} diff --git a/tools/lib/bpf/strset.h b/tools/lib/bpf/strset.h new file mode 100644 index 000000000000..b6ddf77a83c2 --- /dev/null +++ b/tools/lib/bpf/strset.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +/* Copyright (c) 2021 Facebook */ +#ifndef __LIBBPF_STRSET_H +#define __LIBBPF_STRSET_H + +#include +#include + +struct strset; + +struct strset *strset__new(size_t max_data_sz, const char *init_data, size_t init_data_sz); +void strset__free(struct strset *set); + +const char *strset__data(const struct strset *set); +size_t strset__data_size(const struct strset *set); + +int strset__find_str(struct strset *set, const char *s); +int strset__add_str(struct strset *set, const char *s); + +#endif /* __LIBBPF_STRSET_H */ -- cgit v1.2.3 From 9af44bc5d4d70b37c9ada24d8e0367b34b805bd3 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 18 Mar 2021 12:40:29 -0700 Subject: libbpf: Add generic BTF type shallow copy API Add btf__add_type() API that performs shallow copy of a given BTF type from the source BTF into the destination BTF. All the information and type IDs are preserved, but all the strings encountered are added into the destination BTF and corresponding offsets are rewritten. BTF type IDs are assumed to be correct or such that will be (somehow) modified afterwards. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210318194036.3521577-6-andrii@kernel.org --- tools/lib/bpf/btf.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ tools/lib/bpf/btf.h | 2 ++ tools/lib/bpf/libbpf.map | 1 + 3 files changed, 51 insertions(+) (limited to 'tools') diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index fe087592ad35..d30e67e7e1e5 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -1629,6 +1629,54 @@ static int btf_commit_type(struct btf *btf, int data_sz) return btf->start_id + btf->nr_types - 1; } +struct btf_pipe { + const struct btf *src; + struct btf *dst; +}; + +static int btf_rewrite_str(__u32 *str_off, void *ctx) +{ + struct btf_pipe *p = ctx; + int off; + + if (!*str_off) /* nothing to do for empty strings */ + return 0; + + off = btf__add_str(p->dst, btf__str_by_offset(p->src, *str_off)); + if (off < 0) + return off; + + *str_off = off; + return 0; +} + +int btf__add_type(struct btf *btf, const struct btf *src_btf, const struct btf_type *src_type) +{ + struct btf_pipe p = { .src = src_btf, .dst = btf }; + struct btf_type *t; + int sz, err; + + sz = btf_type_size(src_type); + if (sz < 0) + return sz; + + /* deconstruct BTF, if necessary, and invalidate raw_data */ + if (btf_ensure_modifiable(btf)) + return -ENOMEM; + + t = btf_add_type_mem(btf, sz); + if (!t) + return -ENOMEM; + + memcpy(t, src_type, sz); + + err = btf_type_visit_str_offs(t, btf_rewrite_str, &p); + if (err) + return err; + + return btf_commit_type(btf, sz); +} + /* * Append new BTF_KIND_INT type with: * - *name* - non-empty, non-NULL type name; diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index 029a9cfc8c2d..3b0b17ba94a1 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -93,6 +93,8 @@ LIBBPF_API struct btf *libbpf_find_kernel_btf(void); LIBBPF_API int btf__find_str(struct btf *btf, const char *s); LIBBPF_API int btf__add_str(struct btf *btf, const char *s); +LIBBPF_API int btf__add_type(struct btf *btf, const struct btf *src_btf, + const struct btf_type *src_type); LIBBPF_API int btf__add_int(struct btf *btf, const char *name, size_t byte_sz, int encoding); LIBBPF_API int btf__add_float(struct btf *btf, const char *name, size_t byte_sz); diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index ec898f464ab9..d31d8c968097 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -354,4 +354,5 @@ LIBBPF_0.3.0 { LIBBPF_0.4.0 { global: btf__add_float; + btf__add_type; } LIBBPF_0.3.0; -- cgit v1.2.3 From faf6ed321cf61fafa17444fe01e7e336b8e89acc Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 18 Mar 2021 12:40:30 -0700 Subject: libbpf: Add BPF static linker APIs Introduce BPF static linker APIs to libbpf. BPF static linker allows to perform static linking of multiple BPF object files into a single combined resulting object file, preserving all the BPF programs, maps, global variables, etc. Data sections (.bss, .data, .rodata, .maps, maps, etc) with the same name are concatenated together. Similarly, code sections are also concatenated. All the symbols and ELF relocations are also concatenated in their respective ELF sections and are adjusted accordingly to the new object file layout. Static variables and functions are handled correctly as well, adjusting BPF instructions offsets to reflect new variable/function offset within the combined ELF section. Such relocations are referencing STT_SECTION symbols and that stays intact. Data sections in different files can have different alignment requirements, so that is taken care of as well, adjusting sizes and offsets as necessary to satisfy both old and new alignment requirements. DWARF data sections are stripped out, currently. As well as LLLVM_ADDRSIG section, which is ignored by libbpf in bpf_object__open() anyways. So, in a way, BPF static linker is an analogue to `llvm-strip -g`, which is a pretty nice property, especially if resulting .o file is then used to generate BPF skeleton. Original string sections are ignored and instead we construct our own set of unique strings using libbpf-internal `struct strset` API. To reduce the size of the patch, all the .BTF and .BTF.ext processing was moved into a separate patch. The high-level API consists of just 4 functions: - bpf_linker__new() creates an instance of BPF static linker. It accepts output filename and (currently empty) options struct; - bpf_linker__add_file() takes input filename and appends it to the already processed ELF data; it can be called multiple times, one for each BPF ELF object file that needs to be linked in; - bpf_linker__finalize() needs to be called to dump final ELF contents into the output file, specified when bpf_linker was created; after bpf_linker__finalize() is called, no more bpf_linker__add_file() and bpf_linker__finalize() calls are allowed, they will return error; - regardless of whether bpf_linker__finalize() was called or not, bpf_linker__free() will free up all the used resources. Currently, BPF static linker doesn't resolve cross-object file references (extern variables and/or functions). This will be added in the follow up patch set. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210318194036.3521577-7-andrii@kernel.org --- tools/lib/bpf/Build | 2 +- tools/lib/bpf/libbpf.c | 11 +- tools/lib/bpf/libbpf.h | 13 + tools/lib/bpf/libbpf.map | 4 + tools/lib/bpf/libbpf_internal.h | 20 + tools/lib/bpf/linker.c | 1176 +++++++++++++++++++++++++++++++++++++++ 6 files changed, 1215 insertions(+), 11 deletions(-) create mode 100644 tools/lib/bpf/linker.c (limited to 'tools') diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build index 8136186a453f..9b057cc7650a 100644 --- a/tools/lib/bpf/Build +++ b/tools/lib/bpf/Build @@ -1,3 +1,3 @@ libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o \ netlink.o bpf_prog_linfo.o libbpf_probes.o xsk.o hashmap.o \ - btf_dump.o ringbuf.o strset.o + btf_dump.o ringbuf.o strset.o linker.o diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 18ba37164e17..058b643cbcb1 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -55,10 +55,6 @@ #include "libbpf_internal.h" #include "hashmap.h" -#ifndef EM_BPF -#define EM_BPF 247 -#endif - #ifndef BPF_FS_MAGIC #define BPF_FS_MAGIC 0xcafe4a11 #endif @@ -1134,11 +1130,6 @@ static void bpf_object__elf_finish(struct bpf_object *obj) obj->efile.obj_buf_sz = 0; } -/* if libelf is old and doesn't support mmap(), fall back to read() */ -#ifndef ELF_C_READ_MMAP -#define ELF_C_READ_MMAP ELF_C_READ -#endif - static int bpf_object__elf_init(struct bpf_object *obj) { int err = 0; @@ -2807,7 +2798,7 @@ static bool ignore_elf_section(GElf_Shdr *hdr, const char *name) return true; /* ignore .llvm_addrsig section as well */ - if (hdr->sh_type == 0x6FFF4C03 /* SHT_LLVM_ADDRSIG */) + if (hdr->sh_type == SHT_LLVM_ADDRSIG) return true; /* no subprograms will lead to an empty .text section, ignore it */ diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 3d690d4e785c..a1a424b9b8ff 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -760,6 +760,19 @@ enum libbpf_tristate { TRI_MODULE = 2, }; +struct bpf_linker_opts { + /* size of this struct, for forward/backward compatiblity */ + size_t sz; +}; +#define bpf_linker_opts__last_field sz + +struct bpf_linker; + +LIBBPF_API struct bpf_linker *bpf_linker__new(const char *filename, struct bpf_linker_opts *opts); +LIBBPF_API int bpf_linker__add_file(struct bpf_linker *linker, const char *filename); +LIBBPF_API int bpf_linker__finalize(struct bpf_linker *linker); +LIBBPF_API void bpf_linker__free(struct bpf_linker *linker); + #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index d31d8c968097..279ae861f568 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -355,4 +355,8 @@ LIBBPF_0.4.0 { global: btf__add_float; btf__add_type; + bpf_linker__add_file; + bpf_linker__finalize; + bpf_linker__free; + bpf_linker__new; } LIBBPF_0.3.0; diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index e0787900eeca..6017902c687e 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -20,6 +20,26 @@ #include "libbpf.h" +#ifndef EM_BPF +#define EM_BPF 247 +#endif + +#ifndef R_BPF_64_64 +#define R_BPF_64_64 1 +#endif +#ifndef R_BPF_64_32 +#define R_BPF_64_32 10 +#endif + +#ifndef SHT_LLVM_ADDRSIG +#define SHT_LLVM_ADDRSIG 0x6FFF4C03 +#endif + +/* if libelf is old and doesn't support mmap(), fall back to read() */ +#ifndef ELF_C_READ_MMAP +#define ELF_C_READ_MMAP ELF_C_READ +#endif + #define BTF_INFO_ENC(kind, kind_flag, vlen) \ ((!!(kind_flag) << 31) | ((kind) << 24) | ((vlen) & BTF_MAX_VLEN)) #define BTF_TYPE_ENC(name, info, size_or_type) (name), (info), (size_or_type) diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c new file mode 100644 index 000000000000..01b56939a835 --- /dev/null +++ b/tools/lib/bpf/linker.c @@ -0,0 +1,1176 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* + * BPF static linker + * + * Copyright (c) 2021 Facebook + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "libbpf.h" +#include "btf.h" +#include "libbpf_internal.h" +#include "strset.h" + +struct src_sec { + const char *sec_name; + /* positional (not necessarily ELF) index in an array of sections */ + int id; + /* positional (not necessarily ELF) index of a matching section in a final object file */ + int dst_id; + /* section data offset in a matching output section */ + int dst_off; + /* whether section is omitted from the final ELF file */ + bool skipped; + + /* ELF info */ + size_t sec_idx; + Elf_Scn *scn; + Elf64_Shdr *shdr; + Elf_Data *data; +}; + +struct src_obj { + const char *filename; + int fd; + Elf *elf; + /* Section header strings section index */ + size_t shstrs_sec_idx; + /* SYMTAB section index */ + size_t symtab_sec_idx; + + /* List of sections. Slot zero is unused. */ + struct src_sec *secs; + int sec_cnt; + + /* mapping of symbol indices from src to dst ELF */ + int *sym_map; +}; + +struct dst_sec { + char *sec_name; + /* positional (not necessarily ELF) index in an array of sections */ + int id; + + /* ELF info */ + size_t sec_idx; + Elf_Scn *scn; + Elf64_Shdr *shdr; + Elf_Data *data; + + /* final output section size */ + int sec_sz; + /* final output contents of the section */ + void *raw_data; + + /* corresponding STT_SECTION symbol index in SYMTAB */ + int sec_sym_idx; +}; + +struct bpf_linker { + char *filename; + int fd; + Elf *elf; + Elf64_Ehdr *elf_hdr; + + /* Output sections metadata */ + struct dst_sec *secs; + int sec_cnt; + + struct strset *strtab_strs; /* STRTAB unique strings */ + size_t strtab_sec_idx; /* STRTAB section index */ + size_t symtab_sec_idx; /* SYMTAB section index */ +}; + +#define pr_warn_elf(fmt, ...) \ +do { \ + libbpf_print(LIBBPF_WARN, "libbpf: " fmt ": %s\n", ##__VA_ARGS__, elf_errmsg(-1)); \ +} while (0) + +static int init_output_elf(struct bpf_linker *linker, const char *file); + +static int linker_load_obj_file(struct bpf_linker *linker, const char *filename, struct src_obj *obj); +static int linker_sanity_check_elf(struct src_obj *obj); +static int linker_append_sec_data(struct bpf_linker *linker, struct src_obj *obj); +static int linker_append_elf_syms(struct bpf_linker *linker, struct src_obj *obj); +static int linker_append_elf_relos(struct bpf_linker *linker, struct src_obj *obj); + +void bpf_linker__free(struct bpf_linker *linker) +{ + int i; + + if (!linker) + return; + + free(linker->filename); + + if (linker->elf) + elf_end(linker->elf); + + if (linker->fd >= 0) + close(linker->fd); + + strset__free(linker->strtab_strs); + + for (i = 1; i < linker->sec_cnt; i++) { + struct dst_sec *sec = &linker->secs[i]; + + free(sec->sec_name); + free(sec->raw_data); + } + free(linker->secs); + + free(linker); +} + +struct bpf_linker *bpf_linker__new(const char *filename, struct bpf_linker_opts *opts) +{ + struct bpf_linker *linker; + int err; + + if (!OPTS_VALID(opts, bpf_linker_opts)) + return NULL; + + if (elf_version(EV_CURRENT) == EV_NONE) { + pr_warn_elf("libelf initialization failed"); + return NULL; + } + + linker = calloc(1, sizeof(*linker)); + if (!linker) + return NULL; + + linker->fd = -1; + + err = init_output_elf(linker, filename); + if (err) + goto err_out; + + return linker; + +err_out: + bpf_linker__free(linker); + return NULL; +} + +static struct dst_sec *add_dst_sec(struct bpf_linker *linker, const char *sec_name) +{ + struct dst_sec *secs = linker->secs, *sec; + size_t new_cnt = linker->sec_cnt ? linker->sec_cnt + 1 : 2; + + secs = libbpf_reallocarray(secs, new_cnt, sizeof(*secs)); + if (!secs) + return NULL; + + /* zero out newly allocated memory */ + memset(secs + linker->sec_cnt, 0, (new_cnt - linker->sec_cnt) * sizeof(*secs)); + + linker->secs = secs; + linker->sec_cnt = new_cnt; + + sec = &linker->secs[new_cnt - 1]; + sec->id = new_cnt - 1; + sec->sec_name = strdup(sec_name); + if (!sec->sec_name) + return NULL; + + return sec; +} + +static Elf64_Sym *add_new_sym(struct bpf_linker *linker, size_t *sym_idx) +{ + struct dst_sec *symtab = &linker->secs[linker->symtab_sec_idx]; + Elf64_Sym *syms, *sym; + size_t sym_cnt = symtab->sec_sz / sizeof(*sym); + + syms = libbpf_reallocarray(symtab->raw_data, sym_cnt + 1, sizeof(*sym)); + if (!syms) + return NULL; + + sym = &syms[sym_cnt]; + memset(sym, 0, sizeof(*sym)); + + symtab->raw_data = syms; + symtab->sec_sz += sizeof(*sym); + symtab->shdr->sh_size += sizeof(*sym); + symtab->data->d_size += sizeof(*sym); + + if (sym_idx) + *sym_idx = sym_cnt; + + return sym; +} + +static int init_output_elf(struct bpf_linker *linker, const char *file) +{ + int err, str_off; + Elf64_Sym *init_sym; + struct dst_sec *sec; + + linker->filename = strdup(file); + if (!linker->filename) + return -ENOMEM; + + linker->fd = open(file, O_WRONLY | O_CREAT | O_TRUNC, 0644); + if (linker->fd < 0) { + err = -errno; + pr_warn("failed to create '%s': %d\n", file, err); + return err; + } + + linker->elf = elf_begin(linker->fd, ELF_C_WRITE, NULL); + if (!linker->elf) { + pr_warn_elf("failed to create ELF object"); + return -EINVAL; + } + + /* ELF header */ + linker->elf_hdr = elf64_newehdr(linker->elf); + if (!linker->elf_hdr){ + pr_warn_elf("failed to create ELF header"); + return -EINVAL; + } + + linker->elf_hdr->e_machine = EM_BPF; + linker->elf_hdr->e_type = ET_REL; +#if __BYTE_ORDER == __LITTLE_ENDIAN + linker->elf_hdr->e_ident[EI_DATA] = ELFDATA2LSB; +#elif __BYTE_ORDER == __BIG_ENDIAN + linker->elf_hdr->e_ident[EI_DATA] = ELFDATA2MSB; +#else +#error "Unknown __BYTE_ORDER" +#endif + + /* STRTAB */ + /* initialize strset with an empty string to conform to ELF */ + linker->strtab_strs = strset__new(INT_MAX, "", sizeof("")); + if (libbpf_get_error(linker->strtab_strs)) + return libbpf_get_error(linker->strtab_strs); + + sec = add_dst_sec(linker, ".strtab"); + if (!sec) + return -ENOMEM; + + sec->scn = elf_newscn(linker->elf); + if (!sec->scn) { + pr_warn_elf("failed to create STRTAB section"); + return -EINVAL; + } + + sec->shdr = elf64_getshdr(sec->scn); + if (!sec->shdr) + return -EINVAL; + + sec->data = elf_newdata(sec->scn); + if (!sec->data) { + pr_warn_elf("failed to create STRTAB data"); + return -EINVAL; + } + + str_off = strset__add_str(linker->strtab_strs, sec->sec_name); + if (str_off < 0) + return str_off; + + sec->sec_idx = elf_ndxscn(sec->scn); + linker->elf_hdr->e_shstrndx = sec->sec_idx; + linker->strtab_sec_idx = sec->sec_idx; + + sec->shdr->sh_name = str_off; + sec->shdr->sh_type = SHT_STRTAB; + sec->shdr->sh_flags = SHF_STRINGS; + sec->shdr->sh_offset = 0; + sec->shdr->sh_link = 0; + sec->shdr->sh_info = 0; + sec->shdr->sh_addralign = 1; + sec->shdr->sh_size = sec->sec_sz = 0; + sec->shdr->sh_entsize = 0; + + /* SYMTAB */ + sec = add_dst_sec(linker, ".symtab"); + if (!sec) + return -ENOMEM; + + sec->scn = elf_newscn(linker->elf); + if (!sec->scn) { + pr_warn_elf("failed to create SYMTAB section"); + return -EINVAL; + } + + sec->shdr = elf64_getshdr(sec->scn); + if (!sec->shdr) + return -EINVAL; + + sec->data = elf_newdata(sec->scn); + if (!sec->data) { + pr_warn_elf("failed to create SYMTAB data"); + return -EINVAL; + } + + str_off = strset__add_str(linker->strtab_strs, sec->sec_name); + if (str_off < 0) + return str_off; + + sec->sec_idx = elf_ndxscn(sec->scn); + linker->symtab_sec_idx = sec->sec_idx; + + sec->shdr->sh_name = str_off; + sec->shdr->sh_type = SHT_SYMTAB; + sec->shdr->sh_flags = 0; + sec->shdr->sh_offset = 0; + sec->shdr->sh_link = linker->strtab_sec_idx; + /* sh_info should be one greater than the index of the last local + * symbol (i.e., binding is STB_LOCAL). But why and who cares? + */ + sec->shdr->sh_info = 0; + sec->shdr->sh_addralign = 8; + sec->shdr->sh_entsize = sizeof(Elf64_Sym); + + /* add the special all-zero symbol */ + init_sym = add_new_sym(linker, NULL); + if (!init_sym) + return -EINVAL; + + init_sym->st_name = 0; + init_sym->st_info = 0; + init_sym->st_other = 0; + init_sym->st_shndx = SHN_UNDEF; + init_sym->st_value = 0; + init_sym->st_size = 0; + + return 0; +} + +int bpf_linker__add_file(struct bpf_linker *linker, const char *filename) +{ + struct src_obj obj = {}; + int err = 0; + + if (!linker->elf) + return -EINVAL; + + err = err ?: linker_load_obj_file(linker, filename, &obj); + err = err ?: linker_append_sec_data(linker, &obj); + err = err ?: linker_append_elf_syms(linker, &obj); + err = err ?: linker_append_elf_relos(linker, &obj); + + /* free up src_obj resources */ + free(obj.secs); + free(obj.sym_map); + if (obj.elf) + elf_end(obj.elf); + if (obj.fd >= 0) + close(obj.fd); + + return err; +} + +static bool is_dwarf_sec_name(const char *name) +{ + /* approximation, but the actual list is too long */ + return strncmp(name, ".debug_", sizeof(".debug_") - 1) == 0; +} + +static bool is_ignored_sec(struct src_sec *sec) +{ + Elf64_Shdr *shdr = sec->shdr; + const char *name = sec->sec_name; + + /* no special handling of .strtab */ + if (shdr->sh_type == SHT_STRTAB) + return true; + + /* ignore .llvm_addrsig section as well */ + if (shdr->sh_type == SHT_LLVM_ADDRSIG) + return true; + + /* no subprograms will lead to an empty .text section, ignore it */ + if (shdr->sh_type == SHT_PROGBITS && shdr->sh_size == 0 && + strcmp(sec->sec_name, ".text") == 0) + return true; + + /* DWARF sections */ + if (is_dwarf_sec_name(sec->sec_name)) + return true; + + if (strncmp(name, ".rel", sizeof(".rel") - 1) == 0) { + name += sizeof(".rel") - 1; + /* DWARF section relocations */ + if (is_dwarf_sec_name(name)) + return true; + + /* .BTF and .BTF.ext don't need relocations */ + if (strcmp(name, BTF_ELF_SEC) == 0 || + strcmp(name, BTF_EXT_ELF_SEC) == 0) + return true; + } + + return false; +} + +static struct src_sec *add_src_sec(struct src_obj *obj, const char *sec_name) +{ + struct src_sec *secs = obj->secs, *sec; + size_t new_cnt = obj->sec_cnt ? obj->sec_cnt + 1 : 2; + + secs = libbpf_reallocarray(secs, new_cnt, sizeof(*secs)); + if (!secs) + return NULL; + + /* zero out newly allocated memory */ + memset(secs + obj->sec_cnt, 0, (new_cnt - obj->sec_cnt) * sizeof(*secs)); + + obj->secs = secs; + obj->sec_cnt = new_cnt; + + sec = &obj->secs[new_cnt - 1]; + sec->id = new_cnt - 1; + sec->sec_name = sec_name; + + return sec; +} + +static int linker_load_obj_file(struct bpf_linker *linker, const char *filename, struct src_obj *obj) +{ +#if __BYTE_ORDER == __LITTLE_ENDIAN + const int host_endianness = ELFDATA2LSB; +#elif __BYTE_ORDER == __BIG_ENDIAN + const int host_endianness = ELFDATA2MSB; +#else +#error "Unknown __BYTE_ORDER" +#endif + int err = 0; + Elf_Scn *scn; + Elf_Data *data; + Elf64_Ehdr *ehdr; + Elf64_Shdr *shdr; + struct src_sec *sec; + + pr_debug("linker: adding object file '%s'...\n", filename); + + obj->filename = filename; + + obj->fd = open(filename, O_RDONLY); + if (obj->fd < 0) { + err = -errno; + pr_warn("failed to open file '%s': %d\n", filename, err); + return err; + } + obj->elf = elf_begin(obj->fd, ELF_C_READ_MMAP, NULL); + if (!obj->elf) { + err = -errno; + pr_warn_elf("failed to parse ELF file '%s'", filename); + return err; + } + + /* Sanity check ELF file high-level properties */ + ehdr = elf64_getehdr(obj->elf); + if (!ehdr) { + err = -errno; + pr_warn_elf("failed to get ELF header for %s", filename); + return err; + } + if (ehdr->e_ident[EI_DATA] != host_endianness) { + err = -EOPNOTSUPP; + pr_warn_elf("unsupported byte order of ELF file %s", filename); + return err; + } + if (ehdr->e_type != ET_REL + || ehdr->e_machine != EM_BPF + || ehdr->e_ident[EI_CLASS] != ELFCLASS64) { + err = -EOPNOTSUPP; + pr_warn_elf("unsupported kind of ELF file %s", filename); + return err; + } + + if (elf_getshdrstrndx(obj->elf, &obj->shstrs_sec_idx)) { + err = -errno; + pr_warn_elf("failed to get SHSTRTAB section index for %s", filename); + return err; + } + + scn = NULL; + while ((scn = elf_nextscn(obj->elf, scn)) != NULL) { + size_t sec_idx = elf_ndxscn(scn); + const char *sec_name; + + shdr = elf64_getshdr(scn); + if (!shdr) { + err = -errno; + pr_warn_elf("failed to get section #%zu header for %s", + sec_idx, filename); + return err; + } + + sec_name = elf_strptr(obj->elf, obj->shstrs_sec_idx, shdr->sh_name); + if (!sec_name) { + err = -errno; + pr_warn_elf("failed to get section #%zu name for %s", + sec_idx, filename); + return err; + } + + data = elf_getdata(scn, 0); + if (!data) { + err = -errno; + pr_warn_elf("failed to get section #%zu (%s) data from %s", + sec_idx, sec_name, filename); + return err; + } + + sec = add_src_sec(obj, sec_name); + if (!sec) + return -ENOMEM; + + sec->scn = scn; + sec->shdr = shdr; + sec->data = data; + sec->sec_idx = elf_ndxscn(scn); + + if (is_ignored_sec(sec)) { + sec->skipped = true; + continue; + } + + switch (shdr->sh_type) { + case SHT_SYMTAB: + if (obj->symtab_sec_idx) { + err = -EOPNOTSUPP; + pr_warn("multiple SYMTAB sections found, not supported\n"); + return err; + } + obj->symtab_sec_idx = sec_idx; + break; + case SHT_STRTAB: + /* we'll construct our own string table */ + break; + case SHT_PROGBITS: + if (strcmp(sec_name, BTF_ELF_SEC) == 0) { + sec->skipped = true; + continue; + } + if (strcmp(sec_name, BTF_EXT_ELF_SEC) == 0) { + sec->skipped = true; + continue; + } + + /* data & code */ + break; + case SHT_NOBITS: + /* BSS */ + break; + case SHT_REL: + /* relocations */ + break; + default: + pr_warn("unrecognized section #%zu (%s) in %s\n", + sec_idx, sec_name, filename); + err = -EINVAL; + return err; + } + } + + err = err ?: linker_sanity_check_elf(obj); + + return err; +} + +static bool is_pow_of_2(size_t x) +{ + return x && (x & (x - 1)) == 0; +} + +static int linker_sanity_check_elf(struct src_obj *obj) +{ + struct src_sec *sec, *link_sec; + int i, j, n; + + if (!obj->symtab_sec_idx) { + pr_warn("ELF is missing SYMTAB section in %s\n", obj->filename); + return -EINVAL; + } + if (!obj->shstrs_sec_idx) { + pr_warn("ELF is missing section headers STRTAB section in %s\n", obj->filename); + return -EINVAL; + } + + for (i = 1; i < obj->sec_cnt; i++) { + sec = &obj->secs[i]; + + if (sec->sec_name[0] == '\0') { + pr_warn("ELF section #%zu has empty name in %s\n", sec->sec_idx, obj->filename); + return -EINVAL; + } + + if (sec->shdr->sh_addralign && !is_pow_of_2(sec->shdr->sh_addralign)) + return -EINVAL; + if (sec->shdr->sh_addralign != sec->data->d_align) + return -EINVAL; + + if (sec->shdr->sh_size != sec->data->d_size) + return -EINVAL; + + switch (sec->shdr->sh_type) { + case SHT_SYMTAB: { + Elf64_Sym *sym; + + if (sec->shdr->sh_entsize != sizeof(Elf64_Sym)) + return -EINVAL; + if (sec->shdr->sh_size % sec->shdr->sh_entsize != 0) + return -EINVAL; + + if (!sec->shdr->sh_link || sec->shdr->sh_link >= obj->sec_cnt) { + pr_warn("ELF SYMTAB section #%zu points to missing STRTAB section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_link, obj->filename); + return -EINVAL; + } + link_sec = &obj->secs[sec->shdr->sh_link]; + if (link_sec->shdr->sh_type != SHT_STRTAB) { + pr_warn("ELF SYMTAB section #%zu points to invalid STRTAB section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_link, obj->filename); + return -EINVAL; + } + + n = sec->shdr->sh_size / sec->shdr->sh_entsize; + sym = sec->data->d_buf; + for (j = 0; j < n; j++, sym++) { + if (sym->st_shndx + && sym->st_shndx < SHN_LORESERVE + && sym->st_shndx >= obj->sec_cnt) { + pr_warn("ELF sym #%d in section #%zu points to missing section #%zu in %s\n", + j, sec->sec_idx, (size_t)sym->st_shndx, obj->filename); + return -EINVAL; + } + if (ELF64_ST_TYPE(sym->st_info) == STT_SECTION) { + if (sym->st_value != 0) + return -EINVAL; + } + } + break; + } + case SHT_STRTAB: + break; + case SHT_PROGBITS: + if (sec->shdr->sh_flags & SHF_EXECINSTR) { + if (sec->shdr->sh_size % sizeof(struct bpf_insn) != 0) + return -EINVAL; + } + break; + case SHT_NOBITS: + break; + case SHT_REL: { + Elf64_Rel *relo; + struct src_sec *sym_sec; + + if (sec->shdr->sh_entsize != sizeof(Elf64_Rel)) + return -EINVAL; + if (sec->shdr->sh_size % sec->shdr->sh_entsize != 0) + return -EINVAL; + + /* SHT_REL's sh_link should point to SYMTAB */ + if (sec->shdr->sh_link != obj->symtab_sec_idx) { + pr_warn("ELF relo section #%zu points to invalid SYMTAB section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_link, obj->filename); + return -EINVAL; + } + + /* SHT_REL's sh_info points to relocated section */ + if (!sec->shdr->sh_info || sec->shdr->sh_info >= obj->sec_cnt) { + pr_warn("ELF relo section #%zu points to missing section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_info, obj->filename); + return -EINVAL; + } + link_sec = &obj->secs[sec->shdr->sh_info]; + + /* .rel -> pattern is followed */ + if (strncmp(sec->sec_name, ".rel", sizeof(".rel") - 1) != 0 + || strcmp(sec->sec_name + sizeof(".rel") - 1, link_sec->sec_name) != 0) { + pr_warn("ELF relo section #%zu name has invalid name in %s\n", + sec->sec_idx, obj->filename); + return -EINVAL; + } + + /* don't further validate relocations for ignored sections */ + if (link_sec->skipped) + break; + + /* relocatable section is data or instructions */ + if (link_sec->shdr->sh_type != SHT_PROGBITS + && link_sec->shdr->sh_type != SHT_NOBITS) { + pr_warn("ELF relo section #%zu points to invalid section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_info, obj->filename); + return -EINVAL; + } + + /* check sanity of each relocation */ + n = sec->shdr->sh_size / sec->shdr->sh_entsize; + relo = sec->data->d_buf; + sym_sec = &obj->secs[obj->symtab_sec_idx]; + for (j = 0; j < n; j++, relo++) { + size_t sym_idx = ELF64_R_SYM(relo->r_info); + size_t sym_type = ELF64_R_TYPE(relo->r_info); + + if (sym_type != R_BPF_64_64 && sym_type != R_BPF_64_32) { + pr_warn("ELF relo #%d in section #%zu has unexpected type %zu in %s\n", + j, sec->sec_idx, sym_type, obj->filename); + return -EINVAL; + } + + if (!sym_idx || sym_idx * sizeof(Elf64_Sym) >= sym_sec->shdr->sh_size) { + pr_warn("ELF relo #%d in section #%zu points to invalid symbol #%zu in %s\n", + j, sec->sec_idx, sym_idx, obj->filename); + return -EINVAL; + } + + if (link_sec->shdr->sh_flags & SHF_EXECINSTR) { + if (relo->r_offset % sizeof(struct bpf_insn) != 0) { + pr_warn("ELF relo #%d in section #%zu points to missing symbol #%zu in %s\n", + j, sec->sec_idx, sym_idx, obj->filename); + return -EINVAL; + } + } + } + break; + } + case SHT_LLVM_ADDRSIG: + break; + default: + pr_warn("ELF section #%zu (%s) has unrecognized type %zu in %s\n", + sec->sec_idx, sec->sec_name, (size_t)sec->shdr->sh_type, obj->filename); + return -EINVAL; + } + } + + return 0; +} + +static int init_sec(struct bpf_linker *linker, struct dst_sec *dst_sec, struct src_sec *src_sec) +{ + Elf_Scn *scn; + Elf_Data *data; + Elf64_Shdr *shdr; + int name_off; + + dst_sec->sec_sz = 0; + dst_sec->sec_idx = 0; + + scn = elf_newscn(linker->elf); + if (!scn) + return -1; + data = elf_newdata(scn); + if (!data) + return -1; + shdr = elf64_getshdr(scn); + if (!shdr) + return -1; + + dst_sec->scn = scn; + dst_sec->shdr = shdr; + dst_sec->data = data; + dst_sec->sec_idx = elf_ndxscn(scn); + + name_off = strset__add_str(linker->strtab_strs, src_sec->sec_name); + if (name_off < 0) + return name_off; + + shdr->sh_name = name_off; + shdr->sh_type = src_sec->shdr->sh_type; + shdr->sh_flags = src_sec->shdr->sh_flags; + shdr->sh_size = 0; + /* sh_link and sh_info have different meaning for different types of + * sections, so we leave it up to the caller code to fill them in, if + * necessary + */ + shdr->sh_link = 0; + shdr->sh_info = 0; + shdr->sh_addralign = src_sec->shdr->sh_addralign; + shdr->sh_entsize = src_sec->shdr->sh_entsize; + + data->d_type = src_sec->data->d_type; + data->d_size = 0; + data->d_buf = NULL; + data->d_align = src_sec->data->d_align; + data->d_off = 0; + + return 0; +} + +static struct dst_sec *find_dst_sec_by_name(struct bpf_linker *linker, const char *sec_name) +{ + struct dst_sec *sec; + int i; + + for (i = 1; i < linker->sec_cnt; i++) { + sec = &linker->secs[i]; + + if (strcmp(sec->sec_name, sec_name) == 0) + return sec; + } + + return NULL; +} + +static bool secs_match(struct dst_sec *dst, struct src_sec *src) +{ + if (dst->shdr->sh_type != src->shdr->sh_type) { + pr_warn("sec %s types mismatch\n", dst->sec_name); + return false; + } + if (dst->shdr->sh_flags != src->shdr->sh_flags) { + pr_warn("sec %s flags mismatch\n", dst->sec_name); + return false; + } + if (dst->shdr->sh_entsize != src->shdr->sh_entsize) { + pr_warn("sec %s entsize mismatch\n", dst->sec_name); + return false; + } + + return true; +} + +static bool sec_content_is_same(struct dst_sec *dst_sec, struct src_sec *src_sec) +{ + if (dst_sec->sec_sz != src_sec->shdr->sh_size) + return false; + if (memcmp(dst_sec->raw_data, src_sec->data->d_buf, dst_sec->sec_sz) != 0) + return false; + return true; +} + +static int extend_sec(struct dst_sec *dst, struct src_sec *src) +{ + void *tmp; + size_t dst_align = dst->shdr->sh_addralign; + size_t src_align = src->shdr->sh_addralign; + size_t dst_align_sz, dst_final_sz; + + if (dst_align == 0) + dst_align = 1; + if (dst_align < src_align) + dst_align = src_align; + + dst_align_sz = (dst->sec_sz + dst_align - 1) / dst_align * dst_align; + + /* no need to re-align final size */ + dst_final_sz = dst_align_sz + src->shdr->sh_size; + + if (src->shdr->sh_type != SHT_NOBITS) { + tmp = realloc(dst->raw_data, dst_final_sz); + if (!tmp) + return -ENOMEM; + dst->raw_data = tmp; + + /* pad dst section, if it's alignment forced size increase */ + memset(dst->raw_data + dst->sec_sz, 0, dst_align_sz - dst->sec_sz); + /* now copy src data at a properly aligned offset */ + memcpy(dst->raw_data + dst_align_sz, src->data->d_buf, src->shdr->sh_size); + } + + dst->sec_sz = dst_final_sz; + dst->shdr->sh_size = dst_final_sz; + dst->data->d_size = dst_final_sz; + + dst->shdr->sh_addralign = dst_align; + dst->data->d_align = dst_align; + + src->dst_off = dst_align_sz; + + return 0; +} + +static bool is_data_sec(struct src_sec *sec) +{ + if (!sec || sec->skipped) + return false; + return sec->shdr->sh_type == SHT_PROGBITS || sec->shdr->sh_type == SHT_NOBITS; +} + +static bool is_relo_sec(struct src_sec *sec) +{ + if (!sec || sec->skipped) + return false; + return sec->shdr->sh_type == SHT_REL; +} + +static int linker_append_sec_data(struct bpf_linker *linker, struct src_obj *obj) +{ + int i, err; + + for (i = 1; i < obj->sec_cnt; i++) { + struct src_sec *src_sec; + struct dst_sec *dst_sec; + + src_sec = &obj->secs[i]; + if (!is_data_sec(src_sec)) + continue; + + dst_sec = find_dst_sec_by_name(linker, src_sec->sec_name); + if (!dst_sec) { + dst_sec = add_dst_sec(linker, src_sec->sec_name); + if (!dst_sec) + return -ENOMEM; + err = init_sec(linker, dst_sec, src_sec); + if (err) { + pr_warn("failed to init section '%s'\n", src_sec->sec_name); + return err; + } + } else { + if (!secs_match(dst_sec, src_sec)) { + pr_warn("ELF sections %s are incompatible\n", src_sec->sec_name); + return -1; + } + + /* "license" and "version" sections are deduped */ + if (strcmp(src_sec->sec_name, "license") == 0 + || strcmp(src_sec->sec_name, "version") == 0) { + if (!sec_content_is_same(dst_sec, src_sec)) { + pr_warn("non-identical contents of section '%s' are not supported\n", src_sec->sec_name); + return -EINVAL; + } + src_sec->skipped = true; + src_sec->dst_id = dst_sec->id; + continue; + } + } + + /* record mapped section index */ + src_sec->dst_id = dst_sec->id; + + err = extend_sec(dst_sec, src_sec); + if (err) + return err; + } + + return 0; +} + +static int linker_append_elf_syms(struct bpf_linker *linker, struct src_obj *obj) +{ + struct src_sec *symtab = &obj->secs[obj->symtab_sec_idx]; + Elf64_Sym *sym = symtab->data->d_buf, *dst_sym; + int i, n = symtab->shdr->sh_size / symtab->shdr->sh_entsize; + int str_sec_idx = symtab->shdr->sh_link; + + obj->sym_map = calloc(n + 1, sizeof(*obj->sym_map)); + if (!obj->sym_map) + return -ENOMEM; + + for (i = 0; i < n; i++, sym++) { + struct src_sec *src_sec = NULL; + struct dst_sec *dst_sec = NULL; + const char *sym_name; + size_t dst_sym_idx; + int name_off; + + /* we already have all-zero initial symbol */ + if (sym->st_name == 0 && sym->st_info == 0 && + sym->st_other == 0 && sym->st_shndx == SHN_UNDEF && + sym->st_value == 0 && sym->st_size ==0) + continue; + + sym_name = elf_strptr(obj->elf, str_sec_idx, sym->st_name); + if (!sym_name) { + pr_warn("can't fetch symbol name for symbol #%d in '%s'\n", i, obj->filename); + return -1; + } + + if (sym->st_shndx && sym->st_shndx < SHN_LORESERVE) { + src_sec = &obj->secs[sym->st_shndx]; + if (src_sec->skipped) + continue; + dst_sec = &linker->secs[src_sec->dst_id]; + + /* allow only one STT_SECTION symbol per section */ + if (ELF64_ST_TYPE(sym->st_info) == STT_SECTION && dst_sec->sec_sym_idx) { + obj->sym_map[i] = dst_sec->sec_sym_idx; + continue; + } + } + + name_off = strset__add_str(linker->strtab_strs, sym_name); + if (name_off < 0) + return name_off; + + dst_sym = add_new_sym(linker, &dst_sym_idx); + if (!dst_sym) + return -ENOMEM; + + dst_sym->st_name = name_off; + dst_sym->st_info = sym->st_info; + dst_sym->st_other = sym->st_other; + dst_sym->st_shndx = src_sec ? dst_sec->sec_idx : sym->st_shndx; + dst_sym->st_value = (src_sec ? src_sec->dst_off : 0) + sym->st_value; + dst_sym->st_size = sym->st_size; + + obj->sym_map[i] = dst_sym_idx; + + if (ELF64_ST_TYPE(sym->st_info) == STT_SECTION && dst_sym) { + dst_sec->sec_sym_idx = dst_sym_idx; + dst_sym->st_value = 0; + } + + } + + return 0; +} + +static int linker_append_elf_relos(struct bpf_linker *linker, struct src_obj *obj) +{ + struct src_sec *src_symtab = &obj->secs[obj->symtab_sec_idx]; + struct dst_sec *dst_symtab = &linker->secs[linker->symtab_sec_idx]; + int i, err; + + for (i = 1; i < obj->sec_cnt; i++) { + struct src_sec *src_sec, *src_linked_sec; + struct dst_sec *dst_sec, *dst_linked_sec; + Elf64_Rel *src_rel, *dst_rel; + int j, n; + + src_sec = &obj->secs[i]; + if (!is_relo_sec(src_sec)) + continue; + + /* shdr->sh_info points to relocatable section */ + src_linked_sec = &obj->secs[src_sec->shdr->sh_info]; + if (src_linked_sec->skipped) + continue; + + dst_sec = find_dst_sec_by_name(linker, src_sec->sec_name); + if (!dst_sec) { + dst_sec = add_dst_sec(linker, src_sec->sec_name); + if (!dst_sec) + return -ENOMEM; + err = init_sec(linker, dst_sec, src_sec); + if (err) { + pr_warn("failed to init section '%s'\n", src_sec->sec_name); + return err; + } + } else if (!secs_match(dst_sec, src_sec)) { + pr_warn("Secs %s are not compatible\n", src_sec->sec_name); + return -1; + } + + /* shdr->sh_link points to SYMTAB */ + dst_sec->shdr->sh_link = linker->symtab_sec_idx; + + /* shdr->sh_info points to relocated section */ + dst_linked_sec = &linker->secs[src_linked_sec->dst_id]; + dst_sec->shdr->sh_info = dst_linked_sec->sec_idx; + + src_sec->dst_id = dst_sec->id; + err = extend_sec(dst_sec, src_sec); + if (err) + return err; + + src_rel = src_sec->data->d_buf; + dst_rel = dst_sec->raw_data + src_sec->dst_off; + n = src_sec->shdr->sh_size / src_sec->shdr->sh_entsize; + for (j = 0; j < n; j++, src_rel++, dst_rel++) { + size_t src_sym_idx = ELF64_R_SYM(src_rel->r_info); + size_t sym_type = ELF64_R_TYPE(src_rel->r_info); + Elf64_Sym *src_sym, *dst_sym; + size_t dst_sym_idx; + + src_sym_idx = ELF64_R_SYM(src_rel->r_info); + src_sym = src_symtab->data->d_buf + sizeof(*src_sym) * src_sym_idx; + + dst_sym_idx = obj->sym_map[src_sym_idx]; + dst_sym = dst_symtab->raw_data + sizeof(*dst_sym) * dst_sym_idx; + dst_rel->r_offset += src_linked_sec->dst_off; + sym_type = ELF64_R_TYPE(src_rel->r_info); + dst_rel->r_info = ELF64_R_INFO(dst_sym_idx, sym_type); + + if (ELF64_ST_TYPE(src_sym->st_info) == STT_SECTION) { + struct src_sec *sec = &obj->secs[src_sym->st_shndx]; + struct bpf_insn *insn; + + if (src_linked_sec->shdr->sh_flags & SHF_EXECINSTR) { + /* calls to the very first static function inside + * .text section at offset 0 will + * reference section symbol, not the + * function symbol. Fix that up, + * otherwise it won't be possible to + * relocate calls to two different + * static functions with the same name + * (rom two different object files) + */ + insn = dst_linked_sec->raw_data + dst_rel->r_offset; + if (insn->code == (BPF_JMP | BPF_CALL)) + insn->imm += sec->dst_off / sizeof(struct bpf_insn); + else + insn->imm += sec->dst_off; + } else { + pr_warn("relocation against STT_SECTION in non-exec section is not supported!\n"); + return -EINVAL; + } + } + + } + } + + return 0; +} + +int bpf_linker__finalize(struct bpf_linker *linker) +{ + struct dst_sec *sec; + size_t strs_sz; + const void *strs; + int err, i; + + if (!linker->elf) + return -EINVAL; + + /* Finalize strings */ + strs_sz = strset__data_size(linker->strtab_strs); + strs = strset__data(linker->strtab_strs); + + sec = &linker->secs[linker->strtab_sec_idx]; + sec->data->d_align = 1; + sec->data->d_off = 0LL; + sec->data->d_buf = (void *)strs; + sec->data->d_type = ELF_T_BYTE; + sec->data->d_size = strs_sz; + sec->shdr->sh_size = strs_sz; + + for (i = 1; i < linker->sec_cnt; i++) { + sec = &linker->secs[i]; + + /* STRTAB is handled specially above */ + if (sec->sec_idx == linker->strtab_sec_idx) + continue; + + sec->data->d_buf = sec->raw_data; + } + + /* Finalize ELF layout */ + if (elf_update(linker->elf, ELF_C_NULL) < 0) { + err = -errno; + pr_warn_elf("failed to finalize ELF layout"); + return err; + } + + /* Write out final ELF contents */ + if (elf_update(linker->elf, ELF_C_WRITE) < 0) { + err = -errno; + pr_warn_elf("failed to write ELF contents"); + return err; + } + + elf_end(linker->elf); + close(linker->fd); + + linker->elf = NULL; + linker->fd = -1; + + return 0; +} -- cgit v1.2.3 From 8fd27bf69b864b1c2a6e64cf5673603f3959a6ef Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 18 Mar 2021 12:40:31 -0700 Subject: libbpf: Add BPF static linker BTF and BTF.ext support Add .BTF and .BTF.ext static linking logic. When multiple BPF object files are linked together, their respective .BTF and .BTF.ext sections are merged together. BTF types are not just concatenated, but also deduplicated. .BTF.ext data is grouped by type (func info, line info, core_relos) and target section names, and then all the records are concatenated together, preserving their relative order. All the BTF type ID references and string offsets are updated as necessary, to take into account possibly deduplicated strings and types. BTF DATASEC types are handled specially. Their respective var_secinfos are accumulated separately in special per-section data and then final DATASEC types are emitted at the very end during bpf_linker__finalize() operation, just before emitting final ELF output file. BTF data can also provide "section annotations" for some extern variables. Such concept is missing in ELF, but BTF will have DATASEC types for such special extern datasections (e.g., .kconfig, .ksyms). Such sections are called "ephemeral" internally. Internally linker will keep metadata for each such section, collecting variables information, but those sections won't be emitted into the final ELF file. Also, given LLVM/Clang during compilation emits BTF DATASECS that are incomplete, missing section size and variable offsets for static variables, BPF static linker will initially fix up such DATASECs, using ELF symbols data. The final DATASECs will preserve section sizes and all variable offsets. This is handled correctly by libbpf already, so won't cause any new issues. On the other hand, it's actually a nice property to have a complete BTF data without runtime adjustments done during bpf_object__open() by libbpf. In that sense, BPF static linker is also a BTF normalizer. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210318194036.3521577-8-andrii@kernel.org --- tools/lib/bpf/linker.c | 769 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 767 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c index 01b56939a835..b4fff912dce2 100644 --- a/tools/lib/bpf/linker.c +++ b/tools/lib/bpf/linker.c @@ -32,12 +32,17 @@ struct src_sec { int dst_off; /* whether section is omitted from the final ELF file */ bool skipped; + /* whether section is an ephemeral section, not mapped to an ELF section */ + bool ephemeral; /* ELF info */ size_t sec_idx; Elf_Scn *scn; Elf64_Shdr *shdr; Elf_Data *data; + + /* corresponding BTF DATASEC type ID */ + int sec_type_id; }; struct src_obj { @@ -49,12 +54,24 @@ struct src_obj { /* SYMTAB section index */ size_t symtab_sec_idx; - /* List of sections. Slot zero is unused. */ + struct btf *btf; + struct btf_ext *btf_ext; + + /* List of sections (including ephemeral). Slot zero is unused. */ struct src_sec *secs; int sec_cnt; /* mapping of symbol indices from src to dst ELF */ int *sym_map; + /* mapping from the src BTF type IDs to dst ones */ + int *btf_type_map; +}; + +/* single .BTF.ext data section */ +struct btf_ext_sec_data { + size_t rec_cnt; + __u32 rec_sz; + void *recs; }; struct dst_sec { @@ -75,6 +92,15 @@ struct dst_sec { /* corresponding STT_SECTION symbol index in SYMTAB */ int sec_sym_idx; + + /* section's DATASEC variable info, emitted on BTF finalization */ + int sec_var_cnt; + struct btf_var_secinfo *sec_vars; + + /* section's .BTF.ext data */ + struct btf_ext_sec_data func_info; + struct btf_ext_sec_data line_info; + struct btf_ext_sec_data core_relo_info; }; struct bpf_linker { @@ -90,6 +116,9 @@ struct bpf_linker { struct strset *strtab_strs; /* STRTAB unique strings */ size_t strtab_sec_idx; /* STRTAB section index */ size_t symtab_sec_idx; /* SYMTAB section index */ + + struct btf *btf; + struct btf_ext *btf_ext; }; #define pr_warn_elf(fmt, ...) \ @@ -101,9 +130,17 @@ static int init_output_elf(struct bpf_linker *linker, const char *file); static int linker_load_obj_file(struct bpf_linker *linker, const char *filename, struct src_obj *obj); static int linker_sanity_check_elf(struct src_obj *obj); +static int linker_sanity_check_btf(struct src_obj *obj); +static int linker_sanity_check_btf_ext(struct src_obj *obj); +static int linker_fixup_btf(struct src_obj *obj); static int linker_append_sec_data(struct bpf_linker *linker, struct src_obj *obj); static int linker_append_elf_syms(struct bpf_linker *linker, struct src_obj *obj); static int linker_append_elf_relos(struct bpf_linker *linker, struct src_obj *obj); +static int linker_append_btf(struct bpf_linker *linker, struct src_obj *obj); +static int linker_append_btf_ext(struct bpf_linker *linker, struct src_obj *obj); + +static int finalize_btf(struct bpf_linker *linker); +static int finalize_btf_ext(struct bpf_linker *linker); void bpf_linker__free(struct bpf_linker *linker) { @@ -122,11 +159,19 @@ void bpf_linker__free(struct bpf_linker *linker) strset__free(linker->strtab_strs); + btf__free(linker->btf); + btf_ext__free(linker->btf_ext); + for (i = 1; i < linker->sec_cnt; i++) { struct dst_sec *sec = &linker->secs[i]; free(sec->sec_name); free(sec->raw_data); + free(sec->sec_vars); + + free(sec->func_info.recs); + free(sec->line_info.recs); + free(sec->core_relo_info.recs); } free(linker->secs); @@ -335,6 +380,12 @@ static int init_output_elf(struct bpf_linker *linker, const char *file) sec->shdr->sh_addralign = 8; sec->shdr->sh_entsize = sizeof(Elf64_Sym); + /* .BTF */ + linker->btf = btf__new_empty(); + err = libbpf_get_error(linker->btf); + if (err) + return err; + /* add the special all-zero symbol */ init_sym = add_new_sym(linker, NULL); if (!init_sym) @@ -362,8 +413,13 @@ int bpf_linker__add_file(struct bpf_linker *linker, const char *filename) err = err ?: linker_append_sec_data(linker, &obj); err = err ?: linker_append_elf_syms(linker, &obj); err = err ?: linker_append_elf_relos(linker, &obj); + err = err ?: linker_append_btf(linker, &obj); + err = err ?: linker_append_btf_ext(linker, &obj); /* free up src_obj resources */ + free(obj.btf_type_map); + btf__free(obj.btf); + btf_ext__free(obj.btf_ext); free(obj.secs); free(obj.sym_map); if (obj.elf) @@ -555,10 +611,22 @@ static int linker_load_obj_file(struct bpf_linker *linker, const char *filename, break; case SHT_PROGBITS: if (strcmp(sec_name, BTF_ELF_SEC) == 0) { + obj->btf = btf__new(data->d_buf, shdr->sh_size); + err = libbpf_get_error(obj->btf); + if (err) { + pr_warn("failed to parse .BTF from %s: %d\n", filename, err); + return err; + } sec->skipped = true; continue; } if (strcmp(sec_name, BTF_EXT_ELF_SEC) == 0) { + obj->btf_ext = btf_ext__new(data->d_buf, shdr->sh_size); + err = libbpf_get_error(obj->btf_ext); + if (err) { + pr_warn("failed to parse .BTF.ext from '%s': %d\n", filename, err); + return err; + } sec->skipped = true; continue; } @@ -580,6 +648,9 @@ static int linker_load_obj_file(struct bpf_linker *linker, const char *filename, } err = err ?: linker_sanity_check_elf(obj); + err = err ?: linker_sanity_check_btf(obj); + err = err ?: linker_sanity_check_btf_ext(obj); + err = err ?: linker_fixup_btf(obj); return err; } @@ -753,6 +824,69 @@ static int linker_sanity_check_elf(struct src_obj *obj) return 0; } +static int check_btf_type_id(__u32 *type_id, void *ctx) +{ + struct btf *btf = ctx; + + if (*type_id > btf__get_nr_types(btf)) + return -EINVAL; + + return 0; +} + +static int check_btf_str_off(__u32 *str_off, void *ctx) +{ + struct btf *btf = ctx; + const char *s; + + s = btf__str_by_offset(btf, *str_off); + + if (!s) + return -EINVAL; + + return 0; +} + +static int linker_sanity_check_btf(struct src_obj *obj) +{ + struct btf_type *t; + int i, n, err = 0; + + if (!obj->btf) + return 0; + + n = btf__get_nr_types(obj->btf); + for (i = 1; i <= n; i++) { + t = btf_type_by_id(obj->btf, i); + + err = err ?: btf_type_visit_type_ids(t, check_btf_type_id, obj->btf); + err = err ?: btf_type_visit_str_offs(t, check_btf_str_off, obj->btf); + if (err) + return err; + } + + return 0; +} + +static int linker_sanity_check_btf_ext(struct src_obj *obj) +{ + int err = 0; + + if (!obj->btf_ext) + return 0; + + /* can't use .BTF.ext without .BTF */ + if (!obj->btf) + return -EINVAL; + + err = err ?: btf_ext_visit_type_ids(obj->btf_ext, check_btf_type_id, obj->btf); + err = err ?: btf_ext_visit_str_offs(obj->btf_ext, check_btf_str_off, obj->btf); + if (err) + return err; + + return 0; +} + static int init_sec(struct bpf_linker *linker, struct dst_sec *dst_sec, struct src_sec *src_sec) { Elf_Scn *scn; @@ -763,6 +897,10 @@ static int init_sec(struct bpf_linker *linker, struct dst_sec *dst_sec, struct s dst_sec->sec_sz = 0; dst_sec->sec_idx = 0; + /* ephemeral sections are just thin section shells lacking most parts */ + if (src_sec->ephemeral) + return 0; + scn = elf_newscn(linker->elf); if (!scn) return -1; @@ -891,12 +1029,15 @@ static bool is_data_sec(struct src_sec *sec) { if (!sec || sec->skipped) return false; + /* ephemeral sections are data sections, e.g., .kconfig, .ksyms */ + if (sec->ephemeral) + return true; return sec->shdr->sh_type == SHT_PROGBITS || sec->shdr->sh_type == SHT_NOBITS; } static bool is_relo_sec(struct src_sec *sec) { - if (!sec || sec->skipped) + if (!sec || sec->skipped || sec->ephemeral) return false; return sec->shdr->sh_type == SHT_REL; } @@ -945,6 +1086,9 @@ static int linker_append_sec_data(struct bpf_linker *linker, struct src_obj *obj /* record mapped section index */ src_sec->dst_id = dst_sec->id; + if (src_sec->ephemeral) + continue; + err = extend_sec(dst_sec, src_sec); if (err) return err; @@ -1120,6 +1264,339 @@ static int linker_append_elf_relos(struct bpf_linker *linker, struct src_obj *ob return 0; } +static struct src_sec *find_src_sec_by_name(struct src_obj *obj, const char *sec_name) +{ + struct src_sec *sec; + int i; + + for (i = 1; i < obj->sec_cnt; i++) { + sec = &obj->secs[i]; + + if (strcmp(sec->sec_name, sec_name) == 0) + return sec; + } + + return NULL; +} + +static Elf64_Sym *find_sym_by_name(struct src_obj *obj, size_t sec_idx, + int sym_type, const char *sym_name) +{ + struct src_sec *symtab = &obj->secs[obj->symtab_sec_idx]; + Elf64_Sym *sym = symtab->data->d_buf; + int i, n = symtab->shdr->sh_size / symtab->shdr->sh_entsize; + int str_sec_idx = symtab->shdr->sh_link; + const char *name; + + for (i = 0; i < n; i++, sym++) { + if (sym->st_shndx != sec_idx) + continue; + if (ELF64_ST_TYPE(sym->st_info) != sym_type) + continue; + + name = elf_strptr(obj->elf, str_sec_idx, sym->st_name); + if (!name) + return NULL; + + if (strcmp(sym_name, name) != 0) + continue; + + return sym; + } + + return NULL; +} + +static int linker_fixup_btf(struct src_obj *obj) +{ + const char *sec_name; + struct src_sec *sec; + int i, j, n, m; + + n = btf__get_nr_types(obj->btf); + for (i = 1; i <= n; i++) { + struct btf_var_secinfo *vi; + struct btf_type *t; + + t = btf_type_by_id(obj->btf, i); + if (btf_kind(t) != BTF_KIND_DATASEC) + continue; + + sec_name = btf__str_by_offset(obj->btf, t->name_off); + sec = find_src_sec_by_name(obj, sec_name); + if (sec) { + /* record actual section size, unless ephemeral */ + if (sec->shdr) + t->size = sec->shdr->sh_size; + } else { + /* BTF can have some sections that are not represented + * in ELF, e.g., .kconfig and .ksyms, which are used + * for special extern variables. Here we'll + * pre-create "section shells" for them to be able to + * keep track of extra per-section metadata later + * (e.g., BTF variables). + */ + sec = add_src_sec(obj, sec_name); + if (!sec) + return -ENOMEM; + + sec->ephemeral = true; + sec->sec_idx = 0; /* will match UNDEF shndx in ELF */ + } + + /* remember ELF section and its BTF type ID match */ + sec->sec_type_id = i; + + /* fix up variable offsets */ + vi = btf_var_secinfos(t); + for (j = 0, m = btf_vlen(t); j < m; j++, vi++) { + const struct btf_type *vt = btf__type_by_id(obj->btf, vi->type); + const char *var_name = btf__str_by_offset(obj->btf, vt->name_off); + int var_linkage = btf_var(vt)->linkage; + Elf64_Sym *sym; + + /* no need to patch up static or extern vars */ + if (var_linkage != BTF_VAR_GLOBAL_ALLOCATED) + continue; + + sym = find_sym_by_name(obj, sec->sec_idx, STT_OBJECT, var_name); + if (!sym) { + pr_warn("failed to find symbol for variable '%s' in section '%s'\n", var_name, sec_name); + return -ENOENT; + } + + vi->offset = sym->st_value; + } + } + + return 0; +} + +static int remap_type_id(__u32 *type_id, void *ctx) +{ + int *id_map = ctx; + + *type_id = id_map[*type_id]; + + return 0; +} + +static int linker_append_btf(struct bpf_linker *linker, struct src_obj *obj) +{ + const struct btf_type *t; + int i, j, n, start_id, id; + + if (!obj->btf) + return 0; + + start_id = btf__get_nr_types(linker->btf) + 1; + n = btf__get_nr_types(obj->btf); + + obj->btf_type_map = calloc(n + 1, sizeof(int)); + if (!obj->btf_type_map) + return -ENOMEM; + + for (i = 1; i <= n; i++) { + t = btf__type_by_id(obj->btf, i); + + /* DATASECs are handled specially below */ + if (btf_kind(t) == BTF_KIND_DATASEC) + continue; + + id = btf__add_type(linker->btf, obj->btf, t); + if (id < 0) { + pr_warn("failed to append BTF type #%d from file '%s'\n", i, obj->filename); + return id; + } + + obj->btf_type_map[i] = id; + } + + /* remap all the types except DATASECs */ + n = btf__get_nr_types(linker->btf); + for (i = start_id; i <= n; i++) { + struct btf_type *dst_t = btf_type_by_id(linker->btf, i); + + if (btf_type_visit_type_ids(dst_t, remap_type_id, obj->btf_type_map)) + return -EINVAL; + } + + /* append DATASEC info */ + for (i = 1; i < obj->sec_cnt; i++) { + struct src_sec *src_sec; + struct dst_sec *dst_sec; + const struct btf_var_secinfo *src_var; + struct btf_var_secinfo *dst_var; + + src_sec = &obj->secs[i]; + if (!src_sec->sec_type_id || src_sec->skipped) + continue; + dst_sec = &linker->secs[src_sec->dst_id]; + + t = btf__type_by_id(obj->btf, src_sec->sec_type_id); + src_var = btf_var_secinfos(t); + n = btf_vlen(t); + for (j = 0; j < n; j++, src_var++) { + void *sec_vars = dst_sec->sec_vars; + + sec_vars = libbpf_reallocarray(sec_vars, + dst_sec->sec_var_cnt + 1, + sizeof(*dst_sec->sec_vars)); + if (!sec_vars) + return -ENOMEM; + + dst_sec->sec_vars = sec_vars; + dst_sec->sec_var_cnt++; + + dst_var = &dst_sec->sec_vars[dst_sec->sec_var_cnt - 1]; + dst_var->type = obj->btf_type_map[src_var->type]; + dst_var->size = src_var->size; + dst_var->offset = src_sec->dst_off + src_var->offset; + } + } + + return 0; +} + +static void *add_btf_ext_rec(struct btf_ext_sec_data *ext_data, const void *src_rec) +{ + void *tmp; + + tmp = libbpf_reallocarray(ext_data->recs, ext_data->rec_cnt + 1, ext_data->rec_sz); + if (!tmp) + return NULL; + ext_data->recs = tmp; + + tmp += ext_data->rec_cnt * ext_data->rec_sz; + memcpy(tmp, src_rec, ext_data->rec_sz); + + ext_data->rec_cnt++; + + return tmp; +} + +static int linker_append_btf_ext(struct bpf_linker *linker, struct src_obj *obj) +{ + const struct btf_ext_info_sec *ext_sec; + const char *sec_name, *s; + struct src_sec *src_sec; + struct dst_sec *dst_sec; + int rec_sz, str_off, i; + + if (!obj->btf_ext) + return 0; + + rec_sz = obj->btf_ext->func_info.rec_size; + for_each_btf_ext_sec(&obj->btf_ext->func_info, ext_sec) { + struct bpf_func_info_min *src_rec, *dst_rec; + + sec_name = btf__name_by_offset(obj->btf, ext_sec->sec_name_off); + src_sec = find_src_sec_by_name(obj, sec_name); + if (!src_sec) { + pr_warn("can't find section '%s' referenced from .BTF.ext\n", sec_name); + return -EINVAL; + } + dst_sec = &linker->secs[src_sec->dst_id]; + + if (dst_sec->func_info.rec_sz == 0) + dst_sec->func_info.rec_sz = rec_sz; + if (dst_sec->func_info.rec_sz != rec_sz) { + pr_warn("incompatible .BTF.ext record sizes for section '%s'\n", sec_name); + return -EINVAL; + } + + for_each_btf_ext_rec(&obj->btf_ext->func_info, ext_sec, i, src_rec) { + dst_rec = add_btf_ext_rec(&dst_sec->func_info, src_rec); + if (!dst_rec) + return -ENOMEM; + + dst_rec->insn_off += src_sec->dst_off; + dst_rec->type_id = obj->btf_type_map[dst_rec->type_id]; + } + } + + rec_sz = obj->btf_ext->line_info.rec_size; + for_each_btf_ext_sec(&obj->btf_ext->line_info, ext_sec) { + struct bpf_line_info_min *src_rec, *dst_rec; + + sec_name = btf__name_by_offset(obj->btf, ext_sec->sec_name_off); + src_sec = find_src_sec_by_name(obj, sec_name); + if (!src_sec) { + pr_warn("can't find section '%s' referenced from .BTF.ext\n", sec_name); + return -EINVAL; + } + dst_sec = &linker->secs[src_sec->dst_id]; + + if (dst_sec->line_info.rec_sz == 0) + dst_sec->line_info.rec_sz = rec_sz; + if (dst_sec->line_info.rec_sz != rec_sz) { + pr_warn("incompatible .BTF.ext record sizes for section '%s'\n", sec_name); + return -EINVAL; + } + + for_each_btf_ext_rec(&obj->btf_ext->line_info, ext_sec, i, src_rec) { + dst_rec = add_btf_ext_rec(&dst_sec->line_info, src_rec); + if (!dst_rec) + return -ENOMEM; + + dst_rec->insn_off += src_sec->dst_off; + + s = btf__str_by_offset(obj->btf, src_rec->file_name_off); + str_off = btf__add_str(linker->btf, s); + if (str_off < 0) + return -ENOMEM; + dst_rec->file_name_off = str_off; + + s = btf__str_by_offset(obj->btf, src_rec->line_off); + str_off = btf__add_str(linker->btf, s); + if (str_off < 0) + return -ENOMEM; + dst_rec->line_off = str_off; + + /* dst_rec->line_col is fine */ + } + } + + rec_sz = obj->btf_ext->core_relo_info.rec_size; + for_each_btf_ext_sec(&obj->btf_ext->core_relo_info, ext_sec) { + struct bpf_core_relo *src_rec, *dst_rec; + + sec_name = btf__name_by_offset(obj->btf, ext_sec->sec_name_off); + src_sec = find_src_sec_by_name(obj, sec_name); + if (!src_sec) { + pr_warn("can't find section '%s' referenced from .BTF.ext\n", sec_name); + return -EINVAL; + } + dst_sec = &linker->secs[src_sec->dst_id]; + + if (dst_sec->core_relo_info.rec_sz == 0) + dst_sec->core_relo_info.rec_sz = rec_sz; + if (dst_sec->core_relo_info.rec_sz != rec_sz) { + pr_warn("incompatible .BTF.ext record sizes for section '%s'\n", sec_name); + return -EINVAL; + } + + for_each_btf_ext_rec(&obj->btf_ext->core_relo_info, ext_sec, i, src_rec) { + dst_rec = add_btf_ext_rec(&dst_sec->core_relo_info, src_rec); + if (!dst_rec) + return -ENOMEM; + + dst_rec->insn_off += src_sec->dst_off; + dst_rec->type_id = obj->btf_type_map[dst_rec->type_id]; + + s = btf__str_by_offset(obj->btf, src_rec->access_str_off); + str_off = btf__add_str(linker->btf, s); + if (str_off < 0) + return -ENOMEM; + dst_rec->access_str_off = str_off; + + /* dst_rec->kind is fine */ + } + } + + return 0; +} + int bpf_linker__finalize(struct bpf_linker *linker) { struct dst_sec *sec; @@ -1130,6 +1607,10 @@ int bpf_linker__finalize(struct bpf_linker *linker) if (!linker->elf) return -EINVAL; + err = finalize_btf(linker); + if (err) + return err; + /* Finalize strings */ strs_sz = strset__data_size(linker->strtab_strs); strs = strset__data(linker->strtab_strs); @@ -1149,6 +1630,10 @@ int bpf_linker__finalize(struct bpf_linker *linker) if (sec->sec_idx == linker->strtab_sec_idx) continue; + /* special ephemeral sections (.ksyms, .kconfig, etc) */ + if (!sec->scn) + continue; + sec->data->d_buf = sec->raw_data; } @@ -1174,3 +1659,283 @@ int bpf_linker__finalize(struct bpf_linker *linker) return 0; } + +static int emit_elf_data_sec(struct bpf_linker *linker, const char *sec_name, + size_t align, const void *raw_data, size_t raw_sz) +{ + Elf_Scn *scn; + Elf_Data *data; + Elf64_Shdr *shdr; + int name_off; + + name_off = strset__add_str(linker->strtab_strs, sec_name); + if (name_off < 0) + return name_off; + + scn = elf_newscn(linker->elf); + if (!scn) + return -ENOMEM; + data = elf_newdata(scn); + if (!data) + return -ENOMEM; + shdr = elf64_getshdr(scn); + if (!shdr) + return -EINVAL; + + shdr->sh_name = name_off; + shdr->sh_type = SHT_PROGBITS; + shdr->sh_flags = 0; + shdr->sh_size = raw_sz; + shdr->sh_link = 0; + shdr->sh_info = 0; + shdr->sh_addralign = align; + shdr->sh_entsize = 0; + + data->d_type = ELF_T_BYTE; + data->d_size = raw_sz; + data->d_buf = (void *)raw_data; + data->d_align = align; + data->d_off = 0; + + return 0; +} + +static int finalize_btf(struct bpf_linker *linker) +{ + struct btf *btf = linker->btf; + const void *raw_data; + int i, j, id, err; + __u32 raw_sz; + + /* bail out if no BTF data was produced */ + if (btf__get_nr_types(linker->btf) == 0) + return 0; + + for (i = 1; i < linker->sec_cnt; i++) { + struct dst_sec *sec = &linker->secs[i]; + + if (!sec->sec_var_cnt) + continue; + + id = btf__add_datasec(btf, sec->sec_name, sec->sec_sz); + if (id < 0) { + pr_warn("failed to add consolidated BTF type for datasec '%s': %d\n", + sec->sec_name, id); + return id; + } + + for (j = 0; j < sec->sec_var_cnt; j++) { + struct btf_var_secinfo *vi = &sec->sec_vars[j]; + + if (btf__add_datasec_var_info(btf, vi->type, vi->offset, vi->size)) + return -EINVAL; + } + } + + err = finalize_btf_ext(linker); + if (err) { + pr_warn(".BTF.ext generation failed: %d\n", err); + return err; + } + + err = btf__dedup(linker->btf, linker->btf_ext, NULL); + if (err) { + pr_warn("BTF dedup failed: %d\n", err); + return err; + } + + /* Emit .BTF section */ + raw_data = btf__get_raw_data(linker->btf, &raw_sz); + if (!raw_data) + return -ENOMEM; + + err = emit_elf_data_sec(linker, BTF_ELF_SEC, 8, raw_data, raw_sz); + if (err) { + pr_warn("failed to write out .BTF ELF section: %d\n", err); + return err; + } + + /* Emit .BTF.ext section */ + if (linker->btf_ext) { + raw_data = btf_ext__get_raw_data(linker->btf_ext, &raw_sz); + if (!raw_data) + return -ENOMEM; + + err = emit_elf_data_sec(linker, BTF_EXT_ELF_SEC, 8, raw_data, raw_sz); + if (err) { + pr_warn("failed to write out .BTF.ext ELF section: %d\n", err); + return err; + } + } + + return 0; +} + +static int emit_btf_ext_data(struct bpf_linker *linker, void *output, + const char *sec_name, struct btf_ext_sec_data *sec_data) +{ + struct btf_ext_info_sec *sec_info; + void *cur = output; + int str_off; + size_t sz; + + if (!sec_data->rec_cnt) + return 0; + + str_off = btf__add_str(linker->btf, sec_name); + if (str_off < 0) + return -ENOMEM; + + sec_info = cur; + sec_info->sec_name_off = str_off; + sec_info->num_info = sec_data->rec_cnt; + cur += sizeof(struct btf_ext_info_sec); + + sz = sec_data->rec_cnt * sec_data->rec_sz; + memcpy(cur, sec_data->recs, sz); + cur += sz; + + return cur - output; +} + +static int finalize_btf_ext(struct bpf_linker *linker) +{ + size_t funcs_sz = 0, lines_sz = 0, core_relos_sz = 0, total_sz = 0; + size_t func_rec_sz = 0, line_rec_sz = 0, core_relo_rec_sz = 0; + struct btf_ext_header *hdr; + void *data, *cur; + int i, err, sz; + + /* validate that all sections have the same .BTF.ext record sizes + * and calculate total data size for each type of data (func info, + * line info, core relos) + */ + for (i = 1; i < linker->sec_cnt; i++) { + struct dst_sec *sec = &linker->secs[i]; + + if (sec->func_info.rec_cnt) { + if (func_rec_sz == 0) + func_rec_sz = sec->func_info.rec_sz; + if (func_rec_sz != sec->func_info.rec_sz) { + pr_warn("mismatch in func_info record size %zu != %u\n", + func_rec_sz, sec->func_info.rec_sz); + return -EINVAL; + } + + funcs_sz += sizeof(struct btf_ext_info_sec) + func_rec_sz * sec->func_info.rec_cnt; + } + if (sec->line_info.rec_cnt) { + if (line_rec_sz == 0) + line_rec_sz = sec->line_info.rec_sz; + if (line_rec_sz != sec->line_info.rec_sz) { + pr_warn("mismatch in line_info record size %zu != %u\n", + line_rec_sz, sec->line_info.rec_sz); + return -EINVAL; + } + + lines_sz += sizeof(struct btf_ext_info_sec) + line_rec_sz * sec->line_info.rec_cnt; + } + if (sec->core_relo_info.rec_cnt) { + if (core_relo_rec_sz == 0) + core_relo_rec_sz = sec->core_relo_info.rec_sz; + if (core_relo_rec_sz != sec->core_relo_info.rec_sz) { + pr_warn("mismatch in core_relo_info record size %zu != %u\n", + core_relo_rec_sz, sec->core_relo_info.rec_sz); + return -EINVAL; + } + + core_relos_sz += sizeof(struct btf_ext_info_sec) + core_relo_rec_sz * sec->core_relo_info.rec_cnt; + } + } + + if (!funcs_sz && !lines_sz && !core_relos_sz) + return 0; + + total_sz += sizeof(struct btf_ext_header); + if (funcs_sz) { + funcs_sz += sizeof(__u32); /* record size prefix */ + total_sz += funcs_sz; + } + if (lines_sz) { + lines_sz += sizeof(__u32); /* record size prefix */ + total_sz += lines_sz; + } + if (core_relos_sz) { + core_relos_sz += sizeof(__u32); /* record size prefix */ + total_sz += core_relos_sz; + } + + cur = data = calloc(1, total_sz); + if (!data) + return -ENOMEM; + + hdr = cur; + hdr->magic = BTF_MAGIC; + hdr->version = BTF_VERSION; + hdr->flags = 0; + hdr->hdr_len = sizeof(struct btf_ext_header); + cur += sizeof(struct btf_ext_header); + + /* All offsets are in bytes relative to the end of this header */ + hdr->func_info_off = 0; + hdr->func_info_len = funcs_sz; + hdr->line_info_off = funcs_sz; + hdr->line_info_len = lines_sz; + hdr->core_relo_off = funcs_sz + lines_sz;; + hdr->core_relo_len = core_relos_sz; + + if (funcs_sz) { + *(__u32 *)cur = func_rec_sz; + cur += sizeof(__u32); + + for (i = 1; i < linker->sec_cnt; i++) { + struct dst_sec *sec = &linker->secs[i]; + + sz = emit_btf_ext_data(linker, cur, sec->sec_name, &sec->func_info); + if (sz < 0) + return sz; + + cur += sz; + } + } + + if (lines_sz) { + *(__u32 *)cur = line_rec_sz; + cur += sizeof(__u32); + + for (i = 1; i < linker->sec_cnt; i++) { + struct dst_sec *sec = &linker->secs[i]; + + sz = emit_btf_ext_data(linker, cur, sec->sec_name, &sec->line_info); + if (sz < 0) + return sz; + + cur += sz; + } + } + + if (core_relos_sz) { + *(__u32 *)cur = core_relo_rec_sz; + cur += sizeof(__u32); + + for (i = 1; i < linker->sec_cnt; i++) { + struct dst_sec *sec = &linker->secs[i]; + + sz = emit_btf_ext_data(linker, cur, sec->sec_name, &sec->core_relo_info); + if (sz < 0) + return sz; + + cur += sz; + } + } + + linker->btf_ext = btf_ext__new(data, total_sz); + err = libbpf_get_error(linker->btf_ext); + if (err) { + linker->btf_ext = NULL; + pr_warn("failed to parse final .BTF.ext data: %d\n", err); + return err; + } + + return 0; +} -- cgit v1.2.3 From c41226654550b0a8aa75e91ce0a1cdb6ce2316ee Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 18 Mar 2021 12:40:32 -0700 Subject: bpftool: Add ability to specify custom skeleton object name Add optional name OBJECT_NAME parameter to `gen skeleton` command to override default object name, normally derived from input file name. This allows much more flexibility during build time. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210318194036.3521577-9-andrii@kernel.org --- tools/bpf/bpftool/Documentation/bpftool-gen.rst | 13 ++++++----- tools/bpf/bpftool/bash-completion/bpftool | 11 +++++++++- tools/bpf/bpftool/gen.c | 29 ++++++++++++++++++++++--- 3 files changed, 44 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/Documentation/bpftool-gen.rst b/tools/bpf/bpftool/Documentation/bpftool-gen.rst index 84cf0639696f..d4e7338e22e7 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-gen.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-gen.rst @@ -19,7 +19,7 @@ SYNOPSIS GEN COMMANDS ============= -| **bpftool** **gen skeleton** *FILE* +| **bpftool** **gen skeleton** *FILE* [**name** *OBJECT_NAME*] | **bpftool** **gen help** DESCRIPTION @@ -75,10 +75,13 @@ DESCRIPTION specific maps, programs, etc. As part of skeleton, few custom functions are generated. - Each of them is prefixed with object name, derived from - object file name. I.e., if BPF object file name is - **example.o**, BPF object name will be **example**. The - following custom functions are provided in such case: + Each of them is prefixed with object name. Object name can + either be derived from object file name, i.e., if BPF object + file name is **example.o**, BPF object name will be + **example**. Object name can be also specified explicitly + through **name** *OBJECT_NAME* parameter. The following + custom functions are provided (assuming **example** as + the object name): - **example__open** and **example__open_opts**. These functions are used to instantiate skeleton. It diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index fdffbc64c65c..bf7b4bdbb23a 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -982,7 +982,16 @@ _bpftool() gen) case $command in skeleton) - _filedir + case $prev in + $command) + _filedir + return 0 + ;; + *) + _bpftool_once_attr 'name' + return 0 + ;; + esac ;; *) [[ $prev == $object ]] && \ diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index 4033c46d83e7..9bff89a66835 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -273,7 +273,7 @@ static int do_skeleton(int argc, char **argv) char header_guard[MAX_OBJ_NAME_LEN + sizeof("__SKEL_H__")]; size_t i, map_cnt = 0, prog_cnt = 0, file_sz, mmap_sz; DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts); - char obj_name[MAX_OBJ_NAME_LEN], *obj_data; + char obj_name[MAX_OBJ_NAME_LEN] = "", *obj_data; struct bpf_object *obj = NULL; const char *file, *ident; struct bpf_program *prog; @@ -288,6 +288,28 @@ static int do_skeleton(int argc, char **argv) } file = GET_ARG(); + while (argc) { + if (!REQ_ARGS(2)) + return -1; + + if (is_prefix(*argv, "name")) { + NEXT_ARG(); + + if (obj_name[0] != '\0') { + p_err("object name already specified"); + return -1; + } + + strncpy(obj_name, *argv, MAX_OBJ_NAME_LEN - 1); + obj_name[MAX_OBJ_NAME_LEN - 1] = '\0'; + } else { + p_err("unknown arg %s", *argv); + return -1; + } + + NEXT_ARG(); + } + if (argc) { p_err("extra unknown arguments"); return -1; @@ -310,7 +332,8 @@ static int do_skeleton(int argc, char **argv) p_err("failed to mmap() %s: %s", file, strerror(errno)); goto out; } - get_obj_name(obj_name, file); + if (obj_name[0] == '\0') + get_obj_name(obj_name, file); opts.object_name = obj_name; obj = bpf_object__open_mem(obj_data, file_sz, &opts); if (IS_ERR(obj)) { @@ -599,7 +622,7 @@ static int do_help(int argc, char **argv) } fprintf(stderr, - "Usage: %1$s %2$s skeleton FILE\n" + "Usage: %1$s %2$s skeleton FILE [name OBJECT_NAME]\n" " %1$s %2$s help\n" "\n" " " HELP_SPEC_OPTIONS "\n" -- cgit v1.2.3 From d80b2fcbe0a023619e0fc73112f2a02c2662f6ab Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 18 Mar 2021 12:40:33 -0700 Subject: bpftool: Add `gen object` command to perform BPF static linking Add `bpftool gen object ...` command to statically link multiple BPF ELF object files into a single output BPF ELF object file. This patch also updates bash completions and man page. Man page gets a short section on `gen object` command, but also updates the skeleton example to show off workflow for BPF application with two .bpf.c files, compiled individually with Clang, then resulting object files are linked together with `gen object`, and then final object file is used to generate usable BPF skeleton. This should help new users understand realistic workflow w.r.t. compiling mutli-file BPF application. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20210318194036.3521577-10-andrii@kernel.org --- tools/bpf/bpftool/Documentation/bpftool-gen.rst | 65 +++++++++++++++++++------ tools/bpf/bpftool/bash-completion/bpftool | 6 ++- tools/bpf/bpftool/gen.c | 45 ++++++++++++++++- 3 files changed, 100 insertions(+), 16 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/Documentation/bpftool-gen.rst b/tools/bpf/bpftool/Documentation/bpftool-gen.rst index d4e7338e22e7..7cd6681137f3 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-gen.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-gen.rst @@ -14,16 +14,37 @@ SYNOPSIS *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] } - *COMMAND* := { **skeleton** | **help** } + *COMMAND* := { **object** | **skeleton** | **help** } GEN COMMANDS ============= +| **bpftool** **gen object** *OUTPUT_FILE* *INPUT_FILE* [*INPUT_FILE*...] | **bpftool** **gen skeleton** *FILE* [**name** *OBJECT_NAME*] | **bpftool** **gen help** DESCRIPTION =========== + **bpftool gen object** *OUTPUT_FILE* *INPUT_FILE* [*INPUT_FILE*...] + Statically link (combine) together one or more *INPUT_FILE*'s + into a single resulting *OUTPUT_FILE*. All the files involved + are BPF ELF object files. + + The rules of BPF static linking are mostly the same as for + user-space object files, but in addition to combining data + and instruction sections, .BTF and .BTF.ext (if present in + any of the input files) data are combined together. .BTF + data is deduplicated, so all the common types across + *INPUT_FILE*'s will only be represented once in the resulting + BTF information. + + BPF static linking allows to partition BPF source code into + individually compiled files that are then linked into + a single resulting BPF object file, which can be used to + generated BPF skeleton (with **gen skeleton** command) or + passed directly into **libbpf** (using **bpf_object__open()** + family of APIs). + **bpftool gen skeleton** *FILE* Generate BPF skeleton C header file for a given *FILE*. @@ -133,26 +154,19 @@ OPTIONS EXAMPLES ======== -**$ cat example.c** +**$ cat example1.bpf.c** :: #include #include #include - #include "bpf_helpers.h" + #include const volatile int param1 = 42; bool global_flag = true; struct { int x; } data = {}; - struct { - __uint(type, BPF_MAP_TYPE_HASH); - __uint(max_entries, 128); - __type(key, int); - __type(value, long); - } my_map SEC(".maps"); - SEC("raw_tp/sys_enter") int handle_sys_enter(struct pt_regs *ctx) { @@ -164,6 +178,21 @@ EXAMPLES return 0; } +**$ cat example2.bpf.c** + +:: + + #include + #include + #include + + struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 128); + __type(key, int); + __type(value, long); + } my_map SEC(".maps"); + SEC("raw_tp/sys_exit") int handle_sys_exit(struct pt_regs *ctx) { @@ -173,9 +202,17 @@ EXAMPLES } This is example BPF application with two BPF programs and a mix of BPF maps -and global variables. +and global variables. Source code is split across two source code files. -**$ bpftool gen skeleton example.o** +**$ clang -target bpf -g example1.bpf.c -o example1.bpf.o** +**$ clang -target bpf -g example2.bpf.c -o example2.bpf.o** +**$ bpftool gen object example.bpf.o example1.bpf.o example2.bpf.o** + +This set of commands compiles *example1.bpf.c* and *example2.bpf.c* +individually and then statically links respective object files into the final +BPF ELF object file *example.bpf.o*. + +**$ bpftool gen skeleton example.bpf.o name example | tee example.skel.h** :: @@ -230,7 +267,7 @@ and global variables. #endif /* __EXAMPLE_SKEL_H__ */ -**$ cat example_user.c** +**$ cat example.c** :: @@ -273,7 +310,7 @@ and global variables. return err; } -**# ./example_user** +**# ./example** :: diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index bf7b4bdbb23a..d67518bcbd44 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -981,6 +981,10 @@ _bpftool() ;; gen) case $command in + object) + _filedir + return 0 + ;; skeleton) case $prev in $command) @@ -995,7 +999,7 @@ _bpftool() ;; *) [[ $prev == $object ]] && \ - COMPREPLY=( $( compgen -W 'skeleton help' -- "$cur" ) ) + COMPREPLY=( $( compgen -W 'object skeleton help' -- "$cur" ) ) ;; esac ;; diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index 9bff89a66835..31ade77f5ef8 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -614,6 +614,47 @@ out: return err; } +static int do_object(int argc, char **argv) +{ + struct bpf_linker *linker; + const char *output_file, *file; + int err = 0; + + if (!REQ_ARGS(2)) { + usage(); + return -1; + } + + output_file = GET_ARG(); + + linker = bpf_linker__new(output_file, NULL); + if (!linker) { + p_err("failed to create BPF linker instance"); + return -1; + } + + while (argc) { + file = GET_ARG(); + + err = bpf_linker__add_file(linker, file); + if (err) { + p_err("failed to link '%s': %s (%d)", file, strerror(err), err); + goto out; + } + } + + err = bpf_linker__finalize(linker); + if (err) { + p_err("failed to finalize ELF file: %s (%d)", strerror(err), err); + goto out; + } + + err = 0; +out: + bpf_linker__free(linker); + return err; +} + static int do_help(int argc, char **argv) { if (json_output) { @@ -622,7 +663,8 @@ static int do_help(int argc, char **argv) } fprintf(stderr, - "Usage: %1$s %2$s skeleton FILE [name OBJECT_NAME]\n" + "Usage: %1$s %2$s object OUTPUT_FILE INPUT_FILE [INPUT_FILE...]\n" + " %1$s %2$s skeleton FILE [name OBJECT_NAME]\n" " %1$s %2$s help\n" "\n" " " HELP_SPEC_OPTIONS "\n" @@ -633,6 +675,7 @@ static int do_help(int argc, char **argv) } static const struct cmd cmds[] = { + { "object", do_object }, { "skeleton", do_skeleton }, { "help", do_help }, { 0 } -- cgit v1.2.3 From cab62c37be057379a2a17b1b2eacd9dcba1e14dc Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 18 Mar 2021 12:40:34 -0700 Subject: selftests/bpf: Re-generate vmlinux.h and BPF skeletons if bpftool changed Trigger vmlinux.h and BPF skeletons re-generation if detected that bpftool was re-compiled. Otherwise full `make clean` is required to get updated skeletons, if bpftool is modified. Fixes: acbd06206bbb ("selftests/bpf: Add vmlinux.h selftest exercising tracing of syscalls") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210318194036.3521577-11-andrii@kernel.org --- tools/testing/selftests/bpf/Makefile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index d0db2b673c6f..dbca39f45382 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -232,7 +232,7 @@ $(HOST_BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ DESTDIR=$(HOST_SCRATCH_DIR)/ prefix= all install_headers endif -$(INCLUDE_DIR)/vmlinux.h: $(VMLINUX_BTF) | $(BPFTOOL) $(INCLUDE_DIR) +$(INCLUDE_DIR)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) | $(INCLUDE_DIR) ifeq ($(VMLINUX_H),) $(call msg,GEN,,$@) $(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF) format c > $@ @@ -357,7 +357,8 @@ $(TRUNNER_BPF_OBJS): $(TRUNNER_OUTPUT)/%.o: \ $(TRUNNER_BPF_SKELS): $(TRUNNER_OUTPUT)/%.skel.h: \ $(TRUNNER_OUTPUT)/%.o \ - | $(BPFTOOL) $(TRUNNER_OUTPUT) + $(BPFTOOL) \ + | $(TRUNNER_OUTPUT) $$(call msg,GEN-SKEL,$(TRUNNER_BINARY),$$@) $(Q)$$(BPFTOOL) gen skeleton $$< > $$@ endif -- cgit v1.2.3 From 14137f3c62186799b01eea8a338f90c9cbc57f00 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 18 Mar 2021 12:40:35 -0700 Subject: selftests/bpf: Pass all BPF .o's through BPF static linker Pass all individual BPF object files (generated from progs/*.c) through `bpftool gen object` command to validate that BPF static linker doesn't corrupt them. As an additional sanity checks, validate that passing resulting object files through linker again results in identical ELF files. Exact same ELF contents can be guaranteed only after two passes, as after the first pass ELF sections order changes, and thus .BTF.ext data sections order changes. That, in turn, means that strings are added into the final BTF string sections in different order, so .BTF strings data might not be exactly the same. But doing another round of linking afterwards should result in the identical ELF file, which is checked with additional `diff` command. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210318194036.3521577-12-andrii@kernel.org --- tools/testing/selftests/bpf/Makefile | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index dbca39f45382..3a7c02add70c 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -355,12 +355,13 @@ $(TRUNNER_BPF_OBJS): $(TRUNNER_OUTPUT)/%.o: \ $$(call $(TRUNNER_BPF_BUILD_RULE),$$<,$$@, \ $(TRUNNER_BPF_CFLAGS)) -$(TRUNNER_BPF_SKELS): $(TRUNNER_OUTPUT)/%.skel.h: \ - $(TRUNNER_OUTPUT)/%.o \ - $(BPFTOOL) \ - | $(TRUNNER_OUTPUT) +$(TRUNNER_BPF_SKELS): %.skel.h: %.o $(BPFTOOL) | $(TRUNNER_OUTPUT) $$(call msg,GEN-SKEL,$(TRUNNER_BINARY),$$@) - $(Q)$$(BPFTOOL) gen skeleton $$< > $$@ + $(Q)$$(BPFTOOL) gen object $$(<:.o=.linked.o) $$< + $(Q)$$(BPFTOOL) gen object $$(<:.o=.linked2.o) $$(<:.o=.linked.o) + $(Q)$$(BPFTOOL) gen object $$(<:.o=.linked3.o) $$(<:.o=.linked2.o) + $(Q)diff $$(<:.o=.linked2.o) $$(<:.o=.linked3.o) + $(Q)$$(BPFTOOL) gen skeleton $$(<:.o=.linked.o) name $$(notdir $$(<:.o=)) > $$@ endif # ensure we set up tests.h header generation rule just once -- cgit v1.2.3 From a0964f526df6facd4e12a4c416185013026eecf9 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 18 Mar 2021 12:40:36 -0700 Subject: selftests/bpf: Add multi-file statically linked BPF object file test Add Makefile infra to specify multi-file BPF object files (and derivative skeletons). Add first selftest validating BPF static linker can merge together successfully two independent BPF object files and resulting object and skeleton are correct and usable. Use the same F(F(F(X))) = F(F(X)) identity test on linked object files as for the case of single BPF object files. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210318194036.3521577-13-andrii@kernel.org --- tools/testing/selftests/bpf/Makefile | 21 ++++++++++-- .../selftests/bpf/prog_tests/static_linked.c | 40 ++++++++++++++++++++++ .../selftests/bpf/progs/test_static_linked1.c | 30 ++++++++++++++++ .../selftests/bpf/progs/test_static_linked2.c | 31 +++++++++++++++++ 4 files changed, 119 insertions(+), 3 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/static_linked.c create mode 100644 tools/testing/selftests/bpf/progs/test_static_linked1.c create mode 100644 tools/testing/selftests/bpf/progs/test_static_linked2.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 3a7c02add70c..6448c626498f 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -303,6 +303,10 @@ endef SKEL_BLACKLIST := btf__% test_pinning_invalid.c test_sk_assign.c +LINKED_SKELS := test_static_linked.skel.h + +test_static_linked.skel.h-deps := test_static_linked1.o test_static_linked2.o + # Set up extra TRUNNER_XXX "temporary" variables in the environment (relies on # $eval()) and pass control to DEFINE_TEST_RUNNER_RULES. # Parameters: @@ -323,6 +327,7 @@ TRUNNER_BPF_OBJS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.o, $$(TRUNNER_BPF_SRCS) TRUNNER_BPF_SKELS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.skel.h, \ $$(filter-out $(SKEL_BLACKLIST), \ $$(TRUNNER_BPF_SRCS))) +TRUNNER_BPF_SKELS_LINKED := $$(addprefix $$(TRUNNER_OUTPUT)/,$(LINKED_SKELS)) TEST_GEN_FILES += $$(TRUNNER_BPF_OBJS) # Evaluate rules now with extra TRUNNER_XXX variables above already defined @@ -357,11 +362,20 @@ $(TRUNNER_BPF_OBJS): $(TRUNNER_OUTPUT)/%.o: \ $(TRUNNER_BPF_SKELS): %.skel.h: %.o $(BPFTOOL) | $(TRUNNER_OUTPUT) $$(call msg,GEN-SKEL,$(TRUNNER_BINARY),$$@) - $(Q)$$(BPFTOOL) gen object $$(<:.o=.linked.o) $$< - $(Q)$$(BPFTOOL) gen object $$(<:.o=.linked2.o) $$(<:.o=.linked.o) + $(Q)$$(BPFTOOL) gen object $$(<:.o=.linked1.o) $$< + $(Q)$$(BPFTOOL) gen object $$(<:.o=.linked2.o) $$(<:.o=.linked1.o) $(Q)$$(BPFTOOL) gen object $$(<:.o=.linked3.o) $$(<:.o=.linked2.o) $(Q)diff $$(<:.o=.linked2.o) $$(<:.o=.linked3.o) - $(Q)$$(BPFTOOL) gen skeleton $$(<:.o=.linked.o) name $$(notdir $$(<:.o=)) > $$@ + $(Q)$$(BPFTOOL) gen skeleton $$(<:.o=.linked3.o) name $$(notdir $$(<:.o=)) > $$@ + +$(TRUNNER_BPF_SKELS_LINKED): $(TRUNNER_BPF_OBJS) $(BPFTOOL) | $(TRUNNER_OUTPUT) + $$(call msg,LINK-BPF,$(TRUNNER_BINARY),$$(@:.skel.h=.o)) + $(Q)$$(BPFTOOL) gen object $$(@:.skel.h=.linked1.o) $$(addprefix $(TRUNNER_OUTPUT)/,$$($$(@F)-deps)) + $(Q)$$(BPFTOOL) gen object $$(@:.skel.h=.linked2.o) $$(@:.skel.h=.linked1.o) + $(Q)$$(BPFTOOL) gen object $$(@:.skel.h=.linked3.o) $$(@:.skel.h=.linked2.o) + $(Q)diff $$(@:.skel.h=.linked2.o) $$(@:.skel.h=.linked3.o) + $$(call msg,GEN-SKEL,$(TRUNNER_BINARY),$$@) + $(Q)$$(BPFTOOL) gen skeleton $$(@:.skel.h=.linked3.o) name $$(notdir $$(@:.skel.h=)) > $$@ endif # ensure we set up tests.h header generation rule just once @@ -383,6 +397,7 @@ $(TRUNNER_TEST_OBJS): $(TRUNNER_OUTPUT)/%.test.o: \ $(TRUNNER_EXTRA_HDRS) \ $(TRUNNER_BPF_OBJS) \ $(TRUNNER_BPF_SKELS) \ + $(TRUNNER_BPF_SKELS_LINKED) \ $$(BPFOBJ) | $(TRUNNER_OUTPUT) $$(call msg,TEST-OBJ,$(TRUNNER_BINARY),$$@) $(Q)cd $$(@D) && $$(CC) -I. $$(CFLAGS) -c $(CURDIR)/$$< $$(LDLIBS) -o $$(@F) diff --git a/tools/testing/selftests/bpf/prog_tests/static_linked.c b/tools/testing/selftests/bpf/prog_tests/static_linked.c new file mode 100644 index 000000000000..46556976dccc --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/static_linked.c @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Facebook */ + +#include +#include "test_static_linked.skel.h" + +void test_static_linked(void) +{ + int err; + struct test_static_linked* skel; + + skel = test_static_linked__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + skel->rodata->rovar1 = 1; + skel->bss->static_var1 = 2; + skel->bss->static_var11 = 3; + + skel->rodata->rovar2 = 4; + skel->bss->static_var2 = 5; + skel->bss->static_var22 = 6; + + err = test_static_linked__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + err = test_static_linked__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto cleanup; + + /* trigger */ + usleep(1); + + ASSERT_EQ(skel->bss->var1, 1 * 2 + 2 + 3, "var1"); + ASSERT_EQ(skel->bss->var2, 4 * 3 + 5 + 6, "var2"); + +cleanup: + test_static_linked__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/test_static_linked1.c b/tools/testing/selftests/bpf/progs/test_static_linked1.c new file mode 100644 index 000000000000..ea1a6c4c7172 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_static_linked1.c @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include +#include + +/* 8-byte aligned .bss */ +static volatile long static_var1; +static volatile int static_var11; +int var1 = 0; +/* 4-byte aligned .rodata */ +const volatile int rovar1; + +/* same "subprog" name in both files */ +static __noinline int subprog(int x) +{ + /* but different formula */ + return x * 2; +} + +SEC("raw_tp/sys_enter") +int handler1(const void *ctx) +{ + var1 = subprog(rovar1) + static_var1 + static_var11; + + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; +int VERSION SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/progs/test_static_linked2.c b/tools/testing/selftests/bpf/progs/test_static_linked2.c new file mode 100644 index 000000000000..54d8d1ab577c --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_static_linked2.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include +#include + +/* 4-byte aligned .bss */ +static volatile int static_var2; +static volatile int static_var22; +int var2 = 0; +/* 8-byte aligned .rodata */ +const volatile long rovar2; + +/* same "subprog" name in both files */ +static __noinline int subprog(int x) +{ + /* but different formula */ + return x * 3; +} + +SEC("raw_tp/sys_enter") +int handler2(const void *ctx) +{ + var2 = subprog(rovar2) + static_var2 + static_var22; + + return 0; +} + +/* different name and/or type of the variable doesn't matter */ +char _license[] SEC("license") = "GPL"; +int _version SEC("version") = 1; -- cgit v1.2.3 From 38cb57602369cf194556460a52bd18e53c76e13d Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Fri, 19 Mar 2021 04:59:45 +0530 Subject: selftests: net: forwarding: Fix a typo s/verfied/verified/ Signed-off-by: Bhaskar Chowdhury Signed-off-by: David S. Miller --- tools/testing/selftests/net/forwarding/fib_offload_lib.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/fib_offload_lib.sh b/tools/testing/selftests/net/forwarding/fib_offload_lib.sh index 66496659bea7..e134a5f529c9 100644 --- a/tools/testing/selftests/net/forwarding/fib_offload_lib.sh +++ b/tools/testing/selftests/net/forwarding/fib_offload_lib.sh @@ -224,7 +224,7 @@ fib_ipv4_plen_test() ip -n $ns link set dev dummy1 up # Add two routes with the same key and different prefix length and - # make sure both are in hardware. It can be verfied that both are + # make sure both are in hardware. It can be verified that both are # sharing the same leaf by checking the /proc/net/fib_trie ip -n $ns route add 192.0.2.0/24 dev dummy1 ip -n $ns route add 192.0.2.0/25 dev dummy1 -- cgit v1.2.3 From 4284f7acb78bfb0e0c26a2b78e2b2c3d68fccd6f Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 18 Mar 2021 12:43:01 -0700 Subject: selftests/sgx: Improve error detection and messages The SGX device file (/dev/sgx_enclave) is unusual in that it requires execute permissions. It has to be both "chmod +x" *and* be on a filesystem without 'noexec'. In the future, udev and systemd should get updates to set up systems automatically. But, for now, nobody's systems do this automatically, and everybody gets error messages like this when running ./test_sgx: 0x0000000000000000 0x0000000000002000 0x03 0x0000000000002000 0x0000000000001000 0x05 0x0000000000003000 0x0000000000003000 0x03 mmap() failed, errno=1. That isn't very user friendly, even for forgetful kernel developers. Further, the test case is rather haphazard about its use of fprintf() versus perror(). Improve the error messages. Use perror() where possible. Lastly, do some sanity checks on opening and mmap()ing the device file so that we can get a decent error message out to the user. Now, if your user doesn't have permission, you'll get the following: $ ls -l /dev/sgx_enclave crw------- 1 root root 10, 126 Mar 18 11:29 /dev/sgx_enclave $ ./test_sgx Unable to open /dev/sgx_enclave: Permission denied If you then 'chown dave:dave /dev/sgx_enclave' (or whatever), but you leave execute permissions off, you'll get: $ ls -l /dev/sgx_enclave crw------- 1 dave dave 10, 126 Mar 18 11:29 /dev/sgx_enclave $ ./test_sgx no execute permissions on device file If you fix that with "chmod ug+x /dev/sgx" but you leave /dev as noexec, you'll get this: $ mount | grep "/dev .*noexec" udev on /dev type devtmpfs (rw,nosuid,noexec,...) $ ./test_sgx ERROR: mmap for exec: Operation not permitted mmap() succeeded for PROT_READ, but failed for PROT_EXEC check that user has execute permissions on /dev/sgx_enclave and that /dev does not have noexec set: 'mount | grep "/dev .*noexec"' That can be fixed with: mount -o remount,noexec /devESC Hopefully, the combination of better error messages and the search engines indexing this message will help people fix their systems until we do this properly. [ bp: Improve error messages more. ] Signed-off-by: Dave Hansen Signed-off-by: Ingo Molnar Signed-off-by: Borislav Petkov Reviewed-by: Jarkko Sakkinen Link: https://lore.kernel.org/r/20210318194301.11D9A984@viggo.jf.intel.com --- tools/testing/selftests/sgx/load.c | 69 ++++++++++++++++++++++++++++++-------- tools/testing/selftests/sgx/main.c | 2 +- 2 files changed, 56 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/sgx/load.c b/tools/testing/selftests/sgx/load.c index 9d43b75aaa55..f441ac34b4d4 100644 --- a/tools/testing/selftests/sgx/load.c +++ b/tools/testing/selftests/sgx/load.c @@ -45,19 +45,19 @@ static bool encl_map_bin(const char *path, struct encl *encl) fd = open(path, O_RDONLY); if (fd == -1) { - perror("open()"); + perror("enclave executable open()"); return false; } ret = stat(path, &sb); if (ret) { - perror("stat()"); + perror("enclave executable stat()"); goto err; } bin = mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0); if (bin == MAP_FAILED) { - perror("mmap()"); + perror("enclave executable mmap()"); goto err; } @@ -90,8 +90,7 @@ static bool encl_ioc_create(struct encl *encl) ioc.src = (unsigned long)secs; rc = ioctl(encl->fd, SGX_IOC_ENCLAVE_CREATE, &ioc); if (rc) { - fprintf(stderr, "SGX_IOC_ENCLAVE_CREATE failed: errno=%d\n", - errno); + perror("SGX_IOC_ENCLAVE_CREATE failed"); munmap((void *)secs->base, encl->encl_size); return false; } @@ -116,31 +115,72 @@ static bool encl_ioc_add_pages(struct encl *encl, struct encl_segment *seg) rc = ioctl(encl->fd, SGX_IOC_ENCLAVE_ADD_PAGES, &ioc); if (rc < 0) { - fprintf(stderr, "SGX_IOC_ENCLAVE_ADD_PAGES failed: errno=%d.\n", - errno); + perror("SGX_IOC_ENCLAVE_ADD_PAGES failed"); return false; } return true; } + + bool encl_load(const char *path, struct encl *encl) { + const char device_path[] = "/dev/sgx_enclave"; Elf64_Phdr *phdr_tbl; off_t src_offset; Elf64_Ehdr *ehdr; + struct stat sb; + void *ptr; int i, j; int ret; + int fd = -1; memset(encl, 0, sizeof(*encl)); - ret = open("/dev/sgx_enclave", O_RDWR); - if (ret < 0) { - fprintf(stderr, "Unable to open /dev/sgx_enclave\n"); + fd = open(device_path, O_RDWR); + if (fd < 0) { + perror("Unable to open /dev/sgx_enclave"); + goto err; + } + + ret = stat(device_path, &sb); + if (ret) { + perror("device file stat()"); + goto err; + } + + /* + * This just checks if the /dev file has these permission + * bits set. It does not check that the current user is + * the owner or in the owning group. + */ + if (!(sb.st_mode & (S_IXUSR | S_IXGRP | S_IXOTH))) { + fprintf(stderr, "no execute permissions on device file %s\n", device_path); + goto err; + } + + ptr = mmap(NULL, PAGE_SIZE, PROT_READ, MAP_SHARED, fd, 0); + if (ptr == (void *)-1) { + perror("mmap for read"); + goto err; + } + munmap(ptr, PAGE_SIZE); + +#define ERR_MSG \ +"mmap() succeeded for PROT_READ, but failed for PROT_EXEC.\n" \ +" Check that current user has execute permissions on %s and \n" \ +" that /dev does not have noexec set: mount | grep \"/dev .*noexec\"\n" \ +" If so, remount it executable: mount -o remount,exec /dev\n\n" + + ptr = mmap(NULL, PAGE_SIZE, PROT_EXEC, MAP_SHARED, fd, 0); + if (ptr == (void *)-1) { + fprintf(stderr, ERR_MSG, device_path); goto err; } + munmap(ptr, PAGE_SIZE); - encl->fd = ret; + encl->fd = fd; if (!encl_map_bin(path, encl)) goto err; @@ -217,6 +257,8 @@ bool encl_load(const char *path, struct encl *encl) return true; err: + if (fd != -1) + close(fd); encl_delete(encl); return false; } @@ -229,7 +271,7 @@ static bool encl_map_area(struct encl *encl) area = mmap(NULL, encl_size * 2, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (area == MAP_FAILED) { - perror("mmap"); + perror("reservation mmap()"); return false; } @@ -268,8 +310,7 @@ bool encl_build(struct encl *encl) ioc.sigstruct = (uint64_t)&encl->sigstruct; ret = ioctl(encl->fd, SGX_IOC_ENCLAVE_INIT, &ioc); if (ret) { - fprintf(stderr, "SGX_IOC_ENCLAVE_INIT failed: errno=%d\n", - errno); + perror("SGX_IOC_ENCLAVE_INIT failed"); return false; } diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c index 724cec700926..b117bb86a73f 100644 --- a/tools/testing/selftests/sgx/main.c +++ b/tools/testing/selftests/sgx/main.c @@ -195,7 +195,7 @@ int main(int argc, char *argv[], char *envp[]) addr = mmap((void *)encl.encl_base + seg->offset, seg->size, seg->prot, MAP_SHARED | MAP_FIXED, encl.fd, 0); if (addr == MAP_FAILED) { - fprintf(stderr, "mmap() failed, errno=%d.\n", errno); + perror("mmap() segment failed"); exit(KSFT_FAIL); } } -- cgit v1.2.3 From ea24b19562fe5f72c78319dbb347b701818956d9 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Fri, 19 Mar 2021 19:21:17 +0000 Subject: libbpf: Add explicit padding to btf_dump_emit_type_decl_opts Similar to https://lore.kernel.org/bpf/20210313210920.1959628-2-andrii@kernel.org/ When DECLARE_LIBBPF_OPTS is used with inline field initialization, e.g: DECLARE_LIBBPF_OPTS(btf_dump_emit_type_decl_opts, opts, .field_name = var_ident, .indent_level = 2, .strip_mods = strip_mods, ); and compiled in debug mode, the compiler generates code which leaves the padding uninitialized and triggers errors within libbpf APIs which require strict zero initialization of OPTS structs. Adding anonymous padding field fixes the issue. Fixes: 9f81654eebe8 ("libbpf: Expose BTF-to-C type declaration emitting API") Suggested-by: Andrii Nakryiko Signed-off-by: KP Singh Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210319192117.2310658-1-kpsingh@kernel.org --- tools/lib/bpf/btf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index 3b0b17ba94a1..b54f1c3ebd57 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -176,6 +176,7 @@ struct btf_dump_emit_type_decl_opts { int indent_level; /* strip all the const/volatile/restrict mods */ bool strip_mods; + size_t :0; }; #define btf_dump_emit_type_decl_opts__last_field strip_mods -- cgit v1.2.3 From b7e23e54a9c7e9fc4aa177bfc3c73a29325614ad Mon Sep 17 00:00:00 2001 From: Ricardo Ribalda Date: Thu, 18 Mar 2021 21:22:23 +0100 Subject: pm-graph: Fix typo "accesible" Trivial fix. Signed-off-by: Ricardo Ribalda Signed-off-by: Rafael J. Wysocki --- tools/power/pm-graph/sleepgraph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/power/pm-graph/sleepgraph.py b/tools/power/pm-graph/sleepgraph.py index 81f4b8abbdf7..ffd50953a024 100755 --- a/tools/power/pm-graph/sleepgraph.py +++ b/tools/power/pm-graph/sleepgraph.py @@ -6819,7 +6819,7 @@ if __name__ == '__main__': sysvals.outdir = val sysvals.notestrun = True if(os.path.isdir(val) == False): - doError('%s is not accesible' % val) + doError('%s is not accessible' % val) elif(arg == '-filter'): try: val = next(args) -- cgit v1.2.3 From 040accb3cd4ac4a8d151413f569b7ba6d918a19c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 Feb 2021 12:37:46 -0800 Subject: torture: Abstract jitter.sh start/stop into scripts This commit creates jitterstart.sh and jitterstop.sh scripts that handle the starting and stopping of the jitter.sh scripts. These must be sourced using the bash "." command to allow the generated script to wait on the backgrounded jitter.sh scripts. Signed-off-by: Paul E. McKenney --- .../selftests/rcutorture/bin/jitterstart.sh | 37 ++++++++++++++++++++++ .../testing/selftests/rcutorture/bin/jitterstop.sh | 23 ++++++++++++++ tools/testing/selftests/rcutorture/bin/kvm.sh | 7 ++-- 3 files changed, 62 insertions(+), 5 deletions(-) create mode 100644 tools/testing/selftests/rcutorture/bin/jitterstart.sh create mode 100644 tools/testing/selftests/rcutorture/bin/jitterstop.sh (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/bin/jitterstart.sh b/tools/testing/selftests/rcutorture/bin/jitterstart.sh new file mode 100644 index 000000000000..3d710ad291c3 --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/jitterstart.sh @@ -0,0 +1,37 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# +# Start up the specified number of jitter.sh scripts in the background. +# +# Usage: . jitterstart.sh n jittering-dir duration [ sleepmax [ spinmax ] ] +# +# n: Number of jitter.sh scripts to start up. +# jittering-dir: Directory in which to put "jittering" file. +# duration: Time to run in seconds. +# sleepmax: Maximum microseconds to sleep, defaults to one second. +# spinmax: Maximum microseconds to spin, defaults to one millisecond. +# +# Copyright (C) 2021 Facebook, Inc. +# +# Authors: Paul E. McKenney + +jitter_n=$1 +if test -z "$jitter_n" +then + echo jitterstart.sh: Missing count of jitter.sh scripts to start. + exit 33 +fi +jittering_dir=$2 +if test -z "$jittering_dir" +then + echo jitterstart.sh: Missing directory in which to place jittering file. + exit 34 +fi +shift +shift + +touch ${jittering_dir}/jittering +for ((jitter_i = 1; jitter_i <= $jitter_n; jitter_i++)) +do + jitter.sh $jitter_i "${jittering_dir}/jittering" "$@" & +done diff --git a/tools/testing/selftests/rcutorture/bin/jitterstop.sh b/tools/testing/selftests/rcutorture/bin/jitterstop.sh new file mode 100644 index 000000000000..576a4cf4b79a --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/jitterstop.sh @@ -0,0 +1,23 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# +# Remove the "jittering" file, signaling the jitter.sh scripts to stop, +# then wait for them to terminate. +# +# Usage: . jitterstop.sh jittering-dir +# +# jittering-dir: Directory containing "jittering" file. +# +# Copyright (C) 2021 Facebook, Inc. +# +# Authors: Paul E. McKenney + +jittering_dir=$1 +if test -z "$jittering_dir" +then + echo jitterstop.sh: Missing directory in which to place jittering file. + exit 34 +fi + +rm -f ${jittering_dir}/jittering +wait diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index a2ee3f2fff3c..d6973e4a2ecf 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -502,15 +502,12 @@ function dump(first, pastlast, batchnum) print "if test -n \"$needqemurun\"" print "then" print "\techo ---- Starting kernels. `date` | tee -a " rd "log"; - print "\ttouch " rd "jittering" - for (j = 0; j < njitter; j++) - print "\tjitter.sh " j " " rd "jittering " dur " " ja[2] " " ja[3] "&" + print "\t. jitterstart.sh " njitter " " rd " " dur " " ja[2] " " ja[3] print "\twhile ls $runfiles > /dev/null 2>&1" print "\tdo" print "\t\t:" print "\tdone" - print "\trm -f " rd "jittering" - print "\twait" + print "\t. jitterstop.sh " rd print "\techo ---- All kernel runs complete. `date` | tee -a " rd "log"; print "else" print "\twait" -- cgit v1.2.3 From cc45716e07a41233b7c0b2183b0a3e60b85192e0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 Feb 2021 16:19:29 -0800 Subject: torture: Record TORTURE_KCONFIG_GDB_ARG in qemu-cmd When re-running old rcutorture builds, if the original run involved gdb, the re-run also needs to do so. This commit therefore records the TORTURE_KCONFIG_GDB_ARG environment variable into the qemu-cmd file so that the re-run can access it. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index eb5346b457d4..5d9ac90c2cfb 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -201,6 +201,7 @@ echo kernel here: `head -n 1 $testid_txt | sed -e 's/^Build directory: //'` >> echo $QEMU $qemu_args -m $TORTURE_QEMU_MEM -kernel $KERNEL -append \"$qemu_append $boot_args\" $TORTURE_QEMU_GDB_ARG > $resdir/qemu-cmd echo "# TORTURE_SHUTDOWN_GRACE=$TORTURE_SHUTDOWN_GRACE" >> $resdir/qemu-cmd echo "# seconds=$seconds" >> $resdir/qemu-cmd +echo "# TORTURE_KCONFIG_GDB_ARG=\"$TORTURE_KCONFIG_GDB_ARG\"" >> $resdir/qemu-cmd if test -n "$TORTURE_BUILDONLY" then -- cgit v1.2.3 From d53f52d6fc220ba2074338ce6a91f837c7a7cba0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 12 Feb 2021 14:00:05 -0800 Subject: torture: Extract kvm-test-1-run-qemu.sh from kvm-test-1-run.sh Currently, kvm-test-1-run.sh both builds and runs an rcutorture kernel, which is inconvenient when it is necessary to re-run an old run or to carry out a run on a remote system. This commit therefore extracts the portion of kvm-test-1-run.sh that invoke qemu to actually run rcutorture and places it in kvm-test-1-run-qemu.sh. Signed-off-by: Paul E. McKenney --- .../rcutorture/bin/kvm-test-1-run-qemu.sh | 170 +++++++++++++++++++++ .../selftests/rcutorture/bin/kvm-test-1-run.sh | 127 +-------------- 2 files changed, 171 insertions(+), 126 deletions(-) create mode 100755 tools/testing/selftests/rcutorture/bin/kvm-test-1-run-qemu.sh (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-qemu.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-qemu.sh new file mode 100755 index 000000000000..6b0d71b325eb --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-qemu.sh @@ -0,0 +1,170 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# +# Carry out a kvm-based run for the specified qemu-cmd file, which might +# have been generated by --build-only kvm.sh run. +# +# Usage: kvm-test-1-run-qemu.sh qemu-cmd-dir +# +# qemu-cmd-dir provides the directory containing qemu-cmd file. +# This is assumed to be of the form prefix/ds/scenario, where +# "ds" is the top-level date-stamped directory and "scenario" +# is the scenario name. Any required adjustments to this file +# must have been made by the caller. The shell-command comments +# at the end of the qemu-cmd file are not optional. +# +# Copyright (C) 2021 Facebook, Inc. +# +# Authors: Paul E. McKenney + +T=${TMPDIR-/tmp}/kvm-test-1-run-qemu.sh.$$ +trap 'rm -rf $T' 0 +mkdir $T + +resdir="$1" +if ! test -d "$resdir" +then + echo $0: Nonexistent directory: $resdir + exit 1 +fi +if ! test -f "$resdir/qemu-cmd" +then + echo $0: Nonexistent qemu-cmd file: $resdir/qemu-cmd + exit 1 +fi + +# Obtain settings from the qemu-cmd file. +grep '^#' $resdir/qemu-cmd | sed -e 's/^# //' > $T/qemu-cmd-settings +. $T/qemu-cmd-settings + +# Decorate qemu-cmd with redirection, backgrounding, and PID capture +sed -e 's/$/ 2>\&1 \&/' < $resdir/qemu-cmd > $T/qemu-cmd +echo 'echo $! > $resdir/qemu_pid' >> $T/qemu-cmd + +# In case qemu refuses to run... +echo "NOTE: $QEMU either did not run or was interactive" > $resdir/console.log + +# Attempt to run qemu +kstarttime=`gawk 'BEGIN { print systime() }' < /dev/null` +( . $T/qemu-cmd; wait `cat $resdir/qemu_pid`; echo $? > $resdir/qemu-retval ) & +commandcompleted=0 +if test -z "$TORTURE_KCONFIG_GDB_ARG" +then + sleep 10 # Give qemu's pid a chance to reach the file + if test -s "$resdir/qemu_pid" + then + qemu_pid=`cat "$resdir/qemu_pid"` + echo Monitoring qemu job at pid $qemu_pid + else + qemu_pid="" + echo Monitoring qemu job at yet-as-unknown pid + fi +fi +if test -n "$TORTURE_KCONFIG_GDB_ARG" +then + base_resdir=`echo $resdir | sed -e 's/\.[0-9]\+$//'` + if ! test -f $base_resdir/vmlinux + then + base_resdir=/path/to + fi + echo Waiting for you to attach a debug session, for example: > /dev/tty + echo " gdb $base_resdir/vmlinux" > /dev/tty + echo 'After symbols load and the "(gdb)" prompt appears:' > /dev/tty + echo " target remote :1234" > /dev/tty + echo " continue" > /dev/tty + kstarttime=`gawk 'BEGIN { print systime() }' < /dev/null` +fi +while : +do + if test -z "$qemu_pid" -a -s "$resdir/qemu_pid" + then + qemu_pid=`cat "$resdir/qemu_pid"` + fi + kruntime=`gawk 'BEGIN { print systime() - '"$kstarttime"' }' < /dev/null` + if test -z "$qemu_pid" || kill -0 "$qemu_pid" > /dev/null 2>&1 + then + if test -n "$TORTURE_KCONFIG_GDB_ARG" + then + : + elif test $kruntime -ge $seconds || test -f "$resdir/../STOP.1" + then + break; + fi + sleep 1 + else + commandcompleted=1 + if test $kruntime -lt $seconds + then + echo Completed in $kruntime vs. $seconds >> $resdir/Warnings 2>&1 + grep "^(qemu) qemu:" $resdir/kvm-test-1-run.sh.out >> $resdir/Warnings 2>&1 + killpid="`sed -n "s/^(qemu) qemu: terminating on signal [0-9]* from pid \([0-9]*\).*$/\1/p" $resdir/Warnings`" + if test -n "$killpid" + then + echo "ps -fp $killpid" >> $resdir/Warnings 2>&1 + ps -fp $killpid >> $resdir/Warnings 2>&1 + fi + else + echo ' ---' `date`: "Kernel done" + fi + break + fi +done +if test -z "$qemu_pid" -a -s "$resdir/qemu_pid" +then + qemu_pid=`cat "$resdir/qemu_pid"` +fi +if test $commandcompleted -eq 0 -a -n "$qemu_pid" +then + if ! test -f "$resdir/../STOP.1" + then + echo Grace period for qemu job at pid $qemu_pid + fi + oldline="`tail $resdir/console.log`" + while : + do + if test -f "$resdir/../STOP.1" + then + echo "PID $qemu_pid killed due to run STOP.1 request" >> $resdir/Warnings 2>&1 + kill -KILL $qemu_pid + break + fi + kruntime=`gawk 'BEGIN { print systime() - '"$kstarttime"' }' < /dev/null` + if kill -0 $qemu_pid > /dev/null 2>&1 + then + : + else + break + fi + must_continue=no + newline="`tail $resdir/console.log`" + if test "$newline" != "$oldline" && echo $newline | grep -q ' [0-9]\+us : ' + then + must_continue=yes + fi + last_ts="`tail $resdir/console.log | grep '^\[ *[0-9]\+\.[0-9]\+]' | tail -1 | sed -e 's/^\[ *//' -e 's/\..*$//'`" + if test -z "$last_ts" + then + last_ts=0 + fi + if test "$newline" != "$oldline" -a "$last_ts" -lt $((seconds + $TORTURE_SHUTDOWN_GRACE)) + then + must_continue=yes + fi + if test $must_continue = no -a $kruntime -ge $((seconds + $TORTURE_SHUTDOWN_GRACE)) + then + echo "!!! PID $qemu_pid hung at $kruntime vs. $seconds seconds" >> $resdir/Warnings 2>&1 + kill -KILL $qemu_pid + break + fi + oldline=$newline + sleep 10 + done +elif test -z "$qemu_pid" +then + echo Unknown PID, cannot kill qemu command +fi + +# Tell the script that this run is done. +rm -f $resdir/build.run + +parse-console.sh $resdir/console.log $title diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 5d9ac90c2cfb..f3d2ded0c8cf 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -210,129 +210,4 @@ then exit 0 fi -# Decorate qemu-cmd with redirection, backgrounding, and PID capture -sed -e 's/$/ 2>\&1 \&/' < $resdir/qemu-cmd > $T/qemu-cmd -echo 'echo $! > $resdir/qemu_pid' >> $T/qemu-cmd - -# In case qemu refuses to run... -echo "NOTE: $QEMU either did not run or was interactive" > $resdir/console.log - -# Attempt to run qemu -kstarttime=`gawk 'BEGIN { print systime() }' < /dev/null` -( . $T/qemu-cmd; wait `cat $resdir/qemu_pid`; echo $? > $resdir/qemu-retval ) & -commandcompleted=0 -if test -z "$TORTURE_KCONFIG_GDB_ARG" -then - sleep 10 # Give qemu's pid a chance to reach the file - if test -s "$resdir/qemu_pid" - then - qemu_pid=`cat "$resdir/qemu_pid"` - echo Monitoring qemu job at pid $qemu_pid - else - qemu_pid="" - echo Monitoring qemu job at yet-as-unknown pid - fi -fi -if test -n "$TORTURE_KCONFIG_GDB_ARG" -then - echo Waiting for you to attach a debug session, for example: > /dev/tty - echo " gdb $base_resdir/vmlinux" > /dev/tty - echo 'After symbols load and the "(gdb)" prompt appears:' > /dev/tty - echo " target remote :1234" > /dev/tty - echo " continue" > /dev/tty - kstarttime=`gawk 'BEGIN { print systime() }' < /dev/null` -fi -while : -do - if test -z "$qemu_pid" -a -s "$resdir/qemu_pid" - then - qemu_pid=`cat "$resdir/qemu_pid"` - fi - kruntime=`gawk 'BEGIN { print systime() - '"$kstarttime"' }' < /dev/null` - if test -z "$qemu_pid" || kill -0 "$qemu_pid" > /dev/null 2>&1 - then - if test -n "$TORTURE_KCONFIG_GDB_ARG" - then - : - elif test $kruntime -ge $seconds || test -f "$resdir/../STOP.1" - then - break; - fi - sleep 1 - else - commandcompleted=1 - if test $kruntime -lt $seconds - then - echo Completed in $kruntime vs. $seconds >> $resdir/Warnings 2>&1 - grep "^(qemu) qemu:" $resdir/kvm-test-1-run.sh.out >> $resdir/Warnings 2>&1 - killpid="`sed -n "s/^(qemu) qemu: terminating on signal [0-9]* from pid \([0-9]*\).*$/\1/p" $resdir/Warnings`" - if test -n "$killpid" - then - echo "ps -fp $killpid" >> $resdir/Warnings 2>&1 - ps -fp $killpid >> $resdir/Warnings 2>&1 - fi - else - echo ' ---' `date`: "Kernel done" - fi - break - fi -done -if test -z "$qemu_pid" -a -s "$resdir/qemu_pid" -then - qemu_pid=`cat "$resdir/qemu_pid"` -fi -if test $commandcompleted -eq 0 -a -n "$qemu_pid" -then - if ! test -f "$resdir/../STOP.1" - then - echo Grace period for qemu job at pid $qemu_pid - fi - oldline="`tail $resdir/console.log`" - while : - do - if test -f "$resdir/../STOP.1" - then - echo "PID $qemu_pid killed due to run STOP.1 request" >> $resdir/Warnings 2>&1 - kill -KILL $qemu_pid - break - fi - kruntime=`gawk 'BEGIN { print systime() - '"$kstarttime"' }' < /dev/null` - if kill -0 $qemu_pid > /dev/null 2>&1 - then - : - else - break - fi - must_continue=no - newline="`tail $resdir/console.log`" - if test "$newline" != "$oldline" && echo $newline | grep -q ' [0-9]\+us : ' - then - must_continue=yes - fi - last_ts="`tail $resdir/console.log | grep '^\[ *[0-9]\+\.[0-9]\+]' | tail -1 | sed -e 's/^\[ *//' -e 's/\..*$//'`" - if test -z "$last_ts" - then - last_ts=0 - fi - if test "$newline" != "$oldline" -a "$last_ts" -lt $((seconds + $TORTURE_SHUTDOWN_GRACE)) - then - must_continue=yes - fi - if test $must_continue = no -a $kruntime -ge $((seconds + $TORTURE_SHUTDOWN_GRACE)) - then - echo "!!! PID $qemu_pid hung at $kruntime vs. $seconds seconds" >> $resdir/Warnings 2>&1 - kill -KILL $qemu_pid - break - fi - oldline=$newline - sleep 10 - done -elif test -z "$qemu_pid" -then - echo Unknown PID, cannot kill qemu command -fi - -# Tell the script that this run is done. -rm -f $resdir/build.run - -parse-console.sh $resdir/console.log $title +kvm-test-1-run-qemu.sh $resdir -- cgit v1.2.3 From 7831b391fbf86d19ae92e2984a9274b1d2b4eb06 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 16 Feb 2021 15:32:23 -0800 Subject: torture: Record jitter start/stop commands Distributed runs of rcutorture will need to start and stop jittering on the remote hosts, which means that the commands must be communicated to those hosts. The commit therefore causes kvm.sh to place these commands in new TORTURE_JITTER_START and TORTURE_JITTER_STOP environment variables to communicate them to the scripts that will set this up. In addition, this commit causes kvm-test-1-run.sh to append these commands to each generated qemu-cmd file, which allows any remotely executing script to extract the needed commands from this file. Signed-off-by: Paul E. McKenney --- .../selftests/rcutorture/bin/kvm-test-1-run.sh | 2 ++ tools/testing/selftests/rcutorture/bin/kvm.sh | 24 +++++++++++++--------- 2 files changed, 16 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index f3d2ded0c8cf..a69f8ae3eea5 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -202,6 +202,8 @@ echo $QEMU $qemu_args -m $TORTURE_QEMU_MEM -kernel $KERNEL -append \"$qemu_appen echo "# TORTURE_SHUTDOWN_GRACE=$TORTURE_SHUTDOWN_GRACE" >> $resdir/qemu-cmd echo "# seconds=$seconds" >> $resdir/qemu-cmd echo "# TORTURE_KCONFIG_GDB_ARG=\"$TORTURE_KCONFIG_GDB_ARG\"" >> $resdir/qemu-cmd +echo "# TORTURE_JITTER_START=\"$TORTURE_JITTER_START\"" >> $resdir/qemu-cmd +echo "# TORTURE_JITTER_STOP=\"$TORTURE_JITTER_STOP\"" >> $resdir/qemu-cmd if test -n "$TORTURE_BUILDONLY" then diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index d6973e4a2ecf..efcbd12d5ddb 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -35,6 +35,8 @@ TORTURE_KCONFIG_ARG="" TORTURE_KCONFIG_GDB_ARG="" TORTURE_BOOT_GDB_ARG="" TORTURE_QEMU_GDB_ARG="" +TORTURE_JITTER_START="" +TORTURE_JITTER_STOP="" TORTURE_KCONFIG_KASAN_ARG="" TORTURE_KCONFIG_KCSAN_ARG="" TORTURE_KMAKE_ARG="" @@ -443,6 +445,16 @@ function dump(first, pastlast, batchnum) print "echo ----Start batch " batchnum ": `date` | tee -a " rd "log"; print "needqemurun=" jn=1 + njitter = 0; + split(jitter, ja); + if (ja[1] == -1 && ncpus == 0) + njitter = 1; + else if (ja[1] == -1) + njitter = ncpus; + else + njitter = ja[1]; + print "TORTURE_JITTER_START=\". jitterstart.sh " njitter " " rd " " dur " " ja[2] " " ja[3] "\"; export TORTURE_JITTER_START"; + print "TORTURE_JITTER_STOP=\". jitterstop.sh " rd " \"; export TORTURE_JITTER_STOP" for (j = first; j < pastlast; j++) { cpusr[jn] = cpus[j]; if (cfrep[cf[j]] == "") { @@ -484,14 +496,6 @@ function dump(first, pastlast, batchnum) print "\tneedqemurun=1" print "fi" } - njitter = 0; - split(jitter, ja); - if (ja[1] == -1 && ncpus == 0) - njitter = 1; - else if (ja[1] == -1) - njitter = ncpus; - else - njitter = ja[1]; if (TORTURE_BUILDONLY && njitter != 0) { njitter = 0; print "echo Build-only run, so suppressing jitter | tee -a " rd "log" @@ -502,12 +506,12 @@ function dump(first, pastlast, batchnum) print "if test -n \"$needqemurun\"" print "then" print "\techo ---- Starting kernels. `date` | tee -a " rd "log"; - print "\t. jitterstart.sh " njitter " " rd " " dur " " ja[2] " " ja[3] + print "\t$TORTURE_JITTER_START"; print "\twhile ls $runfiles > /dev/null 2>&1" print "\tdo" print "\t\t:" print "\tdone" - print "\t. jitterstop.sh " rd + print "\t$TORTURE_JITTER_STOP"; print "\techo ---- All kernel runs complete. `date` | tee -a " rd "log"; print "else" print "\twait" -- cgit v1.2.3 From cb1fa863a00ba0e8faf69d2ebb960b75129bccd6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 16 Feb 2021 16:55:04 -0800 Subject: torture: Record kvm-test-1-run.sh and kvm-test-1-run-qemu.sh PIDs This commit records the process IDs of the kvm-test-1-run.sh and kvm-test-1-run-qemu.sh scripts to ease monitoring of remotely running instances of these scripts. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-test-1-run-qemu.sh | 2 ++ tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-qemu.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-qemu.sh index 6b0d71b325eb..576a9b761b41 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-qemu.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-qemu.sh @@ -33,6 +33,8 @@ then exit 1 fi +echo ' ---' `date`: Starting kernel, PID $$ + # Obtain settings from the qemu-cmd file. grep '^#' $resdir/qemu-cmd | sed -e 's/^# //' > $T/qemu-cmd-settings . $T/qemu-cmd-settings diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index a69f8ae3eea5..a386ca8dd690 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -41,7 +41,7 @@ then echo "kvm-test-1-run.sh :$resdir: Not a writable directory, cannot store results into it" exit 1 fi -echo ' ---' `date`: Starting build +echo ' ---' `date`: Starting build, PID $$ echo ' ---' Kconfig fragment at: $config_template >> $resdir/log touch $resdir/ConfigFragment.input -- cgit v1.2.3 From 996a042e0a0684b7a666b9d745784623a3531b27 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 16 Feb 2021 20:17:44 -0800 Subject: torture: Remove no-mpstat error message The cpus2use.sh script complains if the mpstat command is not available, and instead uses all available CPUs. Unfortunately, this complaint goes to stdout, where it confuses invokers who expect a single number. This commit removes this error message in order to avoid this confusion. The tendency of late has been to give rcutorture a full system, so this should not cause issues. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/cpus2use.sh | 1 - 1 file changed, 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/bin/cpus2use.sh b/tools/testing/selftests/rcutorture/bin/cpus2use.sh index 1dbfb62567d2..6bb993001680 100755 --- a/tools/testing/selftests/rcutorture/bin/cpus2use.sh +++ b/tools/testing/selftests/rcutorture/bin/cpus2use.sh @@ -21,7 +21,6 @@ then awk -v ncpus=$ncpus '{ print ncpus * ($7 + $NF) / 100 }'` else # No mpstat command, so use all available CPUs. - echo The mpstat command is not available, so greedily using all CPUs. idlecpus=$ncpus fi awk -v ncpus=$ncpus -v idlecpus=$idlecpus < /dev/null ' -- cgit v1.2.3 From 00a447fabb5252d01035e78ae7f2943e5b4fff64 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 20 Feb 2021 10:13:52 -0800 Subject: torture: Rename SRCU-t and SRCU-u to avoid lowercase characters The convention that scenario names are all uppercase has two exceptions, SRCU-t and SRCU-u. This commit therefore renames them to SRCU-T and SRCU-U, respectively, to bring them in line with this convention. This in turn permits tighter argument checking in the torture-test scripting. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/configs/rcu/CFLIST | 4 ++-- tools/testing/selftests/rcutorture/configs/rcu/SRCU-T | 11 +++++++++++ tools/testing/selftests/rcutorture/configs/rcu/SRCU-T.boot | 1 + tools/testing/selftests/rcutorture/configs/rcu/SRCU-U | 9 +++++++++ tools/testing/selftests/rcutorture/configs/rcu/SRCU-U.boot | 2 ++ tools/testing/selftests/rcutorture/configs/rcu/SRCU-t | 11 ----------- tools/testing/selftests/rcutorture/configs/rcu/SRCU-t.boot | 1 - tools/testing/selftests/rcutorture/configs/rcu/SRCU-u | 9 --------- tools/testing/selftests/rcutorture/configs/rcu/SRCU-u.boot | 2 -- 9 files changed, 25 insertions(+), 25 deletions(-) create mode 100644 tools/testing/selftests/rcutorture/configs/rcu/SRCU-T create mode 100644 tools/testing/selftests/rcutorture/configs/rcu/SRCU-T.boot create mode 100644 tools/testing/selftests/rcutorture/configs/rcu/SRCU-U create mode 100644 tools/testing/selftests/rcutorture/configs/rcu/SRCU-U.boot delete mode 100644 tools/testing/selftests/rcutorture/configs/rcu/SRCU-t delete mode 100644 tools/testing/selftests/rcutorture/configs/rcu/SRCU-t.boot delete mode 100644 tools/testing/selftests/rcutorture/configs/rcu/SRCU-u delete mode 100644 tools/testing/selftests/rcutorture/configs/rcu/SRCU-u.boot (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFLIST b/tools/testing/selftests/rcutorture/configs/rcu/CFLIST index f2b20db9e296..98b6175e5aa0 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/CFLIST +++ b/tools/testing/selftests/rcutorture/configs/rcu/CFLIST @@ -7,8 +7,8 @@ TREE07 TREE09 SRCU-N SRCU-P -SRCU-t -SRCU-u +SRCU-T +SRCU-U TINY01 TINY02 TASKS01 diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-T b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-T new file mode 100644 index 000000000000..d6557c38dfe4 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-T @@ -0,0 +1,11 @@ +CONFIG_SMP=n +CONFIG_PREEMPT_NONE=y +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=n +#CHECK#CONFIG_TINY_SRCU=y +CONFIG_RCU_TRACE=n +CONFIG_DEBUG_LOCK_ALLOC=y +CONFIG_PROVE_LOCKING=y +CONFIG_DEBUG_OBJECTS_RCU_HEAD=n +CONFIG_DEBUG_ATOMIC_SLEEP=y +#CHECK#CONFIG_PREEMPT_COUNT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-T.boot b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-T.boot new file mode 100644 index 000000000000..238bfe3bd0cc --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-T.boot @@ -0,0 +1 @@ +rcutorture.torture_type=srcu diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-U b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-U new file mode 100644 index 000000000000..6bc24e99862f --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-U @@ -0,0 +1,9 @@ +CONFIG_SMP=n +CONFIG_PREEMPT_NONE=y +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=n +#CHECK#CONFIG_TINY_SRCU=y +CONFIG_RCU_TRACE=n +CONFIG_DEBUG_LOCK_ALLOC=n +CONFIG_DEBUG_OBJECTS_RCU_HEAD=n +CONFIG_PREEMPT_COUNT=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-U.boot b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-U.boot new file mode 100644 index 000000000000..ce48c7b82673 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-U.boot @@ -0,0 +1,2 @@ +rcutorture.torture_type=srcud +rcupdate.rcu_self_test=1 diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t deleted file mode 100644 index d6557c38dfe4..000000000000 --- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t +++ /dev/null @@ -1,11 +0,0 @@ -CONFIG_SMP=n -CONFIG_PREEMPT_NONE=y -CONFIG_PREEMPT_VOLUNTARY=n -CONFIG_PREEMPT=n -#CHECK#CONFIG_TINY_SRCU=y -CONFIG_RCU_TRACE=n -CONFIG_DEBUG_LOCK_ALLOC=y -CONFIG_PROVE_LOCKING=y -CONFIG_DEBUG_OBJECTS_RCU_HEAD=n -CONFIG_DEBUG_ATOMIC_SLEEP=y -#CHECK#CONFIG_PREEMPT_COUNT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t.boot b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t.boot deleted file mode 100644 index 238bfe3bd0cc..000000000000 --- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t.boot +++ /dev/null @@ -1 +0,0 @@ -rcutorture.torture_type=srcu diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u deleted file mode 100644 index 6bc24e99862f..000000000000 --- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u +++ /dev/null @@ -1,9 +0,0 @@ -CONFIG_SMP=n -CONFIG_PREEMPT_NONE=y -CONFIG_PREEMPT_VOLUNTARY=n -CONFIG_PREEMPT=n -#CHECK#CONFIG_TINY_SRCU=y -CONFIG_RCU_TRACE=n -CONFIG_DEBUG_LOCK_ALLOC=n -CONFIG_DEBUG_OBJECTS_RCU_HEAD=n -CONFIG_PREEMPT_COUNT=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u.boot b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u.boot deleted file mode 100644 index ce48c7b82673..000000000000 --- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u.boot +++ /dev/null @@ -1,2 +0,0 @@ -rcutorture.torture_type=srcud -rcupdate.rcu_self_test=1 -- cgit v1.2.3 From e633e63aa907feff98c654c1919101f3d53ebd5b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 17 Feb 2021 07:15:41 -0800 Subject: torture: Make upper-case-only no-dot no-slash scenario names official This commit enforces the defacto restriction on scenario names, which is that they contain neither "/", ".", nor lowercase alphabetic characters. This restriction avoids collisions between scenario names and the torture scripting's files and directories. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index efcbd12d5ddb..03364f4072bc 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -104,7 +104,7 @@ do TORTURE_BUILDONLY=1 ;; --configs|--config) - checkarg --configs "(list of config files)" "$#" "$2" '^[^/]\+$' '^--' + checkarg --configs "(list of config files)" "$#" "$2" '^[^/.a-z]\+$' '^--' configs="$configs $2" shift ;; -- cgit v1.2.3 From 7ef0d5a33c81cfb1993f2947c361784b1b02adc8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 17 Feb 2021 14:04:01 -0800 Subject: torture: De-capitalize TORTURE_SUITE Although it might be unlikely that someone would name a scenario "TORTURE_SUITE", they are within their rights to do so. This script therefore renames the "TORTURE_SUITE" file in the top-level date-stamped directory within "res" to "torture_suite" to avoid this name collision. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-recheck.sh | 2 +- tools/testing/selftests/rcutorture/bin/kvm.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh index 47cf4db10896..e01b31b87044 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh @@ -30,7 +30,7 @@ do resdir=`echo $i | sed -e 's,/$,,' -e 's,/[^/]*$,,'` head -1 $resdir/log fi - TORTURE_SUITE="`cat $i/../TORTURE_SUITE`" + TORTURE_SUITE="`cat $i/../torture_suite`" configfile=`echo $i | sed -e 's,^.*/,,'` rm -f $i/console.log.*.diags kvm-recheck-${TORTURE_SUITE}.sh $i diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 03364f4072bc..a1cd05c9ddc4 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -405,7 +405,7 @@ echo Results directory: $resdir/$ds echo $scriptname $args touch $resdir/$ds/log echo $scriptname $args >> $resdir/$ds/log -echo ${TORTURE_SUITE} > $resdir/$ds/TORTURE_SUITE +echo ${TORTURE_SUITE} > $resdir/$ds/torture_suite echo Build directory: `pwd` > $resdir/$ds/testid.txt if test -d .git then -- cgit v1.2.3 From d6100d764cc47100ecabdc704bde5ad0448c87cd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 17 Feb 2021 14:40:03 -0800 Subject: torture: Create a "batches" file for build reuse This commit creates a "batches" file in the res/$ds directory, where $ds is the datestamp. This file contains the batches and the number of CPUs, for example: 1 TREE03 16 1 SRCU-P 8 2 TREE07 16 2 TREE01 8 3 TREE02 8 3 TREE04 8 3 TREE05 8 4 SRCU-N 4 4 TRACE01 4 4 TRACE02 4 4 RUDE01 2 4 RUDE01.2 2 4 TASKS01 2 4 TASKS03 2 4 SRCU-t 1 4 SRCU-u 1 4 TASKS02 1 4 TINY01 1 5 TINY02 1 5 TREE09 1 The first column is the batch number, the second the scenario number (possibly suffixed by a repetition number, as in "RUDE01.2"), and the third is the number of CPUs required by that scenario. The last line shows the number of CPUs expected by this batch file, which allows the run to be re-batched if a different number of CPUs is available. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm.sh | 29 ++++++++++++++++----------- 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index a1cd05c9ddc4..0add16367899 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -565,6 +565,18 @@ echo 'ret=$?' >> $T/script echo "cat $T/kvm-recheck.sh.out | tee -a $resdir/$ds/log" >> $T/script echo 'exit $ret' >> $T/script +# Extract the tests and their batches from the script. +egrep 'Start batch|Starting build\.' $T/script | grep -v ">>" | + sed -e 's/:.*$//' -e 's/^echo //' -e 's/-ovf//' | + awk ' + /^----Start/ { + batchno = $3; + next; + } + { + print batchno, $1, $2 + }' > $T/batches + if test "$dryrun" = script then cat $T/script @@ -583,21 +595,14 @@ then exit 0 elif test "$dryrun" = batches then - # Extract the tests and their batches from the script. - egrep 'Start batch|Starting build\.' $T/script | grep -v ">>" | - sed -e 's/:.*$//' -e 's/^echo //' -e 's/-ovf//' | - awk ' - /^----Start/ { - batchno = $3; - next; - } - { - print batchno, $1, $2 - }' + cat $T/batches + exit 0 else - # Not a dryrun, so run the script. + # Not a dryrun. Record the batches and the number of CPUs, then run the script. bash $T/script ret=$? + cp $T/batches $resdir/$ds/batches + echo '#' cpus=$cpus >> $resdir/$ds/batches echo " --- Done at `date` (`get_starttime_duration $starttime`) exitcode $ret" | tee -a $resdir/$ds/log exit $ret fi -- cgit v1.2.3 From 7cf86c0b6279d9d12bb697e58c7e8b2184a8f3db Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 19 Feb 2021 17:49:58 -0800 Subject: torture: Add kvm-again.sh to rerun a previous torture-test This commit adds a kvm-again.sh script that, given the results directory of a torture-test run, re-runs that test. This means that the kernels need not be rebuilt, but it also is a step towards running torture tests on remote systems. This commit also adds a kvm-test-1-run-batch.sh script that runs one batch out of the torture test. The idea is to copy a results directory tree to remote systems, then use kvm-test-1-run-batch.sh to run batches on these systems. Signed-off-by: Paul E. McKenney --- .../testing/selftests/rcutorture/bin/kvm-again.sh | 171 +++++++++++++++++++++ .../rcutorture/bin/kvm-test-1-run-batch.sh | 67 ++++++++ 2 files changed, 238 insertions(+) create mode 100755 tools/testing/selftests/rcutorture/bin/kvm-again.sh create mode 100755 tools/testing/selftests/rcutorture/bin/kvm-test-1-run-batch.sh (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/bin/kvm-again.sh b/tools/testing/selftests/rcutorture/bin/kvm-again.sh new file mode 100755 index 000000000000..413744093994 --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/kvm-again.sh @@ -0,0 +1,171 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# +# Rerun a series of tests under KVM. +# +# Usage: kvm-again.sh /path/to/old/run [ options ] +# +# Copyright (C) 2021 Facebook, Inc. +# +# Authors: Paul E. McKenney + +scriptname=$0 +args="$*" + +T=${TMPDIR-/tmp}/kvm-again.sh.$$ +trap 'rm -rf $T' 0 +mkdir $T + +if ! test -d tools/testing/selftests/rcutorture/bin +then + echo $scriptname must be run from top-level directory of kernel source tree. + exit 1 +fi + +oldrun=$1 +shift +if ! test -d "$oldrun" +then + echo "Usage: $scriptname /path/to/old/run [ options ]" + exit 1 +fi +if ! cp "$oldrun/batches" $T/batches.oldrun +then + # Later on, can reconstitute this from console.log files. + echo Prior run batches file does not exist: $oldrun/batches + exit 1 +fi + +if test -f "$oldrun/torture_suite" +then + torture_suite="`cat $oldrun/torture_suite`" +elif test -f "$oldrun/TORTURE_SUITE" +then + torture_suite="`cat $oldrun/TORTURE_SUITE`" +else + echo "Prior run torture_suite file does not exist: $oldrun/{torture_suite,TORTURE_SUITE}" + exit 1 +fi + +KVM="`pwd`/tools/testing/selftests/rcutorture"; export KVM +PATH=${KVM}/bin:$PATH; export PATH +. functions.sh + +dryrun= +default_link="cp -R" +rundir="`pwd`/tools/testing/selftests/rcutorture/res/`date +%Y.%m.%d-%H.%M.%S-again`" + +startdate="`date`" +starttime="`get_starttime`" + +usage () { + echo "Usage: $scriptname $oldrun [ arguments ]:" + echo " --dryrun" + echo " --link hard|soft|copy" + echo " --remote" + echo " --rundir /new/res/path" + exit 1 +} + +while test $# -gt 0 +do + case "$1" in + --dryrun) + dryrun=1 + ;; + --link) + checkarg --link "hard|soft|copy" "$#" "$2" 'hard\|soft\|copy' '^--' + case "$2" in + copy) + arg_link="cp -R" + ;; + hard) + arg_link="cp -Rl" + ;; + soft) + arg_link="cp -Rs" + ;; + esac + shift + ;; + --remote) + arg_remote=1 + default_link="cp -as" + ;; + --rundir) + checkarg --rundir "(absolute pathname)" "$#" "$2" '^/' '^error' + rundir=$2 + if test -e "$rundir" + then + echo "--rundir $2: Already exists." + usage + fi + shift + ;; + *) + echo Unknown argument $1 + usage + ;; + esac + shift +done +if test -z "$arg_link" +then + arg_link="$default_link" +fi + +echo ---- Re-run results directory: $rundir + +# Copy old run directory tree over and adjust. +mkdir -p "`dirname "$rundir"`" +if ! $arg_link "$oldrun" "$rundir" +then + echo "Cannot copy from $oldrun to $rundir." + usage +fi +rm -f "$rundir"/*/{console.log,console.log.diags,qemu_pid,qemu-retval,Warnings,kvm-test-1-run.sh.out,kvm-test-1-run-qemu.sh.out,vmlinux} "$rundir"/log +echo $oldrun > "$rundir/re-run" +if ! test -d "$rundir/../../bin" +then + $arg_link "$oldrun/../../bin" "$rundir/../.." +fi +for i in $rundir/*/qemu-cmd +do + cp "$i" $T + qemu_cmd_dir="`dirname "$i"`" + kernel_dir="`echo $qemu_cmd_dir | sed -e 's/\.[0-9]\+$//'`" + kvm-transform.sh $kernel_dir/bzImage $qemu_cmd_dir/console.log < $T/qemu-cmd > $i + echo "# TORTURE_KCONFIG_GDB_ARG=''" >> $i +done +grep -v '^#' $T/batches.oldrun | awk ' +BEGIN { + oldbatch = 1; +} + +{ + if (oldbatch != $1) { + print "kvm-test-1-run-batch.sh" curbatch; + curbatch = ""; + oldbatch = $1; + } + curbatch = curbatch " " $2; +} + +END { + print "kvm-test-1-run-batch.sh" curbatch +}' > $T/runbatches.sh + +if test -n "$dryrun" +then + echo ---- Dryrun complete, directory: $rundir | tee -a "$rundir/log" +else + ( cd "$rundir"; sh $T/runbatches.sh ) + kcsan-collapse.sh "$rundir" | tee -a "$rundir/log" + echo | tee -a "$rundir/log" + echo ---- Results directory: $rundir | tee -a "$rundir/log" + kvm-recheck.sh "$rundir" > $T/kvm-recheck.sh.out 2>&1 + ret=$? + cat $T/kvm-recheck.sh.out | tee -a "$rundir/log" + echo " --- Done at `date` (`get_starttime_duration $starttime`) exitcode $ret" | tee -a "$rundir/log" + exit $ret +fi diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-batch.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-batch.sh new file mode 100755 index 000000000000..7ea0809e229e --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-batch.sh @@ -0,0 +1,67 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# +# Carry out a kvm-based run for the specified batch of scenarios, which +# might have been built by --build-only kvm.sh run. +# +# Usage: kvm-test-1-run-batch.sh SCENARIO [ SCENARIO ... ] +# +# Each SCENARIO is the name of a directory in the current directory +# containing a ready-to-run qemu-cmd file. +# +# Copyright (C) 2021 Facebook, Inc. +# +# Authors: Paul E. McKenney + +T=${TMPDIR-/tmp}/kvm-test-1-run-batch.sh.$$ +trap 'rm -rf $T' 0 +mkdir $T + +echo ---- Running batch $* +# Check arguments +runfiles= +for i in "$@" +do + if ! echo $i | grep -q '^[^/.a-z]\+\(\.[0-9]\+\)\?$' + then + echo Bad scenario name: \"$i\" 1>&2 + exit 1 + fi + if ! test -d "$i" + then + echo Scenario name not a directory: \"$i\" 1>&2 + exit 2 + fi + if ! test -f "$i/qemu-cmd" + then + echo Scenario lacks a command file: \"$i/qemu-cmd\" 1>&2 + exit 3 + fi + rm -f $i/build.* + touch $i/build.run + runfiles="$runfiles $i/build.run" +done + +# Extract settings from the qemu-cmd file. +grep '^#' $1/qemu-cmd | sed -e 's/^# //' > $T/qemu-cmd-settings +. $T/qemu-cmd-settings + +# Start up jitter, start each scenario, wait, end jitter. +echo ---- System running test: `uname -a` +echo ---- Starting kernels. `date` | tee -a log +$TORTURE_JITTER_START +for i in "$@" +do + echo ---- System running test: `uname -a` > $i/kvm-test-1-run-qemu.sh.out + echo > $i/kvm-test-1-run-qemu.sh.out + kvm-test-1-run-qemu.sh $i >> $i/kvm-test-1-run-qemu.sh.out 2>&1 & +done +for i in $runfiles +do + while ls $i > /dev/null 2>&1 + do + : + done +done +echo ---- All kernel runs complete. `date` | tee -a log +$TORTURE_JITTER_STOP -- cgit v1.2.3 From 00505165cf4484dffc488259d59689845ba77939 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 22 Feb 2021 14:12:58 -0800 Subject: torture: Add --duration argument to kvm-again.sh This commit adds a --duration argument to kvm-again.sh to allow the user to override the --duration specified for the original kvm.sh run. Signed-off-by: Paul E. McKenney --- .../testing/selftests/rcutorture/bin/kvm-again.sh | 25 ++++++++++++++++++- .../selftests/rcutorture/bin/kvm-transform.sh | 29 +++++++++++++++++----- 2 files changed, 47 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/bin/kvm-again.sh b/tools/testing/selftests/rcutorture/bin/kvm-again.sh index 413744093994..e7e54581d23e 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-again.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-again.sh @@ -52,6 +52,7 @@ PATH=${KVM}/bin:$PATH; export PATH . functions.sh dryrun= +dur= default_link="cp -R" rundir="`pwd`/tools/testing/selftests/rcutorture/res/`date +%Y.%m.%d-%H.%M.%S-again`" @@ -61,6 +62,7 @@ starttime="`get_starttime`" usage () { echo "Usage: $scriptname $oldrun [ arguments ]:" echo " --dryrun" + echo " --duration minutes | s | h | d" echo " --link hard|soft|copy" echo " --remote" echo " --rundir /new/res/path" @@ -73,6 +75,23 @@ do --dryrun) dryrun=1 ;; + --duration) + checkarg --duration "(minutes)" $# "$2" '^[0-9][0-9]*\(s\|m\|h\|d\|\)$' '^error' + mult=60 + if echo "$2" | grep -q 's$' + then + mult=1 + elif echo "$2" | grep -q 'h$' + then + mult=3600 + elif echo "$2" | grep -q 'd$' + then + mult=86400 + fi + ts=`echo $2 | sed -e 's/[smhd]$//'` + dur=$(($ts*mult)) + shift + ;; --link) checkarg --link "hard|soft|copy" "$#" "$2" 'hard\|soft\|copy' '^--' case "$2" in @@ -134,7 +153,11 @@ do cp "$i" $T qemu_cmd_dir="`dirname "$i"`" kernel_dir="`echo $qemu_cmd_dir | sed -e 's/\.[0-9]\+$//'`" - kvm-transform.sh $kernel_dir/bzImage $qemu_cmd_dir/console.log < $T/qemu-cmd > $i + kvm-transform.sh $kernel_dir/bzImage $qemu_cmd_dir/console.log $dur < $T/qemu-cmd > $i + if test -n "$dur" + then + echo "# seconds=$dur" >> $i + fi echo "# TORTURE_KCONFIG_GDB_ARG=''" >> $i done grep -v '^#' $T/batches.oldrun | awk ' diff --git a/tools/testing/selftests/rcutorture/bin/kvm-transform.sh b/tools/testing/selftests/rcutorture/bin/kvm-transform.sh index c45a953ef393..162dddbcde00 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-transform.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-transform.sh @@ -3,7 +3,7 @@ # # Transform a qemu-cmd file to allow reuse. # -# Usage: kvm-transform.sh bzImage console.log < qemu-cmd-in > qemu-cmd-out +# Usage: kvm-transform.sh bzImage console.log [ seconds ] < qemu-cmd-in > qemu-cmd-out # # bzImage: Kernel and initrd from the same prior kvm.sh run. # console.log: File into which to place console output. @@ -29,20 +29,37 @@ then echo "Need console log file name." exit 1 fi +seconds=$3 +if test -n "$seconds" && echo $seconds | grep -q '[^0-9]' +then + echo "Invalid duration, should be numeric in seconds: '$seconds'" + exit 1 +fi + +awk -v image="$image" -v consolelog="$consolelog" -v seconds="$seconds" ' +/^#/ { + print $0; + next; +} -awk -v image="$image" -v consolelog="$consolelog" ' { line = ""; for (i = 1; i <= NF; i++) { - if (line == "") + if ("" seconds != "" && $i ~ /\.shutdown_secs=[0-9]*$/) { + sub(/[0-9]*$/, seconds, $i); + if (line == "") + line = $i; + else + line = line " " $i; + } else if (line == "") { line = $i; - else + } else { line = line " " $i; + } if ($i == "-serial") { i++; line = line " file:" consolelog; - } - if ($i == "-kernel") { + } else if ($i == "-kernel") { i++; line = line " " image; } -- cgit v1.2.3 From 018629e909ffcabfc657388094371f20ba90649f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 22 Feb 2021 14:58:41 -0800 Subject: torture: Make kvm-transform.sh update jitter commands When rerunning an old run using kvm-again.sh, the jitter commands will re-use the original "res" directory. This works, but is clearly an accident waiting to happen. And this accident will happen with remote runs, where the original directory lives on some other system. This commit therefore updates the qemu-cmd commands to use the new res directory created for this specific run. Signed-off-by: Paul E. McKenney --- .../testing/selftests/rcutorture/bin/kvm-again.sh | 3 ++- .../selftests/rcutorture/bin/kvm-transform.sh | 23 +++++++++++++++++++--- 2 files changed, 22 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/bin/kvm-again.sh b/tools/testing/selftests/rcutorture/bin/kvm-again.sh index e7e54581d23e..3fb57ce5aafc 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-again.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-again.sh @@ -153,7 +153,8 @@ do cp "$i" $T qemu_cmd_dir="`dirname "$i"`" kernel_dir="`echo $qemu_cmd_dir | sed -e 's/\.[0-9]\+$//'`" - kvm-transform.sh $kernel_dir/bzImage $qemu_cmd_dir/console.log $dur < $T/qemu-cmd > $i + jitter_dir="`dirname "$kernel_dir"`" + kvm-transform.sh "$kernel_dir/bzImage" "$qemu_cmd_dir/console.log" "$jitter_dir" $dur < $T/qemu-cmd > $i if test -n "$dur" then echo "# seconds=$dur" >> $i diff --git a/tools/testing/selftests/rcutorture/bin/kvm-transform.sh b/tools/testing/selftests/rcutorture/bin/kvm-transform.sh index 162dddbcde00..e9dcbce17bbd 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-transform.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-transform.sh @@ -3,7 +3,7 @@ # # Transform a qemu-cmd file to allow reuse. # -# Usage: kvm-transform.sh bzImage console.log [ seconds ] < qemu-cmd-in > qemu-cmd-out +# Usage: kvm-transform.sh bzImage console.log jitter_dir [ seconds ] < qemu-cmd-in > qemu-cmd-out # # bzImage: Kernel and initrd from the same prior kvm.sh run. # console.log: File into which to place console output. @@ -29,14 +29,31 @@ then echo "Need console log file name." exit 1 fi -seconds=$3 +jitter_dir="$3" +if test -z "$jitter_dir" || ! test -d "$jitter_dir" +then + echo "Need valid jitter directory: '$jitter_dir'" + exit 1 +fi +seconds="$4" if test -n "$seconds" && echo $seconds | grep -q '[^0-9]' then echo "Invalid duration, should be numeric in seconds: '$seconds'" exit 1 fi -awk -v image="$image" -v consolelog="$consolelog" -v seconds="$seconds" ' +awk -v image="$image" -v consolelog="$consolelog" -v jitter_dir="$jitter_dir" \ + -v seconds="$seconds" ' +/^# TORTURE_JITTER_START=/ { + print "# TORTURE_JITTER_START=\". jitterstart.sh " $4 " " jitter_dir " " $6 " " $7; + next; +} + +/^# TORTURE_JITTER_STOP=/ { + print "# TORTURE_JITTER_STOP=\". jitterstop.sh " " " jitter_dir " " $5; + next; +} + /^#/ { print $0; next; -- cgit v1.2.3 From a5dbe2524f553a1283b3364ff91e96bfb618ceab Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 23 Feb 2021 12:07:39 -0800 Subject: torture: Make TORTURE_TRUST_MAKE available in kvm-again.sh environment Because the TORTURE_TRUST_MAKE environment variable is not recorded, kvm-again.sh runs can result in the parse-build.sh script emitting false-positive "BUG: TREE03 no build" messages. These messages are intended to complain about any lack of compiler invocations when the --trust-make flag is not given to kvm.sh. However, when this flag is given to kvm.sh (and thus when TORTURE_TRUST_MAKE=y), lack of compiler invocations is expected behavior when rebuilding from identical source code. This commit therefore makes kvm-test-1-run.sh record the value of the TORTURE_TRUST_MAKE environment variable as an additional comment in the qemu-cmd file, and also makes kvm-again.sh reconstitute that variable from that comment. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-again.sh | 5 +++++ tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh | 1 + 2 files changed, 6 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/bin/kvm-again.sh b/tools/testing/selftests/rcutorture/bin/kvm-again.sh index 3fb57ce5aafc..f1c80b02af58 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-again.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-again.sh @@ -161,6 +161,11 @@ do fi echo "# TORTURE_KCONFIG_GDB_ARG=''" >> $i done + +# Extract settings from the last qemu-cmd file transformed above. +grep '^#' $i | sed -e 's/^# //' > $T/qemu-cmd-settings +. $T/qemu-cmd-settings + grep -v '^#' $T/batches.oldrun | awk ' BEGIN { oldbatch = 1; diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index a386ca8dd690..420ed5ce9d32 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -204,6 +204,7 @@ echo "# seconds=$seconds" >> $resdir/qemu-cmd echo "# TORTURE_KCONFIG_GDB_ARG=\"$TORTURE_KCONFIG_GDB_ARG\"" >> $resdir/qemu-cmd echo "# TORTURE_JITTER_START=\"$TORTURE_JITTER_START\"" >> $resdir/qemu-cmd echo "# TORTURE_JITTER_STOP=\"$TORTURE_JITTER_STOP\"" >> $resdir/qemu-cmd +echo "# TORTURE_TRUST_MAKE=\"$TORTURE_TRUST_MAKE\"; export TORTURE_TRUST_MAKE" >> $resdir/qemu-cmd if test -n "$TORTURE_BUILDONLY" then -- cgit v1.2.3 From 03edf700db335b9375c18310d59d0a0ab6c850df Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 23 Feb 2021 13:12:41 -0800 Subject: torture: Print proper vmlinux path for kvm-again.sh runs The kvm-again.sh script does not copy over the vmlinux files due to their large size. This means that a gdb run must use the vmlinux file from the original "res" directory. This commit therefore finds that directory and prints it out so that the user can copy and pasted the gdb command just as for the initial run. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-again.sh | 5 ++++- tools/testing/selftests/rcutorture/bin/kvm-test-1-run-qemu.sh | 6 +++++- 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/bin/kvm-again.sh b/tools/testing/selftests/rcutorture/bin/kvm-again.sh index f1c80b02af58..668636ee3daf 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-again.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-again.sh @@ -159,7 +159,10 @@ do then echo "# seconds=$dur" >> $i fi - echo "# TORTURE_KCONFIG_GDB_ARG=''" >> $i + if test -n "$arg_remote" + then + echo "# TORTURE_KCONFIG_GDB_ARG=''" >> $i + fi done # Extract settings from the last qemu-cmd file transformed above. diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-qemu.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-qemu.sh index 576a9b761b41..5b1aa2a4f3f6 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-qemu.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-qemu.sh @@ -67,7 +67,11 @@ then base_resdir=`echo $resdir | sed -e 's/\.[0-9]\+$//'` if ! test -f $base_resdir/vmlinux then - base_resdir=/path/to + base_resdir="`cat re-run`/$resdir" + if ! test -f $base_resdir/vmlinux + then + base_resdir=/path/to + fi fi echo Waiting for you to attach a debug session, for example: > /dev/tty echo " gdb $base_resdir/vmlinux" > /dev/tty -- cgit v1.2.3 From a1ab2e89f36d678512a50cbebf6afc4201f41a31 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 23 Feb 2021 14:33:03 -0800 Subject: torture: Consolidate qemu-cmd duration editing into kvm-transform.sh Currently, kvm-again.sh updates the duration in the "seconds=" comment in the qemu-cmd file, but kvm-transform.sh updates the duration in the actual qemu command arguments. This is an accident waiting to happen. This commit therefore consolidates these updates into kvm-transform.sh. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-again.sh | 4 ---- tools/testing/selftests/rcutorture/bin/kvm-transform.sh | 8 ++++++++ 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/bin/kvm-again.sh b/tools/testing/selftests/rcutorture/bin/kvm-again.sh index 668636ee3daf..46e47a00a7db 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-again.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-again.sh @@ -155,10 +155,6 @@ do kernel_dir="`echo $qemu_cmd_dir | sed -e 's/\.[0-9]\+$//'`" jitter_dir="`dirname "$kernel_dir"`" kvm-transform.sh "$kernel_dir/bzImage" "$qemu_cmd_dir/console.log" "$jitter_dir" $dur < $T/qemu-cmd > $i - if test -n "$dur" - then - echo "# seconds=$dur" >> $i - fi if test -n "$arg_remote" then echo "# TORTURE_KCONFIG_GDB_ARG=''" >> $i diff --git a/tools/testing/selftests/rcutorture/bin/kvm-transform.sh b/tools/testing/selftests/rcutorture/bin/kvm-transform.sh index e9dcbce17bbd..d40b4e60a50c 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-transform.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-transform.sh @@ -44,6 +44,14 @@ fi awk -v image="$image" -v consolelog="$consolelog" -v jitter_dir="$jitter_dir" \ -v seconds="$seconds" ' +/^# seconds=/ { + if (seconds == "") + print $0; + else + print "# seconds=" seconds; + next; +} + /^# TORTURE_JITTER_START=/ { print "# TORTURE_JITTER_START=\". jitterstart.sh " $4 " " jitter_dir " " $6 " " $7; next; -- cgit v1.2.3 From 114e4a4b4884c14ebd35874cbe3e1ca0d38efa5d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 27 Feb 2021 20:55:57 -0800 Subject: torture: Fix kvm.sh --datestamp regex check Some versions of grep are happy to interpret a nonsensically placed "-" within a "[]" pattern as a dash, while others give an error message. This commit therefore places the "-" at the end of the expression where it was supposed to be in the first place. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 0add16367899..6bf00a003d3d 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -120,7 +120,7 @@ do shift ;; --datestamp) - checkarg --datestamp "(relative pathname)" "$#" "$2" '^[a-zA-Z0-9._-/]*$' '^--' + checkarg --datestamp "(relative pathname)" "$#" "$2" '^[a-zA-Z0-9._/-]*$' '^--' ds=$2 shift ;; -- cgit v1.2.3 From 4bf07f6562a01a488877e05267808da7147f44a5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 22 Mar 2021 22:39:03 +0100 Subject: timekeeping, clocksource: Fix various typos in comments Fix ~56 single-word typos in timekeeping & clocksource code comments. Signed-off-by: Ingo Molnar Cc: Thomas Gleixner Cc: John Stultz Cc: Stephen Boyd Cc: Daniel Lezcano Cc: linux-kernel@vger.kernel.org --- drivers/clocksource/clksrc-dbx500-prcmu.c | 8 ++++---- drivers/clocksource/dw_apb_timer_of.c | 2 +- drivers/clocksource/hyperv_timer.c | 2 +- drivers/clocksource/timer-atmel-tcb.c | 4 ++-- drivers/clocksource/timer-fsl-ftm.c | 2 +- drivers/clocksource/timer-microchip-pit64b.c | 2 +- drivers/clocksource/timer-of.c | 4 ++-- drivers/clocksource/timer-ti-dm-systimer.c | 2 +- drivers/clocksource/timer-vf-pit.c | 2 +- include/linux/clocksource.h | 2 +- include/linux/timex.h | 2 +- kernel/time/alarmtimer.c | 6 +++--- kernel/time/clocksource.c | 4 ++-- kernel/time/hrtimer.c | 18 +++++++++--------- kernel/time/jiffies.c | 2 +- kernel/time/ntp.c | 2 +- kernel/time/posix-cpu-timers.c | 6 +++--- kernel/time/tick-broadcast-hrtimer.c | 2 +- kernel/time/tick-broadcast.c | 4 ++-- kernel/time/tick-oneshot.c | 2 +- kernel/time/tick-sched.c | 2 +- kernel/time/tick-sched.h | 2 +- kernel/time/time.c | 2 +- kernel/time/timekeeping.c | 10 +++++----- kernel/time/timer.c | 4 ++-- kernel/time/vsyscall.c | 2 +- tools/testing/selftests/timers/clocksource-switch.c | 4 ++-- tools/testing/selftests/timers/leap-a-day.c | 2 +- tools/testing/selftests/timers/leapcrash.c | 4 ++-- tools/testing/selftests/timers/threadtest.c | 2 +- 30 files changed, 56 insertions(+), 56 deletions(-) (limited to 'tools') diff --git a/drivers/clocksource/clksrc-dbx500-prcmu.c b/drivers/clocksource/clksrc-dbx500-prcmu.c index 996900d017c6..2fc93e46cea3 100644 --- a/drivers/clocksource/clksrc-dbx500-prcmu.c +++ b/drivers/clocksource/clksrc-dbx500-prcmu.c @@ -18,7 +18,7 @@ #define RATE_32K 32768 -#define TIMER_MODE_CONTINOUS 0x1 +#define TIMER_MODE_CONTINUOUS 0x1 #define TIMER_DOWNCOUNT_VAL 0xffffffff #define PRCMU_TIMER_REF 0 @@ -55,13 +55,13 @@ static int __init clksrc_dbx500_prcmu_init(struct device_node *node) /* * The A9 sub system expects the timer to be configured as - * a continous looping timer. + * a continuous looping timer. * The PRCMU should configure it but if it for some reason * don't we do it here. */ if (readl(clksrc_dbx500_timer_base + PRCMU_TIMER_MODE) != - TIMER_MODE_CONTINOUS) { - writel(TIMER_MODE_CONTINOUS, + TIMER_MODE_CONTINUOUS) { + writel(TIMER_MODE_CONTINUOUS, clksrc_dbx500_timer_base + PRCMU_TIMER_MODE); writel(TIMER_DOWNCOUNT_VAL, clksrc_dbx500_timer_base + PRCMU_TIMER_REF); diff --git a/drivers/clocksource/dw_apb_timer_of.c b/drivers/clocksource/dw_apb_timer_of.c index 42e7e43b8fcd..2b2c3b586987 100644 --- a/drivers/clocksource/dw_apb_timer_of.c +++ b/drivers/clocksource/dw_apb_timer_of.c @@ -38,7 +38,7 @@ static int __init timer_get_base_and_rate(struct device_node *np, } /* - * Not all implementations use a periphal clock, so don't panic + * Not all implementations use a peripheral clock, so don't panic * if it's not present */ pclk = of_clk_get_by_name(np, "pclk"); diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c index 269a691bd2c4..a02b0a224807 100644 --- a/drivers/clocksource/hyperv_timer.c +++ b/drivers/clocksource/hyperv_timer.c @@ -457,7 +457,7 @@ void __init hv_init_clocksource(void) { /* * Try to set up the TSC page clocksource. If it succeeds, we're - * done. Otherwise, set up the MSR clocksoruce. At least one of + * done. Otherwise, set up the MSR clocksource. At least one of * these will always be available except on very old versions of * Hyper-V on x86. In that case we won't have a Hyper-V * clocksource, but Linux will still run with a clocksource based diff --git a/drivers/clocksource/timer-atmel-tcb.c b/drivers/clocksource/timer-atmel-tcb.c index 787dbebbb432..27af17c99590 100644 --- a/drivers/clocksource/timer-atmel-tcb.c +++ b/drivers/clocksource/timer-atmel-tcb.c @@ -455,9 +455,9 @@ static int __init tcb_clksrc_init(struct device_node *node) tcaddr = tc.regs; if (bits == 32) { - /* use apropriate function to read 32 bit counter */ + /* use appropriate function to read 32 bit counter */ clksrc.read = tc_get_cycles32; - /* setup ony channel 0 */ + /* setup only channel 0 */ tcb_setup_single_chan(&tc, best_divisor_idx); tc_sched_clock = tc_sched_clock_read32; tc_delay_timer.read_current_timer = tc_delay_timer_read32; diff --git a/drivers/clocksource/timer-fsl-ftm.c b/drivers/clocksource/timer-fsl-ftm.c index 12a2ed7cfaff..93f336ec875a 100644 --- a/drivers/clocksource/timer-fsl-ftm.c +++ b/drivers/clocksource/timer-fsl-ftm.c @@ -116,7 +116,7 @@ static int ftm_set_next_event(unsigned long delta, * to the MOD register latches the value into a buffer. The MOD * register is updated with the value of its write buffer with * the following scenario: - * a, the counter source clock is diabled. + * a, the counter source clock is disabled. */ ftm_counter_disable(priv->clkevt_base); diff --git a/drivers/clocksource/timer-microchip-pit64b.c b/drivers/clocksource/timer-microchip-pit64b.c index ab623b25a47b..cfa4ec7ef396 100644 --- a/drivers/clocksource/timer-microchip-pit64b.c +++ b/drivers/clocksource/timer-microchip-pit64b.c @@ -237,7 +237,7 @@ static void __init mchp_pit64b_pres_compute(u32 *pres, u32 clk_rate, break; } - /* Use the bigest prescaler if we didn't match one. */ + /* Use the biggest prescaler if we didn't match one. */ if (*pres == MCHP_PIT64B_PRES_MAX) *pres = MCHP_PIT64B_PRES_MAX - 1; } diff --git a/drivers/clocksource/timer-of.c b/drivers/clocksource/timer-of.c index 572da477c6d3..529cc6a51cdb 100644 --- a/drivers/clocksource/timer-of.c +++ b/drivers/clocksource/timer-of.c @@ -211,10 +211,10 @@ out_fail: } /** - * timer_of_cleanup - release timer_of ressources + * timer_of_cleanup - release timer_of resources * @to: timer_of structure * - * Release the ressources that has been used in timer_of_init(). + * Release the resources that has been used in timer_of_init(). * This function should be called in init error cases */ void __init timer_of_cleanup(struct timer_of *to) diff --git a/drivers/clocksource/timer-ti-dm-systimer.c b/drivers/clocksource/timer-ti-dm-systimer.c index 33b3e8aa2cc5..614c8380f3e9 100644 --- a/drivers/clocksource/timer-ti-dm-systimer.c +++ b/drivers/clocksource/timer-ti-dm-systimer.c @@ -589,7 +589,7 @@ static int __init dmtimer_clockevent_init(struct device_node *np) "always-on " : "", t->rate, np->parent); clockevents_config_and_register(dev, t->rate, - 3, /* Timer internal resynch latency */ + 3, /* Timer internal resync latency */ 0xffffffff); if (of_machine_is_compatible("ti,am33xx") || diff --git a/drivers/clocksource/timer-vf-pit.c b/drivers/clocksource/timer-vf-pit.c index 1a86a4e7e344..911c92146eca 100644 --- a/drivers/clocksource/timer-vf-pit.c +++ b/drivers/clocksource/timer-vf-pit.c @@ -136,7 +136,7 @@ static int __init pit_clockevent_init(unsigned long rate, int irq) /* * The value for the LDVAL register trigger is calculated as: * LDVAL trigger = (period / clock period) - 1 - * The pit is a 32-bit down count timer, when the conter value + * The pit is a 32-bit down count timer, when the counter value * reaches 0, it will generate an interrupt, thus the minimal * LDVAL trigger value is 1. And then the min_delta is * minimal LDVAL trigger value + 1, and the max_delta is full 32-bit. diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 86d143db6523..a247b089ca78 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -70,7 +70,7 @@ struct module; * @mark_unstable: Optional function to inform the clocksource driver that * the watchdog marked the clocksource unstable * @tick_stable: Optional function called periodically from the watchdog - * code to provide stable syncrhonization points + * code to provide stable synchronization points * @wd_list: List head to enqueue into the watchdog list (internal) * @cs_last: Last clocksource value for clocksource watchdog * @wd_last: Last watchdog value corresponding to @cs_last diff --git a/include/linux/timex.h b/include/linux/timex.h index 9c2e54faf9b7..059b18eb1f1f 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -133,7 +133,7 @@ /* * kernel variables - * Note: maximum error = NTP synch distance = dispersion + delay / 2; + * Note: maximum error = NTP sync distance = dispersion + delay / 2; * estimated error = NTP dispersion. */ extern unsigned long tick_usec; /* USER_HZ period (usec) */ diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 98d7a15e8cf6..e9af8fae0bfb 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -2,13 +2,13 @@ /* * Alarmtimer interface * - * This interface provides a timer which is similarto hrtimers, + * This interface provides a timer which is similar to hrtimers, * but triggers a RTC alarm if the box is suspend. * * This interface is influenced by the Android RTC Alarm timer * interface. * - * Copyright (C) 2010 IBM Corperation + * Copyright (C) 2010 IBM Corporation * * Author: John Stultz */ @@ -811,7 +811,7 @@ static long __sched alarm_timer_nsleep_restart(struct restart_block *restart) /** * alarm_timer_nsleep - alarmtimer nanosleep * @which_clock: clockid - * @flags: determins abstime or relative + * @flags: determines abstime or relative * @tsreq: requested sleep time (abs or rel) * * Handles clock_nanosleep calls against _ALARM clockids diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index cce484a2cc7c..1d1a61371b5a 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -38,7 +38,7 @@ * calculated mult and shift factors. This guarantees that no 64bit * overflow happens when the input value of the conversion is * multiplied with the calculated mult factor. Larger ranges may - * reduce the conversion accuracy by chosing smaller mult and shift + * reduce the conversion accuracy by choosing smaller mult and shift * factors. */ void @@ -518,7 +518,7 @@ static void clocksource_suspend_select(bool fallback) * the suspend time when resuming system. * * This function is called late in the suspend process from timekeeping_suspend(), - * that means processes are freezed, non-boot cpus and interrupts are disabled + * that means processes are frozen, non-boot cpus and interrupts are disabled * now. It is therefore possible to start the suspend timer without taking the * clocksource mutex. */ diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 788b9d137de4..30b356c93c78 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -683,7 +683,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) * T1 is removed, so this code is called and would reprogram * the hardware to 5s from now. Any hrtimer_start after that * will not reprogram the hardware due to hang_detected being - * set. So we'd effectivly block all timers until the T2 event + * set. So we'd effectively block all timers until the T2 event * fires. */ if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) @@ -1019,7 +1019,7 @@ static void __remove_hrtimer(struct hrtimer *timer, * cpu_base->next_timer. This happens when we remove the first * timer on a remote cpu. No harm as we never dereference * cpu_base->next_timer. So the worst thing what can happen is - * an superflous call to hrtimer_force_reprogram() on the + * an superfluous call to hrtimer_force_reprogram() on the * remote cpu later on if the same timer gets enqueued again. */ if (reprogram && timer == cpu_base->next_timer) @@ -1212,7 +1212,7 @@ static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) * The counterpart to hrtimer_cancel_wait_running(). * * If there is a waiter for cpu_base->expiry_lock, then it was waiting for - * the timer callback to finish. Drop expiry_lock and reaquire it. That + * the timer callback to finish. Drop expiry_lock and reacquire it. That * allows the waiter to acquire the lock and make progress. */ static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base, @@ -1398,7 +1398,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, int base; /* - * On PREEMPT_RT enabled kernels hrtimers which are not explicitely + * On PREEMPT_RT enabled kernels hrtimers which are not explicitly * marked for hard interrupt expiry mode are moved into soft * interrupt context for latency reasons and because the callbacks * can invoke functions which might sleep on RT, e.g. spin_lock(). @@ -1430,7 +1430,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, * hrtimer_init - initialize a timer to the given clock * @timer: the timer to be initialized * @clock_id: the clock to be used - * @mode: The modes which are relevant for intitialization: + * @mode: The modes which are relevant for initialization: * HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT, * HRTIMER_MODE_REL_SOFT * @@ -1487,7 +1487,7 @@ EXPORT_SYMBOL_GPL(hrtimer_active); * insufficient for that. * * The sequence numbers are required because otherwise we could still observe - * a false negative if the read side got smeared over multiple consequtive + * a false negative if the read side got smeared over multiple consecutive * __run_hrtimer() invocations. */ @@ -1588,7 +1588,7 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, * minimizing wakeups, not running timers at the * earliest interrupt after their soft expiration. * This allows us to avoid using a Priority Search - * Tree, which can answer a stabbing querry for + * Tree, which can answer a stabbing query for * overlapping intervals and instead use the simple * BST we already have. * We don't add extra wakeups by delaying timers that @@ -1822,7 +1822,7 @@ static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode) { /* - * On PREEMPT_RT enabled kernels hrtimers which are not explicitely + * On PREEMPT_RT enabled kernels hrtimers which are not explicitly * marked for hard interrupt expiry mode are moved into soft * interrupt context either for latency reasons or because the * hrtimer callback takes regular spinlocks or invokes other @@ -1835,7 +1835,7 @@ static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, * the same CPU. That causes a latency spike due to the wakeup of * a gazillion threads. * - * OTOH, priviledged real-time user space applications rely on the + * OTOH, privileged real-time user space applications rely on the * low latency of hard interrupt wakeups. If the current task is in * a real-time scheduling class, mark the mode for hard interrupt * expiry. diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index a5cffe2a1770..a492e4da69ba 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -44,7 +44,7 @@ static u64 jiffies_read(struct clocksource *cs) * the timer interrupt frequency HZ and it suffers * inaccuracies caused by missed or lost timer * interrupts and the inability for the timer - * interrupt hardware to accuratly tick at the + * interrupt hardware to accurately tick at the * requested HZ value. It is also not recommended * for "tick-less" systems. */ diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 5247afd7f345..406dccb79c2b 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -544,7 +544,7 @@ static inline bool rtc_tv_nsec_ok(unsigned long set_offset_nsec, struct timespec64 *to_set, const struct timespec64 *now) { - /* Allowed error in tv_nsec, arbitarily set to 5 jiffies in ns. */ + /* Allowed error in tv_nsec, arbitrarily set to 5 jiffies in ns. */ const unsigned long TIME_SET_NSEC_FUZZ = TICK_NSEC * 5; struct timespec64 delay = {.tv_sec = -1, .tv_nsec = set_offset_nsec}; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index a71758e34e45..b145e6835e34 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -279,7 +279,7 @@ void thread_group_sample_cputime(struct task_struct *tsk, u64 *samples) * @tsk: Task for which cputime needs to be started * @samples: Storage for time samples * - * The thread group cputime accouting is avoided when there are no posix + * The thread group cputime accounting is avoided when there are no posix * CPU timers armed. Before starting a timer it's required to check whether * the time accounting is active. If not, a full update of the atomic * accounting store needs to be done and the accounting enabled. @@ -390,7 +390,7 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer) /* * If posix timer expiry is handled in task work context then * timer::it_lock can be taken without disabling interrupts as all - * other locking happens in task context. This requires a seperate + * other locking happens in task context. This requires a separate * lock class key otherwise regular posix timer expiry would record * the lock class being taken in interrupt context and generate a * false positive warning. @@ -1216,7 +1216,7 @@ static void handle_posix_cpu_timers(struct task_struct *tsk) check_process_timers(tsk, &firing); /* - * The above timer checks have updated the exipry cache and + * The above timer checks have updated the expiry cache and * because nothing can have queued or modified timers after * sighand lock was taken above it is guaranteed to be * consistent. So the next timer interrupt fastpath check diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index b5a65e212df2..797eb93103ad 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -53,7 +53,7 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc) * reasons. * * Each caller tries to arm the hrtimer on its own CPU, but if the - * hrtimer callbback function is currently running, then + * hrtimer callback function is currently running, then * hrtimer_start() cannot move it and the timer stays on the CPU on * which it is assigned at the moment. * diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 5a23829372c7..6ec7855ab88d 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -157,7 +157,7 @@ static void tick_device_setup_broadcast_func(struct clock_event_device *dev) } /* - * Check, if the device is disfunctional and a place holder, which + * Check, if the device is dysfunctional and a placeholder, which * needs to be handled by the broadcast device. */ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) @@ -391,7 +391,7 @@ void tick_broadcast_control(enum tick_broadcast_mode mode) * - the broadcast device exists * - the broadcast device is not a hrtimer based one * - the broadcast device is in periodic mode to - * avoid a hickup during switch to oneshot mode + * avoid a hiccup during switch to oneshot mode */ if (bc && !(bc->features & CLOCK_EVT_FEAT_HRTIMER) && tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index f9745d47425a..475ecceda768 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -45,7 +45,7 @@ int tick_program_event(ktime_t expires, int force) } /** - * tick_resume_onshot - resume oneshot mode + * tick_resume_oneshot - resume oneshot mode */ void tick_resume_oneshot(void) { diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index e10a4af88737..128735e3e77e 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -751,7 +751,7 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) * Aside of that check whether the local timer softirq is * pending. If so its a bad idea to call get_next_timer_interrupt() * because there is an already expired timer, so it will request - * immeditate expiry, which rearms the hardware timer with a + * immediate expiry, which rearms the hardware timer with a * minimal delta which brings us back to this place * immediately. Lather, rinse and repeat... */ diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index 4fb06527cf64..d952ae393423 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -29,7 +29,7 @@ enum tick_nohz_mode { * @inidle: Indicator that the CPU is in the tick idle mode * @tick_stopped: Indicator that the idle tick has been stopped * @idle_active: Indicator that the CPU is actively in the tick idle mode; - * it is resetted during irq handling phases. + * it is reset during irq handling phases. * @do_timer_lst: CPU was the last one doing do_timer before going idle * @got_idle_tick: Tick timer function has run with @inidle set * @last_tick: Store the last tick expiry time when the tick diff --git a/kernel/time/time.c b/kernel/time/time.c index 3985b2b32d08..29923b20e0e4 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -571,7 +571,7 @@ EXPORT_SYMBOL(__usecs_to_jiffies); /* * The TICK_NSEC - 1 rounds up the value to the next resolution. Note * that a remainder subtract here would not do the right thing as the - * resolution values don't fall on second boundries. I.e. the line: + * resolution values don't fall on second boundaries. I.e. the line: * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding. * Note that due to the small error in the multiplier here, this * rounding is incorrect for sufficiently large values of tv_nsec, but diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 6aee5768c86f..77bafd8c8df2 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -596,14 +596,14 @@ EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns); * careful cache layout of the timekeeper because the sequence count and * struct tk_read_base would then need two cache lines instead of one. * - * Access to the time keeper clock source is disabled accross the innermost + * Access to the time keeper clock source is disabled across the innermost * steps of suspend/resume. The accessors still work, but the timestamps * are frozen until time keeping is resumed which happens very early. * * For regular suspend/resume there is no observable difference vs. sched * clock, but it might affect some of the nasty low level debug printks. * - * OTOH, access to sched clock is not guaranteed accross suspend/resume on + * OTOH, access to sched clock is not guaranteed across suspend/resume on * all systems either so it depends on the hardware in use. * * If that turns out to be a real problem then this could be mitigated by @@ -899,7 +899,7 @@ ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs) EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset); /** - * ktime_mono_to_any() - convert mononotic time to any other time + * ktime_mono_to_any() - convert monotonic time to any other time * @tmono: time to convert. * @offs: which offset to use */ @@ -1948,7 +1948,7 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, * xtime_nsec_1 = offset + xtime_nsec_2 * Which gives us: * xtime_nsec_2 = xtime_nsec_1 - offset - * Which simplfies to: + * Which simplifies to: * xtime_nsec -= offset */ if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) { @@ -2336,7 +2336,7 @@ static int timekeeping_validate_timex(const struct __kernel_timex *txc) /* * Validate if a timespec/timeval used to inject a time - * offset is valid. Offsets can be postive or negative, so + * offset is valid. Offsets can be positive or negative, so * we don't check tv_sec. The value of the timeval/timespec * is the sum of its fields,but *NOTE*: * The field tv_usec/tv_nsec must always be non-negative and diff --git a/kernel/time/timer.c b/kernel/time/timer.c index f475f1a027c8..d111adf4a0cb 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -894,7 +894,7 @@ static inline void forward_timer_base(struct timer_base *base) /* * No need to forward if we are close enough below jiffies. * Also while executing timers, base->clk is 1 offset ahead - * of jiffies to avoid endless requeuing to current jffies. + * of jiffies to avoid endless requeuing to current jiffies. */ if ((long)(jnow - base->clk) < 1) return; @@ -1271,7 +1271,7 @@ static inline void timer_base_unlock_expiry(struct timer_base *base) * The counterpart to del_timer_wait_running(). * * If there is a waiter for base->expiry_lock, then it was waiting for the - * timer callback to finish. Drop expiry_lock and reaquire it. That allows + * timer callback to finish. Drop expiry_lock and reacquire it. That allows * the waiter to acquire the lock and make progress. */ static void timer_sync_wait_running(struct timer_base *base) diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index 88e6b8ed6ca5..f0d5062d9cbc 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -108,7 +108,7 @@ void update_vsyscall(struct timekeeper *tk) /* * If the current clocksource is not VDSO capable, then spare the - * update of the high reolution parts. + * update of the high resolution parts. */ if (clock_mode != VDSO_CLOCKMODE_NONE) update_vdso_data(vdata, tk); diff --git a/tools/testing/selftests/timers/clocksource-switch.c b/tools/testing/selftests/timers/clocksource-switch.c index bfc974b4572d..ef8eb3604595 100644 --- a/tools/testing/selftests/timers/clocksource-switch.c +++ b/tools/testing/selftests/timers/clocksource-switch.c @@ -3,7 +3,7 @@ * (C) Copyright IBM 2012 * Licensed under the GPLv2 * - * NOTE: This is a meta-test which quickly changes the clocksourc and + * NOTE: This is a meta-test which quickly changes the clocksource and * then uses other tests to detect problems. Thus this test requires * that the inconsistency-check and nanosleep tests be present in the * same directory it is run from. @@ -134,7 +134,7 @@ int main(int argv, char **argc) return -1; } - /* Check everything is sane before we start switching asyncrhonously */ + /* Check everything is sane before we start switching asynchronously */ for (i = 0; i < count; i++) { printf("Validating clocksource %s\n", clocksource_list[i]); if (change_clocksource(clocksource_list[i])) { diff --git a/tools/testing/selftests/timers/leap-a-day.c b/tools/testing/selftests/timers/leap-a-day.c index 19e46ed5dfb5..23eb398c8140 100644 --- a/tools/testing/selftests/timers/leap-a-day.c +++ b/tools/testing/selftests/timers/leap-a-day.c @@ -5,7 +5,7 @@ * Licensed under the GPLv2 * * This test signals the kernel to insert a leap second - * every day at midnight GMT. This allows for stessing the + * every day at midnight GMT. This allows for stressing the * kernel's leap-second behavior, as well as how well applications * handle the leap-second discontinuity. * diff --git a/tools/testing/selftests/timers/leapcrash.c b/tools/testing/selftests/timers/leapcrash.c index dc80728ed191..f70802c5dd0d 100644 --- a/tools/testing/selftests/timers/leapcrash.c +++ b/tools/testing/selftests/timers/leapcrash.c @@ -4,10 +4,10 @@ * (C) Copyright 2013, 2015 Linaro Limited * Licensed under the GPL * - * This test demonstrates leapsecond deadlock that is possibe + * This test demonstrates leapsecond deadlock that is possible * on kernels from 2.6.26 to 3.3. * - * WARNING: THIS WILL LIKELY HARDHANG SYSTEMS AND MAY LOSE DATA + * WARNING: THIS WILL LIKELY HARD HANG SYSTEMS AND MAY LOSE DATA * RUN AT YOUR OWN RISK! * To build: * $ gcc leapcrash.c -o leapcrash -lrt diff --git a/tools/testing/selftests/timers/threadtest.c b/tools/testing/selftests/timers/threadtest.c index cf3e48919874..80aed4bf06fb 100644 --- a/tools/testing/selftests/timers/threadtest.c +++ b/tools/testing/selftests/timers/threadtest.c @@ -76,7 +76,7 @@ void checklist(struct timespec *list, int size) /* The shared thread shares a global list * that each thread fills while holding the lock. - * This stresses clock syncronization across cpus. + * This stresses clock synchronization across cpus. */ void *shared_thread(void *arg) { -- cgit v1.2.3 From 78b226d48106fc91628f941c66545f05273269df Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 19 Mar 2021 13:59:08 -0700 Subject: libbpf: Skip BTF fixup if object file has no BTF Skip BTF fixup step when input object file is missing BTF altogether. Fixes: 8fd27bf69b86 ("libbpf: Add BPF static linker BTF and BTF.ext support") Reported-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Tested-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20210319205909.1748642-3-andrii@kernel.org --- tools/lib/bpf/linker.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'tools') diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c index b4fff912dce2..5e0aa2f2c0ca 100644 --- a/tools/lib/bpf/linker.c +++ b/tools/lib/bpf/linker.c @@ -1313,6 +1313,9 @@ static int linker_fixup_btf(struct src_obj *obj) struct src_sec *sec; int i, j, n, m; + if (!obj->btf) + return 0; + n = btf__get_nr_types(obj->btf); for (i = 1; i <= n; i++) { struct btf_var_secinfo *vi; -- cgit v1.2.3 From 2c137388d685e11cf621b56bf06c4f3a1a8ff7be Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Thu, 18 Feb 2021 17:12:09 +0800 Subject: firmware_loader: Remove unnecessary conversion to bool Fix the following coccicheck warnings: ./tools/testing/selftests/firmware/fw_namespace.c:98:54-59: WARNING: conversion to bool not needed here. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Link: https://lore.kernel.org/r/1613639529-41139-1-git-send-email-jiapeng.chong@linux.alibaba.com Signed-off-by: Greg Kroah-Hartman --- tools/testing/selftests/firmware/fw_namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/firmware/fw_namespace.c b/tools/testing/selftests/firmware/fw_namespace.c index 5ebc1aec7923..0e393cb5f42d 100644 --- a/tools/testing/selftests/firmware/fw_namespace.c +++ b/tools/testing/selftests/firmware/fw_namespace.c @@ -95,7 +95,7 @@ static bool test_fw_in_ns(const char *fw_name, const char *sys_path, bool block_ } if (block_fw_in_parent_ns) umount("/lib/firmware"); - return WEXITSTATUS(status) == EXIT_SUCCESS ? true : false; + return WEXITSTATUS(status) == EXIT_SUCCESS; } if (unshare(CLONE_NEWNS) != 0) { -- cgit v1.2.3 From 4a423645bc2690376a7a94b4bb7b2f74bc6206ff Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 19 Mar 2021 16:53:24 +0000 Subject: kselftest/arm64: mte: Fix compilation with native compiler The mte selftest Makefile contains a check for GCC, to add the memtag -march flag to the compiler options. This check fails if the compiler is not explicitly specified, so reverts to the standard "cc", in which case --version doesn't mention the "gcc" string we match against: $ cc --version | head -n 1 cc (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0 This will not add the -march switch to the command line, so compilation fails: mte_helper.S: Assembler messages: mte_helper.S:25: Error: selected processor does not support `irg x0,x0,xzr' mte_helper.S:38: Error: selected processor does not support `gmi x1,x0,xzr' ... Actually clang accepts the same -march option as well, so we can just drop this check and add this unconditionally to the command line, to avoid any future issues with this check altogether (gcc actually prints basename(argv[0]) when called with --version). Signed-off-by: Andre Przywara Reviewed-by: Nick Desaulniers Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20210319165334.29213-2-andre.przywara@arm.com Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/mte/Makefile | 2 -- 1 file changed, 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/arm64/mte/Makefile b/tools/testing/selftests/arm64/mte/Makefile index 0b3af552632a..df15d44aeb8d 100644 --- a/tools/testing/selftests/arm64/mte/Makefile +++ b/tools/testing/selftests/arm64/mte/Makefile @@ -6,9 +6,7 @@ SRCS := $(filter-out mte_common_util.c,$(wildcard *.c)) PROGS := $(patsubst %.c,%,$(SRCS)) #Add mte compiler option -ifneq ($(shell $(CC) --version 2>&1 | head -n 1 | grep gcc),) CFLAGS += -march=armv8.5-a+memtag -endif #check if the compiler works well mte_cc_support := $(shell if ($(CC) $(CFLAGS) -E -x c /dev/null -o /dev/null 2>&1) then echo "1"; fi) -- cgit v1.2.3 From e5decefd884da1c2c6ea18fca17b80b189afcc43 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 19 Mar 2021 16:53:25 +0000 Subject: kselftest/arm64: mte: Fix pthread linking The GCC manual suggests to use -pthread, when linking with the PThread library, also to add this switch to both the compilation and linking stages. Do as the manual says, to fix compilation with Ubuntu's 20.04 toolchain, which was getting -lpthread too early on the command line: ------------ /usr/bin/ld: /tmp/cc5zbo2A.o: in function `execute_test': tools/testing/selftests/arm64/mte/check_gcr_el1_cswitch.c:86: undefined reference to `pthread_create' /usr/bin/ld: tools/testing/selftests/arm64/mte/check_gcr_el1_cswitch.c:90: undefined reference to `pthread_join' ------------ Signed-off-by: Andre Przywara Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20210319165334.29213-3-andre.przywara@arm.com Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/mte/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/arm64/mte/Makefile b/tools/testing/selftests/arm64/mte/Makefile index df15d44aeb8d..90aadd86fa0d 100644 --- a/tools/testing/selftests/arm64/mte/Makefile +++ b/tools/testing/selftests/arm64/mte/Makefile @@ -1,7 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 # Copyright (C) 2020 ARM Limited -CFLAGS += -std=gnu99 -I. -lpthread +CFLAGS += -std=gnu99 -I. -pthread +LDFLAGS += -pthread SRCS := $(filter-out mte_common_util.c,$(wildcard *.c)) PROGS := $(patsubst %.c,%,$(SRCS)) -- cgit v1.2.3 From 31c88729a7ad2b9871e6a73e491ec7223880d633 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 19 Mar 2021 16:53:26 +0000 Subject: kselftest/arm64: mte: ksm_options: Fix fscanf warning Out of the box Ubuntu's 20.04 compiler warns about missing return value checks for fscanf() calls. Make GCC happy by checking whether we actually parsed one integer. Signed-off-by: Andre Przywara Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20210319165334.29213-4-andre.przywara@arm.com Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/mte/check_ksm_options.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/arm64/mte/check_ksm_options.c b/tools/testing/selftests/arm64/mte/check_ksm_options.c index 3b23c4d61d38..88c74bc46d4f 100644 --- a/tools/testing/selftests/arm64/mte/check_ksm_options.c +++ b/tools/testing/selftests/arm64/mte/check_ksm_options.c @@ -33,7 +33,10 @@ static unsigned long read_sysfs(char *str) ksft_print_msg("ERR: missing %s\n", str); return 0; } - fscanf(f, "%lu", &val); + if (fscanf(f, "%lu", &val) != 1) { + ksft_print_msg("ERR: parsing %s\n", str); + val = 0; + } fclose(f); return val; } -- cgit v1.2.3 From 4d39c89f0b94bf4a6e1ccf42702e7d80d210a5fd Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 23 Mar 2021 17:09:15 +0100 Subject: perf tools: Fix various typos in comments Fix ~124 single-word typos and a few spelling errors in the perf tooling code, accumulated over the years. Signed-off-by: Ingo Molnar Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20210321113734.GA248990@gmail.com Link: http://lore.kernel.org/lkml/20210323160915.GA61903@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-buildid-cache.txt | 2 +- tools/perf/Documentation/perf-report.txt | 2 +- tools/perf/Documentation/perf-top.txt | 2 +- tools/perf/arch/arm/util/cs-etm.c | 2 +- tools/perf/arch/arm64/util/machine.c | 6 +++--- tools/perf/arch/arm64/util/perf_regs.c | 2 +- tools/perf/arch/powerpc/util/kvm-stat.c | 2 +- tools/perf/arch/powerpc/util/utils_header.h | 2 +- tools/perf/arch/x86/tests/bp-modify.c | 2 +- tools/perf/arch/x86/util/perf_regs.c | 4 ++-- tools/perf/bench/epoll-wait.c | 4 ++-- tools/perf/bench/numa.c | 2 +- tools/perf/builtin-annotate.c | 2 +- tools/perf/builtin-diff.c | 2 +- tools/perf/builtin-lock.c | 2 +- tools/perf/builtin-sched.c | 2 +- tools/perf/builtin-script.c | 4 ++-- tools/perf/builtin-stat.c | 4 ++-- tools/perf/builtin-top.c | 2 +- tools/perf/examples/bpf/augmented_raw_syscalls.c | 4 ++-- tools/perf/jvmti/jvmti_agent.c | 4 ++-- tools/perf/pmu-events/arch/powerpc/power8/metrics.json | 12 ++++++------ tools/perf/pmu-events/arch/powerpc/power9/metrics.json | 2 +- tools/perf/pmu-events/jevents.c | 2 +- tools/perf/scripts/python/netdev-times.py | 2 +- tools/perf/tests/bp_signal.c | 6 +++--- tools/perf/tests/code-reading.c | 2 +- tools/perf/tests/hists_cumulate.c | 4 ++-- tools/perf/tests/parse-events.c | 2 +- tools/perf/tests/parse-metric.c | 2 +- tools/perf/tests/topology.c | 2 +- tools/perf/trace/beauty/include/linux/socket.h | 2 +- tools/perf/ui/browsers/annotate.c | 2 +- tools/perf/ui/browsers/hists.c | 2 +- tools/perf/util/bpf-loader.c | 2 +- tools/perf/util/call-path.h | 2 +- tools/perf/util/callchain.c | 2 +- tools/perf/util/config.c | 2 +- tools/perf/util/cs-etm-decoder/cs-etm-decoder.c | 2 +- tools/perf/util/cs-etm.c | 8 ++++---- tools/perf/util/cs-etm.h | 5 +++-- tools/perf/util/data-convert-bt.c | 2 +- tools/perf/util/demangle-java.c | 4 ++-- tools/perf/util/dso.h | 2 +- tools/perf/util/dwarf-aux.c | 6 +++--- tools/perf/util/dwarf-aux.h | 2 +- tools/perf/util/events_stats.h | 2 +- tools/perf/util/evlist.c | 2 +- tools/perf/util/evsel.c | 4 ++-- tools/perf/util/expr.h | 2 +- tools/perf/util/header.c | 18 +++++++++--------- tools/perf/util/intel-pt.c | 2 +- tools/perf/util/levenshtein.c | 2 +- tools/perf/util/libunwind/arm64.c | 2 +- tools/perf/util/libunwind/x86_32.c | 2 +- tools/perf/util/llvm-utils.c | 2 +- tools/perf/util/machine.c | 8 ++++---- tools/perf/util/map.h | 4 ++-- tools/perf/util/mem-events.h | 2 +- tools/perf/util/metricgroup.c | 2 +- tools/perf/util/parse-events.c | 10 +++++----- tools/perf/util/pmu.c | 4 ++-- tools/perf/util/probe-event.c | 4 ++-- tools/perf/util/probe-finder.c | 6 +++--- tools/perf/util/s390-cpumsf.c | 10 +++++----- tools/perf/util/scripting-engines/trace-event-python.c | 2 +- tools/perf/util/session.c | 4 ++-- tools/perf/util/strbuf.h | 2 +- tools/perf/util/strfilter.h | 4 ++-- tools/perf/util/symbol-elf.c | 2 +- tools/perf/util/synthetic-events.c | 4 ++-- tools/perf/util/unwind-libunwind-local.c | 2 +- 72 files changed, 124 insertions(+), 123 deletions(-) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-buildid-cache.txt b/tools/perf/Documentation/perf-buildid-cache.txt index bb167e32a1d7..cd8ce6e8ec12 100644 --- a/tools/perf/Documentation/perf-buildid-cache.txt +++ b/tools/perf/Documentation/perf-buildid-cache.txt @@ -57,7 +57,7 @@ OPTIONS -u:: --update=:: Update specified file of the cache. Note that this doesn't remove - older entires since those may be still needed for annotating old + older entries since those may be still needed for annotating old (or remote) perf.data. Only if there is already a cache which has exactly same build-id, that is replaced by new one. It can be used to update kallsyms and kernel dso to vmlinux in order to support diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index 87112e8d904e..822b16f05c15 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -475,7 +475,7 @@ OPTIONS but probably we'll make the default not to show the switch-on/off events on the --group mode and if there is only one event besides the off/on ones, go straight to the histogram browser, just like 'perf report' with no events - explicitely specified does. + explicitly specified does. --itrace:: Options for decoding instruction tracing data. The options are: diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt index ee2024691d46..bba5ffb05463 100644 --- a/tools/perf/Documentation/perf-top.txt +++ b/tools/perf/Documentation/perf-top.txt @@ -317,7 +317,7 @@ Default is to monitor all CPUS. but probably we'll make the default not to show the switch-on/off events on the --group mode and if there is only one event besides the off/on ones, go straight to the histogram browser, just like 'perf top' with no events - explicitely specified does. + explicitly specified does. --stitch-lbr:: Show callgraph with stitched LBRs, which may have more complete diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c index 0a44bc462cec..d942f118d32c 100644 --- a/tools/perf/arch/arm/util/cs-etm.c +++ b/tools/perf/arch/arm/util/cs-etm.c @@ -378,7 +378,7 @@ static int cs_etm_recording_options(struct auxtrace_record *itr, opts->auxtrace_mmap_pages = roundup_pow_of_two(sz); } - /* Snapshost size can't be bigger than the auxtrace area */ + /* Snapshot size can't be bigger than the auxtrace area */ if (opts->auxtrace_snapshot_size > opts->auxtrace_mmap_pages * (size_t)page_size) { pr_err("Snapshot size %zu must not be greater than AUX area tracing mmap size %zu\n", diff --git a/tools/perf/arch/arm64/util/machine.c b/tools/perf/arch/arm64/util/machine.c index 40c5e0b5bda8..7e7714290a87 100644 --- a/tools/perf/arch/arm64/util/machine.c +++ b/tools/perf/arch/arm64/util/machine.c @@ -6,11 +6,11 @@ #include "debug.h" #include "symbol.h" -/* On arm64, kernel text segment start at high memory address, +/* On arm64, kernel text segment starts at high memory address, * for example 0xffff 0000 8xxx xxxx. Modules start at a low memory - * address, like 0xffff 0000 00ax xxxx. When only samll amount of + * address, like 0xffff 0000 00ax xxxx. When only small amount of * memory is used by modules, gap between end of module's text segment - * and start of kernel text segment may be reach 2G. + * and start of kernel text segment may reach 2G. * Therefore do not fill this gap and do not assign it to the kernel dso map. */ diff --git a/tools/perf/arch/arm64/util/perf_regs.c b/tools/perf/arch/arm64/util/perf_regs.c index 2518cde18b34..476b037eea1c 100644 --- a/tools/perf/arch/arm64/util/perf_regs.c +++ b/tools/perf/arch/arm64/util/perf_regs.c @@ -108,7 +108,7 @@ int arch_sdt_arg_parse_op(char *old_op, char **new_op) /* [sp], [sp, NUM] or [sp,NUM] */ new_len = 7; /* + ( % s p ) NULL */ - /* If the arugment is [sp], need to fill offset '0' */ + /* If the argument is [sp], need to fill offset '0' */ if (rm[2].rm_so == -1) new_len += 1; else diff --git a/tools/perf/arch/powerpc/util/kvm-stat.c b/tools/perf/arch/powerpc/util/kvm-stat.c index eed9e5a42935..16510686c138 100644 --- a/tools/perf/arch/powerpc/util/kvm-stat.c +++ b/tools/perf/arch/powerpc/util/kvm-stat.c @@ -176,7 +176,7 @@ int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid __maybe_unused) } /* - * Incase of powerpc architecture, pmu registers are programmable + * In case of powerpc architecture, pmu registers are programmable * by guest kernel. So monitoring guest via host may not provide * valid samples with default 'cycles' event. It is better to use * 'trace_imc/trace_cycles' event for guest profiling, since it diff --git a/tools/perf/arch/powerpc/util/utils_header.h b/tools/perf/arch/powerpc/util/utils_header.h index 5788eb1f1fe3..2baeb1c1ae85 100644 --- a/tools/perf/arch/powerpc/util/utils_header.h +++ b/tools/perf/arch/powerpc/util/utils_header.h @@ -10,6 +10,6 @@ #define SPRN_PVR 0x11F /* Processor Version Register */ #define PVR_VER(pvr) (((pvr) >> 16) & 0xFFFF) /* Version field */ -#define PVR_REV(pvr) (((pvr) >> 0) & 0xFFFF) /* Revison field */ +#define PVR_REV(pvr) (((pvr) >> 0) & 0xFFFF) /* Revision field */ #endif /* __PERF_UTIL_HEADER_H */ diff --git a/tools/perf/arch/x86/tests/bp-modify.c b/tools/perf/arch/x86/tests/bp-modify.c index adcacf1b6609..dffcf9b52153 100644 --- a/tools/perf/arch/x86/tests/bp-modify.c +++ b/tools/perf/arch/x86/tests/bp-modify.c @@ -73,7 +73,7 @@ static int bp_modify1(void) /* * The parent does following steps: * - creates a new breakpoint (id 0) for bp_2 function - * - changes that breakponit to bp_1 function + * - changes that breakpoint to bp_1 function * - waits for the breakpoint to hit and checks * it has proper rip of bp_1 function * - detaches the child diff --git a/tools/perf/arch/x86/util/perf_regs.c b/tools/perf/arch/x86/util/perf_regs.c index fca81b39b09f..207c56805c55 100644 --- a/tools/perf/arch/x86/util/perf_regs.c +++ b/tools/perf/arch/x86/util/perf_regs.c @@ -165,7 +165,7 @@ static int sdt_init_op_regex(void) /* * Max x86 register name length is 5(ex: %r15d). So, 6th char * should always contain NULL. This helps to find register name - * length using strlen, insted of maintaing one more variable. + * length using strlen, instead of maintaining one more variable. */ #define SDT_REG_NAME_SIZE 6 @@ -207,7 +207,7 @@ int arch_sdt_arg_parse_op(char *old_op, char **new_op) * and displacement 0 (Both sign and displacement 0 are * optional so it may be empty). Use one more character * to hold last NULL so that strlen can be used to find - * prefix length, instead of maintaing one more variable. + * prefix length, instead of maintaining one more variable. */ char prefix[3] = {0}; diff --git a/tools/perf/bench/epoll-wait.c b/tools/perf/bench/epoll-wait.c index 0a0ff1247c83..79d13dbc0a47 100644 --- a/tools/perf/bench/epoll-wait.c +++ b/tools/perf/bench/epoll-wait.c @@ -17,7 +17,7 @@ * While the second model, enabled via --multiq option, uses multiple * queueing (which refers to one epoll instance per worker). For example, * short lived tcp connections in a high throughput httpd server will - * ditribute the accept()'ing connections across CPUs. In this case each + * distribute the accept()'ing connections across CPUs. In this case each * worker does a limited amount of processing. * * [queue A] ---> [worker] @@ -198,7 +198,7 @@ static void *workerfn(void *arg) do { /* - * Block undefinitely waiting for the IN event. + * Block indefinitely waiting for the IN event. * In order to stress the epoll_wait(2) syscall, * call it event per event, instead of a larger * batch (max)limit. diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c index 20b87e29c96f..f2640179ada9 100644 --- a/tools/perf/bench/numa.c +++ b/tools/perf/bench/numa.c @@ -42,7 +42,7 @@ #endif /* - * Regular printout to the terminal, supressed if -q is specified: + * Regular printout to the terminal, suppressed if -q is specified: */ #define tprintf(x...) do { if (g && g->p.show_details >= 0) printf(x); } while (0) diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index a23ba6bb99b6..0f3a196e5d6e 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -239,7 +239,7 @@ static int evsel__add_sample(struct evsel *evsel, struct perf_sample *sample, } /* - * XXX filtered samples can still have branch entires pointing into our + * XXX filtered samples can still have branch entries pointing into our * symbol and are missed. */ process_branch_stack(sample->branch_stack, al, sample); diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c index 878e04b1fab7..f52b3a799e76 100644 --- a/tools/perf/builtin-diff.c +++ b/tools/perf/builtin-diff.c @@ -1796,7 +1796,7 @@ static int ui_init(void) data__for_each_file(i, d) { /* - * Baseline or compute realted columns: + * Baseline or compute related columns: * * PERF_HPP_DIFF__BASELINE * PERF_HPP_DIFF__DELTA diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index a2f1e53f37a7..01326e370009 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -49,7 +49,7 @@ struct lock_stat { /* * FIXME: evsel__intval() returns u64, - * so address of lockdep_map should be dealed as 64bit. + * so address of lockdep_map should be treated as 64bit. * Is there more better solution? */ void *addr; /* address of lockdep_map, used as ID */ diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 69c769b04a61..954ce2f594e9 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -1712,7 +1712,7 @@ static int perf_sched__process_fork_event(struct perf_tool *tool, { struct perf_sched *sched = container_of(tool, struct perf_sched, tool); - /* run the fork event through the perf machineruy */ + /* run the fork event through the perf machinery */ perf_event__process_fork(tool, event, sample, machine); /* and then run additional processing needed for this command */ diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 119a5f7e2772..1280cbfad4db 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -2486,7 +2486,7 @@ static int perf_script__fopen_per_event_dump(struct perf_script *script) /* * Already setup? I.e. we may be called twice in cases like * Intel PT, one for the intel_pt// and dummy events, then - * for the evsels syntheized from the auxtrace info. + * for the evsels synthesized from the auxtrace info. * * Ses perf_script__process_auxtrace_info. */ @@ -3083,7 +3083,7 @@ static int list_available_scripts(const struct option *opt __maybe_unused, * * Fixme: All existing "xxx-record" are all in good formats "-e event ", * which is covered well now. And new parsing code should be added to - * cover the future complexing formats like event groups etc. + * cover the future complex formats like event groups etc. */ static int check_ev_match(char *dir_name, char *scriptname, struct perf_session *session) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 2e2e4a8345ea..d140ffbb5b34 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1705,7 +1705,7 @@ static int add_default_attributes(void) bzero(&errinfo, sizeof(errinfo)); if (transaction_run) { /* Handle -T as -M transaction. Once platform specific metrics - * support has been added to the json files, all archictures + * support has been added to the json files, all architectures * will use this approach. To determine transaction support * on an architecture test for such a metric name. */ @@ -2459,7 +2459,7 @@ int cmd_stat(int argc, const char **argv) /* * We synthesize the kernel mmap record just so that older tools * don't emit warnings about not being able to resolve symbols - * due to /proc/sys/kernel/kptr_restrict settings and instear provide + * due to /proc/sys/kernel/kptr_restrict settings and instead provide * a saner message about no samples being in the perf.data file. * * This also serves to suppress a warning about f_header.data.size == 0 diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 3673c04d16b6..173ace43f845 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -1607,7 +1607,7 @@ int cmd_top(int argc, const char **argv) if (status) { /* * Some arches do not provide a get_cpuid(), so just use pr_debug, otherwise - * warn the user explicitely. + * warn the user explicitly. */ eprintf(status == ENOSYS ? 1 : 0, verbose, "Couldn't read the cpuid for this machine: %s\n", diff --git a/tools/perf/examples/bpf/augmented_raw_syscalls.c b/tools/perf/examples/bpf/augmented_raw_syscalls.c index b80437971d80..a262dcd020f4 100644 --- a/tools/perf/examples/bpf/augmented_raw_syscalls.c +++ b/tools/perf/examples/bpf/augmented_raw_syscalls.c @@ -262,7 +262,7 @@ int sys_enter(struct syscall_enter_args *args) /* * Jump to syscall specific augmenter, even if the default one, * "!raw_syscalls:unaugmented" that will just return 1 to return the - * unagmented tracepoint payload. + * unaugmented tracepoint payload. */ bpf_tail_call(args, &syscalls_sys_enter, augmented_args->args.syscall_nr); @@ -282,7 +282,7 @@ int sys_exit(struct syscall_exit_args *args) /* * Jump to syscall specific return augmenter, even if the default one, * "!raw_syscalls:unaugmented" that will just return 1 to return the - * unagmented tracepoint payload. + * unaugmented tracepoint payload. */ bpf_tail_call(args, &syscalls_sys_exit, exit_args.syscall_nr); /* diff --git a/tools/perf/jvmti/jvmti_agent.c b/tools/perf/jvmti/jvmti_agent.c index 88108598d6e9..526dcaf9f079 100644 --- a/tools/perf/jvmti/jvmti_agent.c +++ b/tools/perf/jvmti/jvmti_agent.c @@ -390,7 +390,7 @@ jvmti_write_code(void *agent, char const *sym, rec.p.total_size += size; /* - * If JVM is multi-threaded, nultiple concurrent calls to agent + * If JVM is multi-threaded, multiple concurrent calls to agent * may be possible, so protect file writes */ flockfile(fp); @@ -457,7 +457,7 @@ jvmti_write_debug_info(void *agent, uint64_t code, rec.p.total_size = size; /* - * If JVM is multi-threaded, nultiple concurrent calls to agent + * If JVM is multi-threaded, multiple concurrent calls to agent * may be possible, so protect file writes */ flockfile(fp); diff --git a/tools/perf/pmu-events/arch/powerpc/power8/metrics.json b/tools/perf/pmu-events/arch/powerpc/power8/metrics.json index fc4aa6c2ddc9..4e25525b7da6 100644 --- a/tools/perf/pmu-events/arch/powerpc/power8/metrics.json +++ b/tools/perf/pmu-events/arch/powerpc/power8/metrics.json @@ -885,37 +885,37 @@ "MetricName": "flush_rate_percent" }, { - "BriefDescription": "GCT slot utilization (11 to 14) as a % of cycles this thread had atleast 1 slot valid", + "BriefDescription": "GCT slot utilization (11 to 14) as a % of cycles this thread had at least 1 slot valid", "MetricExpr": "PM_GCT_UTIL_11_14_ENTRIES / ( PM_RUN_CYC - PM_GCT_NOSLOT_CYC) * 100", "MetricGroup": "general", "MetricName": "gct_util_11to14_slots_percent" }, { - "BriefDescription": "GCT slot utilization (15 to 17) as a % of cycles this thread had atleast 1 slot valid", + "BriefDescription": "GCT slot utilization (15 to 17) as a % of cycles this thread had at least 1 slot valid", "MetricExpr": "PM_GCT_UTIL_15_17_ENTRIES / ( PM_RUN_CYC - PM_GCT_NOSLOT_CYC) * 100", "MetricGroup": "general", "MetricName": "gct_util_15to17_slots_percent" }, { - "BriefDescription": "GCT slot utilization 18+ as a % of cycles this thread had atleast 1 slot valid", + "BriefDescription": "GCT slot utilization 18+ as a % of cycles this thread had at least 1 slot valid", "MetricExpr": "PM_GCT_UTIL_18_ENTRIES / ( PM_RUN_CYC - PM_GCT_NOSLOT_CYC) * 100", "MetricGroup": "general", "MetricName": "gct_util_18plus_slots_percent" }, { - "BriefDescription": "GCT slot utilization (1 to 2) as a % of cycles this thread had atleast 1 slot valid", + "BriefDescription": "GCT slot utilization (1 to 2) as a % of cycles this thread had at least 1 slot valid", "MetricExpr": "PM_GCT_UTIL_1_2_ENTRIES / ( PM_RUN_CYC - PM_GCT_NOSLOT_CYC) * 100", "MetricGroup": "general", "MetricName": "gct_util_1to2_slots_percent" }, { - "BriefDescription": "GCT slot utilization (3 to 6) as a % of cycles this thread had atleast 1 slot valid", + "BriefDescription": "GCT slot utilization (3 to 6) as a % of cycles this thread had at least 1 slot valid", "MetricExpr": "PM_GCT_UTIL_3_6_ENTRIES / ( PM_RUN_CYC - PM_GCT_NOSLOT_CYC) * 100", "MetricGroup": "general", "MetricName": "gct_util_3to6_slots_percent" }, { - "BriefDescription": "GCT slot utilization (7 to 10) as a % of cycles this thread had atleast 1 slot valid", + "BriefDescription": "GCT slot utilization (7 to 10) as a % of cycles this thread had at least 1 slot valid", "MetricExpr": "PM_GCT_UTIL_7_10_ENTRIES / ( PM_RUN_CYC - PM_GCT_NOSLOT_CYC) * 100", "MetricGroup": "general", "MetricName": "gct_util_7to10_slots_percent" diff --git a/tools/perf/pmu-events/arch/powerpc/power9/metrics.json b/tools/perf/pmu-events/arch/powerpc/power9/metrics.json index 140402d2855f..db86ba36224d 100644 --- a/tools/perf/pmu-events/arch/powerpc/power9/metrics.json +++ b/tools/perf/pmu-events/arch/powerpc/power9/metrics.json @@ -1691,7 +1691,7 @@ "MetricName": "custom_secs" }, { - "BriefDescription": "Percentage Cycles atleast one instruction dispatched", + "BriefDescription": "Percentage Cycles at least one instruction dispatched", "MetricExpr": "PM_1PLUS_PPC_DISP / PM_CYC * 100", "MetricName": "cycles_atleast_one_inst_dispatched_percent" }, diff --git a/tools/perf/pmu-events/jevents.c b/tools/perf/pmu-events/jevents.c index e1f3f5c8c550..33aa3c885eaf 100644 --- a/tools/perf/pmu-events/jevents.c +++ b/tools/perf/pmu-events/jevents.c @@ -1149,7 +1149,7 @@ static int process_one_file(const char *fpath, const struct stat *sb, * and directory tree could result in build failure due to table * names not being found. * - * Atleast for now, be strict with processing JSON file names. + * At least for now, be strict with processing JSON file names. * i.e. if JSON file name cannot be mapped to C-style table name, * fail. */ diff --git a/tools/perf/scripts/python/netdev-times.py b/tools/perf/scripts/python/netdev-times.py index ea0c8b90a783..a0cfc7fe5908 100644 --- a/tools/perf/scripts/python/netdev-times.py +++ b/tools/perf/scripts/python/netdev-times.py @@ -356,7 +356,7 @@ def handle_irq_softirq_exit(event_info): return rec_data = {'sirq_ent_t':sirq_ent_t, 'sirq_ext_t':time, 'irq_list':irq_list, 'event_list':event_list} - # merge information realted to a NET_RX softirq + # merge information related to a NET_RX softirq receive_hunk_list.append(rec_data) def handle_napi_poll(event_info): diff --git a/tools/perf/tests/bp_signal.c b/tools/perf/tests/bp_signal.c index cc9fbcedb364..ef37353636d8 100644 --- a/tools/perf/tests/bp_signal.c +++ b/tools/perf/tests/bp_signal.c @@ -225,11 +225,11 @@ int test__bp_signal(struct test *test __maybe_unused, int subtest __maybe_unused * * The test case check following error conditions: * - we get stuck in signal handler because of debug - * exception being triggered receursively due to + * exception being triggered recursively due to * the wrong RF EFLAG management * * - we never trigger the sig_handler breakpoint due - * to the rong RF EFLAG management + * to the wrong RF EFLAG management * */ @@ -242,7 +242,7 @@ int test__bp_signal(struct test *test __maybe_unused, int subtest __maybe_unused ioctl(fd3, PERF_EVENT_IOC_ENABLE, 0); /* - * Kick off the test by trigering 'fd1' + * Kick off the test by triggering 'fd1' * breakpoint. */ test_function(); diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c index 2fdc7b2f996e..9866cddebf23 100644 --- a/tools/perf/tests/code-reading.c +++ b/tools/perf/tests/code-reading.c @@ -658,7 +658,7 @@ static int do_test_code_reading(bool try_kcore) /* * Both cpus and threads are now owned by evlist * and will be freed by following perf_evlist__set_maps - * call. Getting refference to keep them alive. + * call. Getting reference to keep them alive. */ perf_cpu_map__get(cpus); perf_thread_map__get(threads); diff --git a/tools/perf/tests/hists_cumulate.c b/tools/perf/tests/hists_cumulate.c index 3f2e1a581247..890cb1f5bf53 100644 --- a/tools/perf/tests/hists_cumulate.c +++ b/tools/perf/tests/hists_cumulate.c @@ -47,7 +47,7 @@ static struct sample fake_samples[] = { }; /* - * Will be casted to struct ip_callchain which has all 64 bit entries + * Will be cast to struct ip_callchain which has all 64 bit entries * of nr and ips[]. */ static u64 fake_callchains[][10] = { @@ -297,7 +297,7 @@ out: return err; } -/* callcain + NO children */ +/* callchain + NO children */ static int test2(struct evsel *evsel, struct machine *machine) { int err; diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c index a7f6661e6112..026c54743311 100644 --- a/tools/perf/tests/parse-events.c +++ b/tools/perf/tests/parse-events.c @@ -20,7 +20,7 @@ #if defined(__s390x__) /* Return true if kvm module is available and loaded. Test this - * and retun success when trace point kvm_s390_create_vm + * and return success when trace point kvm_s390_create_vm * exists. Otherwise this test always fails. */ static bool kvm_s390_create_vm_valid(void) diff --git a/tools/perf/tests/parse-metric.c b/tools/perf/tests/parse-metric.c index 55bf52e588be..4968c4106254 100644 --- a/tools/perf/tests/parse-metric.c +++ b/tools/perf/tests/parse-metric.c @@ -186,7 +186,7 @@ static int __compute_metric(const char *name, struct value *vals, *ratio2 = compute_single(&metric_events, evlist, &st, name2); out: - /* ... clenup. */ + /* ... cleanup. */ metricgroup__rblist_exit(&metric_events); runtime_stat__exit(&st); evlist__free_stats(evlist); diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c index 74748ed75b2c..050489807a47 100644 --- a/tools/perf/tests/topology.c +++ b/tools/perf/tests/topology.c @@ -80,7 +80,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) * CPU 1 is on core_id 1 and physical_package_id 3 * * Core_id and physical_package_id are platform and architecture - * dependend and might have higher numbers than the CPU id. + * dependent and might have higher numbers than the CPU id. * This actually depends on the configuration. * * In this case process_cpu_topology() prints error message: diff --git a/tools/perf/trace/beauty/include/linux/socket.h b/tools/perf/trace/beauty/include/linux/socket.h index 385894b4a8bb..b8fc5c53ba6f 100644 --- a/tools/perf/trace/beauty/include/linux/socket.h +++ b/tools/perf/trace/beauty/include/linux/socket.h @@ -85,7 +85,7 @@ struct mmsghdr { /* * POSIX 1003.1g - ancillary data object information - * Ancillary data consits of a sequence of pairs of + * Ancillary data consists of a sequence of pairs of * (cmsghdr, cmsg_data[]) */ diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index 9f75d767b6da..ad0a70f0edaf 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -415,7 +415,7 @@ static int sym_title(struct symbol *sym, struct map *map, char *title, } /* - * This can be called from external jumps, i.e. jumps from one functon + * This can be called from external jumps, i.e. jumps from one function * to another, like from the kernel's entry_SYSCALL_64 function to the * swapgs_restore_regs_and_return_to_usermode() function. * diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c index 3b9818ee9546..bcfd0a45953b 100644 --- a/tools/perf/ui/browsers/hists.c +++ b/tools/perf/ui/browsers/hists.c @@ -117,7 +117,7 @@ static void hist_browser__update_rows(struct hist_browser *hb) browser->rows -= browser->extra_title_lines; /* * Verify if we were at the last line and that line isn't - * visibe because we now show the header line(s). + * visible because we now show the header line(s). */ index_row = browser->index - browser->top_idx; if (index_row >= browser->rows) diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c index 9087f1bffd3d..fbb3c4057c30 100644 --- a/tools/perf/util/bpf-loader.c +++ b/tools/perf/util/bpf-loader.c @@ -671,7 +671,7 @@ int bpf__probe(struct bpf_object *obj) * After probing, let's consider prologue, which * adds program fetcher to BPF programs. * - * hook_load_preprocessorr() hooks pre-processor + * hook_load_preprocessor() hooks pre-processor * to bpf_program, let it generate prologue * dynamically during loading. */ diff --git a/tools/perf/util/call-path.h b/tools/perf/util/call-path.h index 6b3229106f16..5875cfc8106e 100644 --- a/tools/perf/util/call-path.h +++ b/tools/perf/util/call-path.h @@ -23,7 +23,7 @@ * @children: tree of call paths of functions called * * In combination with the call_return structure, the call_path structure - * defines a context-sensitve call-graph. + * defines a context-sensitive call-graph. */ struct call_path { struct call_path *parent; diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index 1b60985690bb..8e2777133bd9 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -877,7 +877,7 @@ append_chain_children(struct callchain_node *root, if (!node) return -1; - /* lookup in childrens */ + /* lookup in children */ while (*p) { enum match_result ret; diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c index 6984c77068a3..2daeaa9a4a24 100644 --- a/tools/perf/util/config.c +++ b/tools/perf/util/config.c @@ -699,7 +699,7 @@ static int collect_config(const char *var, const char *value, /* perf_config_set can contain both user and system config items. * So we should know where each value is from. * The classification would be needed when a particular config file - * is overwrited by setting feature i.e. set_config(). + * is overwritten by setting feature i.e. set_config(). */ if (strcmp(config_file_name, perf_etc_perfconfig()) == 0) { section->from_system_config = true; diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c index 4052c9ce6e2f..059bcec3f651 100644 --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c @@ -317,7 +317,7 @@ cs_etm_decoder__do_hard_timestamp(struct cs_etm_queue *etmq, * This is the first timestamp we've seen since the beginning of traces * or a discontinuity. Since timestamps packets are generated *after* * range packets have been generated, we need to estimate the time at - * which instructions started by substracting the number of instructions + * which instructions started by subtracting the number of instructions * executed to the timestamp. */ packet_queue->timestamp = elem->timestamp - packet_queue->instr_count; diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index 9ac80fc23c58..7e63e7dedc33 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -202,7 +202,7 @@ void cs_etm__etmq_set_traceid_queue_timestamp(struct cs_etm_queue *etmq, u8 trace_chan_id) { /* - * Wnen a timestamp packet is encountered the backend code + * When a timestamp packet is encountered the backend code * is stopped so that the front end has time to process packets * that were accumulated in the traceID queue. Since there can * be more than one channel per cs_etm_queue, we need to specify @@ -1697,7 +1697,7 @@ static bool cs_etm__is_svc_instr(struct cs_etm_queue *etmq, u8 trace_chan_id, * | 1 1 0 1 1 1 1 1 | imm8 | * +-----------------+--------+ * - * According to the specifiction, it only defines SVC for T32 + * According to the specification, it only defines SVC for T32 * with 16 bits instruction and has no definition for 32bits; * so below only read 2 bytes as instruction size for T32. */ @@ -1929,7 +1929,7 @@ static int cs_etm__set_sample_flags(struct cs_etm_queue *etmq, /* * If the previous packet is an exception return packet - * and the return address just follows SVC instuction, + * and the return address just follows SVC instruction, * it needs to calibrate the previous packet sample flags * as PERF_IP_FLAG_SYSCALLRET. */ @@ -2003,7 +2003,7 @@ static int cs_etm__set_sample_flags(struct cs_etm_queue *etmq, * contain exception type related info so we cannot decide * the exception type purely based on exception return packet. * If we record the exception number from exception packet and - * reuse it for excpetion return packet, this is not reliable + * reuse it for exception return packet, this is not reliable * due the trace can be discontinuity or the interrupt can * be nested, thus the recorded exception number cannot be * used for exception return packet for these two cases. diff --git a/tools/perf/util/cs-etm.h b/tools/perf/util/cs-etm.h index 85ed11e9d2a7..36428918411e 100644 --- a/tools/perf/util/cs-etm.h +++ b/tools/perf/util/cs-etm.h @@ -12,7 +12,8 @@ struct perf_session; -/* Versionning header in case things need tro change in the future. That way +/* + * Versioning header in case things need to change in the future. That way * decoding of old snapshot is still possible. */ enum { @@ -77,7 +78,7 @@ enum { /* * ETMv3 exception encoding number: - * See Embedded Trace Macrocell spcification (ARM IHI 0014Q) + * See Embedded Trace Macrocell specification (ARM IHI 0014Q) * table 7-12 Encoding of Exception[3:0] for non-ARMv7-M processors. */ enum { diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c index 8b67bd97d122..a9c375e63e73 100644 --- a/tools/perf/util/data-convert-bt.c +++ b/tools/perf/util/data-convert-bt.c @@ -949,7 +949,7 @@ static char *change_name(char *name, char *orig_name, int dup) /* * Add '_' prefix to potential keywork. According to * Mathieu Desnoyers (https://lore.kernel.org/lkml/1074266107.40857.1422045946295.JavaMail.zimbra@efficios.com), - * futher CTF spec updating may require us to use '$'. + * further CTF spec updating may require us to use '$'. */ if (dup < 0) len = strlen(name) + sizeof("_"); diff --git a/tools/perf/util/demangle-java.c b/tools/perf/util/demangle-java.c index 39c05200ed65..ddf33d58bcd3 100644 --- a/tools/perf/util/demangle-java.c +++ b/tools/perf/util/demangle-java.c @@ -147,7 +147,7 @@ error: * Demangle Java function signature (openJDK, not GCJ) * input: * str: string to parse. String is not modified - * flags: comobination of JAVA_DEMANGLE_* flags to modify demangling + * flags: combination of JAVA_DEMANGLE_* flags to modify demangling * return: * if input can be demangled, then a newly allocated string is returned. * if input cannot be demangled, then NULL is returned @@ -164,7 +164,7 @@ java_demangle_sym(const char *str, int flags) if (!str) return NULL; - /* find start of retunr type */ + /* find start of return type */ p = strrchr(str, ')'); if (!p) return NULL; diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h index cd2fe64a3c5d..52e7101c5609 100644 --- a/tools/perf/util/dso.h +++ b/tools/perf/util/dso.h @@ -216,7 +216,7 @@ struct dso { /* dso__for_each_symbol - iterate over the symbols of given type * - * @dso: the 'struct dso *' in which symbols itereated + * @dso: the 'struct dso *' in which symbols are iterated * @pos: the 'struct symbol *' to use as a loop cursor * @n: the 'struct rb_node *' to use as a temporary storage */ diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c index 7b2d471a6419..b2f4920e19a6 100644 --- a/tools/perf/util/dwarf-aux.c +++ b/tools/perf/util/dwarf-aux.c @@ -91,7 +91,7 @@ static Dwarf_Line *cu_getsrc_die(Dwarf_Die *cu_die, Dwarf_Addr addr) return NULL; } while (laddr == addr); l++; - /* Going foward to find the statement line */ + /* Going forward to find the statement line */ do { line = dwarf_onesrcline(lines, l++); if (!line || dwarf_lineaddr(line, &laddr) != 0 || @@ -177,7 +177,7 @@ int cu_walk_functions_at(Dwarf_Die *cu_die, Dwarf_Addr addr, * die_get_linkage_name - Get the linkage name of the object * @dw_die: A DIE of the object * - * Get the linkage name attiribute of given @dw_die. + * Get the linkage name attribute of given @dw_die. * For C++ binary, the linkage name will be the mangled symbol. */ const char *die_get_linkage_name(Dwarf_Die *dw_die) @@ -739,7 +739,7 @@ static int __die_walk_instances_cb(Dwarf_Die *inst, void *data) * @data: user data * * Walk on the instances of give @in_die. @in_die must be an inlined function - * declartion. This returns the return value of @callback if it returns + * declaration. This returns the return value of @callback if it returns * non-zero value, or -ENOENT if there is no instance. */ int die_walk_instances(Dwarf_Die *or_die, int (*callback)(Dwarf_Die *, void *), diff --git a/tools/perf/util/dwarf-aux.h b/tools/perf/util/dwarf-aux.h index 506006e0cf66..cb99646843a9 100644 --- a/tools/perf/util/dwarf-aux.h +++ b/tools/perf/util/dwarf-aux.h @@ -22,7 +22,7 @@ const char *cu_get_comp_dir(Dwarf_Die *cu_die); int cu_find_lineinfo(Dwarf_Die *cudie, unsigned long addr, const char **fname, int *lineno); -/* Walk on funcitons at given address */ +/* Walk on functions at given address */ int cu_walk_functions_at(Dwarf_Die *cu_die, Dwarf_Addr addr, int (*callback)(Dwarf_Die *, void *), void *data); diff --git a/tools/perf/util/events_stats.h b/tools/perf/util/events_stats.h index 859cb34fcff2..631a4af2ed86 100644 --- a/tools/perf/util/events_stats.h +++ b/tools/perf/util/events_stats.h @@ -21,7 +21,7 @@ * all struct perf_record_lost_samples.lost fields reported. * * The total_period is needed because by default auto-freq is used, so - * multipling nr_events[PERF_EVENT_SAMPLE] by a frequency isn't possible to get + * multiplying nr_events[PERF_EVENT_SAMPLE] by a frequency isn't possible to get * the total number of low level events, it is necessary to to sum all struct * perf_record_sample.period and stash the result in total_period. */ diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 435bbfd00551..f1c79ecf8107 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -1210,7 +1210,7 @@ bool evlist__valid_read_format(struct evlist *evlist) } } - /* PERF_SAMPLE_READ imples PERF_FORMAT_ID. */ + /* PERF_SAMPLE_READ implies PERF_FORMAT_ID. */ if ((sample_type & PERF_SAMPLE_READ) && !(read_format & PERF_FORMAT_ID)) { return false; diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 7ecbc8e2fbfa..2d2614eeaa20 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -621,7 +621,7 @@ const char *evsel__hw_cache_result[PERF_COUNT_HW_CACHE_RESULT_MAX][EVSEL__MAX_AL #define COP(x) (1 << x) /* - * cache operartion stat + * cache operation stat * L1I : Read and prefetch only * ITLB and BPU : Read-only */ @@ -2275,7 +2275,7 @@ int evsel__parse_sample(struct evsel *evsel, union perf_event *event, /* * Undo swap of u64, then swap on individual u32s, * get the size of the raw area and undo all of the - * swap. The pevent interface handles endianity by + * swap. The pevent interface handles endianness by * itself. */ if (swapped) { diff --git a/tools/perf/util/expr.h b/tools/perf/util/expr.h index dcf8d19b83c8..85df3e4771e4 100644 --- a/tools/perf/util/expr.h +++ b/tools/perf/util/expr.h @@ -3,7 +3,7 @@ #define PARSE_CTX_H 1 // There are fixes that need to land upstream before we can use libbpf's headers, -// for now use our copy uncoditionally, since the data structures at this point +// for now use our copy unconditionally, since the data structures at this point // are exactly the same, no problem. //#ifdef HAVE_LIBBPF_SUPPORT //#include diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 20effdff76ce..aa1e42518d37 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -127,7 +127,7 @@ static int __do_write_buf(struct feat_fd *ff, const void *buf, size_t size) return 0; } -/* Return: 0 if succeded, -ERR if failed. */ +/* Return: 0 if succeeded, -ERR if failed. */ int do_write(struct feat_fd *ff, const void *buf, size_t size) { if (!ff->buf) @@ -135,7 +135,7 @@ int do_write(struct feat_fd *ff, const void *buf, size_t size) return __do_write_buf(ff, buf, size); } -/* Return: 0 if succeded, -ERR if failed. */ +/* Return: 0 if succeeded, -ERR if failed. */ static int do_write_bitmap(struct feat_fd *ff, unsigned long *set, u64 size) { u64 *p = (u64 *) set; @@ -154,7 +154,7 @@ static int do_write_bitmap(struct feat_fd *ff, unsigned long *set, u64 size) return 0; } -/* Return: 0 if succeded, -ERR if failed. */ +/* Return: 0 if succeeded, -ERR if failed. */ int write_padded(struct feat_fd *ff, const void *bf, size_t count, size_t count_aligned) { @@ -170,7 +170,7 @@ int write_padded(struct feat_fd *ff, const void *bf, #define string_size(str) \ (PERF_ALIGN((strlen(str) + 1), NAME_ALIGN) + sizeof(u32)) -/* Return: 0 if succeded, -ERR if failed. */ +/* Return: 0 if succeeded, -ERR if failed. */ static int do_write_string(struct feat_fd *ff, const char *str) { u32 len, olen; @@ -266,7 +266,7 @@ static char *do_read_string(struct feat_fd *ff) return NULL; } -/* Return: 0 if succeded, -ERR if failed. */ +/* Return: 0 if succeeded, -ERR if failed. */ static int do_read_bitmap(struct feat_fd *ff, unsigned long **pset, u64 *psize) { unsigned long *set; @@ -2874,7 +2874,7 @@ static int process_bpf_prog_info(struct feat_fd *ff, void *data __maybe_unused) int err = -1; if (ff->ph->needs_swap) { - pr_warning("interpreting bpf_prog_info from systems with endianity is not yet supported\n"); + pr_warning("interpreting bpf_prog_info from systems with endianness is not yet supported\n"); return 0; } @@ -2942,7 +2942,7 @@ static int process_bpf_btf(struct feat_fd *ff, void *data __maybe_unused) int err = -1; if (ff->ph->needs_swap) { - pr_warning("interpreting btf from systems with endianity is not yet supported\n"); + pr_warning("interpreting btf from systems with endianness is not yet supported\n"); return 0; } @@ -3481,11 +3481,11 @@ static const size_t attr_pipe_abi_sizes[] = { }; /* - * In the legacy pipe format, there is an implicit assumption that endiannesss + * In the legacy pipe format, there is an implicit assumption that endianness * between host recording the samples, and host parsing the samples is the * same. This is not always the case given that the pipe output may always be * redirected into a file and analyzed on a different machine with possibly a - * different endianness and perf_event ABI revsions in the perf tool itself. + * different endianness and perf_event ABI revisions in the perf tool itself. */ static int try_all_pipe_abis(uint64_t hdr_sz, struct perf_header *ph) { diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index f6e28ac231b7..8658d42ce57a 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -3569,7 +3569,7 @@ int intel_pt_process_auxtrace_info(union perf_event *event, /* * Since this thread will not be kept in any rbtree not in a * list, initialize its list node so that at thread__put() the - * current thread lifetime assuption is kept and we don't segfault + * current thread lifetime assumption is kept and we don't segfault * at list_del_init(). */ INIT_LIST_HEAD(&pt->unknown_thread->node); diff --git a/tools/perf/util/levenshtein.c b/tools/perf/util/levenshtein.c index a217ecf0359d..6a6712635aa4 100644 --- a/tools/perf/util/levenshtein.c +++ b/tools/perf/util/levenshtein.c @@ -30,7 +30,7 @@ * * It does so by calculating the costs of the path ending in characters * i (in string1) and j (in string2), respectively, given that the last - * operation is a substition, a swap, a deletion, or an insertion. + * operation is a substitution, a swap, a deletion, or an insertion. * * This implementation allows the costs to be weighted: * diff --git a/tools/perf/util/libunwind/arm64.c b/tools/perf/util/libunwind/arm64.c index 6b4e5a0892f8..c397be0c2e32 100644 --- a/tools/perf/util/libunwind/arm64.c +++ b/tools/perf/util/libunwind/arm64.c @@ -4,7 +4,7 @@ * generic one. * * The function 'LIBUNWIND__ARCH_REG_ID' name is set according to arch - * name and the defination of this function is included directly from + * name and the definition of this function is included directly from * 'arch/arm64/util/unwind-libunwind.c', to make sure that this function * is defined no matter what arch the host is. * diff --git a/tools/perf/util/libunwind/x86_32.c b/tools/perf/util/libunwind/x86_32.c index 21c216c40a3b..b2b92d030aef 100644 --- a/tools/perf/util/libunwind/x86_32.c +++ b/tools/perf/util/libunwind/x86_32.c @@ -4,7 +4,7 @@ * generic one. * * The function 'LIBUNWIND__ARCH_REG_ID' name is set according to arch - * name and the defination of this function is included directly from + * name and the definition of this function is included directly from * 'arch/x86/util/unwind-libunwind.c', to make sure that this function * is defined no matter what arch the host is. * diff --git a/tools/perf/util/llvm-utils.c b/tools/perf/util/llvm-utils.c index dbdffb6673fe..3ceaf7ef3301 100644 --- a/tools/perf/util/llvm-utils.c +++ b/tools/perf/util/llvm-utils.c @@ -471,7 +471,7 @@ int llvm__compile_bpf(const char *path, void **p_obj_buf, /* * This is an optional work. Even it fail we can continue our - * work. Needn't to check error return. + * work. Needn't check error return. */ llvm__get_kbuild_opts(&kbuild_dir, &kbuild_include_opts); diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 435771eed62b..3ff4936a15a4 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -905,7 +905,7 @@ static struct map *machine__addnew_module_map(struct machine *machine, u64 start maps__insert(&machine->kmaps, map); - /* Put the map here because maps__insert alread got it */ + /* Put the map here because maps__insert already got it */ map__put(map); out: /* put the dso here, corresponding to machine__findnew_module_dso */ @@ -1952,7 +1952,7 @@ int machine__process_fork_event(struct machine *machine, union perf_event *event * maps because that is what the kernel just did. * * But when synthesizing, this should not be done. If we do, we end up - * with overlapping maps as we process the sythesized MMAP2 events that + * with overlapping maps as we process the synthesized MMAP2 events that * get delivered shortly thereafter. * * Use the FORK event misc flags in an internal way to signal this @@ -2518,7 +2518,7 @@ static bool has_stitched_lbr(struct thread *thread, /* * Check if there are identical LBRs between two samples. - * Identicall LBRs must have same from, to and flags values. Also, + * Identical LBRs must have same from, to and flags values. Also, * they have to be saved in the same LBR registers (same physical * index). * @@ -2588,7 +2588,7 @@ err: } /* - * Recolve LBR callstack chain sample + * Resolve LBR callstack chain sample * Return: * 1 on success get LBR callchain information * 0 no available LBR callchain information, should try fp diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h index 9f32825c98d8..d32f5b28c1fb 100644 --- a/tools/perf/util/map.h +++ b/tools/perf/util/map.h @@ -75,7 +75,7 @@ struct thread; /* map__for_each_symbol - iterate over the symbols in the given map * - * @map: the 'struct map *' in which symbols itereated + * @map: the 'struct map *' in which symbols are iterated * @pos: the 'struct symbol *' to use as a loop cursor * @n: the 'struct rb_node *' to use as a temporary storage * Note: caller must ensure map->dso is not NULL (map is loaded). @@ -86,7 +86,7 @@ struct thread; /* map__for_each_symbol_with_name - iterate over the symbols in the given map * that have the given name * - * @map: the 'struct map *' in which symbols itereated + * @map: the 'struct map *' in which symbols are iterated * @sym_name: the symbol name * @pos: the 'struct symbol *' to use as a loop cursor */ diff --git a/tools/perf/util/mem-events.h b/tools/perf/util/mem-events.h index 755cef7e0625..63dd383b6ce2 100644 --- a/tools/perf/util/mem-events.h +++ b/tools/perf/util/mem-events.h @@ -81,7 +81,7 @@ struct c2c_stats { u32 rmt_dram; /* count of loads miss to remote DRAM */ u32 blk_data; /* count of loads blocked by data */ u32 blk_addr; /* count of loads blocked by address conflict */ - u32 nomap; /* count of load/stores with no phys adrs */ + u32 nomap; /* count of load/stores with no phys addrs */ u32 noparse; /* count of unparsable data sources */ }; diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c index 26c990e32378..6acb44ad439b 100644 --- a/tools/perf/util/metricgroup.c +++ b/tools/perf/util/metricgroup.c @@ -181,7 +181,7 @@ static bool evsel_same_pmu(struct evsel *ev1, struct evsel *ev2) * @pctx: the parse context for the metric expression. * @metric_no_merge: don't attempt to share events for the metric with other * metrics. - * @has_constraint: is there a contraint on the group of events? In which case + * @has_constraint: is there a constraint on the group of events? In which case * the events won't be grouped. * @metric_events: out argument, null terminated array of evsel's associated * with the metric. diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 42c84adeb2fb..9ecb45bea948 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -843,9 +843,9 @@ split_bpf_config_terms(struct list_head *evt_head_config, struct parse_events_term *term, *temp; /* - * Currectly, all possible user config term + * Currently, all possible user config term * belong to bpf object. parse_events__is_hardcoded_term() - * happends to be a good flag. + * happens to be a good flag. * * See parse_events_config_bpf() and * config_term_tracepoint(). @@ -895,7 +895,7 @@ int parse_events_load_bpf(struct parse_events_state *parse_state, /* * Caller doesn't know anything about obj_head_config, - * so combine them together again before returnning. + * so combine them together again before returning. */ if (head_config) list_splice_tail(&obj_head_config, head_config); @@ -1182,10 +1182,10 @@ do { \ } /* - * Check term availbility after basic checking so + * Check term availability after basic checking so * PARSE_EVENTS__TERM_TYPE_USER can be found and filtered. * - * If check availbility at the entry of this function, + * If check availability at the entry of this function, * user will see "'' is not usable in 'perf stat'" * if an invalid config term is provided for legacy events * (for example, instructions/badterm/...), which is confusing. diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 44ef28302fc7..88da5cf6aee8 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -1069,7 +1069,7 @@ int perf_pmu__format_type(struct list_head *formats, const char *name) /* * Sets value based on the format definition (format parameter) - * and unformated value (value parameter). + * and unformatted value (value parameter). */ static void pmu_format_value(unsigned long *format, __u64 value, __u64 *v, bool zero) @@ -1408,7 +1408,7 @@ int perf_pmu__check_alias(struct perf_pmu *pmu, struct list_head *head_terms, } /* - * if no unit or scale foundin aliases, then + * if no unit or scale found in aliases, then * set defaults as for evsel * unit cannot left to NULL */ diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index a9cff3a50ddf..a78c8d59a555 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -3228,7 +3228,7 @@ errout: return err; } -/* Concatinate two arrays */ +/* Concatenate two arrays */ static void *memcat(void *a, size_t sz_a, void *b, size_t sz_b) { void *ret; @@ -3258,7 +3258,7 @@ concat_probe_trace_events(struct probe_trace_event **tevs, int *ntevs, if (*ntevs + ntevs2 > probe_conf.max_probes) ret = -E2BIG; else { - /* Concatinate the array of probe_trace_event */ + /* Concatenate the array of probe_trace_event */ new_tevs = memcat(*tevs, (*ntevs) * sizeof(**tevs), *tevs2, ntevs2 * sizeof(**tevs2)); if (!new_tevs) diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c index 1b118c9c86a6..866f2d514d72 100644 --- a/tools/perf/util/probe-finder.c +++ b/tools/perf/util/probe-finder.c @@ -164,7 +164,7 @@ static struct probe_trace_arg_ref *alloc_trace_arg_ref(long offs) /* * Convert a location into trace_arg. * If tvar == NULL, this just checks variable can be converted. - * If fentry == true and vr_die is a parameter, do huristic search + * If fentry == true and vr_die is a parameter, do heuristic search * for the location fuzzed by function entry mcount. */ static int convert_variable_location(Dwarf_Die *vr_die, Dwarf_Addr addr, @@ -498,7 +498,7 @@ static int convert_variable_fields(Dwarf_Die *vr_die, const char *varname, " nor array.\n", varname); return -EINVAL; } - /* While prcessing unnamed field, we don't care about this */ + /* While processing unnamed field, we don't care about this */ if (field->ref && dwarf_diename(vr_die)) { pr_err("Semantic error: %s must be referred by '.'\n", field->name); @@ -1832,7 +1832,7 @@ static int line_range_walk_cb(const char *fname, int lineno, (lf->lno_s > lineno || lf->lno_e < lineno)) return 0; - /* Make sure this line can be reversable */ + /* Make sure this line can be reversible */ if (cu_find_lineinfo(&lf->cu_die, addr, &__fname, &__lineno) > 0 && (lineno != __lineno || strcmp(fname, __fname))) return 0; diff --git a/tools/perf/util/s390-cpumsf.c b/tools/perf/util/s390-cpumsf.c index 078a71773565..8130b56aa04b 100644 --- a/tools/perf/util/s390-cpumsf.c +++ b/tools/perf/util/s390-cpumsf.c @@ -45,7 +45,7 @@ * the data portion is mmap()'ed. * * To sort the queues in chronological order, all queue access is controlled - * by the auxtrace_heap. This is basicly a stack, each stack element has two + * by the auxtrace_heap. This is basically a stack, each stack element has two * entries, the queue number and a time stamp. However the stack is sorted by * the time stamps. The highest time stamp is at the bottom the lowest * (nearest) time stamp is at the top. That sort order is maintained at all @@ -65,11 +65,11 @@ * stamp of the last processed entry of the auxtrace_buffer replaces the * current auxtrace_heap top. * - * 3. Auxtrace_queues might run of out data and are feeded by the + * 3. Auxtrace_queues might run of out data and are fed by the * PERF_RECORD_AUXTRACE handling, see s390_cpumsf_process_auxtrace_event(). * * Event Generation - * Each sampling-data entry in the auxilary trace data generates a perf sample. + * Each sampling-data entry in the auxiliary trace data generates a perf sample. * This sample is filled * with data from the auxtrace such as PID/TID, instruction address, CPU state, * etc. This sample is processed with perf_session__deliver_synth_event() to @@ -575,7 +575,7 @@ static unsigned long long get_trailer_time(const unsigned char *buf) * pointer to the queue, the second parameter is the time stamp. This * is the time stamp: * - of the event that triggered this processing. - * - or the time stamp when the last proccesing of this queue stopped. + * - or the time stamp when the last processing of this queue stopped. * In this case it stopped at a 4KB page boundary and record the * position on where to continue processing on the next invocation * (see buffer->use_data and buffer->use_size). @@ -640,7 +640,7 @@ static int s390_cpumsf_samples(struct s390_cpumsf_queue *sfq, u64 *ts) goto out; } - pos += dsdes; /* Skip diagnositic entry */ + pos += dsdes; /* Skip diagnostic entry */ /* Check for trailer entry */ if (!s390_cpumsf_reached_trailer(bsdes + dsdes, pos)) { diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index c83c2c6564e0..4e4aa4c97ac5 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -1531,7 +1531,7 @@ static void set_table_handlers(struct tables *tables) * Attempt to use the call path root from the call return * processor, if the call return processor is in use. Otherwise, * we allocate a new call path root. This prevents exporting - * duplicate call path ids when both are in use simultaniously. + * duplicate call path ids when both are in use simultaneously. */ if (tables->dbe.crp) tables->dbe.cpr = tables->dbe.crp->cpr; diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 859832a82496..9a8808507bd9 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1069,7 +1069,7 @@ static void callchain__lbr_callstack_printf(struct perf_sample *sample) * in "to" register. * For example, there is a call stack * "A"->"B"->"C"->"D". - * The LBR registers will recorde like + * The LBR registers will be recorded like * "C"->"D", "B"->"C", "A"->"B". * So only the first "to" register and all "from" * registers are needed to construct the whole stack. @@ -1584,7 +1584,7 @@ static s64 perf_session__process_user_event(struct perf_session *session, return tool->event_update(tool, event, &session->evlist); case PERF_RECORD_HEADER_EVENT_TYPE: /* - * Depreceated, but we need to handle it for sake + * Deprecated, but we need to handle it for sake * of old data files create in pipe mode. */ return 0; diff --git a/tools/perf/util/strbuf.h b/tools/perf/util/strbuf.h index ea94d8628980..be94d7046fa0 100644 --- a/tools/perf/util/strbuf.h +++ b/tools/perf/util/strbuf.h @@ -12,7 +12,7 @@ * build complex strings/buffers whose final size isn't easily known. * * It is NOT legal to copy the ->buf pointer away. - * `strbuf_detach' is the operation that detachs a buffer from its shell + * `strbuf_detach' is the operation that detaches a buffer from its shell * while keeping the shell valid wrt its invariants. * * 2. the ->buf member is a byte array that has at least ->len + 1 bytes diff --git a/tools/perf/util/strfilter.h b/tools/perf/util/strfilter.h index e0c25a40f796..c05aca9ca582 100644 --- a/tools/perf/util/strfilter.h +++ b/tools/perf/util/strfilter.h @@ -8,8 +8,8 @@ /* A node of string filter */ struct strfilter_node { - struct strfilter_node *l; /* Tree left branche (for &,|) */ - struct strfilter_node *r; /* Tree right branche (for !,&,|) */ + struct strfilter_node *l; /* Tree left branch (for &,|) */ + struct strfilter_node *r; /* Tree right branch (for !,&,|) */ const char *p; /* Operator or rule */ }; diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c index 6dff843fd883..4c56aa837434 100644 --- a/tools/perf/util/symbol-elf.c +++ b/tools/perf/util/symbol-elf.c @@ -1058,7 +1058,7 @@ static int dso__process_kernel_symbol(struct dso *dso, struct map *map, curr_dso->symtab_type = dso->symtab_type; maps__insert(kmaps, curr_map); /* - * Add it before we drop the referece to curr_map, i.e. while + * Add it before we drop the reference to curr_map, i.e. while * we still are sure to have a reference to this DSO via * *curr_map->dso. */ diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index b698046ec2db..49c9353ff9fc 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -1210,7 +1210,7 @@ static size_t mask_size(struct perf_cpu_map *map, int *max) *max = 0; for (i = 0; i < map->nr; i++) { - /* bit possition of the cpu is + 1 */ + /* bit position of the cpu is + 1 */ int bit = map->map[i] + 1; if (bit > *max) @@ -1236,7 +1236,7 @@ void *cpu_map_data__alloc(struct perf_cpu_map *map, size_t *size, u16 *type, int * mask = size of 'struct perf_record_record_cpu_map' + * maximum cpu bit converted to size of longs * - * and finaly + the size of 'struct perf_record_cpu_map_data'. + * and finally + the size of 'struct perf_record_cpu_map_data'. */ size_cpus = cpus_size(map); size_mask = mask_size(map, max); diff --git a/tools/perf/util/unwind-libunwind-local.c b/tools/perf/util/unwind-libunwind-local.c index 9aededc0bc06..71a353349181 100644 --- a/tools/perf/util/unwind-libunwind-local.c +++ b/tools/perf/util/unwind-libunwind-local.c @@ -82,7 +82,7 @@ UNW_OBJ(dwarf_find_debug_frame) (int found, unw_dyn_info_t *di_debug, #define DW_EH_PE_funcrel 0x40 /* start-of-procedure-relative */ #define DW_EH_PE_aligned 0x50 /* aligned pointer */ -/* Flags intentionaly not handled, since they're not needed: +/* Flags intentionally not handled, since they're not needed: * #define DW_EH_PE_indirect 0x80 * #define DW_EH_PE_uleb128 0x01 * #define DW_EH_PE_udata2 0x02 -- cgit v1.2.3 From 7fac83aaf2eecc9e7e7b72da694c49bb4ce7fdfc Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 16 Mar 2021 14:18:35 -0700 Subject: perf stat: Introduce 'bperf' to share hardware PMCs with BPF The perf tool uses performance monitoring counters (PMCs) to monitor system performance. The PMCs are limited hardware resources. For example, Intel CPUs have 3x fixed PMCs and 4x programmable PMCs per cpu. Modern data center systems use these PMCs in many different ways: system level monitoring, (maybe nested) container level monitoring, per process monitoring, profiling (in sample mode), etc. In some cases, there are more active perf_events than available hardware PMCs. To allow all perf_events to have a chance to run, it is necessary to do expensive time multiplexing of events. On the other hand, many monitoring tools count the common metrics (cycles, instructions). It is a waste to have multiple tools create multiple perf_events of "cycles" and occupy multiple PMCs. bperf tries to reduce such wastes by allowing multiple perf_events of "cycles" or "instructions" (at different scopes) to share PMUs. Instead of having each perf-stat session to read its own perf_events, bperf uses BPF programs to read the perf_events and aggregate readings to BPF maps. Then, the perf-stat session(s) reads the values from these BPF maps. Please refer to the comment before the definition of bperf_ops for the description of bperf architecture. bperf is off by default. To enable it, pass --bpf-counters option to perf-stat. bperf uses a BPF hashmap to share information about BPF programs and maps used by bperf. This map is pinned to bpffs. The default path is /sys/fs/bpf/perf_attr_map. The user could change the path with option --bpf-attr-map. Committer testing: # dmesg|grep "Performance Events" -A5 [ 0.225277] Performance Events: Fam17h+ core perfctr, AMD PMU driver. [ 0.225280] ... version: 0 [ 0.225280] ... bit width: 48 [ 0.225281] ... generic registers: 6 [ 0.225281] ... value mask: 0000ffffffffffff [ 0.225281] ... max period: 00007fffffffffff # # for a in $(seq 6) ; do perf stat -a -e cycles,instructions sleep 100000 & done [1] 2436231 [2] 2436232 [3] 2436233 [4] 2436234 [5] 2436235 [6] 2436236 # perf stat -a -e cycles,instructions sleep 0.1 Performance counter stats for 'system wide': 310,326,987 cycles (41.87%) 236,143,290 instructions # 0.76 insn per cycle (41.87%) 0.100800885 seconds time elapsed # We can see that the counters were enabled for this workload 41.87% of the time. Now with --bpf-counters: # for a in $(seq 32) ; do perf stat --bpf-counters -a -e cycles,instructions sleep 100000 & done [1] 2436514 [2] 2436515 [3] 2436516 [4] 2436517 [5] 2436518 [6] 2436519 [7] 2436520 [8] 2436521 [9] 2436522 [10] 2436523 [11] 2436524 [12] 2436525 [13] 2436526 [14] 2436527 [15] 2436528 [16] 2436529 [17] 2436530 [18] 2436531 [19] 2436532 [20] 2436533 [21] 2436534 [22] 2436535 [23] 2436536 [24] 2436537 [25] 2436538 [26] 2436539 [27] 2436540 [28] 2436541 [29] 2436542 [30] 2436543 [31] 2436544 [32] 2436545 # # ls -la /sys/fs/bpf/perf_attr_map -rw-------. 1 root root 0 Mar 23 14:53 /sys/fs/bpf/perf_attr_map # bpftool map | grep bperf | wc -l 64 # # bpftool map | tail 1265: percpu_array name accum_readings flags 0x0 key 4B value 24B max_entries 1 memlock 4096B 1266: hash name filter flags 0x0 key 4B value 4B max_entries 1 memlock 4096B 1267: array name bperf_fo.bss flags 0x400 key 4B value 8B max_entries 1 memlock 4096B btf_id 996 pids perf(2436545) 1268: percpu_array name accum_readings flags 0x0 key 4B value 24B max_entries 1 memlock 4096B 1269: hash name filter flags 0x0 key 4B value 4B max_entries 1 memlock 4096B 1270: array name bperf_fo.bss flags 0x400 key 4B value 8B max_entries 1 memlock 4096B btf_id 997 pids perf(2436541) 1285: array name pid_iter.rodata flags 0x480 key 4B value 4B max_entries 1 memlock 4096B btf_id 1017 frozen pids bpftool(2437504) 1286: array flags 0x0 key 4B value 32B max_entries 1 memlock 4096B # # bpftool map dump id 1268 | tail value (CPU 21): 8f f3 bc ca 00 00 00 00 80 fd 2a d1 4d 00 00 00 80 fd 2a d1 4d 00 00 00 value (CPU 22): 7e d5 64 4d 00 00 00 00 a4 8a 2e ee 4d 00 00 00 a4 8a 2e ee 4d 00 00 00 value (CPU 23): a7 78 3e 06 01 00 00 00 b2 34 94 f6 4d 00 00 00 b2 34 94 f6 4d 00 00 00 Found 1 element # bpftool map dump id 1268 | tail value (CPU 21): c6 8b d9 ca 00 00 00 00 20 c6 fc 83 4e 00 00 00 20 c6 fc 83 4e 00 00 00 value (CPU 22): 9c b4 d2 4d 00 00 00 00 3e 0c df 89 4e 00 00 00 3e 0c df 89 4e 00 00 00 value (CPU 23): 18 43 66 06 01 00 00 00 5b 69 ed 83 4e 00 00 00 5b 69 ed 83 4e 00 00 00 Found 1 element # bpftool map dump id 1268 | tail value (CPU 21): f2 6e db ca 00 00 00 00 92 67 4c ba 4e 00 00 00 92 67 4c ba 4e 00 00 00 value (CPU 22): dc 8e e1 4d 00 00 00 00 d9 32 7a c5 4e 00 00 00 d9 32 7a c5 4e 00 00 00 value (CPU 23): bd 2b 73 06 01 00 00 00 7c 73 87 bf 4e 00 00 00 7c 73 87 bf 4e 00 00 00 Found 1 element # # perf stat --bpf-counters -a -e cycles,instructions sleep 0.1 Performance counter stats for 'system wide': 119,410,122 cycles 152,105,479 instructions # 1.27 insn per cycle 0.101395093 seconds time elapsed # See? We had the counters enabled all the time. Signed-off-by: Song Liu Reviewed-by: Jiri Olsa Acked-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: kernel-team@fb.com Link: http://lore.kernel.org/lkml/20210316211837.910506-2-songliubraving@fb.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-stat.txt | 11 + tools/perf/Makefile.perf | 1 + tools/perf/builtin-stat.c | 10 + tools/perf/util/bpf_counter.c | 519 +++++++++++++++++++++++++- tools/perf/util/bpf_skel/bperf.h | 14 + tools/perf/util/bpf_skel/bperf_follower.bpf.c | 69 ++++ tools/perf/util/bpf_skel/bperf_leader.bpf.c | 46 +++ tools/perf/util/bpf_skel/bperf_u.h | 14 + tools/perf/util/evsel.h | 20 +- tools/perf/util/target.h | 4 +- 10 files changed, 701 insertions(+), 7 deletions(-) create mode 100644 tools/perf/util/bpf_skel/bperf.h create mode 100644 tools/perf/util/bpf_skel/bperf_follower.bpf.c create mode 100644 tools/perf/util/bpf_skel/bperf_leader.bpf.c create mode 100644 tools/perf/util/bpf_skel/bperf_u.h (limited to 'tools') diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt index 3055aad38d46..744211fa8c18 100644 --- a/tools/perf/Documentation/perf-stat.txt +++ b/tools/perf/Documentation/perf-stat.txt @@ -93,6 +93,17 @@ report:: 1.102235068 seconds time elapsed +--bpf-counters:: + Use BPF programs to aggregate readings from perf_events. This + allows multiple perf-stat sessions that are counting the same metric (cycles, + instructions, etc.) to share hardware counters. + +--bpf-attr-map:: + With option "--bpf-counters", different perf-stat sessions share + information about shared BPF programs and maps via a pinned hashmap. + Use "--bpf-attr-map" to specify the path of this pinned hashmap. + The default path is /sys/fs/bpf/perf_attr_map. + ifdef::HAVE_LIBPFM[] --pfm-events events:: Select a PMU event using libpfm4 syntax (see http://perfmon2.sf.net) diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index a7d768fdc8a1..090fb9d62665 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -1007,6 +1007,7 @@ python-clean: SKEL_OUT := $(abspath $(OUTPUT)util/bpf_skel) SKEL_TMP_OUT := $(abspath $(SKEL_OUT)/.tmp) SKELETONS := $(SKEL_OUT)/bpf_prog_profiler.skel.h +SKELETONS += $(SKEL_OUT)/bperf_leader.skel.h $(SKEL_OUT)/bperf_follower.skel.h ifdef BUILD_BPF_SKEL BPFTOOL := $(SKEL_TMP_OUT)/bootstrap/bpftool diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index d140ffbb5b34..d34e22dce2c8 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -792,6 +792,12 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx) } evlist__for_each_cpu (evsel_list, i, cpu) { + /* + * bperf calls evsel__open_per_cpu() in bperf__load(), so + * no need to call it again here. + */ + if (target.use_bpf) + break; affinity__set(&affinity, cpu); evlist__for_each_entry(evsel_list, counter) { @@ -1146,6 +1152,10 @@ static struct option stat_options[] = { #ifdef HAVE_BPF_SKEL OPT_STRING('b', "bpf-prog", &target.bpf_str, "bpf-prog-id", "stat events on existing bpf program id"), + OPT_BOOLEAN(0, "bpf-counters", &target.use_bpf, + "use bpf program to count events"), + OPT_STRING(0, "bpf-attr-map", &target.attr_map, "attr-map-path", + "path to perf_event_attr map"), #endif OPT_BOOLEAN('a', "all-cpus", &target.system_wide, "system-wide collection from all CPUs"), diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c index 04f89120b323..81d1df3c4ec0 100644 --- a/tools/perf/util/bpf_counter.c +++ b/tools/perf/util/bpf_counter.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -12,14 +13,45 @@ #include #include #include +#include #include "bpf_counter.h" #include "counts.h" #include "debug.h" #include "evsel.h" +#include "evlist.h" #include "target.h" +#include "cpumap.h" +#include "thread_map.h" #include "bpf_skel/bpf_prog_profiler.skel.h" +#include "bpf_skel/bperf_u.h" +#include "bpf_skel/bperf_leader.skel.h" +#include "bpf_skel/bperf_follower.skel.h" + +/* + * bperf uses a hashmap, the attr_map, to track all the leader programs. + * The hashmap is pinned in bpffs. flock() on this file is used to ensure + * no concurrent access to the attr_map. The key of attr_map is struct + * perf_event_attr, and the value is struct perf_event_attr_map_entry. + * + * struct perf_event_attr_map_entry contains two __u32 IDs, bpf_link of the + * leader prog, and the diff_map. Each perf-stat session holds a reference + * to the bpf_link to make sure the leader prog is attached to sched_switch + * tracepoint. + * + * Since the hashmap only contains IDs of the bpf_link and diff_map, it + * does not hold any references to the leader program. Once all perf-stat + * sessions of these events exit, the leader prog, its maps, and the + * perf_events will be freed. + */ +struct perf_event_attr_map_entry { + __u32 link_id; + __u32 diff_map_id; +}; + +#define DEFAULT_ATTR_MAP_PATH "fs/bpf/perf_attr_map" +#define ATTR_MAP_SIZE 16 static inline void *u64_to_ptr(__u64 ptr) { @@ -274,17 +306,494 @@ struct bpf_counter_ops bpf_program_profiler_ops = { .install_pe = bpf_program_profiler__install_pe, }; +static __u32 bpf_link_get_id(int fd) +{ + struct bpf_link_info link_info = {0}; + __u32 link_info_len = sizeof(link_info); + + bpf_obj_get_info_by_fd(fd, &link_info, &link_info_len); + return link_info.id; +} + +static __u32 bpf_link_get_prog_id(int fd) +{ + struct bpf_link_info link_info = {0}; + __u32 link_info_len = sizeof(link_info); + + bpf_obj_get_info_by_fd(fd, &link_info, &link_info_len); + return link_info.prog_id; +} + +static __u32 bpf_map_get_id(int fd) +{ + struct bpf_map_info map_info = {0}; + __u32 map_info_len = sizeof(map_info); + + bpf_obj_get_info_by_fd(fd, &map_info, &map_info_len); + return map_info.id; +} + +static int bperf_lock_attr_map(struct target *target) +{ + char path[PATH_MAX]; + int map_fd, err; + + if (target->attr_map) { + scnprintf(path, PATH_MAX, "%s", target->attr_map); + } else { + scnprintf(path, PATH_MAX, "%s/%s", sysfs__mountpoint(), + DEFAULT_ATTR_MAP_PATH); + } + + if (access(path, F_OK)) { + map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, + sizeof(struct perf_event_attr), + sizeof(struct perf_event_attr_map_entry), + ATTR_MAP_SIZE, 0); + if (map_fd < 0) + return -1; + + err = bpf_obj_pin(map_fd, path); + if (err) { + /* someone pinned the map in parallel? */ + close(map_fd); + map_fd = bpf_obj_get(path); + if (map_fd < 0) + return -1; + } + } else { + map_fd = bpf_obj_get(path); + if (map_fd < 0) + return -1; + } + + err = flock(map_fd, LOCK_EX); + if (err) { + close(map_fd); + return -1; + } + return map_fd; +} + +/* trigger the leader program on a cpu */ +static int bperf_trigger_reading(int prog_fd, int cpu) +{ + DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts, + .ctx_in = NULL, + .ctx_size_in = 0, + .flags = BPF_F_TEST_RUN_ON_CPU, + .cpu = cpu, + .retval = 0, + ); + + return bpf_prog_test_run_opts(prog_fd, &opts); +} + +static int bperf_check_target(struct evsel *evsel, + struct target *target, + enum bperf_filter_type *filter_type, + __u32 *filter_entry_cnt) +{ + if (evsel->leader->core.nr_members > 1) { + pr_err("bpf managed perf events do not yet support groups.\n"); + return -1; + } + + /* determine filter type based on target */ + if (target->system_wide) { + *filter_type = BPERF_FILTER_GLOBAL; + *filter_entry_cnt = 1; + } else if (target->cpu_list) { + *filter_type = BPERF_FILTER_CPU; + *filter_entry_cnt = perf_cpu_map__nr(evsel__cpus(evsel)); + } else if (target->tid) { + *filter_type = BPERF_FILTER_PID; + *filter_entry_cnt = perf_thread_map__nr(evsel->core.threads); + } else if (target->pid || evsel->evlist->workload.pid != -1) { + *filter_type = BPERF_FILTER_TGID; + *filter_entry_cnt = perf_thread_map__nr(evsel->core.threads); + } else { + pr_err("bpf managed perf events do not yet support these targets.\n"); + return -1; + } + + return 0; +} + +static struct perf_cpu_map *all_cpu_map; + +static int bperf_reload_leader_program(struct evsel *evsel, int attr_map_fd, + struct perf_event_attr_map_entry *entry) +{ + struct bperf_leader_bpf *skel = bperf_leader_bpf__open(); + int link_fd, diff_map_fd, err; + struct bpf_link *link = NULL; + + if (!skel) { + pr_err("Failed to open leader skeleton\n"); + return -1; + } + + bpf_map__resize(skel->maps.events, libbpf_num_possible_cpus()); + err = bperf_leader_bpf__load(skel); + if (err) { + pr_err("Failed to load leader skeleton\n"); + goto out; + } + + err = -1; + link = bpf_program__attach(skel->progs.on_switch); + if (!link) { + pr_err("Failed to attach leader program\n"); + goto out; + } + + link_fd = bpf_link__fd(link); + diff_map_fd = bpf_map__fd(skel->maps.diff_readings); + entry->link_id = bpf_link_get_id(link_fd); + entry->diff_map_id = bpf_map_get_id(diff_map_fd); + err = bpf_map_update_elem(attr_map_fd, &evsel->core.attr, entry, BPF_ANY); + assert(err == 0); + + evsel->bperf_leader_link_fd = bpf_link_get_fd_by_id(entry->link_id); + assert(evsel->bperf_leader_link_fd >= 0); + + /* + * save leader_skel for install_pe, which is called within + * following evsel__open_per_cpu call + */ + evsel->leader_skel = skel; + evsel__open_per_cpu(evsel, all_cpu_map, -1); + +out: + bperf_leader_bpf__destroy(skel); + bpf_link__destroy(link); + return err; +} + +static int bperf__load(struct evsel *evsel, struct target *target) +{ + struct perf_event_attr_map_entry entry = {0xffffffff, 0xffffffff}; + int attr_map_fd, diff_map_fd = -1, err; + enum bperf_filter_type filter_type; + __u32 filter_entry_cnt, i; + + if (bperf_check_target(evsel, target, &filter_type, &filter_entry_cnt)) + return -1; + + if (!all_cpu_map) { + all_cpu_map = perf_cpu_map__new(NULL); + if (!all_cpu_map) + return -1; + } + + evsel->bperf_leader_prog_fd = -1; + evsel->bperf_leader_link_fd = -1; + + /* + * Step 1: hold a fd on the leader program and the bpf_link, if + * the program is not already gone, reload the program. + * Use flock() to ensure exclusive access to the perf_event_attr + * map. + */ + attr_map_fd = bperf_lock_attr_map(target); + if (attr_map_fd < 0) { + pr_err("Failed to lock perf_event_attr map\n"); + return -1; + } + + err = bpf_map_lookup_elem(attr_map_fd, &evsel->core.attr, &entry); + if (err) { + err = bpf_map_update_elem(attr_map_fd, &evsel->core.attr, &entry, BPF_ANY); + if (err) + goto out; + } + + evsel->bperf_leader_link_fd = bpf_link_get_fd_by_id(entry.link_id); + if (evsel->bperf_leader_link_fd < 0 && + bperf_reload_leader_program(evsel, attr_map_fd, &entry)) + goto out; + + /* + * The bpf_link holds reference to the leader program, and the + * leader program holds reference to the maps. Therefore, if + * link_id is valid, diff_map_id should also be valid. + */ + evsel->bperf_leader_prog_fd = bpf_prog_get_fd_by_id( + bpf_link_get_prog_id(evsel->bperf_leader_link_fd)); + assert(evsel->bperf_leader_prog_fd >= 0); + + diff_map_fd = bpf_map_get_fd_by_id(entry.diff_map_id); + assert(diff_map_fd >= 0); + + /* + * bperf uses BPF_PROG_TEST_RUN to get accurate reading. Check + * whether the kernel support it + */ + err = bperf_trigger_reading(evsel->bperf_leader_prog_fd, 0); + if (err) { + pr_err("The kernel does not support test_run for raw_tp BPF programs.\n" + "Therefore, --use-bpf might show inaccurate readings\n"); + goto out; + } + + /* Step 2: load the follower skeleton */ + evsel->follower_skel = bperf_follower_bpf__open(); + if (!evsel->follower_skel) { + pr_err("Failed to open follower skeleton\n"); + goto out; + } + + /* attach fexit program to the leader program */ + bpf_program__set_attach_target(evsel->follower_skel->progs.fexit_XXX, + evsel->bperf_leader_prog_fd, "on_switch"); + + /* connect to leader diff_reading map */ + bpf_map__reuse_fd(evsel->follower_skel->maps.diff_readings, diff_map_fd); + + /* set up reading map */ + bpf_map__set_max_entries(evsel->follower_skel->maps.accum_readings, + filter_entry_cnt); + /* set up follower filter based on target */ + bpf_map__set_max_entries(evsel->follower_skel->maps.filter, + filter_entry_cnt); + err = bperf_follower_bpf__load(evsel->follower_skel); + if (err) { + pr_err("Failed to load follower skeleton\n"); + bperf_follower_bpf__destroy(evsel->follower_skel); + evsel->follower_skel = NULL; + goto out; + } + + for (i = 0; i < filter_entry_cnt; i++) { + int filter_map_fd; + __u32 key; + + if (filter_type == BPERF_FILTER_PID || + filter_type == BPERF_FILTER_TGID) + key = evsel->core.threads->map[i].pid; + else if (filter_type == BPERF_FILTER_CPU) + key = evsel->core.cpus->map[i]; + else + break; + + filter_map_fd = bpf_map__fd(evsel->follower_skel->maps.filter); + bpf_map_update_elem(filter_map_fd, &key, &i, BPF_ANY); + } + + evsel->follower_skel->bss->type = filter_type; + + err = bperf_follower_bpf__attach(evsel->follower_skel); + +out: + if (err && evsel->bperf_leader_link_fd >= 0) + close(evsel->bperf_leader_link_fd); + if (err && evsel->bperf_leader_prog_fd >= 0) + close(evsel->bperf_leader_prog_fd); + if (diff_map_fd >= 0) + close(diff_map_fd); + + flock(attr_map_fd, LOCK_UN); + close(attr_map_fd); + + return err; +} + +static int bperf__install_pe(struct evsel *evsel, int cpu, int fd) +{ + struct bperf_leader_bpf *skel = evsel->leader_skel; + + return bpf_map_update_elem(bpf_map__fd(skel->maps.events), + &cpu, &fd, BPF_ANY); +} + +/* + * trigger the leader prog on each cpu, so the accum_reading map could get + * the latest readings. + */ +static int bperf_sync_counters(struct evsel *evsel) +{ + int num_cpu, i, cpu; + + num_cpu = all_cpu_map->nr; + for (i = 0; i < num_cpu; i++) { + cpu = all_cpu_map->map[i]; + bperf_trigger_reading(evsel->bperf_leader_prog_fd, cpu); + } + return 0; +} + +static int bperf__enable(struct evsel *evsel) +{ + evsel->follower_skel->bss->enabled = 1; + return 0; +} + +static int bperf__read(struct evsel *evsel) +{ + struct bperf_follower_bpf *skel = evsel->follower_skel; + __u32 num_cpu_bpf = cpu__max_cpu(); + struct bpf_perf_event_value values[num_cpu_bpf]; + int reading_map_fd, err = 0; + __u32 i, j, num_cpu; + + bperf_sync_counters(evsel); + reading_map_fd = bpf_map__fd(skel->maps.accum_readings); + + for (i = 0; i < bpf_map__max_entries(skel->maps.accum_readings); i++) { + __u32 cpu; + + err = bpf_map_lookup_elem(reading_map_fd, &i, values); + if (err) + goto out; + switch (evsel->follower_skel->bss->type) { + case BPERF_FILTER_GLOBAL: + assert(i == 0); + + num_cpu = all_cpu_map->nr; + for (j = 0; j < num_cpu; j++) { + cpu = all_cpu_map->map[j]; + perf_counts(evsel->counts, cpu, 0)->val = values[cpu].counter; + perf_counts(evsel->counts, cpu, 0)->ena = values[cpu].enabled; + perf_counts(evsel->counts, cpu, 0)->run = values[cpu].running; + } + break; + case BPERF_FILTER_CPU: + cpu = evsel->core.cpus->map[i]; + perf_counts(evsel->counts, i, 0)->val = values[cpu].counter; + perf_counts(evsel->counts, i, 0)->ena = values[cpu].enabled; + perf_counts(evsel->counts, i, 0)->run = values[cpu].running; + break; + case BPERF_FILTER_PID: + case BPERF_FILTER_TGID: + perf_counts(evsel->counts, 0, i)->val = 0; + perf_counts(evsel->counts, 0, i)->ena = 0; + perf_counts(evsel->counts, 0, i)->run = 0; + + for (cpu = 0; cpu < num_cpu_bpf; cpu++) { + perf_counts(evsel->counts, 0, i)->val += values[cpu].counter; + perf_counts(evsel->counts, 0, i)->ena += values[cpu].enabled; + perf_counts(evsel->counts, 0, i)->run += values[cpu].running; + } + break; + default: + break; + } + } +out: + return err; +} + +static int bperf__destroy(struct evsel *evsel) +{ + bperf_follower_bpf__destroy(evsel->follower_skel); + close(evsel->bperf_leader_prog_fd); + close(evsel->bperf_leader_link_fd); + return 0; +} + +/* + * bperf: share hardware PMCs with BPF + * + * perf uses performance monitoring counters (PMC) to monitor system + * performance. The PMCs are limited hardware resources. For example, + * Intel CPUs have 3x fixed PMCs and 4x programmable PMCs per cpu. + * + * Modern data center systems use these PMCs in many different ways: + * system level monitoring, (maybe nested) container level monitoring, per + * process monitoring, profiling (in sample mode), etc. In some cases, + * there are more active perf_events than available hardware PMCs. To allow + * all perf_events to have a chance to run, it is necessary to do expensive + * time multiplexing of events. + * + * On the other hand, many monitoring tools count the common metrics + * (cycles, instructions). It is a waste to have multiple tools create + * multiple perf_events of "cycles" and occupy multiple PMCs. + * + * bperf tries to reduce such wastes by allowing multiple perf_events of + * "cycles" or "instructions" (at different scopes) to share PMUs. Instead + * of having each perf-stat session to read its own perf_events, bperf uses + * BPF programs to read the perf_events and aggregate readings to BPF maps. + * Then, the perf-stat session(s) reads the values from these BPF maps. + * + * || + * shared progs and maps <- || -> per session progs and maps + * || + * --------------- || + * | perf_events | || + * --------------- fexit || ----------------- + * | --------||----> | follower prog | + * --------------- / || --- ----------------- + * cs -> | leader prog |/ ||/ | | + * --> --------------- /|| -------------- ------------------ + * / | | / || | filter map | | accum_readings | + * / ------------ ------------ || -------------- ------------------ + * | | prev map | | diff map | || | + * | ------------ ------------ || | + * \ || | + * = \ ==================================================== | ============ + * \ / user space + * \ / + * \ / + * BPF_PROG_TEST_RUN BPF_MAP_LOOKUP_ELEM + * \ / + * \ / + * \------ perf-stat ----------------------/ + * + * The figure above shows the architecture of bperf. Note that the figure + * is divided into 3 regions: shared progs and maps (top left), per session + * progs and maps (top right), and user space (bottom). + * + * The leader prog is triggered on each context switch (cs). The leader + * prog reads perf_events and stores the difference (current_reading - + * previous_reading) to the diff map. For the same metric, e.g. "cycles", + * multiple perf-stat sessions share the same leader prog. + * + * Each perf-stat session creates a follower prog as fexit program to the + * leader prog. It is possible to attach up to BPF_MAX_TRAMP_PROGS (38) + * follower progs to the same leader prog. The follower prog checks current + * task and processor ID to decide whether to add the value from the diff + * map to its accumulated reading map (accum_readings). + * + * Finally, perf-stat user space reads the value from accum_reading map. + * + * Besides context switch, it is also necessary to trigger the leader prog + * before perf-stat reads the value. Otherwise, the accum_reading map may + * not have the latest reading from the perf_events. This is achieved by + * triggering the event via sys_bpf(BPF_PROG_TEST_RUN) to each CPU. + * + * Comment before the definition of struct perf_event_attr_map_entry + * describes how different sessions of perf-stat share information about + * the leader prog. + */ + +struct bpf_counter_ops bperf_ops = { + .load = bperf__load, + .enable = bperf__enable, + .read = bperf__read, + .install_pe = bperf__install_pe, + .destroy = bperf__destroy, +}; + +static inline bool bpf_counter_skip(struct evsel *evsel) +{ + return list_empty(&evsel->bpf_counter_list) && + evsel->follower_skel == NULL; +} + int bpf_counter__install_pe(struct evsel *evsel, int cpu, int fd) { - if (list_empty(&evsel->bpf_counter_list)) + if (bpf_counter_skip(evsel)) return 0; return evsel->bpf_counter_ops->install_pe(evsel, cpu, fd); } int bpf_counter__load(struct evsel *evsel, struct target *target) { - if (target__has_bpf(target)) + if (target->bpf_str) evsel->bpf_counter_ops = &bpf_program_profiler_ops; + else if (target->use_bpf) + evsel->bpf_counter_ops = &bperf_ops; if (evsel->bpf_counter_ops) return evsel->bpf_counter_ops->load(evsel, target); @@ -293,21 +802,21 @@ int bpf_counter__load(struct evsel *evsel, struct target *target) int bpf_counter__enable(struct evsel *evsel) { - if (list_empty(&evsel->bpf_counter_list)) + if (bpf_counter_skip(evsel)) return 0; return evsel->bpf_counter_ops->enable(evsel); } int bpf_counter__read(struct evsel *evsel) { - if (list_empty(&evsel->bpf_counter_list)) + if (bpf_counter_skip(evsel)) return -EAGAIN; return evsel->bpf_counter_ops->read(evsel); } void bpf_counter__destroy(struct evsel *evsel) { - if (list_empty(&evsel->bpf_counter_list)) + if (bpf_counter_skip(evsel)) return; evsel->bpf_counter_ops->destroy(evsel); evsel->bpf_counter_ops = NULL; diff --git a/tools/perf/util/bpf_skel/bperf.h b/tools/perf/util/bpf_skel/bperf.h new file mode 100644 index 000000000000..186a5551ddb9 --- /dev/null +++ b/tools/perf/util/bpf_skel/bperf.h @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +// Copyright (c) 2021 Facebook + +#ifndef __BPERF_STAT_H +#define __BPERF_STAT_H + +typedef struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct bpf_perf_event_value)); + __uint(max_entries, 1); +} reading_map; + +#endif /* __BPERF_STAT_H */ diff --git a/tools/perf/util/bpf_skel/bperf_follower.bpf.c b/tools/perf/util/bpf_skel/bperf_follower.bpf.c new file mode 100644 index 000000000000..b8fa3cb2da23 --- /dev/null +++ b/tools/perf/util/bpf_skel/bperf_follower.bpf.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +// Copyright (c) 2021 Facebook +#include +#include +#include +#include +#include "bperf.h" +#include "bperf_u.h" + +reading_map diff_readings SEC(".maps"); +reading_map accum_readings SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} filter SEC(".maps"); + +enum bperf_filter_type type = 0; +int enabled = 0; + +SEC("fexit/XXX") +int BPF_PROG(fexit_XXX) +{ + struct bpf_perf_event_value *diff_val, *accum_val; + __u32 filter_key, zero = 0; + __u32 *accum_key; + + if (!enabled) + return 0; + + switch (type) { + case BPERF_FILTER_GLOBAL: + accum_key = &zero; + goto do_add; + case BPERF_FILTER_CPU: + filter_key = bpf_get_smp_processor_id(); + break; + case BPERF_FILTER_PID: + filter_key = bpf_get_current_pid_tgid() & 0xffffffff; + break; + case BPERF_FILTER_TGID: + filter_key = bpf_get_current_pid_tgid() >> 32; + break; + default: + return 0; + } + + accum_key = bpf_map_lookup_elem(&filter, &filter_key); + if (!accum_key) + return 0; + +do_add: + diff_val = bpf_map_lookup_elem(&diff_readings, &zero); + if (!diff_val) + return 0; + + accum_val = bpf_map_lookup_elem(&accum_readings, accum_key); + if (!accum_val) + return 0; + + accum_val->counter += diff_val->counter; + accum_val->enabled += diff_val->enabled; + accum_val->running += diff_val->running; + + return 0; +} + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; diff --git a/tools/perf/util/bpf_skel/bperf_leader.bpf.c b/tools/perf/util/bpf_skel/bperf_leader.bpf.c new file mode 100644 index 000000000000..4f70d1459e86 --- /dev/null +++ b/tools/perf/util/bpf_skel/bperf_leader.bpf.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +// Copyright (c) 2021 Facebook +#include +#include +#include +#include +#include "bperf.h" + +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(int)); + __uint(map_flags, BPF_F_PRESERVE_ELEMS); +} events SEC(".maps"); + +reading_map prev_readings SEC(".maps"); +reading_map diff_readings SEC(".maps"); + +SEC("raw_tp/sched_switch") +int BPF_PROG(on_switch) +{ + struct bpf_perf_event_value val, *prev_val, *diff_val; + __u32 key = bpf_get_smp_processor_id(); + __u32 zero = 0; + long err; + + prev_val = bpf_map_lookup_elem(&prev_readings, &zero); + if (!prev_val) + return 0; + + diff_val = bpf_map_lookup_elem(&diff_readings, &zero); + if (!diff_val) + return 0; + + err = bpf_perf_event_read_value(&events, key, &val, sizeof(val)); + if (err) + return 0; + + diff_val->counter = val.counter - prev_val->counter; + diff_val->enabled = val.enabled - prev_val->enabled; + diff_val->running = val.running - prev_val->running; + *prev_val = val; + return 0; +} + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; diff --git a/tools/perf/util/bpf_skel/bperf_u.h b/tools/perf/util/bpf_skel/bperf_u.h new file mode 100644 index 000000000000..1ce0c2c905c1 --- /dev/null +++ b/tools/perf/util/bpf_skel/bperf_u.h @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +// Copyright (c) 2021 Facebook + +#ifndef __BPERF_STAT_U_H +#define __BPERF_STAT_U_H + +enum bperf_filter_type { + BPERF_FILTER_GLOBAL = 1, + BPERF_FILTER_CPU, + BPERF_FILTER_PID, + BPERF_FILTER_TGID, +}; + +#endif /* __BPERF_STAT_U_H */ diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 6026487353dd..dd4f56f9cfdf 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -20,6 +20,8 @@ union perf_event; struct bpf_counter_ops; struct target; struct hashmap; +struct bperf_leader_bpf; +struct bperf_follower_bpf; typedef int (evsel__sb_cb_t)(union perf_event *event, void *data); @@ -130,8 +132,24 @@ struct evsel { * See also evsel__has_callchain(). */ __u64 synth_sample_type; - struct list_head bpf_counter_list; + + /* + * bpf_counter_ops serves two use cases: + * 1. perf-stat -b counting events used byBPF programs + * 2. perf-stat --use-bpf use BPF programs to aggregate counts + */ struct bpf_counter_ops *bpf_counter_ops; + + /* for perf-stat -b */ + struct list_head bpf_counter_list; + + /* for perf-stat --use-bpf */ + int bperf_leader_prog_fd; + int bperf_leader_link_fd; + union { + struct bperf_leader_bpf *leader_skel; + struct bperf_follower_bpf *follower_skel; + }; }; struct perf_missing_features { diff --git a/tools/perf/util/target.h b/tools/perf/util/target.h index f132c6c2eef8..1bce3eb28ef2 100644 --- a/tools/perf/util/target.h +++ b/tools/perf/util/target.h @@ -16,6 +16,8 @@ struct target { bool uses_mmap; bool default_per_cpu; bool per_thread; + bool use_bpf; + const char *attr_map; }; enum target_errno { @@ -66,7 +68,7 @@ static inline bool target__has_cpu(struct target *target) static inline bool target__has_bpf(struct target *target) { - return target->bpf_str; + return target->bpf_str || target->use_bpf; } static inline bool target__none(struct target *target) -- cgit v1.2.3 From 435b46ef1d9fd904089199da16a21ade0701537f Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 16 Mar 2021 14:18:36 -0700 Subject: perf stat: Measure 't0' and 'ref_time' after enable_counters() Take measurements of 't0' and 'ref_time' after enable_counters(), so that they only measure the time consumed when the counters are enabled. Signed-off-by: Song Liu Acked-by: Andi Kleen Acked-by: Jiri Olsa Acked-by: Namhyung Kim Cc: kernel-team@fb.com Link: http://lore.kernel.org/lkml/20210316211837.910506-3-songliubraving@fb.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index d34e22dce2c8..4bb48c6b6698 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -931,15 +931,15 @@ try_again_reset: /* * Enable counters and exec the command: */ - t0 = rdclock(); - clock_gettime(CLOCK_MONOTONIC, &ref_time); - if (forks) { evlist__start_workload(evsel_list); err = enable_counters(); if (err) return -1; + t0 = rdclock(); + clock_gettime(CLOCK_MONOTONIC, &ref_time); + if (interval || timeout || evlist__ctlfd_initialized(evsel_list)) status = dispatch_events(forks, timeout, interval, ×); if (child_pid != -1) { @@ -960,6 +960,10 @@ try_again_reset: err = enable_counters(); if (err) return -1; + + t0 = rdclock(); + clock_gettime(CLOCK_MONOTONIC, &ref_time); + status = dispatch_events(forks, timeout, interval, ×); } -- cgit v1.2.3 From 2c0cb9f56020d2ea006589434d5eb4e702110124 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 23 Mar 2021 17:42:47 -0300 Subject: perf test: Add a shell test for 'perf stat --bpf-counters' new option Add a test to compare the output of perf-stat with and without option --bpf-counters. If the difference is more than 10%, the test is considered as failed. Committer testing: # perf test bpf-counters 86: perf stat --bpf-counters test : Ok # perf test -v bpf-counters 86: perf stat --bpf-counters test : --- start --- test child forked, pid 2433339 test child finished with 0 ---- end ---- perf stat --bpf-counters test: Ok # Signed-off-by: Song Liu Requested-by: Arnaldo Carvalho de Melo Tested-by: Arnaldo Carvalho de Melo Acked-by: Namhyung Kim Cc: Jiri Olsa Link: http://lore.kernel.org/lkml/EC00E37D-8587-4662-8E30-7AD5F874FA84@fb.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/stat_bpf_counters.sh | 31 +++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100755 tools/perf/tests/shell/stat_bpf_counters.sh (limited to 'tools') diff --git a/tools/perf/tests/shell/stat_bpf_counters.sh b/tools/perf/tests/shell/stat_bpf_counters.sh new file mode 100755 index 000000000000..22eb31e48ca7 --- /dev/null +++ b/tools/perf/tests/shell/stat_bpf_counters.sh @@ -0,0 +1,31 @@ +#!/bin/sh +# perf stat --bpf-counters test +# SPDX-License-Identifier: GPL-2.0 + +set -e + +# check whether $2 is within +/- 10% of $1 +compare_number() +{ + first_num=$1 + second_num=$2 + + # upper bound is first_num * 110% + upper=$(( $first_num + $first_num / 10 )) + # lower bound is first_num * 90% + lower=$(( $first_num - $first_num / 10 )) + + if [ $second_num -gt $upper ] || [ $second_num -lt $lower ]; then + echo "The difference between $first_num and $second_num are greater than 10%." + exit 1 + fi +} + +# skip if --bpf-counters is not supported +perf stat --bpf-counters true > /dev/null 2>&1 || exit 2 + +base_cycles=$(perf stat --no-big-num -e cycles -- perf bench sched messaging -g 1 -l 100 -t 2>&1 | awk '/cycles/ {print $1}') +bpf_cycles=$(perf stat --no-big-num --bpf-counters -e cycles -- perf bench sched messaging -g 1 -l 100 -t 2>&1 | awk '/cycles/ {print $1}') + +compare_number $base_cycles $bpf_cycles +exit 0 -- cgit v1.2.3 From 3a72c94ebfb1f171eba0715998010678a09ec796 Mon Sep 17 00:00:00 2001 From: Russell Currey Date: Tue, 23 Feb 2021 17:02:27 +1000 Subject: selftests/powerpc: Fix L1D flushing tests for Power10 The rfi_flush and entry_flush selftests work by using the PM_LD_MISS_L1 perf event to count L1D misses. The value of this event has changed over time: - Power7 uses 0x400f0 - Power8 and Power9 use both 0x400f0 and 0x3e054 - Power10 uses only 0x3e054 Rather than relying on raw values, configure perf to count L1D read misses in the most explicit way available. This fixes the selftests to work on systems without 0x400f0 as PM_LD_MISS_L1, and should change no behaviour for systems that the tests already worked on. The only potential downside is that referring to a specific perf event requires PMU support implemented in the kernel for that platform. Signed-off-by: Russell Currey Acked-by: Daniel Axtens Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210223070227.2916871-1-ruscur@russell.cc --- tools/testing/selftests/powerpc/security/entry_flush.c | 2 +- tools/testing/selftests/powerpc/security/flush_utils.h | 4 ++++ tools/testing/selftests/powerpc/security/rfi_flush.c | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/security/entry_flush.c b/tools/testing/selftests/powerpc/security/entry_flush.c index 78cf914fa321..68ce377b205e 100644 --- a/tools/testing/selftests/powerpc/security/entry_flush.c +++ b/tools/testing/selftests/powerpc/security/entry_flush.c @@ -53,7 +53,7 @@ int entry_flush_test(void) entry_flush = entry_flush_orig; - fd = perf_event_open_counter(PERF_TYPE_RAW, /* L1d miss */ 0x400f0, -1); + fd = perf_event_open_counter(PERF_TYPE_HW_CACHE, PERF_L1D_READ_MISS_CONFIG, -1); FAIL_IF(fd < 0); p = (char *)memalign(zero_size, CACHELINE_SIZE); diff --git a/tools/testing/selftests/powerpc/security/flush_utils.h b/tools/testing/selftests/powerpc/security/flush_utils.h index 07a5eb301466..7a3d60292916 100644 --- a/tools/testing/selftests/powerpc/security/flush_utils.h +++ b/tools/testing/selftests/powerpc/security/flush_utils.h @@ -9,6 +9,10 @@ #define CACHELINE_SIZE 128 +#define PERF_L1D_READ_MISS_CONFIG ((PERF_COUNT_HW_CACHE_L1D) | \ + (PERF_COUNT_HW_CACHE_OP_READ << 8) | \ + (PERF_COUNT_HW_CACHE_RESULT_MISS << 16)) + void syscall_loop(char *p, unsigned long iterations, unsigned long zero_size); diff --git a/tools/testing/selftests/powerpc/security/rfi_flush.c b/tools/testing/selftests/powerpc/security/rfi_flush.c index 7565fd786640..f73484a6470f 100644 --- a/tools/testing/selftests/powerpc/security/rfi_flush.c +++ b/tools/testing/selftests/powerpc/security/rfi_flush.c @@ -54,7 +54,7 @@ int rfi_flush_test(void) rfi_flush = rfi_flush_orig; - fd = perf_event_open_counter(PERF_TYPE_RAW, /* L1d miss */ 0x400f0, -1); + fd = perf_event_open_counter(PERF_TYPE_HW_CACHE, PERF_L1D_READ_MISS_CONFIG, -1); FAIL_IF(fd < 0); p = (char *)memalign(zero_size, CACHELINE_SIZE); -- cgit v1.2.3 From f33dece70e11ce82a09cb1ea2d7c32347b82c67e Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Sun, 14 Mar 2021 19:16:21 +0800 Subject: selftests/sgx: Use getauxval() to simplify test code Use the library function getauxval() instead of a custom function to get the base address of the vDSO. [ bp: Massage commit message. ] Signed-off-by: Tianjia Zhang Signed-off-by: Borislav Petkov Reviewed-by: Jarkko Sakkinen Acked-by: Shuah Khan Link: https://lkml.kernel.org/r/20210314111621.68428-1-tianjia.zhang@linux.alibaba.com --- tools/testing/selftests/sgx/main.c | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c index b117bb86a73f..d304a4044eb9 100644 --- a/tools/testing/selftests/sgx/main.c +++ b/tools/testing/selftests/sgx/main.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "defines.h" #include "main.h" #include "../kselftest.h" @@ -28,24 +29,6 @@ struct vdso_symtab { Elf64_Word *elf_hashtab; }; -static void *vdso_get_base_addr(char *envp[]) -{ - Elf64_auxv_t *auxv; - int i; - - for (i = 0; envp[i]; i++) - ; - - auxv = (Elf64_auxv_t *)&envp[i + 1]; - - for (i = 0; auxv[i].a_type != AT_NULL; i++) { - if (auxv[i].a_type == AT_SYSINFO_EHDR) - return (void *)auxv[i].a_un.a_val; - } - - return NULL; -} - static Elf64_Dyn *vdso_get_dyntab(void *addr) { Elf64_Ehdr *ehdr = addr; @@ -162,7 +145,7 @@ static int user_handler(long rdi, long rsi, long rdx, long ursp, long r8, long r return 0; } -int main(int argc, char *argv[], char *envp[]) +int main(int argc, char *argv[]) { struct sgx_enclave_run run; struct vdso_symtab symtab; @@ -203,7 +186,8 @@ int main(int argc, char *argv[], char *envp[]) memset(&run, 0, sizeof(run)); run.tcs = encl.encl_base; - addr = vdso_get_base_addr(envp); + /* Get vDSO base address */ + addr = (void *)getauxval(AT_SYSINFO_EHDR); if (!addr) goto err; -- cgit v1.2.3 From 0bdad97801af5913101179a5de3f54b0eb88deea Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Fri, 19 Mar 2021 15:01:55 +0800 Subject: perf stat: Align CSV output for summary mode The 'perf stat' subcommand supports the request for a summary of the interval counter readings. But the summary lines break the CSV output so it's hard for scripts to parse the result. Before: # perf stat -x, -I1000 --interval-count 1 --summary 1.001323097,8013.48,msec,cpu-clock,8013483384,100.00,8.013,CPUs utilized 1.001323097,270,,context-switches,8013513297,100.00,0.034,K/sec 1.001323097,13,,cpu-migrations,8013530032,100.00,0.002,K/sec 1.001323097,184,,page-faults,8013546992,100.00,0.023,K/sec 1.001323097,20574191,,cycles,8013551506,100.00,0.003,GHz 1.001323097,10562267,,instructions,8013564958,100.00,0.51,insn per cycle 1.001323097,2019244,,branches,8013575673,100.00,0.252,M/sec 1.001323097,106152,,branch-misses,8013585776,100.00,5.26,of all branches 8013.48,msec,cpu-clock,8013483384,100.00,7.984,CPUs utilized 270,,context-switches,8013513297,100.00,0.034,K/sec 13,,cpu-migrations,8013530032,100.00,0.002,K/sec 184,,page-faults,8013546992,100.00,0.023,K/sec 20574191,,cycles,8013551506,100.00,0.003,GHz 10562267,,instructions,8013564958,100.00,0.51,insn per cycle 2019244,,branches,8013575673,100.00,0.252,M/sec 106152,,branch-misses,8013585776,100.00,5.26,of all branches The summary line loses the timestamp column, which breaks the CSV output. We add a column at the original 'timestamp' position and it just says 'summary' for the summary line. After: # perf stat -x, -I1000 --interval-count 1 --summary 1.001196053,8012.72,msec,cpu-clock,8012722903,100.00,8.013,CPUs utilized 1.001196053,218,,context-switches,8012753271,100.00,0.027,K/sec 1.001196053,9,,cpu-migrations,8012769767,100.00,0.001,K/sec 1.001196053,0,,page-faults,8012786257,100.00,0.000,K/sec 1.001196053,15004518,,cycles,8012790637,100.00,0.002,GHz 1.001196053,7954691,,instructions,8012804027,100.00,0.53,insn per cycle 1.001196053,1590259,,branches,8012814766,100.00,0.198,M/sec 1.001196053,82601,,branch-misses,8012824365,100.00,5.19,of all branches summary,8012.72,msec,cpu-clock,8012722903,100.00,7.986,CPUs utilized summary,218,,context-switches,8012753271,100.00,0.027,K/sec summary,9,,cpu-migrations,8012769767,100.00,0.001,K/sec summary,0,,page-faults,8012786257,100.00,0.000,K/sec summary,15004518,,cycles,8012790637,100.00,0.002,GHz summary,7954691,,instructions,8012804027,100.00,0.53,insn per cycle summary,1590259,,branches,8012814766,100.00,0.198,M/sec summary,82601,,branch-misses,8012824365,100.00,5.19,of all branches Now it's easy for script to analyse the summary lines. Of course, we also consider not to break possible existing scripts which can continue to use the broken CSV format by using a new '--no-csv-summary.' option. # perf stat -x, -I1000 --interval-count 1 --summary --no-csv-summary 1.001213261,8012.67,msec,cpu-clock,8012672327,100.00,8.013,CPUs utilized 1.001213261,197,,context-switches,8012703742,100.00,24.586,/sec 1.001213261,9,,cpu-migrations,8012720902,100.00,1.123,/sec 1.001213261,644,,page-faults,8012738266,100.00,80.373,/sec 1.001213261,18350698,,cycles,8012744109,100.00,0.002,GHz 1.001213261,12745021,,instructions,8012759001,100.00,0.69,insn per cycle 1.001213261,2458033,,branches,8012770864,100.00,306.768,K/sec 1.001213261,102107,,branch-misses,8012781751,100.00,4.15,of all branches 8012.67,msec,cpu-clock,8012672327,100.00,7.985,CPUs utilized 197,,context-switches,8012703742,100.00,24.586,/sec 9,,cpu-migrations,8012720902,100.00,1.123,/sec 644,,page-faults,8012738266,100.00,80.373,/sec 18350698,,cycles,8012744109,100.00,0.002,GHz 12745021,,instructions,8012759001,100.00,0.69,insn per cycle 2458033,,branches,8012770864,100.00,306.768,K/sec 102107,,branch-misses,8012781751,100.00,4.15,of all branches This option can be enabled in perf config by setting the variable 'stat.no-csv-summary'. # perf config stat.no-csv-summary=true # perf config -l stat.no-csv-summary=true # perf stat -x, -I1000 --interval-count 1 --summary 1.001330198,8013.28,msec,cpu-clock,8013279201,100.00,8.013,CPUs utilized 1.001330198,205,,context-switches,8013308394,100.00,25.583,/sec 1.001330198,10,,cpu-migrations,8013324681,100.00,1.248,/sec 1.001330198,0,,page-faults,8013340926,100.00,0.000,/sec 1.001330198,8027742,,cycles,8013344503,100.00,0.001,GHz 1.001330198,2871717,,instructions,8013356501,100.00,0.36,insn per cycle 1.001330198,553564,,branches,8013366204,100.00,69.081,K/sec 1.001330198,54021,,branch-misses,8013375952,100.00,9.76,of all branches 8013.28,msec,cpu-clock,8013279201,100.00,7.985,CPUs utilized 205,,context-switches,8013308394,100.00,25.583,/sec 10,,cpu-migrations,8013324681,100.00,1.248,/sec 0,,page-faults,8013340926,100.00,0.000,/sec 8027742,,cycles,8013344503,100.00,0.001,GHz 2871717,,instructions,8013356501,100.00,0.36,insn per cycle 553564,,branches,8013366204,100.00,69.081,K/sec 54021,,branch-misses,8013375952,100.00,9.76,of all branches Signed-off-by: Jin Yao Acked-by: Andi Kleen Acked-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Jin Yao Cc: Kan Liang Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20210319070156.20394-1-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-stat.txt | 9 +++++++++ tools/perf/builtin-stat.c | 7 +++++++ tools/perf/util/config.c | 3 +++ tools/perf/util/stat-display.c | 6 ++++++ tools/perf/util/stat.h | 2 ++ 5 files changed, 27 insertions(+) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt index 744211fa8c18..6ec5960b08c3 100644 --- a/tools/perf/Documentation/perf-stat.txt +++ b/tools/perf/Documentation/perf-stat.txt @@ -482,6 +482,15 @@ convenient for post processing. --summary:: Print summary for interval mode (-I). +--no-csv-summary:: +Don't print 'summary' at the first column for CVS summary output. +This option must be used with -x and --summary. + +This option can be enabled in perf config by setting the variable +'stat.no-csv-summary'. + +$ perf config stat.no-csv-summary=true + EXAMPLES -------- diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 4bb48c6b6698..2a2c15cac80a 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1093,6 +1093,11 @@ void perf_stat__set_big_num(int set) stat_config.big_num = (set != 0); } +void perf_stat__set_no_csv_summary(int set) +{ + stat_config.no_csv_summary = (set != 0); +} + static int stat__set_big_num(const struct option *opt __maybe_unused, const char *s __maybe_unused, int unset) { @@ -1249,6 +1254,8 @@ static struct option stat_options[] = { "threads of same physical core"), OPT_BOOLEAN(0, "summary", &stat_config.summary, "print summary for interval mode"), + OPT_BOOLEAN(0, "no-csv-summary", &stat_config.no_csv_summary, + "don't print 'summary' for CSV summary output"), OPT_BOOLEAN(0, "quiet", &stat_config.quiet, "don't print output (useful with record)"), #ifdef HAVE_LIBPFM diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c index 2daeaa9a4a24..6bcb5ef221f8 100644 --- a/tools/perf/util/config.c +++ b/tools/perf/util/config.c @@ -457,6 +457,9 @@ static int perf_stat_config(const char *var, const char *value) if (!strcmp(var, "stat.big-num")) perf_stat__set_big_num(perf_config_bool(var, value)); + if (!strcmp(var, "stat.no-csv-summary")) + perf_stat__set_no_csv_summary(perf_config_bool(var, value)); + /* Add other config variables here. */ return 0; } diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c index 7f09cdaf5b60..d3137bc17065 100644 --- a/tools/perf/util/stat-display.c +++ b/tools/perf/util/stat-display.c @@ -439,6 +439,12 @@ static void printout(struct perf_stat_config *config, struct aggr_cpu_id id, int if (counter->cgrp) os.nfields++; } + + if (!config->no_csv_summary && config->csv_output && + config->summary && !config->interval) { + fprintf(config->output, "%16s%s", "summary", config->csv_sep); + } + if (run == 0 || ena == 0 || counter->counts->scaled == -1) { if (config->metric_only) { pm(config, &os, NULL, "", "", 0); diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h index 41107b8deac5..48e6a06233fa 100644 --- a/tools/perf/util/stat.h +++ b/tools/perf/util/stat.h @@ -128,6 +128,7 @@ struct perf_stat_config { bool all_user; bool percore_show_thread; bool summary; + bool no_csv_summary; bool metric_no_group; bool metric_no_merge; bool stop_read_counter; @@ -160,6 +161,7 @@ struct perf_stat_config { }; void perf_stat__set_big_num(int set); +void perf_stat__set_no_csv_summary(int set); void update_stats(struct stats *stats, u64 val); double avg_stats(struct stats *stats); -- cgit v1.2.3 From 0f7ff383937b24a3db72234a37e8b724acda8ad3 Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Fri, 19 Mar 2021 15:01:56 +0800 Subject: perf test: Add CSV summary test The patch "perf stat: Align CSV output for summary mode" aligned CSV output and added "summary" to the first column of summary lines. Now we check if the "summary" string is added to the CSV output. If we set '--no-csv-summary' option, the "summary" string would not be added, also check with this case. Committer testing: $ perf test csv 84: perf stat csv summary test : Ok $ Signed-off-by: Jin Yao Acked-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Andi Kleen Cc: Jin Yao Cc: Kan Liang Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20210319070156.20394-2-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/stat+csv_summary.sh | 31 ++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100755 tools/perf/tests/shell/stat+csv_summary.sh (limited to 'tools') diff --git a/tools/perf/tests/shell/stat+csv_summary.sh b/tools/perf/tests/shell/stat+csv_summary.sh new file mode 100755 index 000000000000..5571ff75eb42 --- /dev/null +++ b/tools/perf/tests/shell/stat+csv_summary.sh @@ -0,0 +1,31 @@ +#!/bin/sh +# perf stat csv summary test +# SPDX-License-Identifier: GPL-2.0 + +set -e + +# +# 1.001364330 9224197 cycles 8012885033 100.00 +# summary 9224197 cycles 8012885033 100.00 +# +perf stat -e cycles -x' ' -I1000 --interval-count 1 --summary 2>&1 | \ +grep -e summary | \ +while read summary num event run pct +do + if [ $summary != "summary" ]; then + exit 1 + fi +done + +# +# 1.001360298 9148534 cycles 8012853854 100.00 +#9148534 cycles 8012853854 100.00 +# +perf stat -e cycles -x' ' -I1000 --interval-count 1 --summary --no-csv-summary 2>&1 | \ +grep -e summary | \ +while read num event run pct +do + exit 1 +done + +exit 0 -- cgit v1.2.3 From 46cb11b17c7a917e0eb5c7aa87a7bc6cda455a5e Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 19 Mar 2021 16:53:27 +0000 Subject: kselftest/arm64: mte: user_mem: Fix write() warning Out of the box Ubuntu's 20.04 compiler warns about missing return value checks for write() (sys)calls. Make GCC happy by checking whether we actually managed to write "val". Signed-off-by: Andre Przywara Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20210319165334.29213-5-andre.przywara@arm.com Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/mte/check_user_mem.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/arm64/mte/check_user_mem.c b/tools/testing/selftests/arm64/mte/check_user_mem.c index 4bfa80f2a8c3..1de7a0abd0ae 100644 --- a/tools/testing/selftests/arm64/mte/check_user_mem.c +++ b/tools/testing/selftests/arm64/mte/check_user_mem.c @@ -33,7 +33,8 @@ static int check_usermem_access_fault(int mem_type, int mode, int mapping) if (fd == -1) return KSFT_FAIL; for (i = 0; i < len; i++) - write(fd, &val, sizeof(val)); + if (write(fd, &val, sizeof(val)) != sizeof(val)) + return KSFT_FAIL; lseek(fd, 0, 0); ptr = mte_allocate_memory(len, mem_type, mapping, true); if (check_allocated_memory(ptr, len, mem_type, true) != KSFT_PASS) { -- cgit v1.2.3 From d302a702530b4025fbb14f20e637badce28bc741 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 19 Mar 2021 16:53:28 +0000 Subject: kselftest/arm64: mte: common: Fix write() warnings Out of the box Ubuntu's 20.04 compiler warns about missing return value checks for write() (sys)calls. Make GCC happy by checking whether we actually managed to write out our buffer. Signed-off-by: Andre Przywara Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20210319165334.29213-6-andre.przywara@arm.com Signed-off-by: Catalin Marinas --- .../testing/selftests/arm64/mte/mte_common_util.c | 23 +++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/arm64/mte/mte_common_util.c b/tools/testing/selftests/arm64/mte/mte_common_util.c index 39f8908988ea..4e887dad762d 100644 --- a/tools/testing/selftests/arm64/mte/mte_common_util.c +++ b/tools/testing/selftests/arm64/mte/mte_common_util.c @@ -181,10 +181,17 @@ void *mte_allocate_file_memory(size_t size, int mem_type, int mapping, bool tags } /* Initialize the file for mappable size */ lseek(fd, 0, SEEK_SET); - for (index = INIT_BUFFER_SIZE; index < size; index += INIT_BUFFER_SIZE) - write(fd, buffer, INIT_BUFFER_SIZE); + for (index = INIT_BUFFER_SIZE; index < size; index += INIT_BUFFER_SIZE) { + if (write(fd, buffer, INIT_BUFFER_SIZE) != INIT_BUFFER_SIZE) { + perror("initialising buffer"); + return NULL; + } + } index -= INIT_BUFFER_SIZE; - write(fd, buffer, size - index); + if (write(fd, buffer, size - index) != size - index) { + perror("initialising buffer"); + return NULL; + } return __mte_allocate_memory_range(size, mem_type, mapping, 0, 0, tags, fd); } @@ -202,9 +209,15 @@ void *mte_allocate_file_memory_tag_range(size_t size, int mem_type, int mapping, /* Initialize the file for mappable size */ lseek(fd, 0, SEEK_SET); for (index = INIT_BUFFER_SIZE; index < map_size; index += INIT_BUFFER_SIZE) - write(fd, buffer, INIT_BUFFER_SIZE); + if (write(fd, buffer, INIT_BUFFER_SIZE) != INIT_BUFFER_SIZE) { + perror("initialising buffer"); + return NULL; + } index -= INIT_BUFFER_SIZE; - write(fd, buffer, map_size - index); + if (write(fd, buffer, map_size - index) != map_size - index) { + perror("initialising buffer"); + return NULL; + } return __mte_allocate_memory_range(size, mem_type, mapping, range_before, range_after, true, fd); } -- cgit v1.2.3 From 592432862cc4019075a7196d9961562c49507d6f Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 19 Mar 2021 16:53:29 +0000 Subject: kselftest/arm64: mte: Fix MTE feature detection To check whether the CPU and kernel support the MTE features we want to test, we use an (emulated) CPU ID register read. However we only check against a very particular feature version (0b0010), even though the ARM ARM promises ID register features to be backwards compatible. While this could be fixed by using ">=" instead of "==", we should actually use the explicit HWCAP2_MTE hardware capability, exposed by the kernel via the ELF auxiliary vectors. That moves this responsibility to the kernel, and fixes running the tests on machines with FEAT_MTE3 capability. Signed-off-by: Andre Przywara Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20210319165334.29213-7-andre.przywara@arm.com Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/mte/mte_common_util.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/arm64/mte/mte_common_util.c b/tools/testing/selftests/arm64/mte/mte_common_util.c index 4e887dad762d..aa8a8a6b8b6d 100644 --- a/tools/testing/selftests/arm64/mte/mte_common_util.c +++ b/tools/testing/selftests/arm64/mte/mte_common_util.c @@ -291,22 +291,13 @@ int mte_switch_mode(int mte_option, unsigned long incl_mask) return 0; } -#define ID_AA64PFR1_MTE_SHIFT 8 -#define ID_AA64PFR1_MTE 2 - int mte_default_setup(void) { - unsigned long hwcaps = getauxval(AT_HWCAP); + unsigned long hwcaps2 = getauxval(AT_HWCAP2); unsigned long en = 0; int ret; - if (!(hwcaps & HWCAP_CPUID)) { - ksft_print_msg("FAIL: CPUID registers unavailable\n"); - return KSFT_FAIL; - } - /* Read ID_AA64PFR1_EL1 register */ - asm volatile("mrs %0, id_aa64pfr1_el1" : "=r"(hwcaps) : : "memory"); - if (((hwcaps >> ID_AA64PFR1_MTE_SHIFT) & MT_TAG_MASK) != ID_AA64PFR1_MTE) { + if (!(hwcaps2 & HWCAP2_MTE)) { ksft_print_msg("FAIL: MTE features unavailable\n"); return KSFT_SKIP; } -- cgit v1.2.3 From 5238c2cd5a2e0dc846d1b900553211e832952b8d Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 19 Mar 2021 16:53:30 +0000 Subject: kselftest/arm64: mte: Use cross-compiler if specified At the moment we either need to provide CC explicitly, or use a native machine to get the ARM64 MTE selftest compiled. It seems useful to use the same (cross-)compiler as we use for the kernel, so copy the recipe we use in the pauth selftest. Signed-off-by: Andre Przywara Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20210319165334.29213-8-andre.przywara@arm.com Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/mte/Makefile | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/arm64/mte/Makefile b/tools/testing/selftests/arm64/mte/Makefile index 90aadd86fa0d..fa282e5f2069 100644 --- a/tools/testing/selftests/arm64/mte/Makefile +++ b/tools/testing/selftests/arm64/mte/Makefile @@ -1,6 +1,11 @@ # SPDX-License-Identifier: GPL-2.0 # Copyright (C) 2020 ARM Limited +# preserve CC value from top level Makefile +ifeq ($(CC),cc) +CC := $(CROSS_COMPILE)gcc +endif + CFLAGS += -std=gnu99 -I. -pthread LDFLAGS += -pthread SRCS := $(filter-out mte_common_util.c,$(wildcard *.c)) -- cgit v1.2.3 From 8bbb58a3c6b9f12b2217b8ae08e70a6d2f556f73 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 19 Mar 2021 16:53:31 +0000 Subject: kselftest/arm64: mte: Output warning about failing compiler At the moment we check the compiler's ability to compile MTE enabled code, but guard all the Makefile rules by it. As a consequence a broken or not capable compiler just doesn't do anything, and make happily returns without any error message, but with no programs created. Since the MTE feature is only supported by recent aarch64 compilers (not all stable distro compilers support it), having an explicit message seems like a good idea. To not break building multiple targets, we let make proceed without errors. Signed-off-by: Andre Przywara Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20210319165334.29213-9-andre.przywara@arm.com Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/mte/Makefile | 3 +++ 1 file changed, 3 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/arm64/mte/Makefile b/tools/testing/selftests/arm64/mte/Makefile index fa282e5f2069..e0d43cea3cd1 100644 --- a/tools/testing/selftests/arm64/mte/Makefile +++ b/tools/testing/selftests/arm64/mte/Makefile @@ -23,6 +23,9 @@ TEST_GEN_PROGS := $(PROGS) # Get Kernel headers installed and use them. KSFT_KHDR_INSTALL := 1 +else + $(warning compiler "$(CC)" does not support the ARMv8.5 MTE extension.) + $(warning test program "mte" will not be created.) endif # Include KSFT lib.mk. -- cgit v1.2.3 From 9466ecac84a462fc91d192f0f9be0afd8e4f19f9 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 19 Mar 2021 16:53:32 +0000 Subject: kselftest/arm64: mte: Makefile: Fix clang compilation When clang finds a header file on the command line, it wants to precompile that, which would end up in a separate output file. Specifying -o on that same command line collides with that effort, so the compiler complains: clang: error: cannot specify -o when generating multiple output files Since we are not really after a precompiled header, just drop the header file from the command line, by removing it from the list of source files in the Makefile. Signed-off-by: Andre Przywara Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20210319165334.29213-10-andre.przywara@arm.com Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/mte/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/arm64/mte/Makefile b/tools/testing/selftests/arm64/mte/Makefile index e0d43cea3cd1..409e3e53d00a 100644 --- a/tools/testing/selftests/arm64/mte/Makefile +++ b/tools/testing/selftests/arm64/mte/Makefile @@ -32,5 +32,5 @@ endif include ../../lib.mk ifeq ($(mte_cc_support),1) -$(TEST_GEN_PROGS): mte_common_util.c mte_common_util.h mte_helper.S +$(TEST_GEN_PROGS): mte_common_util.c mte_helper.S endif -- cgit v1.2.3 From b4e1fa2290691fda4392ac479115ee3b04a7534c Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 19 Mar 2021 16:53:33 +0000 Subject: kselftest/arm64: mte: Fix clang warning if (!prctl(...) == 0) is not only cumbersome to read, it also upsets clang and triggers a warning: ------------ mte_common_util.c:287:6: warning: logical not is only applied to the left hand side of this comparison [-Wlogical-not-parentheses] .... Fix that by just comparing against "not 0" instead. Signed-off-by: Andre Przywara Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20210319165334.29213-11-andre.przywara@arm.com Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/mte/mte_common_util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/arm64/mte/mte_common_util.c b/tools/testing/selftests/arm64/mte/mte_common_util.c index aa8a8a6b8b6d..040abdca079d 100644 --- a/tools/testing/selftests/arm64/mte/mte_common_util.c +++ b/tools/testing/selftests/arm64/mte/mte_common_util.c @@ -284,7 +284,7 @@ int mte_switch_mode(int mte_option, unsigned long incl_mask) en |= (incl_mask << PR_MTE_TAG_SHIFT); /* Enable address tagging ABI, mte error reporting mode and tag inclusion mask. */ - if (!prctl(PR_SET_TAGGED_ADDR_CTRL, en, 0, 0, 0) == 0) { + if (prctl(PR_SET_TAGGED_ADDR_CTRL, en, 0, 0, 0) != 0) { ksft_print_msg("FAIL:prctl PR_SET_TAGGED_ADDR_CTRL for mte mode\n"); return -EINVAL; } -- cgit v1.2.3 From 75347add03e0fa60ecf2f79e41ec2152b8504593 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 19 Mar 2021 16:53:34 +0000 Subject: kselftest/arm64: mte: Report filename on failing temp file creation The MTE selftests create temporary files in /dev/shm, for later mmap-ing them. When there is no tmpfs mounted on /dev/shm, or /dev/shm does not exist in the first place (on minimal filesystems), the error message is not giving good hints: # FAIL: Unable to open temporary file # FAIL: memory allocation not ok 17 Check initial tags with private mapping, ... Add a perror() call, that gives both the filename and the actual error reason, so that users get a chance of correcting that. Signed-off-by: Andre Przywara Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20210319165334.29213-12-andre.przywara@arm.com Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/mte/mte_common_util.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/arm64/mte/mte_common_util.c b/tools/testing/selftests/arm64/mte/mte_common_util.c index 040abdca079d..f50ac31920d1 100644 --- a/tools/testing/selftests/arm64/mte/mte_common_util.c +++ b/tools/testing/selftests/arm64/mte/mte_common_util.c @@ -337,6 +337,7 @@ int create_temp_file(void) /* Create a file in the tmpfs filesystem */ fd = mkstemp(&filename[0]); if (fd == -1) { + perror(filename); ksft_print_msg("FAIL: Unable to open temporary file\n"); return 0; } -- cgit v1.2.3 From 314bcbf09f147cfb069bc22207215b6b0b7da510 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 9 Mar 2021 19:37:31 +0000 Subject: kselftest: arm64: Add BTI tests Add some tests that verify that BTI functions correctly for static binaries built with and without BTI support, verifying that SIGILL is generated when expected and is not generated in other situations. Since BTI support is still being rolled out in distributions these tests are built entirely free standing, no libc support is used at all so none of the standard helper functions for kselftest can be used and we open code everything. This also means we aren't testing the kernel support for the dynamic linker, though the test program can be readily adapted for that once it becomes something that we can reliably build and run. These tests were originally written by Dave Martin, I've adapted them for kselftest, mainly around the build system and the output format. Signed-off-by: Mark Brown Cc: Dave Martin Link: https://lore.kernel.org/r/20210309193731.57247-1-broonie@kernel.org Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/Makefile | 2 +- tools/testing/selftests/arm64/bti/.gitignore | 2 + tools/testing/selftests/arm64/bti/Makefile | 61 ++++++ tools/testing/selftests/arm64/bti/assembler.h | 80 ++++++++ tools/testing/selftests/arm64/bti/btitest.h | 23 +++ tools/testing/selftests/arm64/bti/compiler.h | 21 ++ tools/testing/selftests/arm64/bti/gen/.gitignore | 2 + tools/testing/selftests/arm64/bti/signal.c | 37 ++++ tools/testing/selftests/arm64/bti/signal.h | 21 ++ tools/testing/selftests/arm64/bti/start.S | 14 ++ tools/testing/selftests/arm64/bti/syscall.S | 23 +++ tools/testing/selftests/arm64/bti/system.c | 22 +++ tools/testing/selftests/arm64/bti/system.h | 28 +++ tools/testing/selftests/arm64/bti/test.c | 234 +++++++++++++++++++++++ tools/testing/selftests/arm64/bti/teststubs.S | 39 ++++ tools/testing/selftests/arm64/bti/trampoline.S | 29 +++ 16 files changed, 637 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/arm64/bti/.gitignore create mode 100644 tools/testing/selftests/arm64/bti/Makefile create mode 100644 tools/testing/selftests/arm64/bti/assembler.h create mode 100644 tools/testing/selftests/arm64/bti/btitest.h create mode 100644 tools/testing/selftests/arm64/bti/compiler.h create mode 100644 tools/testing/selftests/arm64/bti/gen/.gitignore create mode 100644 tools/testing/selftests/arm64/bti/signal.c create mode 100644 tools/testing/selftests/arm64/bti/signal.h create mode 100644 tools/testing/selftests/arm64/bti/start.S create mode 100644 tools/testing/selftests/arm64/bti/syscall.S create mode 100644 tools/testing/selftests/arm64/bti/system.c create mode 100644 tools/testing/selftests/arm64/bti/system.h create mode 100644 tools/testing/selftests/arm64/bti/test.c create mode 100644 tools/testing/selftests/arm64/bti/teststubs.S create mode 100644 tools/testing/selftests/arm64/bti/trampoline.S (limited to 'tools') diff --git a/tools/testing/selftests/arm64/Makefile b/tools/testing/selftests/arm64/Makefile index 2c9d012797a7..ced910fb4019 100644 --- a/tools/testing/selftests/arm64/Makefile +++ b/tools/testing/selftests/arm64/Makefile @@ -4,7 +4,7 @@ ARCH ?= $(shell uname -m 2>/dev/null || echo not) ifneq (,$(filter $(ARCH),aarch64 arm64)) -ARM64_SUBTARGETS ?= tags signal pauth fp mte +ARM64_SUBTARGETS ?= tags signal pauth fp mte bti else ARM64_SUBTARGETS := endif diff --git a/tools/testing/selftests/arm64/bti/.gitignore b/tools/testing/selftests/arm64/bti/.gitignore new file mode 100644 index 000000000000..73869fabada4 --- /dev/null +++ b/tools/testing/selftests/arm64/bti/.gitignore @@ -0,0 +1,2 @@ +btitest +nobtitest diff --git a/tools/testing/selftests/arm64/bti/Makefile b/tools/testing/selftests/arm64/bti/Makefile new file mode 100644 index 000000000000..73e013c082a6 --- /dev/null +++ b/tools/testing/selftests/arm64/bti/Makefile @@ -0,0 +1,61 @@ +# SPDX-License-Identifier: GPL-2.0 + +TEST_GEN_PROGS := btitest nobtitest + +PROGS := $(patsubst %,gen/%,$(TEST_GEN_PROGS)) + +# These tests are built as freestanding binaries since otherwise BTI +# support in ld.so is required which is not currently widespread; when +# it is available it will still be useful to test this separately as the +# cases for statically linked and dynamically lined binaries are +# slightly different. + +CFLAGS_NOBTI = -DBTI=0 +CFLAGS_BTI = -mbranch-protection=standard -DBTI=1 + +CFLAGS_COMMON = -ffreestanding -Wall -Wextra $(CFLAGS) + +BTI_CC_COMMAND = $(CC) $(CFLAGS_BTI) $(CFLAGS_COMMON) -c -o $@ $< +NOBTI_CC_COMMAND = $(CC) $(CFLAGS_NOBTI) $(CFLAGS_COMMON) -c -o $@ $< + +%-bti.o: %.c + $(BTI_CC_COMMAND) + +%-bti.o: %.S + $(BTI_CC_COMMAND) + +%-nobti.o: %.c + $(NOBTI_CC_COMMAND) + +%-nobti.o: %.S + $(NOBTI_CC_COMMAND) + +BTI_OBJS = \ + test-bti.o \ + signal-bti.o \ + start-bti.o \ + syscall-bti.o \ + system-bti.o \ + teststubs-bti.o \ + trampoline-bti.o +gen/btitest: $(BTI_OBJS) + $(CC) $(CFLAGS_BTI) $(CFLAGS_COMMON) -nostdlib -o $@ $^ + +NOBTI_OBJS = \ + test-nobti.o \ + signal-nobti.o \ + start-nobti.o \ + syscall-nobti.o \ + system-nobti.o \ + teststubs-nobti.o \ + trampoline-nobti.o +gen/nobtitest: $(NOBTI_OBJS) + $(CC) $(CFLAGS_BTI) $(CFLAGS_COMMON) -nostdlib -o $@ $^ + +# Including KSFT lib.mk here will also mangle the TEST_GEN_PROGS list +# to account for any OUTPUT target-dirs optionally provided by +# the toplevel makefile +include ../../lib.mk + +$(TEST_GEN_PROGS): $(PROGS) + cp $(PROGS) $(OUTPUT)/ diff --git a/tools/testing/selftests/arm64/bti/assembler.h b/tools/testing/selftests/arm64/bti/assembler.h new file mode 100644 index 000000000000..04e7b72880ef --- /dev/null +++ b/tools/testing/selftests/arm64/bti/assembler.h @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2019 Arm Limited + * Original author: Dave Martin + */ + +#ifndef ASSEMBLER_H +#define ASSEMBLER_H + +#define NT_GNU_PROPERTY_TYPE_0 5 +#define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000 + +/* Bits for GNU_PROPERTY_AARCH64_FEATURE_1_BTI */ +#define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0) +#define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1) + + +.macro startfn name:req + .globl \name +\name: + .macro endfn + .size \name, . - \name + .type \name, @function + .purgem endfn + .endm +.endm + +.macro emit_aarch64_feature_1_and + .pushsection .note.gnu.property, "a" + .align 3 + .long 2f - 1f + .long 6f - 3f + .long NT_GNU_PROPERTY_TYPE_0 +1: .string "GNU" +2: + .align 3 +3: .long GNU_PROPERTY_AARCH64_FEATURE_1_AND + .long 5f - 4f +4: +#if BTI + .long GNU_PROPERTY_AARCH64_FEATURE_1_PAC | \ + GNU_PROPERTY_AARCH64_FEATURE_1_BTI +#else + .long 0 +#endif +5: + .align 3 +6: + .popsection +.endm + +.macro paciasp + hint 0x19 +.endm + +.macro autiasp + hint 0x1d +.endm + +.macro __bti_ + hint 0x20 +.endm + +.macro __bti_c + hint 0x22 +.endm + +.macro __bti_j + hint 0x24 +.endm + +.macro __bti_jc + hint 0x26 +.endm + +.macro bti what= + __bti_\what +.endm + +#endif /* ! ASSEMBLER_H */ diff --git a/tools/testing/selftests/arm64/bti/btitest.h b/tools/testing/selftests/arm64/bti/btitest.h new file mode 100644 index 000000000000..2aff9b10336e --- /dev/null +++ b/tools/testing/selftests/arm64/bti/btitest.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2019 Arm Limited + * Original author: Dave Martin + */ + +#ifndef BTITEST_H +#define BTITEST_H + +/* Trampolines for calling the test stubs: */ +void call_using_br_x0(void (*)(void)); +void call_using_br_x16(void (*)(void)); +void call_using_blr(void (*)(void)); + +/* Test stubs: */ +void nohint_func(void); +void bti_none_func(void); +void bti_c_func(void); +void bti_j_func(void); +void bti_jc_func(void); +void paciasp_func(void); + +#endif /* !BTITEST_H */ diff --git a/tools/testing/selftests/arm64/bti/compiler.h b/tools/testing/selftests/arm64/bti/compiler.h new file mode 100644 index 000000000000..ebb6204f447a --- /dev/null +++ b/tools/testing/selftests/arm64/bti/compiler.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2019 Arm Limited + * Original author: Dave Martin + */ + +#ifndef COMPILER_H +#define COMPILER_H + +#define __always_unused __attribute__((__unused__)) +#define __noreturn __attribute__((__noreturn__)) +#define __unreachable() __builtin_unreachable() + +/* curse(e) has value e, but the compiler cannot assume so */ +#define curse(e) ({ \ + __typeof__(e) __curse_e = (e); \ + asm ("" : "+r" (__curse_e)); \ + __curse_e; \ +}) + +#endif /* ! COMPILER_H */ diff --git a/tools/testing/selftests/arm64/bti/gen/.gitignore b/tools/testing/selftests/arm64/bti/gen/.gitignore new file mode 100644 index 000000000000..73869fabada4 --- /dev/null +++ b/tools/testing/selftests/arm64/bti/gen/.gitignore @@ -0,0 +1,2 @@ +btitest +nobtitest diff --git a/tools/testing/selftests/arm64/bti/signal.c b/tools/testing/selftests/arm64/bti/signal.c new file mode 100644 index 000000000000..f3fd29b91141 --- /dev/null +++ b/tools/testing/selftests/arm64/bti/signal.c @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019 Arm Limited + * Original author: Dave Martin + */ + +#include "system.h" +#include "signal.h" + +int sigemptyset(sigset_t *s) +{ + unsigned int i; + + for (i = 0; i < _NSIG_WORDS; ++i) + s->sig[i] = 0; + + return 0; +} + +int sigaddset(sigset_t *s, int n) +{ + if (n < 1 || n > _NSIG) + return -EINVAL; + + s->sig[(n - 1) / _NSIG_BPW] |= 1UL << (n - 1) % _NSIG_BPW; + return 0; +} + +int sigaction(int n, struct sigaction *sa, const struct sigaction *old) +{ + return syscall(__NR_rt_sigaction, n, sa, old, sizeof(sa->sa_mask)); +} + +int sigprocmask(int how, const sigset_t *mask, sigset_t *old) +{ + return syscall(__NR_rt_sigprocmask, how, mask, old, sizeof(*mask)); +} diff --git a/tools/testing/selftests/arm64/bti/signal.h b/tools/testing/selftests/arm64/bti/signal.h new file mode 100644 index 000000000000..103457dc880e --- /dev/null +++ b/tools/testing/selftests/arm64/bti/signal.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2019 Arm Limited + * Original author: Dave Martin + */ + +#ifndef SIGNAL_H +#define SIGNAL_H + +#include + +#include "system.h" + +typedef __sighandler_t sighandler_t; + +int sigemptyset(sigset_t *s); +int sigaddset(sigset_t *s, int n); +int sigaction(int n, struct sigaction *sa, const struct sigaction *old); +int sigprocmask(int how, const sigset_t *mask, sigset_t *old); + +#endif /* ! SIGNAL_H */ diff --git a/tools/testing/selftests/arm64/bti/start.S b/tools/testing/selftests/arm64/bti/start.S new file mode 100644 index 000000000000..831f952e0572 --- /dev/null +++ b/tools/testing/selftests/arm64/bti/start.S @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2019 Arm Limited + * Original author: Dave Martin + */ + +#include "assembler.h" + +startfn _start + mov x0, sp + b start +endfn + +emit_aarch64_feature_1_and diff --git a/tools/testing/selftests/arm64/bti/syscall.S b/tools/testing/selftests/arm64/bti/syscall.S new file mode 100644 index 000000000000..8dde8b6f3db1 --- /dev/null +++ b/tools/testing/selftests/arm64/bti/syscall.S @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2019 Arm Limited + * Original author: Dave Martin + */ + +#include "assembler.h" + +startfn syscall + bti c + mov w8, w0 + mov x0, x1 + mov x1, x2 + mov x2, x3 + mov x3, x4 + mov x4, x5 + mov x5, x6 + mov x6, x7 + svc #0 + ret +endfn + +emit_aarch64_feature_1_and diff --git a/tools/testing/selftests/arm64/bti/system.c b/tools/testing/selftests/arm64/bti/system.c new file mode 100644 index 000000000000..6385d8d4973b --- /dev/null +++ b/tools/testing/selftests/arm64/bti/system.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019 Arm Limited + * Original author: Dave Martin + */ + +#include "system.h" + +#include + +#include "compiler.h" + +void __noreturn exit(int n) +{ + syscall(__NR_exit, n); + __unreachable(); +} + +ssize_t write(int fd, const void *buf, size_t size) +{ + return syscall(__NR_write, fd, buf, size); +} diff --git a/tools/testing/selftests/arm64/bti/system.h b/tools/testing/selftests/arm64/bti/system.h new file mode 100644 index 000000000000..aca118589705 --- /dev/null +++ b/tools/testing/selftests/arm64/bti/system.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2019 Arm Limited + * Original author: Dave Martin + */ + +#ifndef SYSTEM_H +#define SYSTEM_H + +#include +#include + +typedef __kernel_size_t size_t; +typedef __kernel_ssize_t ssize_t; + +#include +#include +#include +#include + +#include "compiler.h" + +long syscall(int nr, ...); + +void __noreturn exit(int n); +ssize_t write(int fd, const void *buf, size_t size); + +#endif /* ! SYSTEM_H */ diff --git a/tools/testing/selftests/arm64/bti/test.c b/tools/testing/selftests/arm64/bti/test.c new file mode 100644 index 000000000000..656b04976ccc --- /dev/null +++ b/tools/testing/selftests/arm64/bti/test.c @@ -0,0 +1,234 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019,2021 Arm Limited + * Original author: Dave Martin + */ + +#include "system.h" + +#include +#include +#include +#include +#include + +typedef struct ucontext ucontext_t; + +#include "btitest.h" +#include "compiler.h" +#include "signal.h" + +#define EXPECTED_TESTS 18 + +static volatile unsigned int test_num = 1; +static unsigned int test_passed; +static unsigned int test_failed; +static unsigned int test_skipped; + +static void fdputs(int fd, const char *str) +{ + size_t len = 0; + const char *p = str; + + while (*p++) + ++len; + + write(fd, str, len); +} + +static void putstr(const char *str) +{ + fdputs(1, str); +} + +static void putnum(unsigned int num) +{ + char c; + + if (num / 10) + putnum(num / 10); + + c = '0' + (num % 10); + write(1, &c, 1); +} + +#define puttestname(test_name, trampoline_name) do { \ + putstr(test_name); \ + putstr("/"); \ + putstr(trampoline_name); \ +} while (0) + +void print_summary(void) +{ + putstr("# Totals: pass:"); + putnum(test_passed); + putstr(" fail:"); + putnum(test_failed); + putstr(" xfail:0 xpass:0 skip:"); + putnum(test_skipped); + putstr(" error:0\n"); +} + +static const char *volatile current_test_name; +static const char *volatile current_trampoline_name; +static volatile int sigill_expected, sigill_received; + +static void handler(int n, siginfo_t *si __always_unused, + void *uc_ __always_unused) +{ + ucontext_t *uc = uc_; + + putstr("# \t[SIGILL in "); + puttestname(current_test_name, current_trampoline_name); + putstr(", BTYPE="); + write(1, &"00011011"[((uc->uc_mcontext.pstate & PSR_BTYPE_MASK) + >> PSR_BTYPE_SHIFT) * 2], 2); + if (!sigill_expected) { + putstr("]\n"); + putstr("not ok "); + putnum(test_num); + putstr(" "); + puttestname(current_test_name, current_trampoline_name); + putstr("(unexpected SIGILL)\n"); + print_summary(); + exit(128 + n); + } + + putstr(" (expected)]\n"); + sigill_received = 1; + /* zap BTYPE so that resuming the faulting code will work */ + uc->uc_mcontext.pstate &= ~PSR_BTYPE_MASK; +} + +static int skip_all; + +static void __do_test(void (*trampoline)(void (*)(void)), + void (*fn)(void), + const char *trampoline_name, + const char *name, + int expect_sigill) +{ + if (skip_all) { + test_skipped++; + putstr("ok "); + putnum(test_num); + putstr(" "); + puttestname(name, trampoline_name); + putstr(" # SKIP\n"); + + return; + } + + /* Branch Target exceptions should only happen in BTI binaries: */ + if (!BTI) + expect_sigill = 0; + + sigill_expected = expect_sigill; + sigill_received = 0; + current_test_name = name; + current_trampoline_name = trampoline_name; + + trampoline(fn); + + if (expect_sigill && !sigill_received) { + putstr("not ok "); + test_failed++; + } else { + putstr("ok "); + test_passed++; + } + putnum(test_num++); + putstr(" "); + puttestname(name, trampoline_name); + putstr("\n"); +} + +#define do_test(expect_sigill_br_x0, \ + expect_sigill_br_x16, \ + expect_sigill_blr, \ + name) \ +do { \ + __do_test(call_using_br_x0, name, "call_using_br_x0", #name, \ + expect_sigill_br_x0); \ + __do_test(call_using_br_x16, name, "call_using_br_x16", #name, \ + expect_sigill_br_x16); \ + __do_test(call_using_blr, name, "call_using_blr", #name, \ + expect_sigill_blr); \ +} while (0) + +void start(int *argcp) +{ + struct sigaction sa; + void *const *p; + const struct auxv_entry { + unsigned long type; + unsigned long val; + } *auxv; + unsigned long hwcap = 0, hwcap2 = 0; + + putstr("TAP version 13\n"); + putstr("1.."); + putnum(EXPECTED_TESTS); + putstr("\n"); + + /* Gross hack for finding AT_HWCAP2 from the initial process stack: */ + p = (void *const *)argcp + 1 + *argcp + 1; /* start of environment */ + /* step over environment */ + while (*p++) + ; + for (auxv = (const struct auxv_entry *)p; auxv->type != AT_NULL; ++auxv) { + switch (auxv->type) { + case AT_HWCAP: + hwcap = auxv->val; + break; + case AT_HWCAP2: + hwcap2 = auxv->val; + break; + default: + break; + } + } + + if (hwcap & HWCAP_PACA) + putstr("# HWCAP_PACA present\n"); + else + putstr("# HWCAP_PACA not present\n"); + + if (hwcap2 & HWCAP2_BTI) { + putstr("# HWCAP2_BTI present\n"); + if (!(hwcap & HWCAP_PACA)) + putstr("# Bad hardware? Expect problems.\n"); + } else { + putstr("# HWCAP2_BTI not present\n"); + skip_all = 1; + } + + putstr("# Test binary"); + if (!BTI) + putstr(" not"); + putstr(" built for BTI\n"); + + sa.sa_handler = (sighandler_t)(void *)handler; + sa.sa_flags = SA_SIGINFO; + sigemptyset(&sa.sa_mask); + sigaction(SIGILL, &sa, NULL); + sigaddset(&sa.sa_mask, SIGILL); + sigprocmask(SIG_UNBLOCK, &sa.sa_mask, NULL); + + do_test(1, 1, 1, nohint_func); + do_test(1, 1, 1, bti_none_func); + do_test(1, 0, 0, bti_c_func); + do_test(0, 0, 1, bti_j_func); + do_test(0, 0, 0, bti_jc_func); + do_test(1, 0, 0, paciasp_func); + + print_summary(); + + if (test_num - 1 != EXPECTED_TESTS) + putstr("# WARNING - EXPECTED TEST COUNT WRONG\n"); + + if (test_failed) + exit(1); + else + exit(0); +} diff --git a/tools/testing/selftests/arm64/bti/teststubs.S b/tools/testing/selftests/arm64/bti/teststubs.S new file mode 100644 index 000000000000..b62c8c35f67e --- /dev/null +++ b/tools/testing/selftests/arm64/bti/teststubs.S @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2019 Arm Limited + * Original author: Dave Martin + */ + +#include "assembler.h" + +startfn bti_none_func + bti + ret +endfn + +startfn bti_c_func + bti c + ret +endfn + +startfn bti_j_func + bti j + ret +endfn + +startfn bti_jc_func + bti jc + ret +endfn + +startfn paciasp_func + paciasp + autiasp + ret +endfn + +startfn nohint_func + ret +endfn + +emit_aarch64_feature_1_and diff --git a/tools/testing/selftests/arm64/bti/trampoline.S b/tools/testing/selftests/arm64/bti/trampoline.S new file mode 100644 index 000000000000..09beb3f361f1 --- /dev/null +++ b/tools/testing/selftests/arm64/bti/trampoline.S @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2019 Arm Limited + * Original author: Dave Martin + */ + +#include "assembler.h" + +startfn call_using_br_x0 + bti c + br x0 +endfn + +startfn call_using_br_x16 + bti c + mov x16, x0 + br x16 +endfn + +startfn call_using_blr + paciasp + stp x29, x30, [sp, #-16]! + blr x0 + ldp x29, x30, [sp], #16 + autiasp + ret +endfn + +emit_aarch64_feature_1_and -- cgit v1.2.3 From 79d4071ea4c49260286cf75dc669a0ed354d1c4e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 24 Mar 2021 02:30:45 +0100 Subject: selftests: netfilter: flowtable bridge and vlan support This patch adds two new tests to cover bridge and vlan support: - Add a bridge device to the Router1 (nsr1) container and attach the veth0 device to the bridge. Set the IP address to the bridge device to exercise the bridge forwarding path. - Add vlan encapsulation between to the bridge device in the Router1 and one of the sender containers (ns1). Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- tools/testing/selftests/netfilter/nft_flowtable.sh | 82 ++++++++++++++++++++++ 1 file changed, 82 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/netfilter/nft_flowtable.sh b/tools/testing/selftests/netfilter/nft_flowtable.sh index 431296c0f91c..427d94816f2d 100755 --- a/tools/testing/selftests/netfilter/nft_flowtable.sh +++ b/tools/testing/selftests/netfilter/nft_flowtable.sh @@ -371,6 +371,88 @@ else ip netns exec nsr1 nft list ruleset fi +# Another test: +# Add bridge interface br0 to Router1, with NAT enabled. +ip -net nsr1 link add name br0 type bridge +ip -net nsr1 addr flush dev veth0 +ip -net nsr1 link set up dev veth0 +ip -net nsr1 link set veth0 master br0 +ip -net nsr1 addr add 10.0.1.1/24 dev br0 +ip -net nsr1 addr add dead:1::1/64 dev br0 +ip -net nsr1 link set up dev br0 + +ip netns exec nsr1 sysctl net.ipv4.conf.br0.forwarding=1 > /dev/null + +# br0 with NAT enabled. +ip netns exec nsr1 nft -f - <&2 + ip netns exec nsr1 nft list ruleset + ret=1 +fi + +# Another test: +# Add bridge interface br0 to Router1, with NAT and VLAN. +ip -net nsr1 link set veth0 nomaster +ip -net nsr1 link set down dev veth0 +ip -net nsr1 link add link veth0 name veth0.10 type vlan id 10 +ip -net nsr1 link set up dev veth0 +ip -net nsr1 link set up dev veth0.10 +ip -net nsr1 link set veth0.10 master br0 + +ip -net ns1 addr flush dev eth0 +ip -net ns1 link add link eth0 name eth0.10 type vlan id 10 +ip -net ns1 link set eth0 up +ip -net ns1 link set eth0.10 up +ip -net ns1 addr add 10.0.1.99/24 dev eth0.10 +ip -net ns1 route add default via 10.0.1.1 +ip -net ns1 addr add dead:1::99/64 dev eth0.10 + +if test_tcp_forwarding_nat ns1 ns2; then + echo "PASS: flow offloaded for ns1/ns2 with bridge NAT and VLAN" +else + echo "FAIL: flow offload for ns1/ns2 with bridge NAT and VLAN" 1>&2 + ip netns exec nsr1 nft list ruleset + ret=1 +fi + +# restore test topology (remove bridge and VLAN) +ip -net nsr1 link set veth0 nomaster +ip -net nsr1 link set veth0 down +ip -net nsr1 link set veth0.10 down +ip -net nsr1 link delete veth0.10 type vlan +ip -net nsr1 link delete br0 type bridge +ip -net ns1 addr flush dev eth0.10 +ip -net ns1 link set eth0.10 down +ip -net ns1 link set eth0 down +ip -net ns1 link delete eth0.10 type vlan + +# restore address in ns1 and nsr1 +ip -net ns1 link set eth0 up +ip -net ns1 addr add 10.0.1.99/24 dev eth0 +ip -net ns1 route add default via 10.0.1.1 +ip -net ns1 addr add dead:1::99/64 dev eth0 +ip -net ns1 route add default via dead:1::1 +ip -net nsr1 addr add 10.0.1.1/24 dev veth0 +ip -net nsr1 addr add dead:1::1/64 dev veth0 +ip -net nsr1 link set up dev veth0 + KEY_SHA="0x"$(ps -xaf | sha1sum | cut -d " " -f 1) KEY_AES="0x"$(ps -xaf | md5sum | cut -d " " -f 1) SPI1=$RANDOM -- cgit v1.2.3 From 861584724c44e63bfb684090c70ade660dae6c69 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 24 Mar 2021 22:14:23 +0200 Subject: selftests: mlxsw: Test unresolved neigh trap with resilient nexthop groups The number of nexthop buckets in a resilient nexthop group never changes, so when the gateway address of a nexthop cannot be resolved, the nexthop buckets are programmed to trap packets to the CPU in order to trigger resolution. For example: # ip nexthop add id 1 via 198.51.100.1 dev swp3 # ip nexthop add id 10 group 1 type resilient buckets 32 # ip nexthop bucket get id 10 index 0 id 10 index 0 idle_time 1.44 nhid 1 trap Where 198.51.100.1 is a made-up IP. Test that in this case packets are indeed trapped to the CPU via the unresolved neigh trap. Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Signed-off-by: David S. Miller --- .../net/mlxsw/devlink_trap_l3_exceptions.sh | 31 ++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_exceptions.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_exceptions.sh index 1fedfc9da434..42d44e27802c 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_exceptions.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_exceptions.sh @@ -446,6 +446,35 @@ __invalid_nexthop_test() log_test "Unresolved neigh: nexthop does not exist: $desc" } +__invalid_nexthop_bucket_test() +{ + local desc=$1; shift + local dip=$1; shift + local via_add=$1; shift + local trap_name="unresolved_neigh" + + RET=0 + + # Check that route to nexthop that does not exist triggers + # unresolved_neigh + ip nexthop add id 1 via $via_add dev $rp2 + ip nexthop add id 10 group 1 type resilient buckets 32 + ip route add $dip nhid 10 + + t0_packets=$(devlink_trap_rx_packets_get $trap_name) + ping_do $h1 $dip + t1_packets=$(devlink_trap_rx_packets_get $trap_name) + + if [[ $t0_packets -eq $t1_packets ]]; then + check_err 1 "Trap counter did not increase" + fi + + ip route del $dip nhid 10 + ip nexthop del id 10 + ip nexthop del id 1 + log_test "Unresolved neigh: nexthop bucket does not exist: $desc" +} + unresolved_neigh_test() { __host_miss_test "IPv4" 198.51.100.1 @@ -453,6 +482,8 @@ unresolved_neigh_test() __invalid_nexthop_test "IPv4" 198.51.100.1 198.51.100.3 24 198.51.100.4 __invalid_nexthop_test "IPv6" 2001:db8:2::1 2001:db8:2::3 64 \ 2001:db8:2::4 + __invalid_nexthop_bucket_test "IPv4" 198.51.100.1 198.51.100.4 + __invalid_nexthop_bucket_test "IPv6" 2001:db8:2::1 2001:db8:2::4 } vrf_without_routes_create() -- cgit v1.2.3 From ffd3e9b07b9ee3242fa5f5bc76385b6a0e69437d Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 24 Mar 2021 22:14:24 +0200 Subject: selftests: mlxsw: Add resilient nexthop groups configuration tests Test that unsupported resilient nexthop group configurations are rejected and that offload / trap indication is correctly set on nexthop buckets in a resilient group. Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Signed-off-by: David S. Miller --- .../selftests/drivers/net/mlxsw/rtnetlink.sh | 82 ++++++++++++++++++++++ tools/testing/selftests/net/forwarding/lib.sh | 5 ++ 2 files changed, 87 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/mlxsw/rtnetlink.sh b/tools/testing/selftests/drivers/net/mlxsw/rtnetlink.sh index ed346da5d3cb..a217f9f6775b 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/rtnetlink.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/rtnetlink.sh @@ -33,6 +33,7 @@ ALL_TESTS=" nexthop_obj_invalid_test nexthop_obj_offload_test nexthop_obj_group_offload_test + nexthop_obj_bucket_offload_test nexthop_obj_blackhole_offload_test nexthop_obj_route_offload_test devlink_reload_test @@ -739,11 +740,28 @@ nexthop_obj_invalid_test() ip nexthop add id 1 dev $swp1 ip nexthop add id 2 dev $swp1 + ip nexthop add id 3 via 192.0.2.3 dev $swp1 ip nexthop add id 10 group 1/2 check_fail $? "managed to configure a nexthop group with device-only nexthops when should not" + ip nexthop add id 10 group 3 type resilient buckets 7 + check_fail $? "managed to configure a too small resilient nexthop group when should not" + + ip nexthop add id 10 group 3 type resilient buckets 129 + check_fail $? "managed to configure a resilient nexthop group with invalid number of buckets when should not" + + ip nexthop add id 10 group 1/2 type resilient buckets 32 + check_fail $? "managed to configure a resilient nexthop group with device-only nexthops when should not" + + ip nexthop add id 10 group 3 type resilient buckets 32 + check_err $? "failed to configure a valid resilient nexthop group" + ip nexthop replace id 3 dev $swp1 + check_fail $? "managed to populate a nexthop bucket with a device-only nexthop when should not" + log_test "nexthop objects - invalid configurations" + ip nexthop del id 10 + ip nexthop del id 3 ip nexthop del id 2 ip nexthop del id 1 @@ -858,6 +876,70 @@ nexthop_obj_group_offload_test() simple_if_fini $swp1 192.0.2.1/24 2001:db8:1::1/64 } +nexthop_obj_bucket_offload_test() +{ + # Test offload indication of nexthop buckets + RET=0 + + simple_if_init $swp1 192.0.2.1/24 2001:db8:1::1/64 + simple_if_init $swp2 + setup_wait + + ip nexthop add id 1 via 192.0.2.2 dev $swp1 + ip nexthop add id 2 via 2001:db8:1::2 dev $swp1 + ip nexthop add id 10 group 1/2 type resilient buckets 32 idle_timer 0 + ip neigh replace 192.0.2.2 lladdr 00:11:22:33:44:55 nud reachable \ + dev $swp1 + ip neigh replace 192.0.2.3 lladdr 00:11:22:33:44:55 nud reachable \ + dev $swp1 + ip neigh replace 2001:db8:1::2 lladdr 00:11:22:33:44:55 nud reachable \ + dev $swp1 + + busywait "$TIMEOUT" wait_for_offload \ + ip nexthop bucket show nhid 1 + check_err $? "IPv4 nexthop buckets not marked as offloaded when should" + busywait "$TIMEOUT" wait_for_offload \ + ip nexthop bucket show nhid 2 + check_err $? "IPv6 nexthop buckets not marked as offloaded when should" + + # Invalidate nexthop id 1 + ip neigh replace 192.0.2.2 nud failed dev $swp1 + busywait "$TIMEOUT" wait_for_trap \ + ip nexthop bucket show nhid 1 + check_err $? "IPv4 nexthop buckets not marked with trap when should" + + # Invalidate nexthop id 2 + ip neigh replace 2001:db8:1::2 nud failed dev $swp1 + busywait "$TIMEOUT" wait_for_trap \ + ip nexthop bucket show nhid 2 + check_err $? "IPv6 nexthop buckets not marked with trap when should" + + # Revalidate nexthop id 1 by changing its configuration + ip nexthop replace id 1 via 192.0.2.3 dev $swp1 + busywait "$TIMEOUT" wait_for_offload \ + ip nexthop bucket show nhid 1 + check_err $? "nexthop bucket not marked as offloaded after revalidating nexthop" + + # Revalidate nexthop id 2 by changing its neighbour + ip neigh replace 2001:db8:1::2 lladdr 00:11:22:33:44:55 nud reachable \ + dev $swp1 + busywait "$TIMEOUT" wait_for_offload \ + ip nexthop bucket show nhid 2 + check_err $? "nexthop bucket not marked as offloaded after revalidating neighbour" + + log_test "nexthop bucket offload indication" + + ip neigh del 2001:db8:1::2 dev $swp1 + ip neigh del 192.0.2.3 dev $swp1 + ip neigh del 192.0.2.2 dev $swp1 + ip nexthop del id 10 + ip nexthop del id 2 + ip nexthop del id 1 + + simple_if_fini $swp2 + simple_if_fini $swp1 192.0.2.1/24 2001:db8:1::1/64 +} + nexthop_obj_blackhole_offload_test() { # Test offload indication of blackhole nexthop objects diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index be71012b8fc5..05c05e02bade 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -353,6 +353,11 @@ wait_for_offload() "$@" | grep -q offload } +wait_for_trap() +{ + "$@" | grep -q trap +} + until_counter_is() { local expr=$1; shift -- cgit v1.2.3 From 405e07010d375d2123ec9d2e22197490eb698f74 Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Thu, 25 Mar 2021 12:39:34 +0800 Subject: perf tools: Remove duplicate struct forward declarations 'struct evlist' has been declared at 10th line. 'struct comm' has been declared at 15th line. Remove the duplicates Signed-off-by: Wan Jiabing Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Jiri Olsa Cc: Kajol Jain Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: kael_w@yeah.net Link: http://lore.kernel.org/lkml/20210325043947.846093-1-wanjiabing@vivo.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/metricgroup.h | 1 - tools/perf/util/thread-stack.h | 1 - 2 files changed, 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/metricgroup.h b/tools/perf/util/metricgroup.h index ed1b9392e624..026bbf416c48 100644 --- a/tools/perf/util/metricgroup.h +++ b/tools/perf/util/metricgroup.h @@ -9,7 +9,6 @@ struct evlist; struct evsel; -struct evlist; struct option; struct rblist; struct pmu_events_map; diff --git a/tools/perf/util/thread-stack.h b/tools/perf/util/thread-stack.h index 3bc47a42af8e..b3cd09beb62f 100644 --- a/tools/perf/util/thread-stack.h +++ b/tools/perf/util/thread-stack.h @@ -16,7 +16,6 @@ struct comm; struct ip_callchain; struct symbol; struct dso; -struct comm; struct perf_sample; struct addr_location; struct call_path; -- cgit v1.2.3 From 463a7d5a9e6fd3f3b592e09c936d2d07ee0b65b9 Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Tue, 23 Mar 2021 13:01:39 +0800 Subject: perf daemon: Remove duplicate includes sys/stat.h has been included at line 23, so remove the duplicate one at line 27. linux/string.h has been included at line 7, so remove the duplicate one at line 9. time.h has been included at line 14, so remove the duplicate one at line 28. Signed-off-by: Wan Jiabing Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: kael_w@yeah.net Link: http://lore.kernel.org/lkml/20210323050139.287461-1-wanjiabing@vivo.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-daemon.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-daemon.c b/tools/perf/builtin-daemon.c index ace8772a4f03..632ecd010a4f 100644 --- a/tools/perf/builtin-daemon.c +++ b/tools/perf/builtin-daemon.c @@ -6,7 +6,6 @@ #include #include #include -#include #include #include #include @@ -24,8 +23,6 @@ #include #include #include -#include -#include #include "builtin.h" #include "perf.h" #include "debug.h" -- cgit v1.2.3 From e1d392dc8875556d8a742d4be38ab452516428eb Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Fri, 19 Mar 2021 14:53:01 +0100 Subject: iio: event_monitor: Enable events before monitoring After some painful sessions with a driver that register an enable/disable sysfs knob (gp2ap002) and manually going in and enabling the event before monitoring it: cd /sys/bus/iio/devices/iio\:device2/events # ls in_proximity_thresh_either_en # echo 1 > in_proximity_thresh_either_en I realized that it's better if the iio_event_monitor is smart enough to enable all events by itself and disable them after use, if passed the -a flag familiar from the iio_generic_buffer tool. Auto-enabling events depend on the hardware being able to handle all events at the same time which isn't necessarily the case, so a command line option is required for this. Cc: Bastien Nocera Signed-off-by: Linus Walleij Link: https://lore.kernel.org/r/20210319135301.542911-1-linus.walleij@linaro.org Signed-off-by: Jonathan Cameron --- tools/iio/iio_event_monitor.c | 69 +++++++++++++++++++++++++++++++++++++++---- tools/iio/iio_utils.h | 1 + 2 files changed, 65 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/iio/iio_event_monitor.c b/tools/iio/iio_event_monitor.c index bb03859db89d..0076437f6e3f 100644 --- a/tools/iio/iio_event_monitor.c +++ b/tools/iio/iio_event_monitor.c @@ -14,6 +14,7 @@ #include #include +#include #include #include #include @@ -280,22 +281,69 @@ static void print_event(struct iio_event_data *event) printf("\n"); } +/* Enable or disable events in sysfs if the knob is available */ +static void enable_events(char *dev_dir, int enable) +{ + const struct dirent *ent; + char evdir[256]; + int ret; + DIR *dp; + + snprintf(evdir, sizeof(evdir), FORMAT_EVENTS_DIR, dev_dir); + evdir[sizeof(evdir)-1] = '\0'; + + dp = opendir(evdir); + if (!dp) { + fprintf(stderr, "Enabling/disabling events: can't open %s\n", + evdir); + return; + } + + while (ent = readdir(dp), ent) { + if (iioutils_check_suffix(ent->d_name, "_en")) { + printf("%sabling: %s\n", + enable ? "En" : "Dis", + ent->d_name); + ret = write_sysfs_int(ent->d_name, evdir, + enable); + if (ret < 0) + fprintf(stderr, "Failed to enable/disable %s\n", + ent->d_name); + } + } + + if (closedir(dp) == -1) { + perror("Enabling/disabling channels: " + "Failed to close directory"); + return; + } +} + int main(int argc, char **argv) { struct iio_event_data event; const char *device_name; + char *dev_dir_name = NULL; char *chrdev_name; int ret; int dev_num; int fd, event_fd; - - if (argc <= 1) { - fprintf(stderr, "Usage: %s \n", argv[0]); + bool all_events = false; + + if (argc == 2) { + device_name = argv[1]; + } else if (argc == 3) { + device_name = argv[2]; + if (!strcmp(argv[1], "-a")) + all_events = true; + } else { + fprintf(stderr, + "Usage: iio_event_monitor [options] \n" + "Listen and display events from IIO devices\n" + " -a Auto-activate all available events\n"); return -1; } - device_name = argv[1]; - dev_num = find_type_by_name(device_name, "iio:device"); if (dev_num >= 0) { printf("Found IIO device with name %s with device number %d\n", @@ -303,6 +351,10 @@ int main(int argc, char **argv) ret = asprintf(&chrdev_name, "/dev/iio:device%d", dev_num); if (ret < 0) return -ENOMEM; + /* Look up sysfs dir as well if we can */ + ret = asprintf(&dev_dir_name, "%siio:device%d", iio_dir, dev_num); + if (ret < 0) + return -ENOMEM; } else { /* * If we can't find an IIO device by name assume device_name is @@ -313,6 +365,9 @@ int main(int argc, char **argv) return -ENOMEM; } + if (all_events && dev_dir_name) + enable_events(dev_dir_name, 1); + fd = open(chrdev_name, 0); if (fd == -1) { ret = -errno; @@ -365,6 +420,10 @@ int main(int argc, char **argv) perror("Failed to close event file"); error_free_chrdev_name: + /* Disable events after use */ + if (all_events && dev_dir_name) + enable_events(dev_dir_name, 0); + free(chrdev_name); return ret; diff --git a/tools/iio/iio_utils.h b/tools/iio/iio_utils.h index 336752cade4f..663c94a6c705 100644 --- a/tools/iio/iio_utils.h +++ b/tools/iio/iio_utils.h @@ -13,6 +13,7 @@ #define IIO_MAX_NAME_LENGTH 64 #define FORMAT_SCAN_ELEMENTS_DIR "%s/buffer%d" +#define FORMAT_EVENTS_DIR "%s/events" #define FORMAT_TYPE_FILE "%s_type" #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0])) -- cgit v1.2.3 From 800c120ef4e3abb45d67a36d3e5532eb051efc3c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 25 Mar 2021 13:13:33 -0700 Subject: tools/turbostat: Unmark non-kernel-doc comment Do not mark a comment as kernel-doc notation when it is not meant to be in kernel-doc notation. Signed-off-by: Randy Dunlap Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210325201333.16792-1-rdunlap@infradead.org --- tools/power/x86/turbostat/turbostat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index a7c4f0772e53..5939615265f1 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -2449,7 +2449,7 @@ dump_knl_turbo_ratio_limits(void) fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT: 0x%08llx\n", base_cpu, msr); - /** + /* * Turbo encoding in KNL is as follows: * [0] -- Reserved * [7:1] -- Base value of number of active cores of bucket 1. -- cgit v1.2.3 From a46410d5e4975d701d526397156fa0815747dc2f Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 24 Mar 2021 10:29:41 -0700 Subject: libbpf: Constify few bpf_program getters bpf_program__get_type() and bpf_program__get_expected_attach_type() shouldn't modify given bpf_program, so mark input parameter as const struct bpf_program. This eliminates unnecessary compilation warnings or explicit casts in user programs. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210324172941.2609884-1-andrii@kernel.org --- tools/lib/bpf/libbpf.c | 4 ++-- tools/lib/bpf/libbpf.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index c0fd2c718a7d..10a0a67699f1 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -8458,7 +8458,7 @@ int bpf_program__nth_fd(const struct bpf_program *prog, int n) return fd; } -enum bpf_prog_type bpf_program__get_type(struct bpf_program *prog) +enum bpf_prog_type bpf_program__get_type(const struct bpf_program *prog) { return prog->type; } @@ -8503,7 +8503,7 @@ BPF_PROG_TYPE_FNS(extension, BPF_PROG_TYPE_EXT); BPF_PROG_TYPE_FNS(sk_lookup, BPF_PROG_TYPE_SK_LOOKUP); enum bpf_attach_type -bpf_program__get_expected_attach_type(struct bpf_program *prog) +bpf_program__get_expected_attach_type(const struct bpf_program *prog) { return prog->expected_attach_type; } diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index a1a424b9b8ff..89ade7d7b31c 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -361,12 +361,12 @@ LIBBPF_API int bpf_program__set_struct_ops(struct bpf_program *prog); LIBBPF_API int bpf_program__set_extension(struct bpf_program *prog); LIBBPF_API int bpf_program__set_sk_lookup(struct bpf_program *prog); -LIBBPF_API enum bpf_prog_type bpf_program__get_type(struct bpf_program *prog); +LIBBPF_API enum bpf_prog_type bpf_program__get_type(const struct bpf_program *prog); LIBBPF_API void bpf_program__set_type(struct bpf_program *prog, enum bpf_prog_type type); LIBBPF_API enum bpf_attach_type -bpf_program__get_expected_attach_type(struct bpf_program *prog); +bpf_program__get_expected_attach_type(const struct bpf_program *prog); LIBBPF_API void bpf_program__set_expected_attach_type(struct bpf_program *prog, enum bpf_attach_type type); -- cgit v1.2.3 From cff908463d91a6b2fb8c8ab6c41d9c308c29fd42 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Mon, 22 Mar 2021 17:07:20 +0000 Subject: selftests/bpf: Better error messages for ima_setup.sh failures The current implementation uses the CHECK_FAIL macro which does not provide useful error messages when the script fails. Use the CHECK macro instead and provide more descriptive messages to aid debugging. Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210322170720.2926715-1-kpsingh@kernel.org --- tools/testing/selftests/bpf/prog_tests/test_ima.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/test_ima.c b/tools/testing/selftests/bpf/prog_tests/test_ima.c index b54bc0c351b7..0252f61d611a 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_ima.c +++ b/tools/testing/selftests/bpf/prog_tests/test_ima.c @@ -68,7 +68,8 @@ void test_test_ima(void) goto close_prog; snprintf(cmd, sizeof(cmd), "./ima_setup.sh setup %s", measured_dir); - if (CHECK_FAIL(system(cmd))) + err = system(cmd); + if (CHECK(err, "failed to run command", "%s, errno = %d\n", cmd, errno)) goto close_clean; err = run_measured_process(measured_dir, &skel->bss->monitored_pid); @@ -81,7 +82,8 @@ void test_test_ima(void) close_clean: snprintf(cmd, sizeof(cmd), "./ima_setup.sh cleanup %s", measured_dir); - CHECK_FAIL(system(cmd)); + err = system(cmd); + CHECK(err, "failed to run command", "%s, errno = %d\n", cmd, errno); close_prog: ima__destroy(skel); } -- cgit v1.2.3 From e9bd8cbd970bedd3cca8ee334c76ab90feb78760 Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Mon, 22 Mar 2021 23:50:54 -0300 Subject: bpf: selftests: Add tests for batched ops in LPM trie maps Uses the already existing infrastructure for testing batched ops. The testing code is essentially the same, with minor tweaks for this use case. Suggested-by: Jamal Hadi Salim Signed-off-by: Pedro Tammela Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210323025058.315763-3-pctammela@gmail.com --- .../bpf/map_tests/lpm_trie_map_batch_ops.c | 158 +++++++++++++++++++++ 1 file changed, 158 insertions(+) create mode 100644 tools/testing/selftests/bpf/map_tests/lpm_trie_map_batch_ops.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/map_tests/lpm_trie_map_batch_ops.c b/tools/testing/selftests/bpf/map_tests/lpm_trie_map_batch_ops.c new file mode 100644 index 000000000000..2e986e5e4cac --- /dev/null +++ b/tools/testing/selftests/bpf/map_tests/lpm_trie_map_batch_ops.c @@ -0,0 +1,158 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +struct test_lpm_key { + __u32 prefix; + struct in_addr ipv4; +}; + +static void map_batch_update(int map_fd, __u32 max_entries, + struct test_lpm_key *keys, int *values) +{ + __u32 i; + int err; + char buff[16] = { 0 }; + DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, opts, + .elem_flags = 0, + .flags = 0, + ); + + for (i = 0; i < max_entries; i++) { + keys[i].prefix = 32; + snprintf(buff, 16, "192.168.1.%d", i + 1); + inet_pton(AF_INET, buff, &keys[i].ipv4); + values[i] = i + 1; + } + + err = bpf_map_update_batch(map_fd, keys, values, &max_entries, &opts); + CHECK(err, "bpf_map_update_batch()", "error:%s\n", strerror(errno)); +} + +static void map_batch_verify(int *visited, __u32 max_entries, + struct test_lpm_key *keys, int *values) +{ + char buff[16] = { 0 }; + int lower_byte = 0; + __u32 i; + + memset(visited, 0, max_entries * sizeof(*visited)); + for (i = 0; i < max_entries; i++) { + inet_ntop(AF_INET, &keys[i].ipv4, buff, 32); + CHECK(sscanf(buff, "192.168.1.%d", &lower_byte) == EOF, + "sscanf()", "error: i %d\n", i); + CHECK(lower_byte != values[i], "key/value checking", + "error: i %d key %s value %d\n", i, buff, values[i]); + visited[i] = 1; + } + for (i = 0; i < max_entries; i++) { + CHECK(visited[i] != 1, "visited checking", + "error: keys array at index %d missing\n", i); + } +} + +void test_lpm_trie_map_batch_ops(void) +{ + struct bpf_create_map_attr xattr = { + .name = "lpm_trie_map", + .map_type = BPF_MAP_TYPE_LPM_TRIE, + .key_size = sizeof(struct test_lpm_key), + .value_size = sizeof(int), + .map_flags = BPF_F_NO_PREALLOC, + }; + struct test_lpm_key *keys, key; + int map_fd, *values, *visited; + __u32 step, count, total, total_success; + const __u32 max_entries = 10; + __u64 batch = 0; + int err; + DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, opts, + .elem_flags = 0, + .flags = 0, + ); + + xattr.max_entries = max_entries; + map_fd = bpf_create_map_xattr(&xattr); + CHECK(map_fd == -1, "bpf_create_map_xattr()", "error:%s\n", + strerror(errno)); + + keys = malloc(max_entries * sizeof(struct test_lpm_key)); + values = malloc(max_entries * sizeof(int)); + visited = malloc(max_entries * sizeof(int)); + CHECK(!keys || !values || !visited, "malloc()", "error:%s\n", + strerror(errno)); + + total_success = 0; + for (step = 1; step < max_entries; step++) { + map_batch_update(map_fd, max_entries, keys, values); + map_batch_verify(visited, max_entries, keys, values); + memset(keys, 0, max_entries * sizeof(*keys)); + memset(values, 0, max_entries * sizeof(*values)); + batch = 0; + total = 0; + /* iteratively lookup/delete elements with 'step' + * elements each. + */ + count = step; + while (true) { + err = bpf_map_lookup_batch(map_fd, + total ? &batch : NULL, &batch, + keys + total, values + total, &count, &opts); + + CHECK((err && errno != ENOENT), "lookup with steps", + "error: %s\n", strerror(errno)); + + total += count; + if (err) + break; + } + + CHECK(total != max_entries, "lookup with steps", + "total = %u, max_entries = %u\n", total, max_entries); + + map_batch_verify(visited, max_entries, keys, values); + + total = 0; + count = step; + while (total < max_entries) { + if (max_entries - total < step) + count = max_entries - total; + err = bpf_map_delete_batch(map_fd, keys + total, &count, + &opts); + CHECK((err && errno != ENOENT), "delete batch", + "error: %s\n", strerror(errno)); + total += count; + if (err) + break; + } + CHECK(total != max_entries, "delete with steps", + "total = %u, max_entries = %u\n", total, max_entries); + + /* check map is empty, errono == ENOENT */ + err = bpf_map_get_next_key(map_fd, NULL, &key); + CHECK(!err || errno != ENOENT, "bpf_map_get_next_key()", + "error: %s\n", strerror(errno)); + + total_success++; + } + + CHECK(total_success == 0, "check total_success", + "unexpected failure\n"); + + printf("%s:PASS\n", __func__); + + free(keys); + free(values); + free(visited); +} -- cgit v1.2.3 From 155f556d64b1a48710f01305e14bb860734ed1e3 Mon Sep 17 00:00:00 2001 From: Rafael David Tinoco Date: Tue, 23 Mar 2021 01:09:52 -0300 Subject: libbpf: Add bpf object kern_version attribute setter Unfortunately some distros don't have their kernel version defined accurately in due to different long term support reasons. It is important to have a way to override the bpf kern_version attribute during runtime: some old kernels might still check for kern_version attribute during bpf_prog_load(). Signed-off-by: Rafael David Tinoco Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210323040952.2118241-1-rafaeldtinoco@ubuntu.com --- tools/lib/bpf/libbpf.c | 10 ++++++++++ tools/lib/bpf/libbpf.h | 1 + tools/lib/bpf/libbpf.map | 1 + 3 files changed, 12 insertions(+) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 10a0a67699f1..cebb0e852cf8 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -8270,6 +8270,16 @@ int bpf_object__btf_fd(const struct bpf_object *obj) return obj->btf ? btf__fd(obj->btf) : -1; } +int bpf_object__set_kversion(struct bpf_object *obj, __u32 kern_version) +{ + if (obj->loaded) + return -EINVAL; + + obj->kern_version = kern_version; + + return 0; +} + int bpf_object__set_priv(struct bpf_object *obj, void *priv, bpf_object_clear_priv_t clear_priv) { diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 89ade7d7b31c..f500621d28e5 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -143,6 +143,7 @@ LIBBPF_API int bpf_object__unload(struct bpf_object *obj); LIBBPF_API const char *bpf_object__name(const struct bpf_object *obj); LIBBPF_API unsigned int bpf_object__kversion(const struct bpf_object *obj); +LIBBPF_API int bpf_object__set_kversion(struct bpf_object *obj, __u32 kern_version); struct btf; LIBBPF_API struct btf *bpf_object__btf(const struct bpf_object *obj); diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 279ae861f568..f5990f7208ce 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -359,4 +359,5 @@ LIBBPF_0.4.0 { bpf_linker__finalize; bpf_linker__free; bpf_linker__new; + bpf_object__set_kversion; } LIBBPF_0.3.0; -- cgit v1.2.3 From 0a606822c4863b2398925e6ff3329d64c4c52bb8 Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Mon, 22 Mar 2021 10:57:24 -0400 Subject: perf sort: Add dynamic headers for perf report columns Currently the header string for different columns in perf report is fixed. Some fields of perf sample could have different meaning for different architectures than the meaning conveyed by the header string. An example is the new field 'var2_w' of perf_sample_weight structure. This is presently captured as 'Local INSTR Latency' in perf mem report. But this could be used to denote a different latency cycle in another architecture. Introduce a weak function arch_perf_header_entry() to set the arch specific header string for the fields which can contain dynamic header. If the architecture do not have this function, fall back to the default header string value. Signed-off-by: Athira Rajeev Reviewed-by: Madhavan Srinivasan Acked-by: Jiri Olsa Cc: Kajol Jain Cc: Kan Liang Cc: Michael Ellerman Cc: Peter Zijlstra Cc: Ravi Bangoria Link: https://lore.kernel.org/r/1616425047-1666-3-git-send-email-atrajeev@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/event.h | 1 + tools/perf/util/sort.c | 19 ++++++++++++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index f603edbbbc6f..6106a9c134c9 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -427,5 +427,6 @@ char *get_page_size_name(u64 size, char *str); void arch_perf_parse_sample_weight(struct perf_sample *data, const __u64 *array, u64 type); void arch_perf_synthesize_sample_weight(const struct perf_sample *data, __u64 *array, u64 type); +const char *arch_perf_header_entry(const char *se_header); #endif /* __PERF_RECORD_H */ diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index 552b590485bf..eeb03e749181 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -25,6 +25,7 @@ #include #include "mem-events.h" #include "annotate.h" +#include "event.h" #include "time-utils.h" #include "cgroup.h" #include "machine.h" @@ -45,6 +46,7 @@ const char *field_order; regex_t ignore_callees_regex; int have_ignore_callees = 0; enum sort_mode sort__mode = SORT_MODE__NORMAL; +const char *dynamic_headers[] = {"local_ins_lat"}; /* * Replaces all occurrences of a char used with the: @@ -1816,6 +1818,16 @@ struct sort_dimension { int taken; }; +const char * __weak arch_perf_header_entry(const char *se_header) +{ + return se_header; +} + +static void sort_dimension_add_dynamic_header(struct sort_dimension *sd) +{ + sd->entry->se_header = arch_perf_header_entry(sd->entry->se_header); +} + #define DIM(d, n, func) [d] = { .name = n, .entry = &(func) } static struct sort_dimension common_sort_dimensions[] = { @@ -2739,7 +2751,7 @@ int sort_dimension__add(struct perf_hpp_list *list, const char *tok, struct evlist *evlist, int level) { - unsigned int i; + unsigned int i, j; for (i = 0; i < ARRAY_SIZE(common_sort_dimensions); i++) { struct sort_dimension *sd = &common_sort_dimensions[i]; @@ -2747,6 +2759,11 @@ int sort_dimension__add(struct perf_hpp_list *list, const char *tok, if (strncasecmp(tok, sd->name, strlen(tok))) continue; + for (j = 0; j < ARRAY_SIZE(dynamic_headers); j++) { + if (!strcmp(dynamic_headers[j], sd->name)) + sort_dimension_add_dynamic_header(sd); + } + if (sd->entry == &sort_parent) { int ret = regcomp(&parent_regex, parent_pattern, REG_EXTENDED); if (ret) { -- cgit v1.2.3 From ff0bd0a33f257cc01c0b777c1423205e49049777 Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Mon, 22 Mar 2021 10:57:25 -0400 Subject: perf powerpc: Add support for PERF_SAMPLE_WEIGHT_STRUCT Add arch specific arch_evsel__set_sample_weight() to set the new sample type for powerpc. Add arch specific arch_perf_parse_sample_weight() to store the sample->weight values depending on the sample type applied. if the new sample type (PERF_SAMPLE_WEIGHT_STRUCT) is applied, store only the lower 32 bits to sample->weight. If sample type is 'PERF_SAMPLE_WEIGHT', store the full 64-bit to sample->weight. Signed-off-by: Athira Rajeev Reviewed-by: Madhavan Srinivasan Acked-by: Jiri Olsa Cc: Kajol Jain Cc: Kan Liang Cc: Michael Ellerman Cc: Peter Zijlstra Cc: Ravi Bangoria Link: https://lore.kernel.org/r/1616425047-1666-4-git-send-email-atrajeev@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/powerpc/util/Build | 2 ++ tools/perf/arch/powerpc/util/event.c | 32 ++++++++++++++++++++++++++++++++ tools/perf/arch/powerpc/util/evsel.c | 8 ++++++++ 3 files changed, 42 insertions(+) create mode 100644 tools/perf/arch/powerpc/util/event.c create mode 100644 tools/perf/arch/powerpc/util/evsel.c (limited to 'tools') diff --git a/tools/perf/arch/powerpc/util/Build b/tools/perf/arch/powerpc/util/Build index b7945e5a543b..8a79c4126e5b 100644 --- a/tools/perf/arch/powerpc/util/Build +++ b/tools/perf/arch/powerpc/util/Build @@ -4,6 +4,8 @@ perf-y += kvm-stat.o perf-y += perf_regs.o perf-y += mem-events.o perf-y += sym-handling.o +perf-y += evsel.o +perf-y += event.o perf-$(CONFIG_DWARF) += dwarf-regs.o perf-$(CONFIG_DWARF) += skip-callchain-idx.o diff --git a/tools/perf/arch/powerpc/util/event.c b/tools/perf/arch/powerpc/util/event.c new file mode 100644 index 000000000000..f49d32c2c8ae --- /dev/null +++ b/tools/perf/arch/powerpc/util/event.c @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +#include "../../../util/event.h" +#include "../../../util/synthetic-events.h" +#include "../../../util/machine.h" +#include "../../../util/tool.h" +#include "../../../util/map.h" +#include "../../../util/debug.h" + +void arch_perf_parse_sample_weight(struct perf_sample *data, + const __u64 *array, u64 type) +{ + union perf_sample_weight weight; + + weight.full = *array; + if (type & PERF_SAMPLE_WEIGHT) + data->weight = weight.full; + else + data->weight = weight.var1_dw; +} + +void arch_perf_synthesize_sample_weight(const struct perf_sample *data, + __u64 *array, u64 type) +{ + *array = data->weight; + + if (type & PERF_SAMPLE_WEIGHT_STRUCT) + *array &= 0xffffffff; +} diff --git a/tools/perf/arch/powerpc/util/evsel.c b/tools/perf/arch/powerpc/util/evsel.c new file mode 100644 index 000000000000..2f733cdc8dbb --- /dev/null +++ b/tools/perf/arch/powerpc/util/evsel.c @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include "util/evsel.h" + +void arch_evsel__set_sample_weight(struct evsel *evsel) +{ + evsel__set_sample_bit(evsel, WEIGHT_STRUCT); +} -- cgit v1.2.3 From 06e5ca746c07380dfe0e4c3e10c34a6daa69eae6 Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Mon, 22 Mar 2021 10:57:26 -0400 Subject: perf tools: Support pipeline stage cycles for powerpc The pipeline stage cycles details can be recorded on powerpc from the contents of Performance Monitor Unit (PMU) registers. On ISA v3.1 platform, sampling registers exposes the cycles spent in different pipeline stages. Patch adds perf tools support to present two of the cycle counter information along with memory latency (weight). Re-use the field 'ins_lat' for storing the first pipeline stage cycle. This is stored in 'var2_w' field of 'perf_sample_weight'. Add a new field 'p_stage_cyc' to store the second pipeline stage cycle which is stored in 'var3_w' field of perf_sample_weight. Add new sort function 'Pipeline Stage Cycle' and include this in default_mem_sort_order[]. This new sort function may be used to denote some other pipeline stage in another architecture. So add this to list of sort entries that can have dynamic header string. Signed-off-by: Athira Rajeev Reviewed-by: Madhavan Srinivasan Acked-by: Jiri Olsa Cc: Jiri Olsa Cc: Kajol Jain Cc: Kan Liang Cc: Michael Ellerman Cc: Peter Zijlstra Cc: Ravi Bangoria Link: https://lore.kernel.org/r/1616425047-1666-5-git-send-email-atrajeev@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-report.txt | 2 ++ tools/perf/arch/powerpc/util/event.c | 18 ++++++++++++++++-- tools/perf/util/event.h | 1 + tools/perf/util/hist.c | 11 ++++++++--- tools/perf/util/hist.h | 1 + tools/perf/util/session.c | 4 +++- tools/perf/util/sort.c | 24 ++++++++++++++++++++++-- tools/perf/util/sort.h | 2 ++ 8 files changed, 55 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index 822b16f05c15..f51f0000676e 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -112,6 +112,8 @@ OPTIONS - ins_lat: Instruction latency in core cycles. This is the global instruction latency - local_ins_lat: Local instruction latency version + - p_stage_cyc: On powerpc, this presents the number of cycles spent in a + pipeline stage. And currently supported only on powerpc. By default, comm, dso and symbol keys are used. (i.e. --sort comm,dso,symbol) diff --git a/tools/perf/arch/powerpc/util/event.c b/tools/perf/arch/powerpc/util/event.c index f49d32c2c8ae..22521bc9481a 100644 --- a/tools/perf/arch/powerpc/util/event.c +++ b/tools/perf/arch/powerpc/util/event.c @@ -18,8 +18,11 @@ void arch_perf_parse_sample_weight(struct perf_sample *data, weight.full = *array; if (type & PERF_SAMPLE_WEIGHT) data->weight = weight.full; - else + else { data->weight = weight.var1_dw; + data->ins_lat = weight.var2_w; + data->p_stage_cyc = weight.var3_w; + } } void arch_perf_synthesize_sample_weight(const struct perf_sample *data, @@ -27,6 +30,17 @@ void arch_perf_synthesize_sample_weight(const struct perf_sample *data, { *array = data->weight; - if (type & PERF_SAMPLE_WEIGHT_STRUCT) + if (type & PERF_SAMPLE_WEIGHT_STRUCT) { *array &= 0xffffffff; + *array |= ((u64)data->ins_lat << 32); + } +} + +const char *arch_perf_header_entry(const char *se_header) +{ + if (!strcmp(se_header, "Local INSTR Latency")) + return "Finish Cyc"; + else if (!strcmp(se_header, "Pipeline Stage Cycle")) + return "Dispatch Cyc"; + return se_header; } diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index 6106a9c134c9..e5da4a695ff2 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -147,6 +147,7 @@ struct perf_sample { u8 cpumode; u16 misc; u16 ins_lat; + u16 p_stage_cyc; bool no_hw_idx; /* No hw_idx collected in branch_stack */ char insn[MAX_INSN]; void *raw_data; diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index c82f5fc26af8..9299ee535518 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -211,6 +211,7 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h) hists__new_col_len(hists, HISTC_MEM_BLOCKED, 10); hists__new_col_len(hists, HISTC_LOCAL_INS_LAT, 13); hists__new_col_len(hists, HISTC_GLOBAL_INS_LAT, 13); + hists__new_col_len(hists, HISTC_P_STAGE_CYC, 13); if (symbol_conf.nanosecs) hists__new_col_len(hists, HISTC_TIME, 16); else @@ -289,13 +290,14 @@ static long hist_time(unsigned long htime) } static void he_stat__add_period(struct he_stat *he_stat, u64 period, - u64 weight, u64 ins_lat) + u64 weight, u64 ins_lat, u64 p_stage_cyc) { he_stat->period += period; he_stat->weight += weight; he_stat->nr_events += 1; he_stat->ins_lat += ins_lat; + he_stat->p_stage_cyc += p_stage_cyc; } static void he_stat__add_stat(struct he_stat *dest, struct he_stat *src) @@ -308,6 +310,7 @@ static void he_stat__add_stat(struct he_stat *dest, struct he_stat *src) dest->nr_events += src->nr_events; dest->weight += src->weight; dest->ins_lat += src->ins_lat; + dest->p_stage_cyc += src->p_stage_cyc; } static void he_stat__decay(struct he_stat *he_stat) @@ -597,6 +600,7 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists, u64 period = entry->stat.period; u64 weight = entry->stat.weight; u64 ins_lat = entry->stat.ins_lat; + u64 p_stage_cyc = entry->stat.p_stage_cyc; bool leftmost = true; p = &hists->entries_in->rb_root.rb_node; @@ -615,11 +619,11 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists, if (!cmp) { if (sample_self) { - he_stat__add_period(&he->stat, period, weight, ins_lat); + he_stat__add_period(&he->stat, period, weight, ins_lat, p_stage_cyc); hist_entry__add_callchain_period(he, period); } if (symbol_conf.cumulate_callchain) - he_stat__add_period(he->stat_acc, period, weight, ins_lat); + he_stat__add_period(he->stat_acc, period, weight, ins_lat, p_stage_cyc); /* * This mem info was allocated from sample__resolve_mem @@ -731,6 +735,7 @@ __hists__add_entry(struct hists *hists, .period = sample->period, .weight = sample->weight, .ins_lat = sample->ins_lat, + .p_stage_cyc = sample->p_stage_cyc, }, .parent = sym_parent, .filtered = symbol__parent_filter(sym_parent) | al->filtered, diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index 3c537232294b..e2faa745c8d6 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -75,6 +75,7 @@ enum hist_column { HISTC_MEM_BLOCKED, HISTC_LOCAL_INS_LAT, HISTC_GLOBAL_INS_LAT, + HISTC_P_STAGE_CYC, HISTC_NR_COLS, /* Last entry */ }; diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 9a8808507bd9..eba3769be3f1 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1302,8 +1302,10 @@ static void dump_sample(struct evsel *evsel, union perf_event *event, if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) { printf("... weight: %" PRIu64 "", sample->weight); - if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) + if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) { printf(",0x%"PRIx16"", sample->ins_lat); + printf(",0x%"PRIx16"", sample->p_stage_cyc); + } printf("\n"); } diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index eeb03e749181..df7b932b4073 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -37,7 +37,7 @@ const char default_parent_pattern[] = "^sys_|^do_page_fault"; const char *parent_pattern = default_parent_pattern; const char *default_sort_order = "comm,dso,symbol"; const char default_branch_sort_order[] = "comm,dso_from,symbol_from,symbol_to,cycles"; -const char default_mem_sort_order[] = "local_weight,mem,sym,dso,symbol_daddr,dso_daddr,snoop,tlb,locked,blocked,local_ins_lat"; +const char default_mem_sort_order[] = "local_weight,mem,sym,dso,symbol_daddr,dso_daddr,snoop,tlb,locked,blocked,local_ins_lat,p_stage_cyc"; const char default_top_sort_order[] = "dso,symbol"; const char default_diff_sort_order[] = "dso,symbol"; const char default_tracepoint_sort_order[] = "trace"; @@ -46,7 +46,7 @@ const char *field_order; regex_t ignore_callees_regex; int have_ignore_callees = 0; enum sort_mode sort__mode = SORT_MODE__NORMAL; -const char *dynamic_headers[] = {"local_ins_lat"}; +const char *dynamic_headers[] = {"local_ins_lat", "p_stage_cyc"}; /* * Replaces all occurrences of a char used with the: @@ -1410,6 +1410,25 @@ struct sort_entry sort_global_ins_lat = { .se_width_idx = HISTC_GLOBAL_INS_LAT, }; +static int64_t +sort__global_p_stage_cyc_cmp(struct hist_entry *left, struct hist_entry *right) +{ + return left->stat.p_stage_cyc - right->stat.p_stage_cyc; +} + +static int hist_entry__p_stage_cyc_snprintf(struct hist_entry *he, char *bf, + size_t size, unsigned int width) +{ + return repsep_snprintf(bf, size, "%-*u", width, he->stat.p_stage_cyc); +} + +struct sort_entry sort_p_stage_cyc = { + .se_header = "Pipeline Stage Cycle", + .se_cmp = sort__global_p_stage_cyc_cmp, + .se_snprintf = hist_entry__p_stage_cyc_snprintf, + .se_width_idx = HISTC_P_STAGE_CYC, +}; + struct sort_entry sort_mem_daddr_sym = { .se_header = "Data Symbol", .se_cmp = sort__daddr_cmp, @@ -1853,6 +1872,7 @@ static struct sort_dimension common_sort_dimensions[] = { DIM(SORT_CODE_PAGE_SIZE, "code_page_size", sort_code_page_size), DIM(SORT_LOCAL_INS_LAT, "local_ins_lat", sort_local_ins_lat), DIM(SORT_GLOBAL_INS_LAT, "ins_lat", sort_global_ins_lat), + DIM(SORT_PIPELINE_STAGE_CYC, "p_stage_cyc", sort_p_stage_cyc), }; #undef DIM diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h index 63f67a3f3630..87a092645aa7 100644 --- a/tools/perf/util/sort.h +++ b/tools/perf/util/sort.h @@ -51,6 +51,7 @@ struct he_stat { u64 period_guest_us; u64 weight; u64 ins_lat; + u64 p_stage_cyc; u32 nr_events; }; @@ -234,6 +235,7 @@ enum sort_type { SORT_CODE_PAGE_SIZE, SORT_LOCAL_INS_LAT, SORT_GLOBAL_INS_LAT, + SORT_PIPELINE_STAGE_CYC, /* branch stack specific sort keys */ __SORT_BRANCH_STACK, -- cgit v1.2.3 From 50fa3a531e8e4b58550171fb159d0aa578c6b52d Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Mon, 22 Mar 2021 10:57:27 -0400 Subject: perf sort: Display sort dimension p_stage_cyc only on supported archs The sort dimension "p_stage_cyc" is used to represent pipeline stage cycle information. Presently, this is used only in powerpc. For unsupported platforms, we don't want to display it in the perf report output columns. Hence add check in sort_dimension__add() and skip the sort key incase it is not applicable for the particular arch. Signed-off-by: Athira Rajeev Reviewed-by: Madhavan Srinivasan Acked-by: Jiri Olsa Cc: Kajol Jain Cc: Kan Liang Cc: Michael Ellerman Cc: Peter Zijlstra Cc: Ravi Bangoria Link: https://lore.kernel.org/r/1616425047-1666-6-git-send-email-atrajeev@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/powerpc/util/event.c | 7 +++++++ tools/perf/util/event.h | 1 + tools/perf/util/sort.c | 19 +++++++++++++++++++ 3 files changed, 27 insertions(+) (limited to 'tools') diff --git a/tools/perf/arch/powerpc/util/event.c b/tools/perf/arch/powerpc/util/event.c index 22521bc9481a..3bf441257466 100644 --- a/tools/perf/arch/powerpc/util/event.c +++ b/tools/perf/arch/powerpc/util/event.c @@ -44,3 +44,10 @@ const char *arch_perf_header_entry(const char *se_header) return "Dispatch Cyc"; return se_header; } + +int arch_support_sort_key(const char *sort_key) +{ + if (!strcmp(sort_key, "p_stage_cyc")) + return 1; + return 0; +} diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index e5da4a695ff2..8a62fb39e365 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -429,5 +429,6 @@ char *get_page_size_name(u64 size, char *str); void arch_perf_parse_sample_weight(struct perf_sample *data, const __u64 *array, u64 type); void arch_perf_synthesize_sample_weight(const struct perf_sample *data, __u64 *array, u64 type); const char *arch_perf_header_entry(const char *se_header); +int arch_support_sort_key(const char *sort_key); #endif /* __PERF_RECORD_H */ diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index df7b932b4073..88ce47f2547e 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -47,6 +47,7 @@ regex_t ignore_callees_regex; int have_ignore_callees = 0; enum sort_mode sort__mode = SORT_MODE__NORMAL; const char *dynamic_headers[] = {"local_ins_lat", "p_stage_cyc"}; +const char *arch_specific_sort_keys[] = {"p_stage_cyc"}; /* * Replaces all occurrences of a char used with the: @@ -1837,6 +1838,11 @@ struct sort_dimension { int taken; }; +int __weak arch_support_sort_key(const char *sort_key __maybe_unused) +{ + return 0; +} + const char * __weak arch_perf_header_entry(const char *se_header) { return se_header; @@ -2773,6 +2779,19 @@ int sort_dimension__add(struct perf_hpp_list *list, const char *tok, { unsigned int i, j; + /* + * Check to see if there are any arch specific + * sort dimensions not applicable for the current + * architecture. If so, Skip that sort key since + * we don't want to display it in the output fields. + */ + for (j = 0; j < ARRAY_SIZE(arch_specific_sort_keys); j++) { + if (!strcmp(arch_specific_sort_keys[j], tok) && + !arch_support_sort_key(tok)) { + return 0; + } + } + for (i = 0; i < ARRAY_SIZE(common_sort_dimensions); i++) { struct sort_dimension *sd = &common_sort_dimensions[i]; -- cgit v1.2.3 From dfbe56bf48663ae383aba0eb756ba3615a3e9feb Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 25 Mar 2021 10:47:24 -0700 Subject: tools: usbip: list.h: fix kernel-doc for list_del() In list.h, the kernel-doc for list_del() should be immediately preceding the implementation and not separated from it by another function implementation. Eliminates this kernel-doc error: list.h:1: warning: 'list_del' not found Cc: Greg Kroah-Hartman Cc: Valentina Manea Cc: Shuah Khan Cc: Shuah Khan Cc: linux-usb@vger.kernel.org Acked-by: Shuah Khan Signed-off-by: Randy Dunlap Link: https://lore.kernel.org/r/20210325174724.14447-1-rdunlap@infradead.org Signed-off-by: Greg Kroah-Hartman --- tools/usb/usbip/libsrc/list.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/usb/usbip/libsrc/list.h b/tools/usb/usbip/libsrc/list.h index a941671e4900..9cca2425587b 100644 --- a/tools/usb/usbip/libsrc/list.h +++ b/tools/usb/usbip/libsrc/list.h @@ -77,17 +77,17 @@ static inline void __list_del(struct list_head * prev, struct list_head * next) #define LIST_POISON1 ((void *) 0x00100100 + POISON_POINTER_DELTA) #define LIST_POISON2 ((void *) 0x00200200 + POISON_POINTER_DELTA) +static inline void __list_del_entry(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); +} + /** * list_del - deletes entry from list. * @entry: the element to delete from the list. * Note: list_empty() on entry does not return true after this, the entry is * in an undefined state. */ -static inline void __list_del_entry(struct list_head *entry) -{ - __list_del(entry->prev, entry->next); -} - static inline void list_del(struct list_head *entry) { __list_del(entry->prev, entry->next); -- cgit v1.2.3 From b737eecd4a8a62c7e479b2c7d2d1a1319343c72b Mon Sep 17 00:00:00 2001 From: "Hongren Zheng (Zenithal)" Date: Wed, 24 Mar 2021 14:35:52 +0800 Subject: usbip: tools: add options and examples in man page related to device mode The commit e0546fd8b748 ("usbip: tools: Start using VUDC backend in usbip tools") implemented device mode for user space tools, however the corresponding options are not documented in man page. This commit documents the options and provides examples on device mode. Also the command `usbip port` is documented. Acked-by: Shuah Khan Signed-off-by: Hongren Zheng Link: https://lore.kernel.org/r/YFrdyKKx1nx8bktm@Sun Signed-off-by: Greg Kroah-Hartman --- tools/usb/usbip/doc/usbip.8 | 42 +++++++++++++++++++++++++++++++++++++++++- tools/usb/usbip/doc/usbipd.8 | 26 ++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/usb/usbip/doc/usbip.8 b/tools/usb/usbip/doc/usbip.8 index a15d20063b98..1f26e4a00638 100644 --- a/tools/usb/usbip/doc/usbip.8 +++ b/tools/usb/usbip/doc/usbip.8 @@ -49,10 +49,17 @@ then exit. Attach a remote USB device. .PP +.HP +\fBattach\fR \-\-remote=<\fIhost\fR> \-\-device=<\fIdev_id\fR> +.IP +Attach a remote USB gadget. +Only used when the remote usbipd is in device mode. +.PP + .HP \fBdetach\fR \-\-port=<\fIport\fR> .IP -Detach an imported USB device. +Detach an imported USB device/gadget. .PP .HP @@ -73,12 +80,26 @@ Stop exporting a device so it can be used by a local driver. List USB devices exported by a remote host. .PP +.HP +\fBlist\fR \-\-device +.IP +List USB gadgets of local usbip-vudc. +Only used when the local usbipd is in device mode. +Note that this can not list usbip-vudc USB gadgets of the remote device mode usbipd. +.PP + .HP \fBlist\fR \-\-local .IP List local USB devices. .PP +.HP +\fBport\fR +.IP +List imported devices/gadgets. +.PP + .SH EXAMPLES @@ -90,8 +111,27 @@ List local USB devices. client:# usbip attach --remote=server --busid=1-2 - Connect the remote USB device. + client:# usbip port + - List imported devices/gadgets. + client:# usbip detach --port=0 - Detach the usb device. +The following example shows the usage of device mode + + server:# usbip list --device + - List gadgets exported by local usbipd server. + + client:# modprobe vhci-hcd + + client:# usbip attach --remote=server --device=usbip-vudc.0 + - Connect the remote USB gadget. + + client:# usbip port + - List imported devices/gadgets. + + client:# usbip detach --port=0 + - Detach the usb gadget. + .SH "SEE ALSO" \fBusbipd\fP\fB(8)\fB\fP diff --git a/tools/usb/usbip/doc/usbipd.8 b/tools/usb/usbip/doc/usbipd.8 index fb62a756893b..d974394f86a1 100644 --- a/tools/usb/usbip/doc/usbipd.8 +++ b/tools/usb/usbip/doc/usbipd.8 @@ -29,6 +29,12 @@ Bind to IPv4. Default is both. Bind to IPv6. Default is both. .PP +.HP +\fB\-e\fR, \fB\-\-device\fR +.IP +Run in device mode. Rather than drive an attached device, create a virtual UDC to bind gadgets to. +.PP + .HP \fB\-D\fR, \fB\-\-daemon\fR .IP @@ -86,6 +92,26 @@ USB/IP client can connect and use exported devices. - A usb device 1-2 is now exportable to other hosts! - Use 'usbip unbind --busid=1-2' when you want to shutdown exporting and use the device locally. +The following example shows the usage of device mode + + server:# modprobe usbip-vudc + - Use /sys/class/udc/ interface. + - usbip-host is independent of this module. + + server:# usbipd -e -D + - Start usbip daemon in device mode. + + server:# modprobe g_mass_storage file=/tmp/tmp.img + - Bind a gadget to usbip-vudc. + - in this example, a mass storage gadget is bound. + + server:# usbip list --device + - List gadgets exported by local usbipd server. + + server:# modprobe -r g_mass_storage + - Unbind a gadget from usbip-vudc. + - in this example, the previous mass storage gadget is unbound. + .SH "SEE ALSO" \fBusbip\fP\fB(8)\fB\fP -- cgit v1.2.3 From a58977b2f831e931b3f9268e3051c875ec00f800 Mon Sep 17 00:00:00 2001 From: "Hongren Zheng (Zenithal)" Date: Wed, 24 Mar 2021 15:56:27 +0800 Subject: usbip: tools: add usage of device mode in usbip_list.c The option '-d/--device' was implemented in 'usbip list' but not shown in usage. Hence this commit adds this option to usage. Acked-by: Shuah Khan Signed-off-by: Hongren Zheng Link: https://lore.kernel.org/r/YFrwq75Uyef3c9gz@Sun Signed-off-by: Greg Kroah-Hartman --- tools/usb/usbip/src/usbip_list.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/usb/usbip/src/usbip_list.c b/tools/usb/usbip/src/usbip_list.c index 8625b0f514ee..3d810bcca02f 100644 --- a/tools/usb/usbip/src/usbip_list.c +++ b/tools/usb/usbip/src/usbip_list.c @@ -33,7 +33,8 @@ static const char usbip_list_usage_string[] = "usbip list [-p|--parsable] \n" " -p, --parsable Parsable list format\n" " -r, --remote= List the exportable USB devices on \n" - " -l, --local List the local USB devices\n"; + " -l, --local List the local USB devices\n" + " -d, --device List the local USB gadgets bound to usbip-vudc\n"; void usbip_list_usage(void) { -- cgit v1.2.3 From b0922c0732c10eabab7ef15c420b0ae6cf540564 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 25 Mar 2021 10:55:36 -0700 Subject: tools: gpio-utils: fix various kernel-doc warnings Fix several problems in kernel-doc notation in gpio-utils.c. gpio-utils.c:37: warning: Incorrect use of kernel-doc format: * gpiotools_request_line() - request gpio lines in a gpiochip gpio-utils.c:61: warning: expecting prototype for doc(). Prototype was for gpiotools_request_line() instead gpio-utils.c:265: warning: Excess function parameter 'value' description in 'gpiotools_sets' gpio-utils.c:1: warning: 'gpiotools_request_lines' not found Signed-off-by: Randy Dunlap Cc: Bartosz Golaszewski Cc: Linus Walleij Cc: linux-gpio@vger.kernel.org Reviewed-by: Linus Walleij Signed-off-by: Bartosz Golaszewski --- tools/gpio/gpio-utils.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/gpio/gpio-utils.c b/tools/gpio/gpio-utils.c index 1639b4d832cd..4096bcd511d1 100644 --- a/tools/gpio/gpio-utils.c +++ b/tools/gpio/gpio-utils.c @@ -20,7 +20,7 @@ #define CONSUMER "gpio-utils" /** - * doc: Operation of gpio + * DOC: Operation of gpio * * Provide the api of gpiochip for chardev interface. There are two * types of api. The first one provide as same function as each @@ -100,7 +100,7 @@ exit_free_name: } /** - * gpiotools_set_values(): Set the value of gpio(s) + * gpiotools_set_values() - Set the value of gpio(s) * @fd: The fd returned by * gpiotools_request_line(). * @values: The array of values want to set. @@ -124,7 +124,7 @@ int gpiotools_set_values(const int fd, struct gpio_v2_line_values *values) } /** - * gpiotools_get_values(): Get the value of gpio(s) + * gpiotools_get_values() - Get the value of gpio(s) * @fd: The fd returned by * gpiotools_request_line(). * @values: The array of values get from hardware. @@ -148,7 +148,7 @@ int gpiotools_get_values(const int fd, struct gpio_v2_line_values *values) } /** - * gpiotools_release_line(): Release the line(s) of gpiochip + * gpiotools_release_line() - Release the line(s) of gpiochip * @fd: The fd returned by * gpiotools_request_line(). * @@ -169,7 +169,7 @@ int gpiotools_release_line(const int fd) } /** - * gpiotools_get(): Get value from specific line + * gpiotools_get() - Get value from specific line * @device_name: The name of gpiochip without prefix "/dev/", * such as "gpiochip0" * @line: number of line, such as 2. @@ -191,7 +191,7 @@ int gpiotools_get(const char *device_name, unsigned int line) /** - * gpiotools_gets(): Get values from specific lines. + * gpiotools_gets() - Get values from specific lines. * @device_name: The name of gpiochip without prefix "/dev/", * such as "gpiochip0". * @lines: An array desired lines, specified by offset @@ -230,7 +230,7 @@ int gpiotools_gets(const char *device_name, unsigned int *lines, } /** - * gpiotools_set(): Set value to specific line + * gpiotools_set() - Set value to specific line * @device_name: The name of gpiochip without prefix "/dev/", * such as "gpiochip0" * @line: number of line, such as 2. @@ -248,13 +248,13 @@ int gpiotools_set(const char *device_name, unsigned int line, } /** - * gpiotools_sets(): Set values to specific lines. + * gpiotools_sets() - Set values to specific lines. * @device_name: The name of gpiochip without prefix "/dev/", * such as "gpiochip0". * @lines: An array desired lines, specified by offset * index for the associated GPIO device. * @num_lines: The number of lines to request. - * @value: The array of values set to gpiochip, must be + * @values: The array of values set to gpiochip, must be * 0(low) or 1(high). * * Return: On success return 0; -- cgit v1.2.3 From 36e7985160782bc683001afe09e33a288435def0 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 25 Mar 2021 21:30:36 -0700 Subject: libbpf: Preserve empty DATASEC BTFs during static linking Ensure that BPF static linker preserves all DATASEC BTF types, even if some of them might not have any variable information at all. This may happen if the compiler promotes local initialized variable contents into .rodata section and there are no global or static functions in the program. For example, $ cat t.c struct t { char a; char b; char c; }; void bar(struct t*); void find() { struct t tmp = {1, 2, 3}; bar(&tmp); } $ clang -target bpf -O2 -g -S t.c .long 104 # BTF_KIND_DATASEC(id = 8) .long 251658240 # 0xf000000 .long 0 .ascii ".rodata" # string offset=104 $ clang -target bpf -O2 -g -c t.c $ readelf -S t.o | grep data [ 4] .rodata PROGBITS 0000000000000000 00000090 Fixes: 8fd27bf69b86 ("libbpf: Add BPF static linker BTF and BTF.ext support") Reported-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210326043036.3081011-1-andrii@kernel.org --- tools/lib/bpf/linker.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c index 5e0aa2f2c0ca..a29d62ff8041 100644 --- a/tools/lib/bpf/linker.c +++ b/tools/lib/bpf/linker.c @@ -94,6 +94,7 @@ struct dst_sec { int sec_sym_idx; /* section's DATASEC variable info, emitted on BTF finalization */ + bool has_btf; int sec_var_cnt; struct btf_var_secinfo *sec_vars; @@ -1436,6 +1437,16 @@ static int linker_append_btf(struct bpf_linker *linker, struct src_obj *obj) continue; dst_sec = &linker->secs[src_sec->dst_id]; + /* Mark section as having BTF regardless of the presence of + * variables. In some cases compiler might generate empty BTF + * with no variables information. E.g., when promoting local + * array/structure variable initial values and BPF object + * file otherwise has no read-only static variables in + * .rodata. We need to preserve such empty BTF and just set + * correct section size. + */ + dst_sec->has_btf = true; + t = btf__type_by_id(obj->btf, src_sec->sec_type_id); src_var = btf_var_secinfos(t); n = btf_vlen(t); @@ -1717,7 +1728,7 @@ static int finalize_btf(struct bpf_linker *linker) for (i = 1; i < linker->sec_cnt; i++) { struct dst_sec *sec = &linker->secs[i]; - if (!sec->sec_var_cnt) + if (!sec->has_btf) continue; id = btf__add_datasec(btf, sec->sec_name, sec->sec_sz); -- cgit v1.2.3 From cb4969e6f9f5ee12521aec764fa3d4bbd91bc797 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Wed, 3 Mar 2021 01:44:20 +0100 Subject: selftests: fix prepending $(OUTPUT) to $(TEST_PROGS) Currently the following command produces an error message: linux# make kselftest TARGETS=bpf O=/mnt/linux-build # selftests: bpf: test_libbpf.sh # ./test_libbpf.sh: line 23: ./test_libbpf_open: No such file or directory # test_libbpf: failed at file test_l4lb.o # selftests: test_libbpf [FAILED] The error message might not affect the return code of make, therefore one needs to grep make output in order to detect it. This is not the only instance of the same underlying problem; any test with more than one element in $(TEST_PROGS) fails the same way. Another example: linux# make O=/mnt/linux-build TARGETS=splice kselftest [...] # ./short_splice_read.sh: 15: ./splice_read: not found # FAIL: /sys/module/test_module/sections/.init.text 2 not ok 2 selftests: splice: short_splice_read.sh # exit=1 The current logic prepends $(OUTPUT) only to the first member of $(TEST_PROGS). After that, run_one() does cd `dirname $TEST` For all tests except the first one, `dirname $TEST` is ., which means they cannot access the files generated in $(OUTPUT). Fix by using $(addprefix) to prepend $(OUTPUT)/ to each member of $(TEST_PROGS). Fixes: 1a940687e424 ("selftests: lib.mk: copy test scripts and test files for make O=dir run") Signed-off-by: Ilya Leoshkevich Signed-off-by: Shuah Khan --- tools/testing/selftests/lib.mk | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index a5ce26d548e4..be17462fe146 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -74,7 +74,8 @@ ifdef building_out_of_srctree rsync -aq $(TEST_PROGS) $(TEST_PROGS_EXTENDED) $(TEST_FILES) $(OUTPUT); \ fi @if [ "X$(TEST_PROGS)" != "X" ]; then \ - $(call RUN_TESTS, $(TEST_GEN_PROGS) $(TEST_CUSTOM_PROGS) $(OUTPUT)/$(TEST_PROGS)) ; \ + $(call RUN_TESTS, $(TEST_GEN_PROGS) $(TEST_CUSTOM_PROGS) \ + $(addprefix $(OUTPUT)/,$(TEST_PROGS))) ; \ else \ $(call RUN_TESTS, $(TEST_GEN_PROGS) $(TEST_CUSTOM_PROGS)); \ fi -- cgit v1.2.3 From ea2c679edc4120354ff15818f78ad64921534c03 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 15 Mar 2021 12:33:03 +0000 Subject: selftests/timers: Fix spelling mistake "clocksourc" -> "clocksource" There is a spelling mistake in a comment. Fix it. Signed-off-by: Colin Ian King Acked-by: John Stultz Signed-off-by: Shuah Khan --- tools/testing/selftests/timers/clocksource-switch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/timers/clocksource-switch.c b/tools/testing/selftests/timers/clocksource-switch.c index bfc974b4572d..2d66abd877e6 100644 --- a/tools/testing/selftests/timers/clocksource-switch.c +++ b/tools/testing/selftests/timers/clocksource-switch.c @@ -3,7 +3,7 @@ * (C) Copyright IBM 2012 * Licensed under the GPLv2 * - * NOTE: This is a meta-test which quickly changes the clocksourc and + * NOTE: This is a meta-test which quickly changes the clocksource and * then uses other tests to detect problems. Thus this test requires * that the inconsistency-check and nanosleep tests be present in the * same directory it is run from. -- cgit v1.2.3 From 2e580a63b5c214a89bcc3e243ee2058691cee001 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 26 Mar 2021 11:26:35 -0700 Subject: selftests: mptcp: add cfg_do_w for cfg_remove In some testcases, we need to slow down the transmitting process. This patch added a new argument named cfg_do_w for cfg_remove to allow the caller to pass an argument to cfg_remove. In do_rnd_write, use this cfg_do_w to control the transmitting speed. Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/mptcp_connect.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c index 77bb62feb872..69d89b5d666f 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.c +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c @@ -55,6 +55,7 @@ static int cfg_sndbuf; static int cfg_rcvbuf; static bool cfg_join; static bool cfg_remove; +static unsigned int cfg_do_w; static int cfg_wait; static void die_usage(void) @@ -272,8 +273,8 @@ static size_t do_rnd_write(const int fd, char *buf, const size_t len) if (cfg_join && first && do_w > 100) do_w = 100; - if (cfg_remove && do_w > 50) - do_w = 50; + if (cfg_remove && do_w > cfg_do_w) + do_w = cfg_do_w; bw = write(fd, buf, do_w); if (bw < 0) @@ -829,7 +830,7 @@ static void parse_opts(int argc, char **argv) { int c; - while ((c = getopt(argc, argv, "6jrlp:s:hut:m:S:R:w:")) != -1) { + while ((c = getopt(argc, argv, "6jr:lp:s:hut:m:S:R:w:")) != -1) { switch (c) { case 'j': cfg_join = true; @@ -840,6 +841,9 @@ static void parse_opts(int argc, char **argv) cfg_remove = true; cfg_mode = CFG_MODE_POLL; cfg_wait = 400000; + cfg_do_w = atoi(optarg); + if (cfg_do_w <= 0) + cfg_do_w = 50; break; case 'l': listen_mode = true; -- cgit v1.2.3 From 8da6229b9524d9a4ea91ed1308f7e45bfe0b2799 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 26 Mar 2021 11:26:36 -0700 Subject: selftests: mptcp: timeout testcases for multi addresses This patch added the timeout testcases for multi addresses, valid and invalid. These testcases need to transmit 8 ADD_ADDRs, so add a new speed level 'least' to set 10 to mptcp_connect to slow down the transmitting process. The original speed level 'slow' still uses 50. Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 26 +++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index fe990d8696a9..32379efa2276 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -234,8 +234,10 @@ do_transfer() if [ $speed = "fast" ]; then mptcp_connect="./mptcp_connect -j" - else - mptcp_connect="./mptcp_connect -r" + elif [ $speed = "slow" ]; then + mptcp_connect="./mptcp_connect -r 50" + elif [ $speed = "least" ]; then + mptcp_connect="./mptcp_connect -r 10" fi local local_addr @@ -818,6 +820,26 @@ add_addr_timeout_tests() run_tests $ns1 $ns2 dead:beef:1::1 0 0 0 slow chk_join_nr "signal address, ADD_ADDR6 timeout" 1 1 1 chk_add_nr 4 0 + + # signal addresses timeout + reset_with_add_addr_timeout + ip netns exec $ns1 ./pm_nl_ctl limits 2 2 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal + ip netns exec $ns1 ./pm_nl_ctl add 10.0.3.1 flags signal + ip netns exec $ns2 ./pm_nl_ctl limits 2 2 + run_tests $ns1 $ns2 10.0.1.1 0 0 0 least + chk_join_nr "signal addresses, ADD_ADDR timeout" 2 2 2 + chk_add_nr 8 0 + + # signal invalid addresses timeout + reset_with_add_addr_timeout + ip netns exec $ns1 ./pm_nl_ctl limits 2 2 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.12.1 flags signal + ip netns exec $ns1 ./pm_nl_ctl add 10.0.3.1 flags signal + ip netns exec $ns2 ./pm_nl_ctl limits 2 2 + run_tests $ns1 $ns2 10.0.1.1 0 0 0 least + chk_join_nr "invalid address, ADD_ADDR timeout" 1 1 1 + chk_add_nr 8 0 } remove_tests() -- cgit v1.2.3 From ef360019db4043d53d631aec1e630bd6e6ce54f4 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 26 Mar 2021 11:26:42 -0700 Subject: selftests: mptcp: signal addresses testcases This patch adds testcases for signalling multi valid and invalid addresses for both signal_address_tests and remove_tests. Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 58 +++++++++++++++++++++++++ 1 file changed, 58 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 32379efa2276..679de3abaf34 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -785,6 +785,28 @@ signal_address_tests() run_tests $ns1 $ns2 10.0.1.1 chk_join_nr "multiple subflows and signal" 3 3 3 chk_add_nr 1 1 + + # signal addresses + reset + ip netns exec $ns1 ./pm_nl_ctl limits 3 3 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal + ip netns exec $ns1 ./pm_nl_ctl add 10.0.3.1 flags signal + ip netns exec $ns1 ./pm_nl_ctl add 10.0.4.1 flags signal + ip netns exec $ns2 ./pm_nl_ctl limits 3 3 + run_tests $ns1 $ns2 10.0.1.1 + chk_join_nr "signal addresses" 3 3 3 + chk_add_nr 3 3 + + # signal invalid addresses + reset + ip netns exec $ns1 ./pm_nl_ctl limits 3 3 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.12.1 flags signal + ip netns exec $ns1 ./pm_nl_ctl add 10.0.3.1 flags signal + ip netns exec $ns1 ./pm_nl_ctl add 10.0.14.1 flags signal + ip netns exec $ns2 ./pm_nl_ctl limits 3 3 + run_tests $ns1 $ns2 10.0.1.1 + chk_join_nr "signal invalid addresses" 1 1 1 + chk_add_nr 3 3 } link_failure_tests() @@ -896,6 +918,30 @@ remove_tests() chk_add_nr 1 1 chk_rm_nr 2 2 + # addresses remove + reset + ip netns exec $ns1 ./pm_nl_ctl limits 3 3 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal id 250 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.3.1 flags signal + ip netns exec $ns1 ./pm_nl_ctl add 10.0.4.1 flags signal + ip netns exec $ns2 ./pm_nl_ctl limits 3 3 + run_tests $ns1 $ns2 10.0.1.1 0 -3 0 slow + chk_join_nr "remove addresses" 3 3 3 + chk_add_nr 3 3 + chk_rm_nr 3 3 invert + + # invalid addresses remove + reset + ip netns exec $ns1 ./pm_nl_ctl limits 3 3 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.12.1 flags signal + ip netns exec $ns1 ./pm_nl_ctl add 10.0.3.1 flags signal + ip netns exec $ns1 ./pm_nl_ctl add 10.0.14.1 flags signal + ip netns exec $ns2 ./pm_nl_ctl limits 3 3 + run_tests $ns1 $ns2 10.0.1.1 0 -3 0 slow + chk_join_nr "remove invalid addresses" 1 1 1 + chk_add_nr 3 3 + chk_rm_nr 3 1 invert + # subflows and signal, flush reset ip netns exec $ns1 ./pm_nl_ctl limits 0 3 @@ -930,6 +976,18 @@ remove_tests() chk_join_nr "flush addresses" 3 3 3 chk_add_nr 3 3 chk_rm_nr 3 3 invert + + # invalid addresses flush + reset + ip netns exec $ns1 ./pm_nl_ctl limits 3 3 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.12.1 flags signal + ip netns exec $ns1 ./pm_nl_ctl add 10.0.3.1 flags signal + ip netns exec $ns1 ./pm_nl_ctl add 10.0.14.1 flags signal + ip netns exec $ns2 ./pm_nl_ctl limits 3 3 + run_tests $ns1 $ns2 10.0.1.1 0 -8 0 slow + chk_join_nr "flush invalid addresses" 1 1 1 + chk_add_nr 3 3 + chk_rm_nr 3 1 invert } add_tests() -- cgit v1.2.3 From e6ac2450d6dee3121cd8bbf2907b78a68a8a353d Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 24 Mar 2021 18:51:42 -0700 Subject: bpf: Support bpf program calling kernel function This patch adds support to BPF verifier to allow bpf program calling kernel function directly. The use case included in this set is to allow bpf-tcp-cc to directly call some tcp-cc helper functions (e.g. "tcp_cong_avoid_ai()"). Those functions have already been used by some kernel tcp-cc implementations. This set will also allow the bpf-tcp-cc program to directly call the kernel tcp-cc implementation, For example, a bpf_dctcp may only want to implement its own dctcp_cwnd_event() and reuse other dctcp_*() directly from the kernel tcp_dctcp.c instead of reimplementing (or copy-and-pasting) them. The tcp-cc kernel functions mentioned above will be white listed for the struct_ops bpf-tcp-cc programs to use in a later patch. The white listed functions are not bounded to a fixed ABI contract. Those functions have already been used by the existing kernel tcp-cc. If any of them has changed, both in-tree and out-of-tree kernel tcp-cc implementations have to be changed. The same goes for the struct_ops bpf-tcp-cc programs which have to be adjusted accordingly. This patch is to make the required changes in the bpf verifier. First change is in btf.c, it adds a case in "btf_check_func_arg_match()". When the passed in "btf->kernel_btf == true", it means matching the verifier regs' states with a kernel function. This will handle the PTR_TO_BTF_ID reg. It also maps PTR_TO_SOCK_COMMON, PTR_TO_SOCKET, and PTR_TO_TCP_SOCK to its kernel's btf_id. In the later libbpf patch, the insn calling a kernel function will look like: insn->code == (BPF_JMP | BPF_CALL) insn->src_reg == BPF_PSEUDO_KFUNC_CALL /* <- new in this patch */ insn->imm == func_btf_id /* btf_id of the running kernel */ [ For the future calling function-in-kernel-module support, an array of module btf_fds can be passed at the load time and insn->off can be used to index into this array. ] At the early stage of verifier, the verifier will collect all kernel function calls into "struct bpf_kfunc_desc". Those descriptors are stored in "prog->aux->kfunc_tab" and will be available to the JIT. Since this "add" operation is similar to the current "add_subprog()" and looking for the same insn->code, they are done together in the new "add_subprog_and_kfunc()". In the "do_check()" stage, the new "check_kfunc_call()" is added to verify the kernel function call instruction: 1. Ensure the kernel function can be used by a particular BPF_PROG_TYPE. A new bpf_verifier_ops "check_kfunc_call" is added to do that. The bpf-tcp-cc struct_ops program will implement this function in a later patch. 2. Call "btf_check_kfunc_args_match()" to ensure the regs can be used as the args of a kernel function. 3. Mark the regs' type, subreg_def, and zext_dst. At the later do_misc_fixups() stage, the new fixup_kfunc_call() will replace the insn->imm with the function address (relative to __bpf_call_base). If needed, the jit can find the btf_func_model by calling the new bpf_jit_find_kfunc_model(prog, insn). With the imm set to the function address, "bpftool prog dump xlated" will be able to display the kernel function calls the same way as it displays other bpf helper calls. gpl_compatible program is required to call kernel function. This feature currently requires JIT. The verifier selftests are adjusted because of the changes in the verbose log in add_subprog_and_kfunc(). Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210325015142.1544736-1-kafai@fb.com --- arch/x86/net/bpf_jit_comp.c | 5 + include/linux/bpf.h | 24 ++ include/linux/btf.h | 1 + include/linux/filter.h | 1 + include/uapi/linux/bpf.h | 4 + kernel/bpf/btf.c | 65 +++- kernel/bpf/core.c | 18 +- kernel/bpf/disasm.c | 13 +- kernel/bpf/syscall.c | 1 + kernel/bpf/verifier.c | 368 +++++++++++++++++++++-- tools/include/uapi/linux/bpf.h | 4 + tools/testing/selftests/bpf/verifier/calls.c | 12 +- tools/testing/selftests/bpf/verifier/dead_code.c | 10 +- 13 files changed, 480 insertions(+), 46 deletions(-) (limited to 'tools') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index b35fc8023884..9eead60f0301 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -2346,3 +2346,8 @@ out: tmp : orig_prog); return prog; } + +bool bpf_jit_supports_kfunc_call(void) +{ + return true; +} diff --git a/include/linux/bpf.h b/include/linux/bpf.h index eaae618a90b5..b5b7967e3ff3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -427,6 +427,7 @@ enum bpf_reg_type { PTR_TO_PERCPU_BTF_ID, /* reg points to a percpu kernel variable */ PTR_TO_FUNC, /* reg points to a bpf program function */ PTR_TO_MAP_KEY, /* reg points to a map element key */ + __BPF_REG_TYPE_MAX, }; /* The information passed from prog-specific *_is_valid_access @@ -480,6 +481,7 @@ struct bpf_verifier_ops { const struct btf_type *t, int off, int size, enum bpf_access_type atype, u32 *next_btf_id); + bool (*check_kfunc_call)(u32 kfunc_btf_id); }; struct bpf_prog_offload_ops { @@ -796,6 +798,8 @@ struct btf_mod_pair { struct module *module; }; +struct bpf_kfunc_desc_tab; + struct bpf_prog_aux { atomic64_t refcnt; u32 used_map_cnt; @@ -832,6 +836,7 @@ struct bpf_prog_aux { struct bpf_prog **func; void *jit_data; /* JIT specific data. arch dependent */ struct bpf_jit_poke_descriptor *poke_tab; + struct bpf_kfunc_desc_tab *kfunc_tab; u32 size_poke_tab; struct bpf_ksym ksym; const struct bpf_prog_ops *ops; @@ -1547,6 +1552,9 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, struct bpf_reg_state; int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs); +int btf_check_kfunc_arg_match(struct bpf_verifier_env *env, + const struct btf *btf, u32 func_id, + struct bpf_reg_state *regs); int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *reg); int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog, @@ -1557,6 +1565,10 @@ struct bpf_link *bpf_link_by_id(u32 id); const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id); void bpf_task_storage_free(struct task_struct *task); +bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog); +const struct btf_func_model * +bpf_jit_find_kfunc_model(const struct bpf_prog *prog, + const struct bpf_insn *insn); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { @@ -1737,6 +1749,18 @@ bpf_base_func_proto(enum bpf_func_id func_id) static inline void bpf_task_storage_free(struct task_struct *task) { } + +static inline bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog) +{ + return false; +} + +static inline const struct btf_func_model * +bpf_jit_find_kfunc_model(const struct bpf_prog *prog, + const struct bpf_insn *insn) +{ + return NULL; +} #endif /* CONFIG_BPF_SYSCALL */ void __bpf_free_used_btfs(struct bpf_prog_aux *aux, diff --git a/include/linux/btf.h b/include/linux/btf.h index 8a05687a4ee2..3bac66e0183a 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -110,6 +110,7 @@ const struct btf_type *btf_type_resolve_func_ptr(const struct btf *btf, const struct btf_type * btf_resolve_size(const struct btf *btf, const struct btf_type *type, u32 *type_size); +const char *btf_type_str(const struct btf_type *t); #define for_each_member(i, struct_type, member) \ for (i = 0, member = btf_type_member(struct_type); \ diff --git a/include/linux/filter.h b/include/linux/filter.h index 0d9c710eb050..eecfd82db648 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -918,6 +918,7 @@ u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog); void bpf_jit_compile(struct bpf_prog *prog); bool bpf_jit_needs_zext(void); +bool bpf_jit_supports_kfunc_call(void); bool bpf_helper_changes_pkt_data(void *func); static inline bool bpf_dump_raw_ok(const struct cred *cred) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 008edc1dc8c1..598716742593 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1117,6 +1117,10 @@ enum bpf_link_type { * offset to another bpf function */ #define BPF_PSEUDO_CALL 1 +/* when bpf_call->src_reg == BPF_PSEUDO_KFUNC_CALL, + * bpf_call->imm == btf_id of a BTF_KIND_FUNC in the running kernel + */ +#define BPF_PSEUDO_KFUNC_CALL 2 /* flags for BPF_MAP_UPDATE_ELEM command */ enum { diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 3c489adacf3b..ec8afc4bc560 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -283,7 +283,7 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = { [BTF_KIND_FLOAT] = "FLOAT", }; -static const char *btf_type_str(const struct btf_type *t) +const char *btf_type_str(const struct btf_type *t) { return btf_kind_str[BTF_INFO_KIND(t->info)]; } @@ -5362,6 +5362,14 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr return btf_check_func_type_match(log, btf1, t1, btf2, t2); } +static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = { +#ifdef CONFIG_NET + [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK], + [PTR_TO_SOCK_COMMON] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], + [PTR_TO_TCP_SOCK] = &btf_sock_ids[BTF_SOCK_TYPE_TCP], +#endif +}; + static int btf_check_func_arg_match(struct bpf_verifier_env *env, const struct btf *btf, u32 func_id, struct bpf_reg_state *regs, @@ -5371,12 +5379,12 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, const char *func_name, *ref_tname; const struct btf_type *t, *ref_t; const struct btf_param *args; - u32 i, nargs; + u32 i, nargs, ref_id; t = btf_type_by_id(btf, func_id); if (!t || !btf_type_is_func(t)) { /* These checks were already done by the verifier while loading - * struct bpf_func_info + * struct bpf_func_info or in add_kfunc_call(). */ bpf_log(log, "BTF of func_id %u doesn't point to KIND_FUNC\n", func_id); @@ -5418,9 +5426,49 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, return -EINVAL; } - ref_t = btf_type_skip_modifiers(btf, t->type, NULL); + ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id); ref_tname = btf_name_by_offset(btf, ref_t->name_off); - if (btf_get_prog_ctx_type(log, btf, t, env->prog->type, i)) { + if (btf_is_kernel(btf)) { + const struct btf_type *reg_ref_t; + const struct btf *reg_btf; + const char *reg_ref_tname; + u32 reg_ref_id; + + if (!btf_type_is_struct(ref_t)) { + bpf_log(log, "kernel function %s args#%d pointer type %s %s is not supported\n", + func_name, i, btf_type_str(ref_t), + ref_tname); + return -EINVAL; + } + + if (reg->type == PTR_TO_BTF_ID) { + reg_btf = reg->btf; + reg_ref_id = reg->btf_id; + } else if (reg2btf_ids[reg->type]) { + reg_btf = btf_vmlinux; + reg_ref_id = *reg2btf_ids[reg->type]; + } else { + bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d is not a pointer to btf_id\n", + func_name, i, + btf_type_str(ref_t), ref_tname, regno); + return -EINVAL; + } + + reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, + ®_ref_id); + reg_ref_tname = btf_name_by_offset(reg_btf, + reg_ref_t->name_off); + if (!btf_struct_ids_match(log, reg_btf, reg_ref_id, + reg->off, btf, ref_id)) { + bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n", + func_name, i, + btf_type_str(ref_t), ref_tname, + regno, btf_type_str(reg_ref_t), + reg_ref_tname); + return -EINVAL; + } + } else if (btf_get_prog_ctx_type(log, btf, t, + env->prog->type, i)) { /* If function expects ctx type in BTF check that caller * is passing PTR_TO_CTX. */ @@ -5493,6 +5541,13 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, return err; } +int btf_check_kfunc_arg_match(struct bpf_verifier_env *env, + const struct btf *btf, u32 func_id, + struct bpf_reg_state *regs) +{ + return btf_check_func_arg_match(env, btf, func_id, regs, false); +} + /* Convert BTF of a function into bpf_reg_state if possible * Returns: * EFAULT - there is a verifier bug. Abort verification. diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a35eb3d7b126..f5423251c118 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -159,6 +159,9 @@ void bpf_prog_jit_attempt_done(struct bpf_prog *prog) kvfree(prog->aux->jited_linfo); prog->aux->jited_linfo = NULL; } + + kfree(prog->aux->kfunc_tab); + prog->aux->kfunc_tab = NULL; } /* The jit engine is responsible to provide an array @@ -1840,9 +1843,15 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) /* In case of BPF to BPF calls, verifier did all the prep * work with regards to JITing, etc. */ + bool jit_needed = false; + if (fp->bpf_func) goto finalize; + if (IS_ENABLED(CONFIG_BPF_JIT_ALWAYS_ON) || + bpf_prog_has_kfunc_call(fp)) + jit_needed = true; + bpf_prog_select_func(fp); /* eBPF JITs can rewrite the program in case constant @@ -1858,12 +1867,10 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) fp = bpf_int_jit_compile(fp); bpf_prog_jit_attempt_done(fp); -#ifdef CONFIG_BPF_JIT_ALWAYS_ON - if (!fp->jited) { + if (!fp->jited && jit_needed) { *err = -ENOTSUPP; return fp; } -#endif } else { *err = bpf_prog_offload_compile(fp); if (*err) @@ -2343,6 +2350,11 @@ bool __weak bpf_jit_needs_zext(void) return false; } +bool __weak bpf_jit_supports_kfunc_call(void) +{ + return false; +} + /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call * skb_copy_bits(), so provide a weak definition of it for NET-less config. */ diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index 3acc7e0b6916..dad821c8ecd0 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -19,16 +19,23 @@ static const char *__func_get_name(const struct bpf_insn_cbs *cbs, { BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID); - if (insn->src_reg != BPF_PSEUDO_CALL && + if (!insn->src_reg && insn->imm >= 0 && insn->imm < __BPF_FUNC_MAX_ID && func_id_str[insn->imm]) return func_id_str[insn->imm]; - if (cbs && cbs->cb_call) - return cbs->cb_call(cbs->private_data, insn); + if (cbs && cbs->cb_call) { + const char *res; + + res = cbs->cb_call(cbs->private_data, insn); + if (res) + return res; + } if (insn->src_reg == BPF_PSEUDO_CALL) snprintf(buff, len, "%+d", insn->imm); + else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) + snprintf(buff, len, "kernel-function"); return buff; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index eaf85bf51c5a..9603de81811a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1696,6 +1696,7 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) btf_put(prog->aux->btf); kvfree(prog->aux->jited_linfo); kvfree(prog->aux->linfo); + kfree(prog->aux->kfunc_tab); if (prog->aux->attach_btf) btf_put(prog->aux->attach_btf); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b31e62daafbd..852541a435ef 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -234,6 +234,12 @@ static bool bpf_pseudo_call(const struct bpf_insn *insn) insn->src_reg == BPF_PSEUDO_CALL; } +static bool bpf_pseudo_kfunc_call(const struct bpf_insn *insn) +{ + return insn->code == (BPF_JMP | BPF_CALL) && + insn->src_reg == BPF_PSEUDO_KFUNC_CALL; +} + static bool bpf_pseudo_func(const struct bpf_insn *insn) { return insn->code == (BPF_LD | BPF_IMM | BPF_DW) && @@ -1554,47 +1560,205 @@ static int add_subprog(struct bpf_verifier_env *env, int off) verbose(env, "too many subprograms\n"); return -E2BIG; } + /* determine subprog starts. The end is one before the next starts */ env->subprog_info[env->subprog_cnt++].start = off; sort(env->subprog_info, env->subprog_cnt, sizeof(env->subprog_info[0]), cmp_subprogs, NULL); return env->subprog_cnt - 1; } -static int check_subprogs(struct bpf_verifier_env *env) +struct bpf_kfunc_desc { + struct btf_func_model func_model; + u32 func_id; + s32 imm; +}; + +#define MAX_KFUNC_DESCS 256 +struct bpf_kfunc_desc_tab { + struct bpf_kfunc_desc descs[MAX_KFUNC_DESCS]; + u32 nr_descs; +}; + +static int kfunc_desc_cmp_by_id(const void *a, const void *b) +{ + const struct bpf_kfunc_desc *d0 = a; + const struct bpf_kfunc_desc *d1 = b; + + /* func_id is not greater than BTF_MAX_TYPE */ + return d0->func_id - d1->func_id; +} + +static const struct bpf_kfunc_desc * +find_kfunc_desc(const struct bpf_prog *prog, u32 func_id) +{ + struct bpf_kfunc_desc desc = { + .func_id = func_id, + }; + struct bpf_kfunc_desc_tab *tab; + + tab = prog->aux->kfunc_tab; + return bsearch(&desc, tab->descs, tab->nr_descs, + sizeof(tab->descs[0]), kfunc_desc_cmp_by_id); +} + +static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id) +{ + const struct btf_type *func, *func_proto; + struct bpf_kfunc_desc_tab *tab; + struct bpf_prog_aux *prog_aux; + struct bpf_kfunc_desc *desc; + const char *func_name; + unsigned long addr; + int err; + + prog_aux = env->prog->aux; + tab = prog_aux->kfunc_tab; + if (!tab) { + if (!btf_vmlinux) { + verbose(env, "calling kernel function is not supported without CONFIG_DEBUG_INFO_BTF\n"); + return -ENOTSUPP; + } + + if (!env->prog->jit_requested) { + verbose(env, "JIT is required for calling kernel function\n"); + return -ENOTSUPP; + } + + if (!bpf_jit_supports_kfunc_call()) { + verbose(env, "JIT does not support calling kernel function\n"); + return -ENOTSUPP; + } + + if (!env->prog->gpl_compatible) { + verbose(env, "cannot call kernel function from non-GPL compatible program\n"); + return -EINVAL; + } + + tab = kzalloc(sizeof(*tab), GFP_KERNEL); + if (!tab) + return -ENOMEM; + prog_aux->kfunc_tab = tab; + } + + if (find_kfunc_desc(env->prog, func_id)) + return 0; + + if (tab->nr_descs == MAX_KFUNC_DESCS) { + verbose(env, "too many different kernel function calls\n"); + return -E2BIG; + } + + func = btf_type_by_id(btf_vmlinux, func_id); + if (!func || !btf_type_is_func(func)) { + verbose(env, "kernel btf_id %u is not a function\n", + func_id); + return -EINVAL; + } + func_proto = btf_type_by_id(btf_vmlinux, func->type); + if (!func_proto || !btf_type_is_func_proto(func_proto)) { + verbose(env, "kernel function btf_id %u does not have a valid func_proto\n", + func_id); + return -EINVAL; + } + + func_name = btf_name_by_offset(btf_vmlinux, func->name_off); + addr = kallsyms_lookup_name(func_name); + if (!addr) { + verbose(env, "cannot find address for kernel function %s\n", + func_name); + return -EINVAL; + } + + desc = &tab->descs[tab->nr_descs++]; + desc->func_id = func_id; + desc->imm = BPF_CAST_CALL(addr) - __bpf_call_base; + err = btf_distill_func_proto(&env->log, btf_vmlinux, + func_proto, func_name, + &desc->func_model); + if (!err) + sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]), + kfunc_desc_cmp_by_id, NULL); + return err; +} + +static int kfunc_desc_cmp_by_imm(const void *a, const void *b) +{ + const struct bpf_kfunc_desc *d0 = a; + const struct bpf_kfunc_desc *d1 = b; + + if (d0->imm > d1->imm) + return 1; + else if (d0->imm < d1->imm) + return -1; + return 0; +} + +static void sort_kfunc_descs_by_imm(struct bpf_prog *prog) +{ + struct bpf_kfunc_desc_tab *tab; + + tab = prog->aux->kfunc_tab; + if (!tab) + return; + + sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]), + kfunc_desc_cmp_by_imm, NULL); +} + +bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog) +{ + return !!prog->aux->kfunc_tab; +} + +const struct btf_func_model * +bpf_jit_find_kfunc_model(const struct bpf_prog *prog, + const struct bpf_insn *insn) +{ + const struct bpf_kfunc_desc desc = { + .imm = insn->imm, + }; + const struct bpf_kfunc_desc *res; + struct bpf_kfunc_desc_tab *tab; + + tab = prog->aux->kfunc_tab; + res = bsearch(&desc, tab->descs, tab->nr_descs, + sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm); + + return res ? &res->func_model : NULL; +} + +static int add_subprog_and_kfunc(struct bpf_verifier_env *env) { - int i, ret, subprog_start, subprog_end, off, cur_subprog = 0; struct bpf_subprog_info *subprog = env->subprog_info; struct bpf_insn *insn = env->prog->insnsi; - int insn_cnt = env->prog->len; + int i, ret, insn_cnt = env->prog->len; /* Add entry function. */ ret = add_subprog(env, 0); - if (ret < 0) + if (ret) return ret; - /* determine subprog starts. The end is one before the next starts */ - for (i = 0; i < insn_cnt; i++) { - if (bpf_pseudo_func(insn + i)) { - if (!env->bpf_capable) { - verbose(env, - "function pointers are allowed for CAP_BPF and CAP_SYS_ADMIN\n"); - return -EPERM; - } - ret = add_subprog(env, i + insn[i].imm + 1); - if (ret < 0) - return ret; - /* remember subprog */ - insn[i + 1].imm = ret; - continue; - } - if (!bpf_pseudo_call(insn + i)) + for (i = 0; i < insn_cnt; i++, insn++) { + if (!bpf_pseudo_func(insn) && !bpf_pseudo_call(insn) && + !bpf_pseudo_kfunc_call(insn)) continue; + if (!env->bpf_capable) { - verbose(env, - "function calls to other bpf functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n"); + verbose(env, "loading/calling other bpf or kernel functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n"); return -EPERM; } - ret = add_subprog(env, i + insn[i].imm + 1); + + if (bpf_pseudo_func(insn)) { + ret = add_subprog(env, i + insn->imm + 1); + if (ret >= 0) + /* remember subprog */ + insn[1].imm = ret; + } else if (bpf_pseudo_call(insn)) { + ret = add_subprog(env, i + insn->imm + 1); + } else { + ret = add_kfunc_call(env, insn->imm); + } + if (ret < 0) return ret; } @@ -1608,6 +1772,16 @@ static int check_subprogs(struct bpf_verifier_env *env) for (i = 0; i < env->subprog_cnt; i++) verbose(env, "func#%d @%d\n", i, subprog[i].start); + return 0; +} + +static int check_subprogs(struct bpf_verifier_env *env) +{ + int i, subprog_start, subprog_end, off, cur_subprog = 0; + struct bpf_subprog_info *subprog = env->subprog_info; + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + /* now check that all jumps are within the same subprog */ subprog_start = subprog[cur_subprog].start; subprog_end = subprog[cur_subprog + 1].start; @@ -1916,6 +2090,17 @@ static int get_prev_insn_idx(struct bpf_verifier_state *st, int i, return i; } +static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn) +{ + const struct btf_type *func; + + if (insn->src_reg != BPF_PSEUDO_KFUNC_CALL) + return NULL; + + func = btf_type_by_id(btf_vmlinux, insn->imm); + return btf_name_by_offset(btf_vmlinux, func->name_off); +} + /* For given verifier state backtrack_insn() is called from the last insn to * the first insn. Its purpose is to compute a bitmask of registers and * stack slots that needs precision in the parent verifier state. @@ -1924,6 +2109,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, u32 *reg_mask, u64 *stack_mask) { const struct bpf_insn_cbs cbs = { + .cb_call = disasm_kfunc_name, .cb_print = verbose, .private_data = env, }; @@ -5960,6 +6146,98 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return 0; } +/* mark_btf_func_reg_size() is used when the reg size is determined by + * the BTF func_proto's return value size and argument. + */ +static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno, + size_t reg_size) +{ + struct bpf_reg_state *reg = &cur_regs(env)[regno]; + + if (regno == BPF_REG_0) { + /* Function return value */ + reg->live |= REG_LIVE_WRITTEN; + reg->subreg_def = reg_size == sizeof(u64) ? + DEF_NOT_SUBREG : env->insn_idx + 1; + } else { + /* Function argument */ + if (reg_size == sizeof(u64)) { + mark_insn_zext(env, reg); + mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); + } else { + mark_reg_read(env, reg, reg->parent, REG_LIVE_READ32); + } + } +} + +static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn) +{ + const struct btf_type *t, *func, *func_proto, *ptr_type; + struct bpf_reg_state *regs = cur_regs(env); + const char *func_name, *ptr_type_name; + u32 i, nargs, func_id, ptr_type_id; + const struct btf_param *args; + int err; + + func_id = insn->imm; + func = btf_type_by_id(btf_vmlinux, func_id); + func_name = btf_name_by_offset(btf_vmlinux, func->name_off); + func_proto = btf_type_by_id(btf_vmlinux, func->type); + + if (!env->ops->check_kfunc_call || + !env->ops->check_kfunc_call(func_id)) { + verbose(env, "calling kernel function %s is not allowed\n", + func_name); + return -EACCES; + } + + /* Check the arguments */ + err = btf_check_kfunc_arg_match(env, btf_vmlinux, func_id, regs); + if (err) + return err; + + for (i = 0; i < CALLER_SAVED_REGS; i++) + mark_reg_not_init(env, regs, caller_saved[i]); + + /* Check return type */ + t = btf_type_skip_modifiers(btf_vmlinux, func_proto->type, NULL); + if (btf_type_is_scalar(t)) { + mark_reg_unknown(env, regs, BPF_REG_0); + mark_btf_func_reg_size(env, BPF_REG_0, t->size); + } else if (btf_type_is_ptr(t)) { + ptr_type = btf_type_skip_modifiers(btf_vmlinux, t->type, + &ptr_type_id); + if (!btf_type_is_struct(ptr_type)) { + ptr_type_name = btf_name_by_offset(btf_vmlinux, + ptr_type->name_off); + verbose(env, "kernel function %s returns pointer type %s %s is not supported\n", + func_name, btf_type_str(ptr_type), + ptr_type_name); + return -EINVAL; + } + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].btf = btf_vmlinux; + regs[BPF_REG_0].type = PTR_TO_BTF_ID; + regs[BPF_REG_0].btf_id = ptr_type_id; + mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *)); + } /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */ + + nargs = btf_type_vlen(func_proto); + args = (const struct btf_param *)(func_proto + 1); + for (i = 0; i < nargs; i++) { + u32 regno = i + 1; + + t = btf_type_skip_modifiers(btf_vmlinux, args[i].type, NULL); + if (btf_type_is_ptr(t)) + mark_btf_func_reg_size(env, regno, sizeof(void *)); + else + /* scalar. ensured by btf_check_kfunc_arg_match() */ + mark_btf_func_reg_size(env, regno, t->size); + } + + return 0; +} + static bool signed_add_overflows(s64 a, s64 b) { /* Do the add in u64, where overflow is well-defined */ @@ -10162,6 +10440,7 @@ static int do_check(struct bpf_verifier_env *env) if (env->log.level & BPF_LOG_LEVEL) { const struct bpf_insn_cbs cbs = { + .cb_call = disasm_kfunc_name, .cb_print = verbose, .private_data = env, }; @@ -10309,7 +10588,8 @@ static int do_check(struct bpf_verifier_env *env) if (BPF_SRC(insn->code) != BPF_K || insn->off != 0 || (insn->src_reg != BPF_REG_0 && - insn->src_reg != BPF_PSEUDO_CALL) || + insn->src_reg != BPF_PSEUDO_CALL && + insn->src_reg != BPF_PSEUDO_KFUNC_CALL) || insn->dst_reg != BPF_REG_0 || class == BPF_JMP32) { verbose(env, "BPF_CALL uses reserved fields\n"); @@ -10324,6 +10604,8 @@ static int do_check(struct bpf_verifier_env *env) } if (insn->src_reg == BPF_PSEUDO_CALL) err = check_func_call(env, insn, &env->insn_idx); + else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) + err = check_kfunc_call(env, insn); else err = check_helper_call(env, insn, &env->insn_idx); if (err) @@ -11634,6 +11916,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->name[0] = 'F'; func[i]->aux->stack_depth = env->subprog_info[i].stack_depth; func[i]->jit_requested = 1; + func[i]->aux->kfunc_tab = prog->aux->kfunc_tab; func[i]->aux->linfo = prog->aux->linfo; func[i]->aux->nr_linfo = prog->aux->nr_linfo; func[i]->aux->jited_linfo = prog->aux->jited_linfo; @@ -11773,6 +12056,7 @@ static int fixup_call_args(struct bpf_verifier_env *env) #ifndef CONFIG_BPF_JIT_ALWAYS_ON struct bpf_prog *prog = env->prog; struct bpf_insn *insn = prog->insnsi; + bool has_kfunc_call = bpf_prog_has_kfunc_call(prog); int i, depth; #endif int err = 0; @@ -11786,6 +12070,10 @@ static int fixup_call_args(struct bpf_verifier_env *env) return err; } #ifndef CONFIG_BPF_JIT_ALWAYS_ON + if (has_kfunc_call) { + verbose(env, "calling kernel functions are not allowed in non-JITed programs\n"); + return -EINVAL; + } if (env->subprog_cnt > 1 && env->prog->aux->tail_call_reachable) { /* When JIT fails the progs with bpf2bpf calls and tail_calls * have to be rejected, since interpreter doesn't support them yet. @@ -11814,6 +12102,26 @@ static int fixup_call_args(struct bpf_verifier_env *env) return err; } +static int fixup_kfunc_call(struct bpf_verifier_env *env, + struct bpf_insn *insn) +{ + const struct bpf_kfunc_desc *desc; + + /* insn->imm has the btf func_id. Replace it with + * an address (relative to __bpf_base_call). + */ + desc = find_kfunc_desc(env->prog, insn->imm); + if (!desc) { + verbose(env, "verifier internal error: kernel function descriptor not found for func_id %u\n", + insn->imm); + return -EFAULT; + } + + insn->imm = desc->imm; + + return 0; +} + /* Do various post-verification rewrites in a single program pass. * These rewrites simplify JIT and interpreter implementations. */ @@ -11949,6 +12257,12 @@ static int do_misc_fixups(struct bpf_verifier_env *env) continue; if (insn->src_reg == BPF_PSEUDO_CALL) continue; + if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { + ret = fixup_kfunc_call(env, insn); + if (ret) + return ret; + continue; + } if (insn->imm == BPF_FUNC_get_route_realm) prog->dst_needed = 1; @@ -12178,6 +12492,8 @@ patch_call_imm: } } + sort_kfunc_descs_by_imm(env->prog); + return 0; } @@ -12883,6 +13199,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (!env->explored_states) goto skip_full_check; + ret = add_subprog_and_kfunc(env); + if (ret < 0) + goto skip_full_check; + ret = check_subprogs(env); if (ret < 0) goto skip_full_check; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 2d3036e292a9..ab9f2233607c 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1117,6 +1117,10 @@ enum bpf_link_type { * offset to another bpf function */ #define BPF_PSEUDO_CALL 1 +/* when bpf_call->src_reg == BPF_PSEUDO_KFUNC_CALL, + * bpf_call->imm == btf_id of a BTF_KIND_FUNC in the running kernel + */ +#define BPF_PSEUDO_KFUNC_CALL 2 /* flags for BPF_MAP_UPDATE_ELEM command */ enum { diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index eb888c8479c3..336a749673d1 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -19,7 +19,7 @@ BPF_MOV64_IMM(BPF_REG_0, 2), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 1, @@ -136,7 +136,7 @@ { "calls: wrong src reg", .insns = { - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 2, 0, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 3, 0, 0), BPF_MOV64_IMM(BPF_REG_0, 1), BPF_EXIT_INSN(), }, @@ -397,7 +397,7 @@ BPF_MOV64_IMM(BPF_REG_0, 1), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .fixup_map_hash_48b = { 3 }, .result_unpriv = REJECT, .result = ACCEPT, @@ -1977,7 +1977,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, }, @@ -2003,7 +2003,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .errstr = "!read_ok", .result = REJECT, }, @@ -2028,7 +2028,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .errstr = "!read_ok", .result = REJECT, }, diff --git a/tools/testing/selftests/bpf/verifier/dead_code.c b/tools/testing/selftests/bpf/verifier/dead_code.c index 5cf361d8eb1c..17fe33a75034 100644 --- a/tools/testing/selftests/bpf/verifier/dead_code.c +++ b/tools/testing/selftests/bpf/verifier/dead_code.c @@ -85,7 +85,7 @@ BPF_MOV64_IMM(BPF_REG_0, 12), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 7, @@ -103,7 +103,7 @@ BPF_MOV64_IMM(BPF_REG_0, 12), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 7, @@ -121,7 +121,7 @@ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -5), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 7, @@ -137,7 +137,7 @@ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, @@ -152,7 +152,7 @@ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, -- cgit v1.2.3 From 933d1aa32409ef4209c8065ee4ede68236659cd2 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 24 Mar 2021 18:52:07 -0700 Subject: libbpf: Refactor bpf_object__resolve_ksyms_btf_id This patch refactors most of the logic from bpf_object__resolve_ksyms_btf_id() into a new function bpf_object__resolve_ksym_var_btf_id(). It is to get ready for a later patch adding bpf_object__resolve_ksym_func_btf_id() which resolves a kernel function to the running kernel btf_id. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210325015207.1546749-1-kafai@fb.com --- tools/lib/bpf/libbpf.c | 124 ++++++++++++++++++++++++++----------------------- 1 file changed, 67 insertions(+), 57 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index cebb0e852cf8..e289155510ca 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -7395,75 +7395,85 @@ out: return err; } -static int bpf_object__resolve_ksyms_btf_id(struct bpf_object *obj) +static int bpf_object__resolve_ksym_var_btf_id(struct bpf_object *obj, + struct extern_desc *ext) { - struct extern_desc *ext; + const struct btf_type *targ_var, *targ_type; + __u32 targ_type_id, local_type_id; + const char *targ_var_name; + int i, id, btf_fd, err; struct btf *btf; - int i, j, id, btf_fd, err; - for (i = 0; i < obj->nr_extern; i++) { - const struct btf_type *targ_var, *targ_type; - __u32 targ_type_id, local_type_id; - const char *targ_var_name; - int ret; + btf = obj->btf_vmlinux; + btf_fd = 0; + id = btf__find_by_name_kind(btf, ext->name, BTF_KIND_VAR); + if (id == -ENOENT) { + err = load_module_btfs(obj); + if (err) + return err; - ext = &obj->externs[i]; - if (ext->type != EXT_KSYM || !ext->ksym.type_id) - continue; + for (i = 0; i < obj->btf_module_cnt; i++) { + btf = obj->btf_modules[i].btf; + /* we assume module BTF FD is always >0 */ + btf_fd = obj->btf_modules[i].fd; + id = btf__find_by_name_kind(btf, ext->name, BTF_KIND_VAR); + if (id != -ENOENT) + break; + } + } + if (id <= 0) { + pr_warn("extern (var ksym) '%s': failed to find BTF ID in kernel BTF(s).\n", + ext->name); + return -ESRCH; + } - btf = obj->btf_vmlinux; - btf_fd = 0; - id = btf__find_by_name_kind(btf, ext->name, BTF_KIND_VAR); - if (id == -ENOENT) { - err = load_module_btfs(obj); - if (err) - return err; + /* find local type_id */ + local_type_id = ext->ksym.type_id; - for (j = 0; j < obj->btf_module_cnt; j++) { - btf = obj->btf_modules[j].btf; - /* we assume module BTF FD is always >0 */ - btf_fd = obj->btf_modules[j].fd; - id = btf__find_by_name_kind(btf, ext->name, BTF_KIND_VAR); - if (id != -ENOENT) - break; - } - } - if (id <= 0) { - pr_warn("extern (ksym) '%s': failed to find BTF ID in kernel BTF(s).\n", - ext->name); - return -ESRCH; - } + /* find target type_id */ + targ_var = btf__type_by_id(btf, id); + targ_var_name = btf__name_by_offset(btf, targ_var->name_off); + targ_type = skip_mods_and_typedefs(btf, targ_var->type, &targ_type_id); - /* find local type_id */ - local_type_id = ext->ksym.type_id; + err = bpf_core_types_are_compat(obj->btf, local_type_id, + btf, targ_type_id); + if (err <= 0) { + const struct btf_type *local_type; + const char *targ_name, *local_name; - /* find target type_id */ - targ_var = btf__type_by_id(btf, id); - targ_var_name = btf__name_by_offset(btf, targ_var->name_off); - targ_type = skip_mods_and_typedefs(btf, targ_var->type, &targ_type_id); + local_type = btf__type_by_id(obj->btf, local_type_id); + local_name = btf__name_by_offset(obj->btf, local_type->name_off); + targ_name = btf__name_by_offset(btf, targ_type->name_off); - ret = bpf_core_types_are_compat(obj->btf, local_type_id, - btf, targ_type_id); - if (ret <= 0) { - const struct btf_type *local_type; - const char *targ_name, *local_name; + pr_warn("extern (var ksym) '%s': incompatible types, expected [%d] %s %s, but kernel has [%d] %s %s\n", + ext->name, local_type_id, + btf_kind_str(local_type), local_name, targ_type_id, + btf_kind_str(targ_type), targ_name); + return -EINVAL; + } - local_type = btf__type_by_id(obj->btf, local_type_id); - local_name = btf__name_by_offset(obj->btf, local_type->name_off); - targ_name = btf__name_by_offset(btf, targ_type->name_off); + ext->is_set = true; + ext->ksym.kernel_btf_obj_fd = btf_fd; + ext->ksym.kernel_btf_id = id; + pr_debug("extern (var ksym) '%s': resolved to [%d] %s %s\n", + ext->name, id, btf_kind_str(targ_var), targ_var_name); - pr_warn("extern (ksym) '%s': incompatible types, expected [%d] %s %s, but kernel has [%d] %s %s\n", - ext->name, local_type_id, - btf_kind_str(local_type), local_name, targ_type_id, - btf_kind_str(targ_type), targ_name); - return -EINVAL; - } + return 0; +} + +static int bpf_object__resolve_ksyms_btf_id(struct bpf_object *obj) +{ + struct extern_desc *ext; + int i, err; + + for (i = 0; i < obj->nr_extern; i++) { + ext = &obj->externs[i]; + if (ext->type != EXT_KSYM || !ext->ksym.type_id) + continue; - ext->is_set = true; - ext->ksym.kernel_btf_obj_fd = btf_fd; - ext->ksym.kernel_btf_id = id; - pr_debug("extern (ksym) '%s': resolved to [%d] %s %s\n", - ext->name, id, btf_kind_str(targ_var), targ_var_name); + err = bpf_object__resolve_ksym_var_btf_id(obj, ext); + if (err) + return err; } return 0; } -- cgit v1.2.3 From 774e132e83d0f10a7ebbfe7db1debdaed6013f83 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 24 Mar 2021 18:52:14 -0700 Subject: libbpf: Refactor codes for finding btf id of a kernel symbol This patch refactors code, that finds kernel btf_id by kind and symbol name, to a new function find_ksym_btf_id(). It also adds a new helper __btf_kind_str() to return a string by the numeric kind value. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210325015214.1547069-1-kafai@fb.com --- tools/lib/bpf/libbpf.c | 44 +++++++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index e289155510ca..2f5595ae0b84 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1921,9 +1921,9 @@ resolve_func_ptr(const struct btf *btf, __u32 id, __u32 *res_id) return btf_is_func_proto(t) ? t : NULL; } -static const char *btf_kind_str(const struct btf_type *t) +static const char *__btf_kind_str(__u16 kind) { - switch (btf_kind(t)) { + switch (kind) { case BTF_KIND_UNKN: return "void"; case BTF_KIND_INT: return "int"; case BTF_KIND_PTR: return "ptr"; @@ -1945,6 +1945,11 @@ static const char *btf_kind_str(const struct btf_type *t) } } +static const char *btf_kind_str(const struct btf_type *t) +{ + return __btf_kind_str(btf_kind(t)); +} + /* * Fetch integer attribute of BTF map definition. Such attributes are * represented using a pointer to an array, in which dimensionality of array @@ -7395,18 +7400,17 @@ out: return err; } -static int bpf_object__resolve_ksym_var_btf_id(struct bpf_object *obj, - struct extern_desc *ext) +static int find_ksym_btf_id(struct bpf_object *obj, const char *ksym_name, + __u16 kind, struct btf **res_btf, + int *res_btf_fd) { - const struct btf_type *targ_var, *targ_type; - __u32 targ_type_id, local_type_id; - const char *targ_var_name; int i, id, btf_fd, err; struct btf *btf; btf = obj->btf_vmlinux; btf_fd = 0; - id = btf__find_by_name_kind(btf, ext->name, BTF_KIND_VAR); + id = btf__find_by_name_kind(btf, ksym_name, kind); + if (id == -ENOENT) { err = load_module_btfs(obj); if (err) @@ -7416,17 +7420,35 @@ static int bpf_object__resolve_ksym_var_btf_id(struct bpf_object *obj, btf = obj->btf_modules[i].btf; /* we assume module BTF FD is always >0 */ btf_fd = obj->btf_modules[i].fd; - id = btf__find_by_name_kind(btf, ext->name, BTF_KIND_VAR); + id = btf__find_by_name_kind(btf, ksym_name, kind); if (id != -ENOENT) break; } } if (id <= 0) { - pr_warn("extern (var ksym) '%s': failed to find BTF ID in kernel BTF(s).\n", - ext->name); + pr_warn("extern (%s ksym) '%s': failed to find BTF ID in kernel BTF(s).\n", + __btf_kind_str(kind), ksym_name); return -ESRCH; } + *res_btf = btf; + *res_btf_fd = btf_fd; + return id; +} + +static int bpf_object__resolve_ksym_var_btf_id(struct bpf_object *obj, + struct extern_desc *ext) +{ + const struct btf_type *targ_var, *targ_type; + __u32 targ_type_id, local_type_id; + const char *targ_var_name; + int id, btf_fd = 0, err; + struct btf *btf = NULL; + + id = find_ksym_btf_id(obj, ext->name, BTF_KIND_VAR, &btf, &btf_fd); + if (id < 0) + return id; + /* find local type_id */ local_type_id = ext->ksym.type_id; -- cgit v1.2.3 From 0c091e5c2d37696589a3e0131a809b5499899995 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 24 Mar 2021 18:52:21 -0700 Subject: libbpf: Rename RELO_EXTERN to RELO_EXTERN_VAR This patch renames RELO_EXTERN to RELO_EXTERN_VAR. It is to avoid the confusion with a later patch adding RELO_EXTERN_FUNC. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210325015221.1547722-1-kafai@fb.com --- tools/lib/bpf/libbpf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 2f5595ae0b84..0ac53dce37b7 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -185,7 +185,7 @@ enum reloc_type { RELO_LD64, RELO_CALL, RELO_DATA, - RELO_EXTERN, + RELO_EXTERN_VAR, RELO_SUBPROG_ADDR, }; @@ -3455,7 +3455,7 @@ static int bpf_program__record_reloc(struct bpf_program *prog, } pr_debug("prog '%s': found extern #%d '%s' (sym %d) for insn #%u\n", prog->name, i, ext->name, ext->sym_idx, insn_idx); - reloc_desc->type = RELO_EXTERN; + reloc_desc->type = RELO_EXTERN_VAR; reloc_desc->insn_idx = insn_idx; reloc_desc->sym_off = i; /* sym_off stores extern index */ return 0; @@ -6218,7 +6218,7 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog) insn[0].imm = obj->maps[relo->map_idx].fd; relo->processed = true; break; - case RELO_EXTERN: + case RELO_EXTERN_VAR: ext = &obj->externs[relo->sym_off]; if (ext->type == EXT_KCFG) { insn[0].src_reg = BPF_PSEUDO_MAP_VALUE; -- cgit v1.2.3 From aa0b8d43e9537d371cbd3f272d3403f2b15201af Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 24 Mar 2021 18:52:27 -0700 Subject: libbpf: Record extern sym relocation first This patch records the extern sym relocs first before recording subprog relocs. The later patch will have relocs for extern kernel function call which is also using BPF_JMP | BPF_CALL. It will be easier to handle the extern symbols first in the later patch. is_call_insn() helper is added. The existing is_ldimm64() helper is renamed to is_ldimm64_insn() for consistency. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210325015227.1548623-1-kafai@fb.com --- tools/lib/bpf/libbpf.c | 63 +++++++++++++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 29 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 0ac53dce37b7..e1615c274250 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -573,14 +573,19 @@ static bool insn_is_subprog_call(const struct bpf_insn *insn) insn->off == 0; } -static bool is_ldimm64(struct bpf_insn *insn) +static bool is_ldimm64_insn(struct bpf_insn *insn) { return insn->code == (BPF_LD | BPF_IMM | BPF_DW); } +static bool is_call_insn(const struct bpf_insn *insn) +{ + return insn->code == (BPF_JMP | BPF_CALL); +} + static bool insn_is_pseudo_func(struct bpf_insn *insn) { - return is_ldimm64(insn) && insn->src_reg == BPF_PSEUDO_FUNC; + return is_ldimm64_insn(insn) && insn->src_reg == BPF_PSEUDO_FUNC; } static int @@ -3408,31 +3413,7 @@ static int bpf_program__record_reloc(struct bpf_program *prog, reloc_desc->processed = false; - /* sub-program call relocation */ - if (insn->code == (BPF_JMP | BPF_CALL)) { - if (insn->src_reg != BPF_PSEUDO_CALL) { - pr_warn("prog '%s': incorrect bpf_call opcode\n", prog->name); - return -LIBBPF_ERRNO__RELOC; - } - /* text_shndx can be 0, if no default "main" program exists */ - if (!shdr_idx || shdr_idx != obj->efile.text_shndx) { - sym_sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, shdr_idx)); - pr_warn("prog '%s': bad call relo against '%s' in section '%s'\n", - prog->name, sym_name, sym_sec_name); - return -LIBBPF_ERRNO__RELOC; - } - if (sym->st_value % BPF_INSN_SZ) { - pr_warn("prog '%s': bad call relo against '%s' at offset %zu\n", - prog->name, sym_name, (size_t)sym->st_value); - return -LIBBPF_ERRNO__RELOC; - } - reloc_desc->type = RELO_CALL; - reloc_desc->insn_idx = insn_idx; - reloc_desc->sym_off = sym->st_value; - return 0; - } - - if (!is_ldimm64(insn)) { + if (!is_call_insn(insn) && !is_ldimm64_insn(insn)) { pr_warn("prog '%s': invalid relo against '%s' for insns[%d].code 0x%x\n", prog->name, sym_name, insn_idx, insn->code); return -LIBBPF_ERRNO__RELOC; @@ -3461,6 +3442,30 @@ static int bpf_program__record_reloc(struct bpf_program *prog, return 0; } + /* sub-program call relocation */ + if (is_call_insn(insn)) { + if (insn->src_reg != BPF_PSEUDO_CALL) { + pr_warn("prog '%s': incorrect bpf_call opcode\n", prog->name); + return -LIBBPF_ERRNO__RELOC; + } + /* text_shndx can be 0, if no default "main" program exists */ + if (!shdr_idx || shdr_idx != obj->efile.text_shndx) { + sym_sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, shdr_idx)); + pr_warn("prog '%s': bad call relo against '%s' in section '%s'\n", + prog->name, sym_name, sym_sec_name); + return -LIBBPF_ERRNO__RELOC; + } + if (sym->st_value % BPF_INSN_SZ) { + pr_warn("prog '%s': bad call relo against '%s' at offset %zu\n", + prog->name, sym_name, (size_t)sym->st_value); + return -LIBBPF_ERRNO__RELOC; + } + reloc_desc->type = RELO_CALL; + reloc_desc->insn_idx = insn_idx; + reloc_desc->sym_off = sym->st_value; + return 0; + } + if (!shdr_idx || shdr_idx >= SHN_LORESERVE) { pr_warn("prog '%s': invalid relo against '%s' in special section 0x%x; forgot to initialize global var?..\n", prog->name, sym_name, shdr_idx); @@ -5700,7 +5705,7 @@ poison: /* poison second part of ldimm64 to avoid confusing error from * verifier about "unknown opcode 00" */ - if (is_ldimm64(insn)) + if (is_ldimm64_insn(insn)) bpf_core_poison_insn(prog, relo_idx, insn_idx + 1, insn + 1); bpf_core_poison_insn(prog, relo_idx, insn_idx, insn); return 0; @@ -5776,7 +5781,7 @@ poison: case BPF_LD: { __u64 imm; - if (!is_ldimm64(insn) || + if (!is_ldimm64_insn(insn) || insn[0].src_reg != 0 || insn[0].off != 0 || insn_idx + 1 >= prog->insns_cnt || insn[1].code != 0 || insn[1].dst_reg != 0 || -- cgit v1.2.3 From 5bd022ec01f060f30672cc6383b8b04e75a4310d Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 24 Mar 2021 18:52:34 -0700 Subject: libbpf: Support extern kernel function This patch is to make libbpf able to handle the following extern kernel function declaration and do the needed relocations before loading the bpf program to the kernel. extern int foo(struct sock *) __attribute__((section(".ksyms"))) In the collect extern phase, needed changes is made to bpf_object__collect_externs() and find_extern_btf_id() to collect extern function in ".ksyms" section. The func in the BTF datasec also needs to be replaced by an int var. The idea is similar to the existing handling in extern var. In case the BTF may not have a var, a dummy ksym var is added at the beginning of bpf_object__collect_externs() if there is func under ksyms datasec. It will also change the func linkage from extern to global which the kernel can support. It also assigns a param name if it does not have one. In the collect relo phase, it will record the kernel function call as RELO_EXTERN_FUNC. bpf_object__resolve_ksym_func_btf_id() is added to find the func btf_id of the running kernel. During actual relocation, it will patch the BPF_CALL instruction with src_reg = BPF_PSEUDO_FUNC_CALL and insn->imm set to the running kernel func's btf_id. The required LLVM patch: https://reviews.llvm.org/D93563 Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210325015234.1548923-1-kafai@fb.com --- tools/lib/bpf/libbpf.c | 174 +++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 162 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index e1615c274250..7aad78dbb4b4 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -186,6 +186,7 @@ enum reloc_type { RELO_CALL, RELO_DATA, RELO_EXTERN_VAR, + RELO_EXTERN_FUNC, RELO_SUBPROG_ADDR, }; @@ -1955,6 +1956,11 @@ static const char *btf_kind_str(const struct btf_type *t) return __btf_kind_str(btf_kind(t)); } +static enum btf_func_linkage btf_func_linkage(const struct btf_type *t) +{ + return (enum btf_func_linkage)BTF_INFO_VLEN(t->info); +} + /* * Fetch integer attribute of BTF map definition. Such attributes are * represented using a pointer to an array, in which dimensionality of array @@ -3019,7 +3025,7 @@ static bool sym_is_subprog(const GElf_Sym *sym, int text_shndx) static int find_extern_btf_id(const struct btf *btf, const char *ext_name) { const struct btf_type *t; - const char *var_name; + const char *tname; int i, n; if (!btf) @@ -3029,14 +3035,18 @@ static int find_extern_btf_id(const struct btf *btf, const char *ext_name) for (i = 1; i <= n; i++) { t = btf__type_by_id(btf, i); - if (!btf_is_var(t)) + if (!btf_is_var(t) && !btf_is_func(t)) continue; - var_name = btf__name_by_offset(btf, t->name_off); - if (strcmp(var_name, ext_name)) + tname = btf__name_by_offset(btf, t->name_off); + if (strcmp(tname, ext_name)) continue; - if (btf_var(t)->linkage != BTF_VAR_GLOBAL_EXTERN) + if (btf_is_var(t) && + btf_var(t)->linkage != BTF_VAR_GLOBAL_EXTERN) + return -EINVAL; + + if (btf_is_func(t) && btf_func_linkage(t) != BTF_FUNC_EXTERN) return -EINVAL; return i; @@ -3149,12 +3159,48 @@ static int find_int_btf_id(const struct btf *btf) return 0; } +static int add_dummy_ksym_var(struct btf *btf) +{ + int i, int_btf_id, sec_btf_id, dummy_var_btf_id; + const struct btf_var_secinfo *vs; + const struct btf_type *sec; + + sec_btf_id = btf__find_by_name_kind(btf, KSYMS_SEC, + BTF_KIND_DATASEC); + if (sec_btf_id < 0) + return 0; + + sec = btf__type_by_id(btf, sec_btf_id); + vs = btf_var_secinfos(sec); + for (i = 0; i < btf_vlen(sec); i++, vs++) { + const struct btf_type *vt; + + vt = btf__type_by_id(btf, vs->type); + if (btf_is_func(vt)) + break; + } + + /* No func in ksyms sec. No need to add dummy var. */ + if (i == btf_vlen(sec)) + return 0; + + int_btf_id = find_int_btf_id(btf); + dummy_var_btf_id = btf__add_var(btf, + "dummy_ksym", + BTF_VAR_GLOBAL_ALLOCATED, + int_btf_id); + if (dummy_var_btf_id < 0) + pr_warn("cannot create a dummy_ksym var\n"); + + return dummy_var_btf_id; +} + static int bpf_object__collect_externs(struct bpf_object *obj) { struct btf_type *sec, *kcfg_sec = NULL, *ksym_sec = NULL; const struct btf_type *t; struct extern_desc *ext; - int i, n, off; + int i, n, off, dummy_var_btf_id; const char *ext_name, *sec_name; Elf_Scn *scn; GElf_Shdr sh; @@ -3166,6 +3212,10 @@ static int bpf_object__collect_externs(struct bpf_object *obj) if (elf_sec_hdr(obj, scn, &sh)) return -LIBBPF_ERRNO__FORMAT; + dummy_var_btf_id = add_dummy_ksym_var(obj->btf); + if (dummy_var_btf_id < 0) + return dummy_var_btf_id; + n = sh.sh_size / sh.sh_entsize; pr_debug("looking for externs among %d symbols...\n", n); @@ -3210,6 +3260,11 @@ static int bpf_object__collect_externs(struct bpf_object *obj) sec_name = btf__name_by_offset(obj->btf, sec->name_off); if (strcmp(sec_name, KCONFIG_SEC) == 0) { + if (btf_is_func(t)) { + pr_warn("extern function %s is unsupported under %s section\n", + ext->name, KCONFIG_SEC); + return -ENOTSUP; + } kcfg_sec = sec; ext->type = EXT_KCFG; ext->kcfg.sz = btf__resolve_size(obj->btf, t->type); @@ -3231,6 +3286,11 @@ static int bpf_object__collect_externs(struct bpf_object *obj) return -ENOTSUP; } } else if (strcmp(sec_name, KSYMS_SEC) == 0) { + if (btf_is_func(t) && ext->is_weak) { + pr_warn("extern weak function %s is unsupported\n", + ext->name); + return -ENOTSUP; + } ksym_sec = sec; ext->type = EXT_KSYM; skip_mods_and_typedefs(obj->btf, t->type, @@ -3257,7 +3317,14 @@ static int bpf_object__collect_externs(struct bpf_object *obj) * extern variables in DATASEC */ int int_btf_id = find_int_btf_id(obj->btf); + /* For extern function, a dummy_var added earlier + * will be used to replace the vs->type and + * its name string will be used to refill + * the missing param's name. + */ + const struct btf_type *dummy_var; + dummy_var = btf__type_by_id(obj->btf, dummy_var_btf_id); for (i = 0; i < obj->nr_extern; i++) { ext = &obj->externs[i]; if (ext->type != EXT_KSYM) @@ -3276,12 +3343,32 @@ static int bpf_object__collect_externs(struct bpf_object *obj) ext_name = btf__name_by_offset(obj->btf, vt->name_off); ext = find_extern_by_name(obj, ext_name); if (!ext) { - pr_warn("failed to find extern definition for BTF var '%s'\n", - ext_name); + pr_warn("failed to find extern definition for BTF %s '%s'\n", + btf_kind_str(vt), ext_name); return -ESRCH; } - btf_var(vt)->linkage = BTF_VAR_GLOBAL_ALLOCATED; - vt->type = int_btf_id; + if (btf_is_func(vt)) { + const struct btf_type *func_proto; + struct btf_param *param; + int j; + + func_proto = btf__type_by_id(obj->btf, + vt->type); + param = btf_params(func_proto); + /* Reuse the dummy_var string if the + * func proto does not have param name. + */ + for (j = 0; j < btf_vlen(func_proto); j++) + if (param[j].type && !param[j].name_off) + param[j].name_off = + dummy_var->name_off; + vs->type = dummy_var_btf_id; + vt->info &= ~0xffff; + vt->info |= BTF_FUNC_GLOBAL; + } else { + btf_var(vt)->linkage = BTF_VAR_GLOBAL_ALLOCATED; + vt->type = int_btf_id; + } vs->offset = off; vs->size = sizeof(int); } @@ -3436,7 +3523,10 @@ static int bpf_program__record_reloc(struct bpf_program *prog, } pr_debug("prog '%s': found extern #%d '%s' (sym %d) for insn #%u\n", prog->name, i, ext->name, ext->sym_idx, insn_idx); - reloc_desc->type = RELO_EXTERN_VAR; + if (insn->code == (BPF_JMP | BPF_CALL)) + reloc_desc->type = RELO_EXTERN_FUNC; + else + reloc_desc->type = RELO_EXTERN_VAR; reloc_desc->insn_idx = insn_idx; reloc_desc->sym_off = i; /* sym_off stores extern index */ return 0; @@ -6241,6 +6331,12 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog) } relo->processed = true; break; + case RELO_EXTERN_FUNC: + ext = &obj->externs[relo->sym_off]; + insn[0].src_reg = BPF_PSEUDO_KFUNC_CALL; + insn[0].imm = ext->ksym.kernel_btf_id; + relo->processed = true; + break; case RELO_SUBPROG_ADDR: insn[0].src_reg = BPF_PSEUDO_FUNC; /* will be handled as a follow up pass */ @@ -7361,6 +7457,7 @@ static int bpf_object__read_kallsyms_file(struct bpf_object *obj) { char sym_type, sym_name[500]; unsigned long long sym_addr; + const struct btf_type *t; struct extern_desc *ext; int ret, err = 0; FILE *f; @@ -7387,6 +7484,10 @@ static int bpf_object__read_kallsyms_file(struct bpf_object *obj) if (!ext || ext->type != EXT_KSYM) continue; + t = btf__type_by_id(obj->btf, ext->btf_id); + if (!btf_is_var(t)) + continue; + if (ext->is_set && ext->ksym.addr != sym_addr) { pr_warn("extern (ksym) '%s' resolution is ambiguous: 0x%llx or 0x%llx\n", sym_name, ext->ksym.addr, sym_addr); @@ -7488,8 +7589,53 @@ static int bpf_object__resolve_ksym_var_btf_id(struct bpf_object *obj, return 0; } +static int bpf_object__resolve_ksym_func_btf_id(struct bpf_object *obj, + struct extern_desc *ext) +{ + int local_func_proto_id, kfunc_proto_id, kfunc_id; + const struct btf_type *kern_func; + struct btf *kern_btf = NULL; + int ret, kern_btf_fd = 0; + + local_func_proto_id = ext->ksym.type_id; + + kfunc_id = find_ksym_btf_id(obj, ext->name, BTF_KIND_FUNC, + &kern_btf, &kern_btf_fd); + if (kfunc_id < 0) { + pr_warn("extern (func ksym) '%s': not found in kernel BTF\n", + ext->name); + return kfunc_id; + } + + if (kern_btf != obj->btf_vmlinux) { + pr_warn("extern (func ksym) '%s': function in kernel module is not supported\n", + ext->name); + return -ENOTSUP; + } + + kern_func = btf__type_by_id(kern_btf, kfunc_id); + kfunc_proto_id = kern_func->type; + + ret = bpf_core_types_are_compat(obj->btf, local_func_proto_id, + kern_btf, kfunc_proto_id); + if (ret <= 0) { + pr_warn("extern (func ksym) '%s': func_proto [%d] incompatible with kernel [%d]\n", + ext->name, local_func_proto_id, kfunc_proto_id); + return -EINVAL; + } + + ext->is_set = true; + ext->ksym.kernel_btf_obj_fd = kern_btf_fd; + ext->ksym.kernel_btf_id = kfunc_id; + pr_debug("extern (func ksym) '%s': resolved to kernel [%d]\n", + ext->name, kfunc_id); + + return 0; +} + static int bpf_object__resolve_ksyms_btf_id(struct bpf_object *obj) { + const struct btf_type *t; struct extern_desc *ext; int i, err; @@ -7498,7 +7644,11 @@ static int bpf_object__resolve_ksyms_btf_id(struct bpf_object *obj) if (ext->type != EXT_KSYM || !ext->ksym.type_id) continue; - err = bpf_object__resolve_ksym_var_btf_id(obj, ext); + t = btf__type_by_id(obj->btf, ext->btf_id); + if (btf_is_var(t)) + err = bpf_object__resolve_ksym_var_btf_id(obj, ext); + else + err = bpf_object__resolve_ksym_func_btf_id(obj, ext); if (err) return err; } -- cgit v1.2.3 From 39cd9e0f6783fd2dd2b0e95500e34575b3707ed8 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 24 Mar 2021 18:52:40 -0700 Subject: bpf: selftests: Rename bictcp to bpf_cubic As a similar chanage in the kernel, this patch gives the proper name to the bpf cubic. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210325015240.1550074-1-kafai@fb.com --- tools/testing/selftests/bpf/progs/bpf_cubic.c | 30 +++++++++++++-------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/bpf_cubic.c b/tools/testing/selftests/bpf/progs/bpf_cubic.c index 6939bfd8690f..33c4d2bded64 100644 --- a/tools/testing/selftests/bpf/progs/bpf_cubic.c +++ b/tools/testing/selftests/bpf/progs/bpf_cubic.c @@ -174,8 +174,8 @@ static __always_inline void bictcp_hystart_reset(struct sock *sk) * as long as it is used in one of the func ptr * under SEC(".struct_ops"). */ -SEC("struct_ops/bictcp_init") -void BPF_PROG(bictcp_init, struct sock *sk) +SEC("struct_ops/bpf_cubic_init") +void BPF_PROG(bpf_cubic_init, struct sock *sk) { struct bictcp *ca = inet_csk_ca(sk); @@ -192,7 +192,7 @@ void BPF_PROG(bictcp_init, struct sock *sk) * The remaining tcp-cubic functions have an easier way. */ SEC("no-sec-prefix-bictcp_cwnd_event") -void BPF_PROG(bictcp_cwnd_event, struct sock *sk, enum tcp_ca_event event) +void BPF_PROG(bpf_cubic_cwnd_event, struct sock *sk, enum tcp_ca_event event) { if (event == CA_EVENT_TX_START) { struct bictcp *ca = inet_csk_ca(sk); @@ -384,7 +384,7 @@ tcp_friendliness: } /* Or simply use the BPF_STRUCT_OPS to avoid the SEC boiler plate. */ -void BPF_STRUCT_OPS(bictcp_cong_avoid, struct sock *sk, __u32 ack, __u32 acked) +void BPF_STRUCT_OPS(bpf_cubic_cong_avoid, struct sock *sk, __u32 ack, __u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); @@ -403,7 +403,7 @@ void BPF_STRUCT_OPS(bictcp_cong_avoid, struct sock *sk, __u32 ack, __u32 acked) tcp_cong_avoid_ai(tp, ca->cnt, acked); } -__u32 BPF_STRUCT_OPS(bictcp_recalc_ssthresh, struct sock *sk) +__u32 BPF_STRUCT_OPS(bpf_cubic_recalc_ssthresh, struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); @@ -420,7 +420,7 @@ __u32 BPF_STRUCT_OPS(bictcp_recalc_ssthresh, struct sock *sk) return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); } -void BPF_STRUCT_OPS(bictcp_state, struct sock *sk, __u8 new_state) +void BPF_STRUCT_OPS(bpf_cubic_state, struct sock *sk, __u8 new_state) { if (new_state == TCP_CA_Loss) { bictcp_reset(inet_csk_ca(sk)); @@ -496,7 +496,7 @@ static __always_inline void hystart_update(struct sock *sk, __u32 delay) } } -void BPF_STRUCT_OPS(bictcp_acked, struct sock *sk, +void BPF_STRUCT_OPS(bpf_cubic_acked, struct sock *sk, const struct ack_sample *sample) { const struct tcp_sock *tp = tcp_sk(sk); @@ -525,7 +525,7 @@ void BPF_STRUCT_OPS(bictcp_acked, struct sock *sk, hystart_update(sk, delay); } -__u32 BPF_STRUCT_OPS(tcp_reno_undo_cwnd, struct sock *sk) +__u32 BPF_STRUCT_OPS(bpf_cubic_undo_cwnd, struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); @@ -534,12 +534,12 @@ __u32 BPF_STRUCT_OPS(tcp_reno_undo_cwnd, struct sock *sk) SEC(".struct_ops") struct tcp_congestion_ops cubic = { - .init = (void *)bictcp_init, - .ssthresh = (void *)bictcp_recalc_ssthresh, - .cong_avoid = (void *)bictcp_cong_avoid, - .set_state = (void *)bictcp_state, - .undo_cwnd = (void *)tcp_reno_undo_cwnd, - .cwnd_event = (void *)bictcp_cwnd_event, - .pkts_acked = (void *)bictcp_acked, + .init = (void *)bpf_cubic_init, + .ssthresh = (void *)bpf_cubic_recalc_ssthresh, + .cong_avoid = (void *)bpf_cubic_cong_avoid, + .set_state = (void *)bpf_cubic_state, + .undo_cwnd = (void *)bpf_cubic_undo_cwnd, + .cwnd_event = (void *)bpf_cubic_cwnd_event, + .pkts_acked = (void *)bpf_cubic_acked, .name = "bpf_cubic", }; -- cgit v1.2.3 From 78e60bbbe8e8f614b6a453d8a780d9e6f77749a8 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 24 Mar 2021 18:52:46 -0700 Subject: bpf: selftests: Bpf_cubic and bpf_dctcp calling kernel functions This patch removes the bpf implementation of tcp_slow_start() and tcp_cong_avoid_ai(). Instead, it directly uses the kernel implementation. It also replaces the bpf_cubic_undo_cwnd implementation by directly calling tcp_reno_undo_cwnd(). bpf_dctcp also directly calls tcp_reno_cong_avoid() instead. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210325015246.1551062-1-kafai@fb.com --- tools/testing/selftests/bpf/bpf_tcp_helpers.h | 29 ++------------------------- tools/testing/selftests/bpf/progs/bpf_cubic.c | 6 +++--- tools/testing/selftests/bpf/progs/bpf_dctcp.c | 22 ++++++-------------- 3 files changed, 11 insertions(+), 46 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h index 91f0fac632f4..029589c008c9 100644 --- a/tools/testing/selftests/bpf/bpf_tcp_helpers.h +++ b/tools/testing/selftests/bpf/bpf_tcp_helpers.h @@ -187,16 +187,6 @@ struct tcp_congestion_ops { typeof(y) __y = (y); \ __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); }) -static __always_inline __u32 tcp_slow_start(struct tcp_sock *tp, __u32 acked) -{ - __u32 cwnd = min(tp->snd_cwnd + acked, tp->snd_ssthresh); - - acked -= cwnd - tp->snd_cwnd; - tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); - - return acked; -} - static __always_inline bool tcp_in_slow_start(const struct tcp_sock *tp) { return tp->snd_cwnd < tp->snd_ssthresh; @@ -213,22 +203,7 @@ static __always_inline bool tcp_is_cwnd_limited(const struct sock *sk) return !!BPF_CORE_READ_BITFIELD(tp, is_cwnd_limited); } -static __always_inline void tcp_cong_avoid_ai(struct tcp_sock *tp, __u32 w, __u32 acked) -{ - /* If credits accumulated at a higher w, apply them gently now. */ - if (tp->snd_cwnd_cnt >= w) { - tp->snd_cwnd_cnt = 0; - tp->snd_cwnd++; - } - - tp->snd_cwnd_cnt += acked; - if (tp->snd_cwnd_cnt >= w) { - __u32 delta = tp->snd_cwnd_cnt / w; - - tp->snd_cwnd_cnt -= delta * w; - tp->snd_cwnd += delta; - } - tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_cwnd_clamp); -} +extern __u32 tcp_slow_start(struct tcp_sock *tp, __u32 acked) __ksym; +extern void tcp_cong_avoid_ai(struct tcp_sock *tp, __u32 w, __u32 acked) __ksym; #endif diff --git a/tools/testing/selftests/bpf/progs/bpf_cubic.c b/tools/testing/selftests/bpf/progs/bpf_cubic.c index 33c4d2bded64..f62df4d023f9 100644 --- a/tools/testing/selftests/bpf/progs/bpf_cubic.c +++ b/tools/testing/selftests/bpf/progs/bpf_cubic.c @@ -525,11 +525,11 @@ void BPF_STRUCT_OPS(bpf_cubic_acked, struct sock *sk, hystart_update(sk, delay); } +extern __u32 tcp_reno_undo_cwnd(struct sock *sk) __ksym; + __u32 BPF_STRUCT_OPS(bpf_cubic_undo_cwnd, struct sock *sk) { - const struct tcp_sock *tp = tcp_sk(sk); - - return max(tp->snd_cwnd, tp->prior_cwnd); + return tcp_reno_undo_cwnd(sk); } SEC(".struct_ops") diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp.c b/tools/testing/selftests/bpf/progs/bpf_dctcp.c index 4dc1a967776a..fd42247da8b4 100644 --- a/tools/testing/selftests/bpf/progs/bpf_dctcp.c +++ b/tools/testing/selftests/bpf/progs/bpf_dctcp.c @@ -194,22 +194,12 @@ __u32 BPF_PROG(dctcp_cwnd_undo, struct sock *sk) return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd); } -SEC("struct_ops/tcp_reno_cong_avoid") -void BPF_PROG(tcp_reno_cong_avoid, struct sock *sk, __u32 ack, __u32 acked) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (!tcp_is_cwnd_limited(sk)) - return; +extern void tcp_reno_cong_avoid(struct sock *sk, __u32 ack, __u32 acked) __ksym; - /* In "safe" area, increase. */ - if (tcp_in_slow_start(tp)) { - acked = tcp_slow_start(tp, acked); - if (!acked) - return; - } - /* In dangerous area, increase slowly. */ - tcp_cong_avoid_ai(tp, tp->snd_cwnd, acked); +SEC("struct_ops/dctcp_reno_cong_avoid") +void BPF_PROG(dctcp_cong_avoid, struct sock *sk, __u32 ack, __u32 acked) +{ + tcp_reno_cong_avoid(sk, ack, acked); } SEC(".struct_ops") @@ -226,7 +216,7 @@ struct tcp_congestion_ops dctcp = { .in_ack_event = (void *)dctcp_update_alpha, .cwnd_event = (void *)dctcp_cwnd_event, .ssthresh = (void *)dctcp_ssthresh, - .cong_avoid = (void *)tcp_reno_cong_avoid, + .cong_avoid = (void *)dctcp_cong_avoid, .undo_cwnd = (void *)dctcp_cwnd_undo, .set_state = (void *)dctcp_state, .flags = TCP_CONG_NEEDS_ECN, -- cgit v1.2.3 From 7bd1590d4eba1583f6ee85e8cfe556505f761e19 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 24 Mar 2021 18:52:52 -0700 Subject: bpf: selftests: Add kfunc_call test This patch adds a few kernel function bpf_kfunc_call_test*() for the selftest's test_run purpose. They will be allowed for tc_cls prog. The selftest calling the kernel function bpf_kfunc_call_test*() is also added in this patch. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210325015252.1551395-1-kafai@fb.com --- include/linux/bpf.h | 6 +++ net/bpf/test_run.c | 28 ++++++++++ net/core/filter.c | 1 + .../testing/selftests/bpf/prog_tests/kfunc_call.c | 59 ++++++++++++++++++++++ .../testing/selftests/bpf/progs/kfunc_call_test.c | 47 +++++++++++++++++ .../selftests/bpf/progs/kfunc_call_test_subprog.c | 42 +++++++++++++++ 6 files changed, 183 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/kfunc_call.c create mode 100644 tools/testing/selftests/bpf/progs/kfunc_call_test.c create mode 100644 tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c (limited to 'tools') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b5b7967e3ff3..9fdd839b418c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1532,6 +1532,7 @@ int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); +bool bpf_prog_test_check_kfunc_call(u32 kfunc_id); bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info); @@ -1731,6 +1732,11 @@ static inline int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, return -ENOTSUPP; } +static inline bool bpf_prog_test_check_kfunc_call(u32 kfunc_id) +{ + return false; +} + static inline void bpf_map_put(struct bpf_map *map) { } diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 4aabf71cd95d..a5d72c48fb66 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -2,6 +2,7 @@ /* Copyright (c) 2017 Facebook */ #include +#include #include #include #include @@ -213,10 +214,37 @@ int noinline bpf_modify_return_test(int a, int *b) *b += 1; return a + *b; } + +u64 noinline bpf_kfunc_call_test1(struct sock *sk, u32 a, u64 b, u32 c, u64 d) +{ + return a + b + c + d; +} + +int noinline bpf_kfunc_call_test2(struct sock *sk, u32 a, u32 b) +{ + return a + b; +} + +struct sock * noinline bpf_kfunc_call_test3(struct sock *sk) +{ + return sk; +} + __diag_pop(); ALLOW_ERROR_INJECTION(bpf_modify_return_test, ERRNO); +BTF_SET_START(test_sk_kfunc_ids) +BTF_ID(func, bpf_kfunc_call_test1) +BTF_ID(func, bpf_kfunc_call_test2) +BTF_ID(func, bpf_kfunc_call_test3) +BTF_SET_END(test_sk_kfunc_ids) + +bool bpf_prog_test_check_kfunc_call(u32 kfunc_id) +{ + return btf_id_set_contains(&test_sk_kfunc_ids, kfunc_id); +} + static void *bpf_test_init(const union bpf_attr *kattr, u32 size, u32 headroom, u32 tailroom) { diff --git a/net/core/filter.c b/net/core/filter.c index 17dc159ec40c..cae56d08a670 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -9813,6 +9813,7 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = { .convert_ctx_access = tc_cls_act_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, .gen_ld_abs = bpf_gen_ld_abs, + .check_kfunc_call = bpf_prog_test_check_kfunc_call, }; const struct bpf_prog_ops tc_cls_act_prog_ops = { diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c new file mode 100644 index 000000000000..7fc0951ee75f --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include +#include +#include "kfunc_call_test.skel.h" +#include "kfunc_call_test_subprog.skel.h" + +static void test_main(void) +{ + struct kfunc_call_test *skel; + int prog_fd, retval, err; + + skel = kfunc_call_test__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel")) + return; + + prog_fd = bpf_program__fd(skel->progs.kfunc_call_test1); + err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4), + NULL, NULL, (__u32 *)&retval, NULL); + ASSERT_OK(err, "bpf_prog_test_run(test1)"); + ASSERT_EQ(retval, 12, "test1-retval"); + + prog_fd = bpf_program__fd(skel->progs.kfunc_call_test2); + err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4), + NULL, NULL, (__u32 *)&retval, NULL); + ASSERT_OK(err, "bpf_prog_test_run(test2)"); + ASSERT_EQ(retval, 3, "test2-retval"); + + kfunc_call_test__destroy(skel); +} + +static void test_subprog(void) +{ + struct kfunc_call_test_subprog *skel; + int prog_fd, retval, err; + + skel = kfunc_call_test_subprog__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel")) + return; + + prog_fd = bpf_program__fd(skel->progs.kfunc_call_test1); + err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4), + NULL, NULL, (__u32 *)&retval, NULL); + ASSERT_OK(err, "bpf_prog_test_run(test1)"); + ASSERT_EQ(retval, 10, "test1-retval"); + ASSERT_NEQ(skel->data->active_res, -1, "active_res"); + ASSERT_EQ(skel->data->sk_state, BPF_TCP_CLOSE, "sk_state"); + + kfunc_call_test_subprog__destroy(skel); +} + +void test_kfunc_call(void) +{ + if (test__start_subtest("main")) + test_main(); + + if (test__start_subtest("subprog")) + test_subprog(); +} diff --git a/tools/testing/selftests/bpf/progs/kfunc_call_test.c b/tools/testing/selftests/bpf/progs/kfunc_call_test.c new file mode 100644 index 000000000000..470f8723e463 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/kfunc_call_test.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include +#include +#include "bpf_tcp_helpers.h" + +extern int bpf_kfunc_call_test2(struct sock *sk, __u32 a, __u32 b) __ksym; +extern __u64 bpf_kfunc_call_test1(struct sock *sk, __u32 a, __u64 b, + __u32 c, __u64 d) __ksym; + +SEC("classifier") +int kfunc_call_test2(struct __sk_buff *skb) +{ + struct bpf_sock *sk = skb->sk; + + if (!sk) + return -1; + + sk = bpf_sk_fullsock(sk); + if (!sk) + return -1; + + return bpf_kfunc_call_test2((struct sock *)sk, 1, 2); +} + +SEC("classifier") +int kfunc_call_test1(struct __sk_buff *skb) +{ + struct bpf_sock *sk = skb->sk; + __u64 a = 1ULL << 32; + __u32 ret; + + if (!sk) + return -1; + + sk = bpf_sk_fullsock(sk); + if (!sk) + return -1; + + a = bpf_kfunc_call_test1((struct sock *)sk, 1, a | 2, 3, a | 4); + ret = a >> 32; /* ret should be 2 */ + ret += (__u32)a; /* ret should be 12 */ + + return ret; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c b/tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c new file mode 100644 index 000000000000..b2dcb7d9cb03 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include +#include +#include "bpf_tcp_helpers.h" + +extern const int bpf_prog_active __ksym; +extern __u64 bpf_kfunc_call_test1(struct sock *sk, __u32 a, __u64 b, + __u32 c, __u64 d) __ksym; +extern struct sock *bpf_kfunc_call_test3(struct sock *sk) __ksym; +int active_res = -1; +int sk_state = -1; + +int __noinline f1(struct __sk_buff *skb) +{ + struct bpf_sock *sk = skb->sk; + int *active; + + if (!sk) + return -1; + + sk = bpf_sk_fullsock(sk); + if (!sk) + return -1; + + active = (int *)bpf_per_cpu_ptr(&bpf_prog_active, + bpf_get_smp_processor_id()); + if (active) + active_res = *active; + + sk_state = bpf_kfunc_call_test3((struct sock *)sk)->__sk_common.skc_state; + + return (__u32)bpf_kfunc_call_test1((struct sock *)sk, 1, 2, 3, 4); +} + +SEC("classifier") +int kfunc_call_test1(struct __sk_buff *skb) +{ + return f1(skb); +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From c127ffa23e41e6cb2251d5c30e54d60777034caa Mon Sep 17 00:00:00 2001 From: Baowen Zheng Date: Fri, 26 Mar 2021 14:09:37 +0100 Subject: selftests: tc-testing: add action police selftest for packets per second Add selftest cases in action police for packets per second. These tests depend on corresponding iproute2 command support. Signed-off-by: Baowen Zheng Signed-off-by: Simon Horman Reviewed-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- .../tc-testing/tc-tests/actions/police.json | 48 ++++++++++++++++++++++ 1 file changed, 48 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/police.json b/tools/testing/selftests/tc-testing/tc-tests/actions/police.json index b8268da5adaa..8e45792703ed 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/police.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/police.json @@ -764,5 +764,53 @@ "teardown": [ "$TC actions flush action police" ] + }, + { + "id": "cdd7", + "name": "Add valid police action with packets per second rate limit", + "category": [ + "actions", + "police" + ], + "setup": [ + [ + "$TC actions flush action police", + 0, + 1, + 255 + ] + ], + "cmdUnderTest": "$TC actions add action police pkts_rate 1000 pkts_burst 200 index 1", + "expExitCode": "0", + "verifyCmd": "$TC actions ls action police", + "matchPattern": "action order [0-9]*: police 0x1 rate 0bit burst 0b mtu 4096Mb pkts_rate 1000 pkts_burst 200", + "matchCount": "1", + "teardown": [ + "$TC actions flush action police" + ] + }, + { + "id": "f5bc", + "name": "Add invalid police action with both bps and pps", + "category": [ + "actions", + "police" + ], + "setup": [ + [ + "$TC actions flush action police", + 0, + 1, + 255 + ] + ], + "cmdUnderTest": "$TC actions add action police rate 1kbit burst 10k pkts_rate 1000 pkts_burst 200 index 1", + "expExitCode": "255", + "verifyCmd": "$TC actions ls action police", + "matchPattern": "action order [0-9]*: police 0x1 ", + "matchCount": "0", + "teardown": [ + "$TC actions flush action police" + ] } ] -- cgit v1.2.3 From 53b61f29367df398243b7298ad1e5793c289a493 Mon Sep 17 00:00:00 2001 From: Baowen Zheng Date: Fri, 26 Mar 2021 14:09:38 +0100 Subject: selftests: forwarding: Add tc-police tests for packets per second Test tc-police action for packets per second. The test is mainly in scenarios Rx policing and Tx policing. The test passes with veth pairs ports. Signed-off-by: Baowen Zheng Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- tools/testing/selftests/net/forwarding/lib.sh | 9 ++++ .../testing/selftests/net/forwarding/tc_police.sh | 56 ++++++++++++++++++++++ 2 files changed, 65 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index 05c05e02bade..42e28c983d41 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -772,6 +772,15 @@ rate() echo $((8 * (t1 - t0) / interval)) } +packets_rate() +{ + local t0=$1; shift + local t1=$1; shift + local interval=$1; shift + + echo $(((t1 - t0) / interval)) +} + mac_get() { local if_name=$1 diff --git a/tools/testing/selftests/net/forwarding/tc_police.sh b/tools/testing/selftests/net/forwarding/tc_police.sh index 160f9cccdfb7..4f9f17cb45d6 100755 --- a/tools/testing/selftests/net/forwarding/tc_police.sh +++ b/tools/testing/selftests/net/forwarding/tc_police.sh @@ -35,6 +35,8 @@ ALL_TESTS=" police_shared_test police_rx_mirror_test police_tx_mirror_test + police_pps_rx_test + police_pps_tx_test " NUM_NETIFS=6 source tc_common.sh @@ -290,6 +292,60 @@ police_tx_mirror_test() police_mirror_common_test $rp2 egress "police tx and mirror" } +police_pps_common_test() +{ + local test_name=$1; shift + + RET=0 + + # Rule to measure bandwidth on ingress of $h2 + tc filter add dev $h2 ingress protocol ip pref 1 handle 101 flower \ + dst_ip 198.51.100.1 ip_proto udp dst_port 54321 \ + action drop + + mausezahn $h1 -a own -b $(mac_get $rp1) -A 192.0.2.1 -B 198.51.100.1 \ + -t udp sp=12345,dp=54321 -p 1000 -c 0 -q & + + local t0=$(tc_rule_stats_get $h2 1 ingress .packets) + sleep 10 + local t1=$(tc_rule_stats_get $h2 1 ingress .packets) + + local er=$((2000)) + local nr=$(packets_rate $t0 $t1 10) + local nr_pct=$((100 * (nr - er) / er)) + ((-10 <= nr_pct && nr_pct <= 10)) + check_err $? "Expected rate $(humanize $er), got $(humanize $nr), which is $nr_pct% off. Required accuracy is +-10%." + + log_test "$test_name" + + { kill %% && wait %%; } 2>/dev/null + tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower +} + +police_pps_rx_test() +{ + # Rule to police traffic destined to $h2 on ingress of $rp1 + tc filter add dev $rp1 ingress protocol ip pref 1 handle 101 flower \ + dst_ip 198.51.100.1 ip_proto udp dst_port 54321 \ + action police pkts_rate 2000 pkts_burst 400 conform-exceed drop/ok + + police_pps_common_test "police pps on rx" + + tc filter del dev $rp1 ingress protocol ip pref 1 handle 101 flower +} + +police_pps_tx_test() +{ + # Rule to police traffic destined to $h2 on egress of $rp2 + tc filter add dev $rp2 egress protocol ip pref 1 handle 101 flower \ + dst_ip 198.51.100.1 ip_proto udp dst_port 54321 \ + action police pkts_rate 2000 pkts_burst 400 conform-exceed drop/ok + + police_pps_common_test "police pps on tx" + + tc filter del dev $rp2 egress protocol ip pref 1 handle 101 flower +} + setup_prepare() { h1=${NETIFS[p1]} -- cgit v1.2.3 From dfc4ae3372182a168146745def03d877f31fcf2f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 25 Mar 2021 13:08:20 -0700 Subject: selftests/powerpc: unmark non-kernel-doc comments Drop the 'beginning of kernel-doc' notation markers (/**) in places that are not in kernel-doc format. Signed-off-by: Randy Dunlap Reviewed-by: Daniel Axtens Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210325200820.16594-1-rdunlap@infradead.org --- tools/testing/selftests/powerpc/tm/tm-trap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/tm/tm-trap.c b/tools/testing/selftests/powerpc/tm/tm-trap.c index c75960af8018..11521077f915 100644 --- a/tools/testing/selftests/powerpc/tm/tm-trap.c +++ b/tools/testing/selftests/powerpc/tm/tm-trap.c @@ -66,7 +66,7 @@ void trap_signal_handler(int signo, siginfo_t *si, void *uc) /* Get thread endianness: extract bit LE from MSR */ thread_endianness = MSR_LE & ucp->uc_mcontext.gp_regs[PT_MSR]; - /*** + /* * Little-Endian Machine */ @@ -126,7 +126,7 @@ void trap_signal_handler(int signo, siginfo_t *si, void *uc) } } - /*** + /* * Big-Endian Machine */ -- cgit v1.2.3 From c3572a0b731fc0de13afef300f1f5afb929e4d4c Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 29 Mar 2021 13:09:46 +0300 Subject: selftests: mlxsw: Test matchall failure with protocol match The driver can only offload matchall rules that do not match on a protocol. Test that matchall rules that match on a protocol are vetoed. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- .../selftests/drivers/net/mlxsw/tc_restrictions.sh | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh b/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh index b4dbda706c4d..5ec3beb637c8 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh @@ -11,6 +11,7 @@ ALL_TESTS=" matchall_mirror_behind_flower_ingress_test matchall_sample_behind_flower_ingress_test matchall_mirror_behind_flower_egress_test + matchall_proto_match_test police_limits_test multi_police_test " @@ -291,6 +292,22 @@ matchall_mirror_behind_flower_egress_test() matchall_behind_flower_egress_test "mirror" "mirred egress mirror dev $swp2" } +matchall_proto_match_test() +{ + RET=0 + + tc qdisc add dev $swp1 clsact + + tc filter add dev $swp1 ingress pref 1 proto ip handle 101 \ + matchall skip_sw \ + action sample group 1 rate 100 + check_fail $? "Incorrect success to add matchall rule with protocol match" + + tc qdisc del dev $swp1 clsact + + log_test "matchall protocol match" +} + police_limits_test() { RET=0 -- cgit v1.2.3 From 7ede22e6583290c832306e23414cc2c1e336c4b7 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 29 Mar 2021 13:09:48 +0300 Subject: selftests: mlxsw: Test vetoing of double sampling Test that two sampling rules cannot be configured on the same port with the same trigger. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- .../selftests/drivers/net/mlxsw/tc_sample.sh | 30 ++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/mlxsw/tc_sample.sh b/tools/testing/selftests/drivers/net/mlxsw/tc_sample.sh index 57b05f042787..093bed088ad0 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/tc_sample.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/tc_sample.sh @@ -34,6 +34,7 @@ lib_dir=$(dirname $0)/../../../net/forwarding ALL_TESTS=" tc_sample_rate_test tc_sample_max_rate_test + tc_sample_conflict_test tc_sample_group_conflict_test tc_sample_md_iif_test tc_sample_md_lag_iif_test @@ -272,6 +273,35 @@ tc_sample_max_rate_test() log_test "tc sample maximum rate" } +tc_sample_conflict_test() +{ + RET=0 + + # Test that two sampling rules cannot be configured on the same port, + # even when they share the same parameters. + + tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \ + skip_sw action sample rate 1024 group 1 + check_err $? "Failed to configure sampling rule" + + tc filter add dev $rp1 ingress protocol all pref 2 handle 102 matchall \ + skip_sw action sample rate 1024 group 1 &> /dev/null + check_fail $? "Managed to configure second sampling rule" + + # Delete the first rule and make sure the second rule can now be + # configured. + + tc filter del dev $rp1 ingress protocol all pref 1 handle 101 matchall + + tc filter add dev $rp1 ingress protocol all pref 2 handle 102 matchall \ + skip_sw action sample rate 1024 group 1 + check_err $? "Failed to configure sampling rule after deletion" + + log_test "tc sample conflict test" + + tc filter del dev $rp1 ingress protocol all pref 2 handle 102 matchall +} + tc_sample_group_conflict_test() { RET=0 -- cgit v1.2.3 From 2ba4badca9977b64c966b0177920daadbd5501fe Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 29 Mar 2021 22:41:56 -0700 Subject: bpf: selftests: Update clang requirement in README.rst for testing kfunc call This patch updates the README.rst to specify the clang requirement to compile the bpf selftests that call kernel function. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20210330054156.2933804-1-kafai@fb.com --- tools/testing/selftests/bpf/README.rst | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst index 3464161c8eea..65fe318d1e71 100644 --- a/tools/testing/selftests/bpf/README.rst +++ b/tools/testing/selftests/bpf/README.rst @@ -179,3 +179,17 @@ types, which was introduced in `Clang 13`__. The older Clang versions will either crash when compiling these tests, or generate an incorrect BTF. __ https://reviews.llvm.org/D83289 + +Kernel function call test and Clang version +=========================================== + +Some selftests (e.g. kfunc_call and bpf_tcp_ca) require a LLVM support +to generate extern function in BTF. It was introduced in `Clang 13`__. + +Without it, the error from compiling bpf selftests looks like: + +.. code-block:: console + + libbpf: failed to find BTF for extern 'tcp_slow_start' [25] section: -2 + +__ https://reviews.llvm.org/D93563 -- cgit v1.2.3 From 05d817031ff9686a8206039b19e37616cf9e1d44 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 26 Mar 2021 21:25:02 -0700 Subject: libbpf: Fix memory leak when emitting final btf_ext Free temporary allocated memory used to construct finalized .BTF.ext data. Found by Coverity static analysis on libbpf's Github repo. Fixes: 8fd27bf69b86 ("libbpf: Add BPF static linker BTF and BTF.ext support") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20210327042502.969745-1-andrii@kernel.org --- tools/lib/bpf/linker.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c index a29d62ff8041..46b16cbdcda3 100644 --- a/tools/lib/bpf/linker.c +++ b/tools/lib/bpf/linker.c @@ -1906,8 +1906,10 @@ static int finalize_btf_ext(struct bpf_linker *linker) struct dst_sec *sec = &linker->secs[i]; sz = emit_btf_ext_data(linker, cur, sec->sec_name, &sec->func_info); - if (sz < 0) - return sz; + if (sz < 0) { + err = sz; + goto out; + } cur += sz; } @@ -1921,8 +1923,10 @@ static int finalize_btf_ext(struct bpf_linker *linker) struct dst_sec *sec = &linker->secs[i]; sz = emit_btf_ext_data(linker, cur, sec->sec_name, &sec->line_info); - if (sz < 0) - return sz; + if (sz < 0) { + err = sz; + goto out; + } cur += sz; } @@ -1936,8 +1940,10 @@ static int finalize_btf_ext(struct bpf_linker *linker) struct dst_sec *sec = &linker->secs[i]; sz = emit_btf_ext_data(linker, cur, sec->sec_name, &sec->core_relo_info); - if (sz < 0) - return sz; + if (sz < 0) { + err = sz; + goto out; + } cur += sz; } @@ -1948,8 +1954,10 @@ static int finalize_btf_ext(struct bpf_linker *linker) if (err) { linker->btf_ext = NULL; pr_warn("failed to parse final .BTF.ext data: %d\n", err); - return err; + goto out; } - return 0; +out: + free(data); + return err; } -- cgit v1.2.3 From 292c5ed168597df85f53cb03ec3e831b18969b62 Mon Sep 17 00:00:00 2001 From: Fabian Hemmer Date: Fri, 26 Feb 2021 02:52:23 -0500 Subject: perf tools: Preserve identifier id in OCaml demangler Some OCaml developers reported that this bit of information is sometimes useful for disambiguating functions for which the OCaml compiler assigns the same name, e.g. nested or inlined functions. Signed-off-by: Fabian Hemmer Link: http://lore.kernel.org/lkml/20210226075223.p3s5oz4jbxwnqjtv@nyu Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/demangle-ocaml-test.c | 6 +++--- tools/perf/util/demangle-ocaml.c | 12 ------------ 2 files changed, 3 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/demangle-ocaml-test.c b/tools/perf/tests/demangle-ocaml-test.c index 2fac7d762c0d..0043be812355 100644 --- a/tools/perf/tests/demangle-ocaml-test.c +++ b/tools/perf/tests/demangle-ocaml-test.c @@ -19,11 +19,11 @@ int test__demangle_ocaml(struct test *test __maybe_unused, int subtest __maybe_u { "main", NULL }, { "camlStdlib__array__map_154", - "Stdlib.array.map" }, + "Stdlib.array.map_154" }, { "camlStdlib__anon_fn$5bstdlib$2eml$3a334$2c0$2d$2d54$5d_1453", - "Stdlib.anon_fn[stdlib.ml:334,0--54]" }, + "Stdlib.anon_fn[stdlib.ml:334,0--54]_1453" }, { "camlStdlib__bytes__$2b$2b_2205", - "Stdlib.bytes.++" }, + "Stdlib.bytes.++_2205" }, }; for (i = 0; i < ARRAY_SIZE(test_cases); i++) { diff --git a/tools/perf/util/demangle-ocaml.c b/tools/perf/util/demangle-ocaml.c index 3df14e67c622..9d707bb60b4b 100644 --- a/tools/perf/util/demangle-ocaml.c +++ b/tools/perf/util/demangle-ocaml.c @@ -64,17 +64,5 @@ ocaml_demangle_sym(const char *sym) } result[j] = '\0'; - /* scan backwards to remove an "_" followed by decimal digits */ - if (j != 0 && isdigit(result[j - 1])) { - while (--j) { - if (!isdigit(result[j])) { - break; - } - } - if (result[j] == '_') { - result[j] = '\0'; - } - } - return result; } -- cgit v1.2.3 From 9f33df73a929ed91dddad036f518d690b3094eda Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Tue, 30 Mar 2021 00:43:00 +0200 Subject: selftests: xsk: Don't call worker_pkt_dump() for stats test For TEST_TYPE_STATS, worker_pkt_validate() that places frames onto pkt_buf is not called. Therefore, when dump mode is set, don't call worker_pkt_dump() for mentioned test type, so that it won't crash on pkt_buf() access. Signed-off-by: Maciej Fijalkowski Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210329224316.17793-2-maciej.fijalkowski@intel.com --- tools/testing/selftests/bpf/xdpxceiver.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index 1e21a3172687..09429ed2ddf6 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -999,7 +999,7 @@ static void testapp_validate(void) pthread_join(t1, NULL); pthread_join(t0, NULL); - if (debug_pkt_dump) { + if (debug_pkt_dump && test_type != TEST_TYPE_STATS) { worker_pkt_dump(); for (int iter = 0; iter < num_frames - 1; iter++) { free(pkt_buf[iter]->payload); -- cgit v1.2.3 From e623bfdef713ddda1b9f57d1759df3a1cd97255a Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Tue, 30 Mar 2021 00:43:01 +0200 Subject: selftests: xsk: Remove struct ifaceconfigobj ifaceconfigobj is not really useful, it is possible to keep the functionality and simplify the code. Signed-off-by: Maciej Fijalkowski Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210329224316.17793-3-maciej.fijalkowski@intel.com --- tools/testing/selftests/bpf/xdpxceiver.c | 65 +++++++++++++++----------------- tools/testing/selftests/bpf/xdpxceiver.h | 9 ----- 2 files changed, 30 insertions(+), 44 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index 09429ed2ddf6..e2ff3cd48ccf 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -93,6 +93,13 @@ typedef __u16 __sum16; #include "xdpxceiver.h" #include "../kselftest.h" +static const char *MAC1 = "\x00\x0A\x56\x9E\xEE\x62"; +static const char *MAC2 = "\x00\x0A\x56\x9E\xEE\x61"; +static const char *IP1 = "192.168.100.162"; +static const char *IP2 = "192.168.100.161"; +static const u16 UDP_PORT1 = 2020; +static const u16 UDP_PORT2 = 2121; + static void __exit_with_error(int error, const char *file, const char *func, int line) { if (configured_mode == TEST_MODE_UNCONFIGURED) { @@ -1053,25 +1060,24 @@ static void testapp_stats(void) print_ksft_result(); } -static void init_iface_config(struct ifaceconfigobj *ifaceconfig) +static void init_iface(struct ifobject *ifobj, const char *dst_mac, + const char *src_mac, const char *dst_ip, + const char *src_ip, const u16 dst_port, + const u16 src_port) { - /*Init interface0 */ - ifdict[0]->fv.vector = tx; - memcpy(ifdict[0]->dst_mac, ifaceconfig->dst_mac, ETH_ALEN); - memcpy(ifdict[0]->src_mac, ifaceconfig->src_mac, ETH_ALEN); - ifdict[0]->dst_ip = ifaceconfig->dst_ip.s_addr; - ifdict[0]->src_ip = ifaceconfig->src_ip.s_addr; - ifdict[0]->dst_port = ifaceconfig->dst_port; - ifdict[0]->src_port = ifaceconfig->src_port; - - /*Init interface1 */ - ifdict[1]->fv.vector = rx; - memcpy(ifdict[1]->dst_mac, ifaceconfig->src_mac, ETH_ALEN); - memcpy(ifdict[1]->src_mac, ifaceconfig->dst_mac, ETH_ALEN); - ifdict[1]->dst_ip = ifaceconfig->src_ip.s_addr; - ifdict[1]->src_ip = ifaceconfig->dst_ip.s_addr; - ifdict[1]->dst_port = ifaceconfig->src_port; - ifdict[1]->src_port = ifaceconfig->dst_port; + struct in_addr ip; + + memcpy(ifobj->dst_mac, dst_mac, ETH_ALEN); + memcpy(ifobj->src_mac, src_mac, ETH_ALEN); + + inet_aton(dst_ip, &ip); + ifobj->dst_ip = ip.s_addr; + + inet_aton(src_ip, &ip); + ifobj->src_ip = ip.s_addr; + + ifobj->dst_port = dst_port; + ifobj->src_port = src_port; } static void *nsdisablemodethread(void *args) @@ -1175,26 +1181,11 @@ static void run_pkt_test(int mode, int type) int main(int argc, char **argv) { struct rlimit _rlim = { RLIM_INFINITY, RLIM_INFINITY }; + int i, j; if (setrlimit(RLIMIT_MEMLOCK, &_rlim)) exit_with_error(errno); - const char *MAC1 = "\x00\x0A\x56\x9E\xEE\x62"; - const char *MAC2 = "\x00\x0A\x56\x9E\xEE\x61"; - const char *IP1 = "192.168.100.162"; - const char *IP2 = "192.168.100.161"; - u16 UDP_DST_PORT = 2020; - u16 UDP_SRC_PORT = 2121; - int i, j; - - ifaceconfig = malloc(sizeof(struct ifaceconfigobj)); - memcpy(ifaceconfig->dst_mac, MAC1, ETH_ALEN); - memcpy(ifaceconfig->src_mac, MAC2, ETH_ALEN); - inet_aton(IP1, &ifaceconfig->dst_ip); - inet_aton(IP2, &ifaceconfig->src_ip); - ifaceconfig->dst_port = UDP_DST_PORT; - ifaceconfig->src_port = UDP_SRC_PORT; - for (int i = 0; i < MAX_INTERFACES; i++) { ifdict[i] = malloc(sizeof(struct ifobject)); if (!ifdict[i]) @@ -1209,7 +1200,11 @@ int main(int argc, char **argv) num_frames = ++opt_pkt_count; - init_iface_config(ifaceconfig); + ifdict[0]->fv.vector = tx; + init_iface(ifdict[0], MAC1, MAC2, IP1, IP2, UDP_PORT1, UDP_PORT2); + + ifdict[1]->fv.vector = rx; + init_iface(ifdict[1], MAC2, MAC1, IP2, IP1, UDP_PORT2, UDP_PORT1); disable_xdp_mode(XDP_FLAGS_DRV_MODE); diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h index 30314ef305c2..8f9308099318 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.h +++ b/tools/testing/selftests/bpf/xdpxceiver.h @@ -125,15 +125,6 @@ struct generic_data { u32 seqnum; }; -struct ifaceconfigobj { - u8 dst_mac[ETH_ALEN]; - u8 src_mac[ETH_ALEN]; - struct in_addr dst_ip; - struct in_addr src_ip; - u16 src_port; - u16 dst_port; -} *ifaceconfig; - struct ifobject { int ifindex; int ifdict_index; -- cgit v1.2.3 From 7519c387e69d367075bf493de8a9ea427c9d2a1b Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Tue, 30 Mar 2021 00:43:02 +0200 Subject: selftests: xsk: Remove unused function Probably it was ported from xdpsock but is not used anywhere. Signed-off-by: Maciej Fijalkowski Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210329224316.17793-4-maciej.fijalkowski@intel.com --- tools/testing/selftests/bpf/xdpxceiver.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index e2ff3cd48ccf..593e8be41fce 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -153,19 +153,6 @@ static void *memset32_htonl(void *dest, u32 val, u32 size) return dest; } -/* - * This function code has been taken from - * Linux kernel lib/checksum.c - */ -static inline unsigned short from32to16(unsigned int x) -{ - /* add up 16-bit and 16-bit for 16+c bit */ - x = (x & 0xffff) + (x >> 16); - /* add up carry.. */ - x = (x & 0xffff) + (x >> 16); - return x; -} - /* * Fold a partial checksum * This function code has been taken from -- cgit v1.2.3 From 965d2cb0f675e5f60af441c9bd430c284d66179d Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Tue, 30 Mar 2021 00:43:03 +0200 Subject: selftests: xsk: Remove inline keyword from source file Follow the kernel coding style guidelines and let compiler do the decision about inlining. Signed-off-by: Maciej Fijalkowski Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210329224316.17793-5-maciej.fijalkowski@intel.com --- tools/testing/selftests/bpf/xdpxceiver.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index 593e8be41fce..77880abd818c 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -158,7 +158,7 @@ static void *memset32_htonl(void *dest, u32 val, u32 size) * This function code has been taken from * Linux kernel include/asm-generic/checksum.h */ -static inline __u16 csum_fold(__u32 csum) +static __u16 csum_fold(__u32 csum) { u32 sum = (__force u32)csum; @@ -171,7 +171,7 @@ static inline __u16 csum_fold(__u32 csum) * This function code has been taken from * Linux kernel lib/checksum.c */ -static inline u32 from64to32(u64 x) +static u32 from64to32(u64 x) { /* add up 32-bit and 32-bit for 32+c bit */ x = (x & 0xffffffff) + (x >> 32); @@ -180,13 +180,11 @@ static inline u32 from64to32(u64 x) return (u32)x; } -__u32 csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u32 sum); - /* * This function code has been taken from * Linux kernel lib/checksum.c */ -__u32 csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u32 sum) +static __u32 csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u32 sum) { unsigned long long s = (__force u32)sum; @@ -204,13 +202,12 @@ __u32 csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u3 * This function has been taken from * Linux kernel include/asm-generic/checksum.h */ -static inline __u16 -csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u32 sum) +static __u16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u32 sum) { return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum)); } -static inline u16 udp_csum(u32 saddr, u32 daddr, u32 len, u8 proto, u16 *udp_pkt) +static u16 udp_csum(u32 saddr, u32 daddr, u32 len, u8 proto, u16 *udp_pkt) { u32 csum = 0; u32 cnt = 0; @@ -500,7 +497,7 @@ static void kick_tx(struct xsk_socket_info *xsk) exit_with_error(errno); } -static inline void complete_tx_only(struct xsk_socket_info *xsk, int batch_size) +static void complete_tx_only(struct xsk_socket_info *xsk, int batch_size) { unsigned int rcvd; u32 idx; @@ -605,7 +602,7 @@ static void tx_only(struct xsk_socket_info *xsk, u32 *frameptr, int batch_size) complete_tx_only(xsk, batch_size); } -static inline int get_batch_size(int pkt_cnt) +static int get_batch_size(int pkt_cnt) { if (!opt_pkt_count) return BATCH_SIZE; -- cgit v1.2.3 From aa2d61c154f9387883664e3873bc0700419640a3 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Tue, 30 Mar 2021 00:43:04 +0200 Subject: selftests: xsk: Simplify frame traversal in dumping thread MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Store offsets to each layer in a separate variables rather than compute them every single time. Signed-off-by: Björn Töpel Signed-off-by: Maciej Fijalkowski Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210329224316.17793-6-maciej.fijalkowski@intel.com --- tools/testing/selftests/bpf/xdpxceiver.c | 47 ++++++++++++++------------------ 1 file changed, 21 insertions(+), 26 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index 77880abd818c..ae2f7d71f041 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -658,45 +658,40 @@ static void tx_only_all(struct ifobject *ifobject) static void worker_pkt_dump(void) { - struct in_addr ipaddr; + struct ethhdr *ethhdr; + struct iphdr *iphdr; + struct udphdr *udphdr; + char s[128]; + int payload; + void *ptr; fprintf(stdout, "---------------------------------------\n"); for (int iter = 0; iter < num_frames - 1; iter++) { + ptr = pkt_buf[iter]->payload; + ethhdr = ptr; + iphdr = ptr + sizeof(*ethhdr); + udphdr = ptr + sizeof(*ethhdr) + sizeof(*iphdr); + /*extract L2 frame */ fprintf(stdout, "DEBUG>> L2: dst mac: "); for (int i = 0; i < ETH_ALEN; i++) - fprintf(stdout, "%02X", ((struct ethhdr *) - pkt_buf[iter]->payload)->h_dest[i]); + fprintf(stdout, "%02X", ethhdr->h_dest[i]); fprintf(stdout, "\nDEBUG>> L2: src mac: "); for (int i = 0; i < ETH_ALEN; i++) - fprintf(stdout, "%02X", ((struct ethhdr *) - pkt_buf[iter]->payload)->h_source[i]); + fprintf(stdout, "%02X", ethhdr->h_source[i]); /*extract L3 frame */ - fprintf(stdout, "\nDEBUG>> L3: ip_hdr->ihl: %02X\n", - ((struct iphdr *)(pkt_buf[iter]->payload + sizeof(struct ethhdr)))->ihl); - - ipaddr.s_addr = - ((struct iphdr *)(pkt_buf[iter]->payload + sizeof(struct ethhdr)))->saddr; - fprintf(stdout, "DEBUG>> L3: ip_hdr->saddr: %s\n", inet_ntoa(ipaddr)); - - ipaddr.s_addr = - ((struct iphdr *)(pkt_buf[iter]->payload + sizeof(struct ethhdr)))->daddr; - fprintf(stdout, "DEBUG>> L3: ip_hdr->daddr: %s\n", inet_ntoa(ipaddr)); - + fprintf(stdout, "\nDEBUG>> L3: ip_hdr->ihl: %02X\n", iphdr->ihl); + fprintf(stdout, "DEBUG>> L3: ip_hdr->saddr: %s\n", + inet_ntop(AF_INET, &iphdr->saddr, s, sizeof(s))); + fprintf(stdout, "DEBUG>> L3: ip_hdr->daddr: %s\n", + inet_ntop(AF_INET, &iphdr->daddr, s, sizeof(s))); /*extract L4 frame */ - fprintf(stdout, "DEBUG>> L4: udp_hdr->src: %d\n", - ntohs(((struct udphdr *)(pkt_buf[iter]->payload + - sizeof(struct ethhdr) + - sizeof(struct iphdr)))->source)); - - fprintf(stdout, "DEBUG>> L4: udp_hdr->dst: %d\n", - ntohs(((struct udphdr *)(pkt_buf[iter]->payload + - sizeof(struct ethhdr) + - sizeof(struct iphdr)))->dest)); + fprintf(stdout, "DEBUG>> L4: udp_hdr->src: %d\n", ntohs(udphdr->source)); + fprintf(stdout, "DEBUG>> L4: udp_hdr->dst: %d\n", ntohs(udphdr->dest)); /*extract L5 frame */ - int payload = *((uint32_t *)(pkt_buf[iter]->payload + PKT_HDR_SIZE)); + payload = *((uint32_t *)(ptr + PKT_HDR_SIZE)); if (payload == EOT) { print_verbose("End-of-transmission frame received\n"); -- cgit v1.2.3 From 10397994d30f2de51bfd9321ed9ddb789464f572 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Tue, 30 Mar 2021 00:43:05 +0200 Subject: libbpf: xsk: Use bpf_link Currently, if there are multiple xdpsock instances running on a single interface and in case one of the instances is terminated, the rest of them are left in an inoperable state due to the fact of unloaded XDP prog from interface. Consider the scenario below: // load xdp prog and xskmap and add entry to xskmap at idx 10 $ sudo ./xdpsock -i ens801f0 -t -q 10 // add entry to xskmap at idx 11 $ sudo ./xdpsock -i ens801f0 -t -q 11 terminate one of the processes and another one is unable to work due to the fact that the XDP prog was unloaded from interface. To address that, step away from setting bpf prog in favour of bpf_link. This means that refcounting of BPF resources will be done automatically by bpf_link itself. Provide backward compatibility by checking if underlying system is bpf_link capable. Do this by looking up/creating bpf_link on loopback device. If it failed in any way, stick with netlink-based XDP prog. therwise, use bpf_link-based logic. When setting up BPF resources during xsk socket creation, check whether bpf_link for a given ifindex already exists via set of calls to bpf_link_get_next_id -> bpf_link_get_fd_by_id -> bpf_obj_get_info_by_fd and comparing the ifindexes from bpf_link and xsk socket. For case where resources exist but they are not AF_XDP related, bail out and ask user to remove existing prog and then retry. Lastly, do a bit of refactoring within __xsk_setup_xdp_prog and pull out existing code branches based on prog_id value onto separate functions that are responsible for resource initialization if prog_id was 0 and for lookup existing resources for non-zero prog_id as that implies that XDP program is present on the underlying net device. This in turn makes it easier to follow, especially the teardown part of both branches. Signed-off-by: Maciej Fijalkowski Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210329224316.17793-7-maciej.fijalkowski@intel.com --- tools/lib/bpf/xsk.c | 258 +++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 213 insertions(+), 45 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c index 526fc35c0b23..95da0e19f4a5 100644 --- a/tools/lib/bpf/xsk.c +++ b/tools/lib/bpf/xsk.c @@ -28,6 +28,7 @@ #include #include #include +#include #include "bpf.h" #include "libbpf.h" @@ -70,8 +71,10 @@ struct xsk_ctx { int ifindex; struct list_head list; int prog_fd; + int link_fd; int xsks_map_fd; char ifname[IFNAMSIZ]; + bool has_bpf_link; }; struct xsk_socket { @@ -409,7 +412,7 @@ static int xsk_load_xdp_prog(struct xsk_socket *xsk) static const int log_buf_size = 16 * 1024; struct xsk_ctx *ctx = xsk->ctx; char log_buf[log_buf_size]; - int err, prog_fd; + int prog_fd; /* This is the fallback C-program: * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx) @@ -499,14 +502,41 @@ static int xsk_load_xdp_prog(struct xsk_socket *xsk) return prog_fd; } - err = bpf_set_link_xdp_fd(xsk->ctx->ifindex, prog_fd, - xsk->config.xdp_flags); + ctx->prog_fd = prog_fd; + return 0; +} + +static int xsk_create_bpf_link(struct xsk_socket *xsk) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts); + struct xsk_ctx *ctx = xsk->ctx; + __u32 prog_id = 0; + int link_fd; + int err; + + err = bpf_get_link_xdp_id(ctx->ifindex, &prog_id, xsk->config.xdp_flags); if (err) { - close(prog_fd); + pr_warn("getting XDP prog id failed\n"); return err; } - ctx->prog_fd = prog_fd; + /* if there's a netlink-based XDP prog loaded on interface, bail out + * and ask user to do the removal by himself + */ + if (prog_id) { + pr_warn("Netlink-based XDP prog detected, please unload it in order to launch AF_XDP prog\n"); + return -EINVAL; + } + + opts.flags = xsk->config.xdp_flags & ~(XDP_FLAGS_UPDATE_IF_NOEXIST | XDP_FLAGS_REPLACE); + + link_fd = bpf_link_create(ctx->prog_fd, ctx->ifindex, BPF_XDP, &opts); + if (link_fd < 0) { + pr_warn("bpf_link_create failed: %s\n", strerror(errno)); + return link_fd; + } + + ctx->link_fd = link_fd; return 0; } @@ -625,7 +655,6 @@ static int xsk_lookup_bpf_maps(struct xsk_socket *xsk) close(fd); } - err = 0; if (ctx->xsks_map_fd == -1) err = -ENOENT; @@ -642,6 +671,98 @@ static int xsk_set_bpf_maps(struct xsk_socket *xsk) &xsk->fd, 0); } +static int xsk_link_lookup(int ifindex, __u32 *prog_id, int *link_fd) +{ + struct bpf_link_info link_info; + __u32 link_len; + __u32 id = 0; + int err; + int fd; + + while (true) { + err = bpf_link_get_next_id(id, &id); + if (err) { + if (errno == ENOENT) { + err = 0; + break; + } + pr_warn("can't get next link: %s\n", strerror(errno)); + break; + } + + fd = bpf_link_get_fd_by_id(id); + if (fd < 0) { + if (errno == ENOENT) + continue; + pr_warn("can't get link by id (%u): %s\n", id, strerror(errno)); + err = -errno; + break; + } + + link_len = sizeof(struct bpf_link_info); + memset(&link_info, 0, link_len); + err = bpf_obj_get_info_by_fd(fd, &link_info, &link_len); + if (err) { + pr_warn("can't get link info: %s\n", strerror(errno)); + close(fd); + break; + } + if (link_info.type == BPF_LINK_TYPE_XDP) { + if (link_info.xdp.ifindex == ifindex) { + *link_fd = fd; + if (prog_id) + *prog_id = link_info.prog_id; + break; + } + } + close(fd); + } + + return err; +} + +static bool xsk_probe_bpf_link(void) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts, + .flags = XDP_FLAGS_SKB_MODE); + struct bpf_load_program_attr prog_attr; + struct bpf_insn insns[2] = { + BPF_MOV64_IMM(BPF_REG_0, XDP_PASS), + BPF_EXIT_INSN() + }; + int prog_fd, link_fd = -1; + int ifindex_lo = 1; + bool ret = false; + int err; + + err = xsk_link_lookup(ifindex_lo, NULL, &link_fd); + if (err) + return ret; + + if (link_fd >= 0) + return true; + + memset(&prog_attr, 0, sizeof(prog_attr)); + prog_attr.prog_type = BPF_PROG_TYPE_XDP; + prog_attr.insns = insns; + prog_attr.insns_cnt = ARRAY_SIZE(insns); + prog_attr.license = "GPL"; + + prog_fd = bpf_load_program_xattr(&prog_attr, NULL, 0); + if (prog_fd < 0) + return ret; + + link_fd = bpf_link_create(prog_fd, ifindex_lo, BPF_XDP, &opts); + close(prog_fd); + + if (link_fd >= 0) { + ret = true; + close(link_fd); + } + + return ret; +} + static int xsk_create_xsk_struct(int ifindex, struct xsk_socket *xsk) { char ifname[IFNAMSIZ]; @@ -663,64 +784,108 @@ static int xsk_create_xsk_struct(int ifindex, struct xsk_socket *xsk) ctx->ifname[IFNAMSIZ - 1] = 0; xsk->ctx = ctx; + xsk->ctx->has_bpf_link = xsk_probe_bpf_link(); return 0; } -static int __xsk_setup_xdp_prog(struct xsk_socket *_xdp, - int *xsks_map_fd) +static int xsk_init_xdp_res(struct xsk_socket *xsk, + int *xsks_map_fd) { - struct xsk_socket *xsk = _xdp; struct xsk_ctx *ctx = xsk->ctx; - __u32 prog_id = 0; int err; - err = bpf_get_link_xdp_id(ctx->ifindex, &prog_id, - xsk->config.xdp_flags); + err = xsk_create_bpf_maps(xsk); if (err) return err; - if (!prog_id) { - err = xsk_create_bpf_maps(xsk); - if (err) - return err; + err = xsk_load_xdp_prog(xsk); + if (err) + goto err_load_xdp_prog; - err = xsk_load_xdp_prog(xsk); - if (err) { - goto err_load_xdp_prog; - } - } else { - ctx->prog_fd = bpf_prog_get_fd_by_id(prog_id); - if (ctx->prog_fd < 0) - return -errno; - err = xsk_lookup_bpf_maps(xsk); - if (err) { - close(ctx->prog_fd); - return err; - } - } + if (ctx->has_bpf_link) + err = xsk_create_bpf_link(xsk); + else + err = bpf_set_link_xdp_fd(xsk->ctx->ifindex, ctx->prog_fd, + xsk->config.xdp_flags); - if (xsk->rx) { - err = xsk_set_bpf_maps(xsk); - if (err) { - if (!prog_id) { - goto err_set_bpf_maps; - } else { - close(ctx->prog_fd); - return err; - } - } - } - if (xsks_map_fd) - *xsks_map_fd = ctx->xsks_map_fd; + if (err) + goto err_attach_xdp_prog; - return 0; + if (!xsk->rx) + return err; + + err = xsk_set_bpf_maps(xsk); + if (err) + goto err_set_bpf_maps; + + return err; err_set_bpf_maps: + if (ctx->has_bpf_link) + close(ctx->link_fd); + else + bpf_set_link_xdp_fd(ctx->ifindex, -1, 0); +err_attach_xdp_prog: close(ctx->prog_fd); - bpf_set_link_xdp_fd(ctx->ifindex, -1, 0); err_load_xdp_prog: xsk_delete_bpf_maps(xsk); + return err; +} + +static int xsk_lookup_xdp_res(struct xsk_socket *xsk, int *xsks_map_fd, int prog_id) +{ + struct xsk_ctx *ctx = xsk->ctx; + int err; + + ctx->prog_fd = bpf_prog_get_fd_by_id(prog_id); + if (ctx->prog_fd < 0) { + err = -errno; + goto err_prog_fd; + } + err = xsk_lookup_bpf_maps(xsk); + if (err) + goto err_lookup_maps; + + if (!xsk->rx) + return err; + + err = xsk_set_bpf_maps(xsk); + if (err) + goto err_set_maps; + + return err; + +err_set_maps: + close(ctx->xsks_map_fd); +err_lookup_maps: + close(ctx->prog_fd); +err_prog_fd: + if (ctx->has_bpf_link) + close(ctx->link_fd); + return err; +} + +static int __xsk_setup_xdp_prog(struct xsk_socket *_xdp, int *xsks_map_fd) +{ + struct xsk_socket *xsk = _xdp; + struct xsk_ctx *ctx = xsk->ctx; + __u32 prog_id = 0; + int err; + + if (ctx->has_bpf_link) + err = xsk_link_lookup(ctx->ifindex, &prog_id, &ctx->link_fd); + else + err = bpf_get_link_xdp_id(ctx->ifindex, &prog_id, xsk->config.xdp_flags); + + if (err) + return err; + + err = !prog_id ? xsk_init_xdp_res(xsk, xsks_map_fd) : + xsk_lookup_xdp_res(xsk, xsks_map_fd, prog_id); + + if (!err && xsks_map_fd) + *xsks_map_fd = ctx->xsks_map_fd; return err; } @@ -898,6 +1063,7 @@ int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, } } xsk->ctx = ctx; + xsk->ctx->has_bpf_link = xsk_probe_bpf_link(); if (rx) { err = setsockopt(xsk->fd, SOL_XDP, XDP_RX_RING, @@ -1054,6 +1220,8 @@ void xsk_socket__delete(struct xsk_socket *xsk) if (ctx->prog_fd != -1) { xsk_delete_bpf_maps(xsk); close(ctx->prog_fd); + if (ctx->has_bpf_link) + close(ctx->link_fd); } err = xsk_get_mmap_offsets(xsk->fd, &off); -- cgit v1.2.3 From ef9280789773c974b45f809d58b47b481f2cf9f5 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Tue, 30 Mar 2021 00:43:07 +0200 Subject: selftests: xsk: Remove thread for netns switch Currently, there is a dedicated thread for following remote ns operations: - grabbing the ifindex of the interface moved to remote netns - removing xdp prog from that interface With bpf_link usage in place, this can be simply omitted, so remove mentioned thread, as BPF resources will be managed by bpf_link itself, so there's no further need for creating the thread that will switch to remote netns and do the cleanup. Keep most of the logic for switching the ns, though, but make switch_namespace() return the fd so that it will be possible to close it at the process termination time. Get rid of logic around making sure that it's possible to switch ns in validate_interfaces(). Signed-off-by: Maciej Fijalkowski Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210329224316.17793-9-maciej.fijalkowski@intel.com --- tools/testing/selftests/bpf/xdpxceiver.c | 123 +++---------------------------- tools/testing/selftests/bpf/xdpxceiver.h | 10 +-- 2 files changed, 14 insertions(+), 119 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index ae2f7d71f041..a856b2935d66 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -355,12 +355,15 @@ static void usage(const char *prog) ksft_print_msg(str, prog); } -static bool switch_namespace(int idx) +static int switch_namespace(const char *nsname) { char fqns[26] = "/var/run/netns/"; int nsfd; - strncat(fqns, ifdict[idx]->nsname, sizeof(fqns) - strlen(fqns) - 1); + if (!nsname || strlen(nsname) == 0) + return -1; + + strncat(fqns, nsname, sizeof(fqns) - strlen(fqns) - 1); nsfd = open(fqns, O_RDONLY); if (nsfd == -1) @@ -369,26 +372,9 @@ static bool switch_namespace(int idx) if (setns(nsfd, 0) == -1) exit_with_error(errno); - return true; -} - -static void *nsswitchthread(void *args) -{ - struct targs *targs = args; + print_verbose("NS switched: %s\n", nsname); - targs->retptr = false; - - if (switch_namespace(targs->idx)) { - ifdict[targs->idx]->ifindex = if_nametoindex(ifdict[targs->idx]->ifname); - if (!ifdict[targs->idx]->ifindex) { - ksft_test_result_fail("ERROR: [%s] interface \"%s\" does not exist\n", - __func__, ifdict[targs->idx]->ifname); - } else { - print_verbose("Interface found: %s\n", ifdict[targs->idx]->ifname); - targs->retptr = true; - } - } - pthread_exit(NULL); + return nsfd; } static int validate_interfaces(void) @@ -400,33 +386,6 @@ static int validate_interfaces(void) ret = false; ksft_test_result_fail("ERROR: interfaces: -i , -i ,."); } - if (strcmp(ifdict[i]->nsname, "")) { - struct targs *targs; - - targs = malloc(sizeof(*targs)); - if (!targs) - exit_with_error(errno); - - targs->idx = i; - if (pthread_create(&ns_thread, NULL, nsswitchthread, targs)) - exit_with_error(errno); - - pthread_join(ns_thread, NULL); - - if (targs->retptr) - print_verbose("NS switched: %s\n", ifdict[i]->nsname); - - free(targs); - } else { - ifdict[i]->ifindex = if_nametoindex(ifdict[i]->ifname); - if (!ifdict[i]->ifindex) { - ksft_test_result_fail - ("ERROR: interface \"%s\" does not exist\n", ifdict[i]->ifname); - ret = false; - } else { - print_verbose("Interface found: %s\n", ifdict[i]->ifname); - } - } } return ret; } @@ -843,8 +802,7 @@ static void *worker_testapp_validate(void *arg) if (bufs == MAP_FAILED) exit_with_error(errno); - if (strcmp(ifobject->nsname, "")) - switch_namespace(ifobject->ifdict_index); + ifobject->ns_fd = switch_namespace(ifobject->nsname); } if (ifobject->fv.vector == tx) { @@ -1059,60 +1017,6 @@ static void init_iface(struct ifobject *ifobj, const char *dst_mac, ifobj->src_port = src_port; } -static void *nsdisablemodethread(void *args) -{ - struct targs *targs = args; - - targs->retptr = false; - - if (switch_namespace(targs->idx)) { - targs->retptr = bpf_set_link_xdp_fd(ifdict[targs->idx]->ifindex, -1, targs->flags); - } else { - targs->retptr = errno; - print_verbose("Failed to switch namespace to %s\n", ifdict[targs->idx]->nsname); - } - - pthread_exit(NULL); -} - -static void disable_xdp_mode(int mode) -{ - int err = 0; - __u32 flags = XDP_FLAGS_UPDATE_IF_NOEXIST | mode; - char *mode_str = mode & XDP_FLAGS_SKB_MODE ? "skb" : "drv"; - - for (int i = 0; i < MAX_INTERFACES; i++) { - if (strcmp(ifdict[i]->nsname, "")) { - struct targs *targs; - - targs = malloc(sizeof(*targs)); - memset(targs, 0, sizeof(*targs)); - if (!targs) - exit_with_error(errno); - - targs->idx = i; - targs->flags = flags; - if (pthread_create(&ns_thread, NULL, nsdisablemodethread, targs)) - exit_with_error(errno); - - pthread_join(ns_thread, NULL); - err = targs->retptr; - free(targs); - } else { - err = bpf_set_link_xdp_fd(ifdict[i]->ifindex, -1, flags); - } - - if (err) { - print_verbose("Failed to disable %s mode on interface %s\n", - mode_str, ifdict[i]->ifname); - exit_with_error(err); - } - - print_verbose("Disabled %s mode for interface: %s\n", mode_str, ifdict[i]->ifname); - configured_mode = mode & XDP_FLAGS_SKB_MODE ? TEST_MODE_DRV : TEST_MODE_SKB; - } -} - static void run_pkt_test(int mode, int type) { test_type = type; @@ -1132,13 +1036,9 @@ static void run_pkt_test(int mode, int type) switch (mode) { case (TEST_MODE_SKB): - if (configured_mode == TEST_MODE_DRV) - disable_xdp_mode(XDP_FLAGS_DRV_MODE); xdp_flags |= XDP_FLAGS_SKB_MODE; break; case (TEST_MODE_DRV): - if (configured_mode == TEST_MODE_SKB) - disable_xdp_mode(XDP_FLAGS_SKB_MODE); xdp_flags |= XDP_FLAGS_DRV_MODE; break; default: @@ -1185,8 +1085,6 @@ int main(int argc, char **argv) ifdict[1]->fv.vector = rx; init_iface(ifdict[1], MAC2, MAC1, IP2, IP1, UDP_PORT2, UDP_PORT1); - disable_xdp_mode(XDP_FLAGS_DRV_MODE); - ksft_set_plan(TEST_MODE_MAX * TEST_TYPE_MAX); for (i = 0; i < TEST_MODE_MAX; i++) { @@ -1194,8 +1092,11 @@ int main(int argc, char **argv) run_pkt_test(i, j); } - for (int i = 0; i < MAX_INTERFACES; i++) + for (int i = 0; i < MAX_INTERFACES; i++) { + if (ifdict[i]->ns_fd != -1) + close(ifdict[i]->ns_fd); free(ifdict[i]); + } ksft_exit_pass(); diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h index 8f9308099318..493f7498d40e 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.h +++ b/tools/testing/selftests/bpf/xdpxceiver.h @@ -126,7 +126,7 @@ struct generic_data { }; struct ifobject { - int ifindex; + int ns_fd; int ifdict_index; char ifname[MAX_INTERFACE_NAME_CHARS]; char nsname[MAX_INTERFACES_NAMESPACE_CHARS]; @@ -150,15 +150,9 @@ pthread_mutex_t sync_mutex; pthread_mutex_t sync_mutex_tx; pthread_cond_t signal_rx_condition; pthread_cond_t signal_tx_condition; -pthread_t t0, t1, ns_thread; +pthread_t t0, t1; pthread_attr_t attr; -struct targs { - u8 retptr; - int idx; - u32 flags; -}; - TAILQ_HEAD(head_s, pkt) head = TAILQ_HEAD_INITIALIZER(head); struct head_s *head_p; struct pkt { -- cgit v1.2.3 From 9866bcd6635c264aaf3de1d89e44773a269048eb Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Tue, 30 Mar 2021 00:43:08 +0200 Subject: selftests: xsk: Split worker thread Let's a have a separate Tx/Rx worker threads instead of a one common thread packed with Tx/Rx specific checks. Move mmap for umem buffer space and a switch_namespace() call to thread_common_ops. This also allows for a bunch of simplifactions that are the subject of the next commits. The final result will be a code base that is much easier to follow. Signed-off-by: Maciej Fijalkowski Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210329224316.17793-10-maciej.fijalkowski@intel.com --- tools/testing/selftests/bpf/xdpxceiver.c | 156 +++++++++++++++---------------- 1 file changed, 77 insertions(+), 79 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index a856b2935d66..cb6583b8ad67 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -760,6 +760,15 @@ static void thread_common_ops(struct ifobject *ifobject, void *bufs, pthread_mut int ctr = 0; int ret; + pthread_attr_setstacksize(&attr, THREAD_STACK); + + bufs = mmap(NULL, num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE, + PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (bufs == MAP_FAILED) + exit_with_error(errno); + + ifobject->ns_fd = switch_namespace(ifobject->nsname); + xsk_configure_umem(ifobject, bufs, num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE); ret = xsk_configure_socket(ifobject); @@ -782,9 +791,12 @@ static void thread_common_ops(struct ifobject *ifobject, void *bufs, pthread_mut if (ctr >= SOCK_RECONF_CTR) exit_with_error(ret); + + print_verbose("Interface [%s] vector [%s]\n", + ifobject->ifname, ifobject->fv.vector == tx ? "Tx" : "Rx"); } -static void *worker_testapp_validate(void *arg) +static void *worker_testapp_validate_tx(void *arg) { struct udphdr *udp_hdr = (struct udphdr *)(pkt_data + sizeof(struct ethhdr) + sizeof(struct iphdr)); @@ -792,97 +804,83 @@ static void *worker_testapp_validate(void *arg) struct ethhdr *eth_hdr = (struct ethhdr *)pkt_data; struct ifobject *ifobject = (struct ifobject *)arg; struct generic_data data; + int spinningrxctr = 0; void *bufs = NULL; - pthread_attr_setstacksize(&attr, THREAD_STACK); - - if (!bidi_pass) { - bufs = mmap(NULL, num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE, - PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (bufs == MAP_FAILED) - exit_with_error(errno); + if (!bidi_pass) + thread_common_ops(ifobject, bufs, &sync_mutex_tx, &spinning_tx); - ifobject->ns_fd = switch_namespace(ifobject->nsname); + while (atomic_load(&spinning_rx) && spinningrxctr < SOCK_RECONF_CTR) { + spinningrxctr++; + usleep(USLEEP_MAX); } - if (ifobject->fv.vector == tx) { - int spinningrxctr = 0; + for (int i = 0; i < num_frames; i++) { + /*send EOT frame */ + if (i == (num_frames - 1)) + data.seqnum = -1; + else + data.seqnum = i; + gen_udp_hdr(&data, ifobject, udp_hdr); + gen_ip_hdr(ifobject, ip_hdr); + gen_udp_csum(udp_hdr, ip_hdr); + gen_eth_hdr(ifobject, eth_hdr); + gen_eth_frame(ifobject->umem, i * XSK_UMEM__DEFAULT_FRAME_SIZE); + } - if (!bidi_pass) - thread_common_ops(ifobject, bufs, &sync_mutex_tx, &spinning_tx); + print_verbose("Sending %d packets on interface %s\n", + (opt_pkt_count - 1), ifobject->ifname); + tx_only_all(ifobject); - while (atomic_load(&spinning_rx) && spinningrxctr < SOCK_RECONF_CTR) { - spinningrxctr++; - usleep(USLEEP_MAX); - } + if (test_type != TEST_TYPE_BIDI || bidi_pass) { + xsk_socket__delete(ifobject->xsk->xsk); + (void)xsk_umem__delete(ifobject->umem->umem); + } + pthread_exit(NULL); +} - print_verbose("Interface [%s] vector [Tx]\n", ifobject->ifname); - for (int i = 0; i < num_frames; i++) { - /*send EOT frame */ - if (i == (num_frames - 1)) - data.seqnum = -1; - else - data.seqnum = i; - gen_udp_hdr(&data, ifobject, udp_hdr); - gen_ip_hdr(ifobject, ip_hdr); - gen_udp_csum(udp_hdr, ip_hdr); - gen_eth_hdr(ifobject, eth_hdr); - gen_eth_frame(ifobject->umem, i * XSK_UMEM__DEFAULT_FRAME_SIZE); - } +static void *worker_testapp_validate_rx(void *arg) +{ + struct ifobject *ifobject = (struct ifobject *)arg; + struct pollfd fds[MAX_SOCKS] = { }; + void *bufs = NULL; - print_verbose("Sending %d packets on interface %s\n", - (opt_pkt_count - 1), ifobject->ifname); - tx_only_all(ifobject); - } else if (ifobject->fv.vector == rx) { - struct pollfd fds[MAX_SOCKS] = { }; - int ret; - - if (!bidi_pass) - thread_common_ops(ifobject, bufs, &sync_mutex_tx, &spinning_rx); - - print_verbose("Interface [%s] vector [Rx]\n", ifobject->ifname); - if (stat_test_type != STAT_TEST_RX_FILL_EMPTY) - xsk_populate_fill_ring(ifobject->umem); - - TAILQ_INIT(&head); - if (debug_pkt_dump) { - pkt_buf = calloc(num_frames, sizeof(*pkt_buf)); - if (!pkt_buf) - exit_with_error(errno); - } + if (!bidi_pass) + thread_common_ops(ifobject, bufs, &sync_mutex_tx, &spinning_rx); - fds[0].fd = xsk_socket__fd(ifobject->xsk->xsk); - fds[0].events = POLLIN; + if (stat_test_type != STAT_TEST_RX_FILL_EMPTY) + xsk_populate_fill_ring(ifobject->umem); - pthread_mutex_lock(&sync_mutex); - pthread_cond_signal(&signal_rx_condition); - pthread_mutex_unlock(&sync_mutex); + TAILQ_INIT(&head); + if (debug_pkt_dump) { + pkt_buf = calloc(num_frames, sizeof(*pkt_buf)); + if (!pkt_buf) + exit_with_error(errno); + } - while (1) { - if (test_type == TEST_TYPE_POLL) { - ret = poll(fds, 1, POLL_TMOUT); - if (ret <= 0) - continue; - } + fds[0].fd = xsk_socket__fd(ifobject->xsk->xsk); + fds[0].events = POLLIN; - if (test_type != TEST_TYPE_STATS) { - rx_pkt(ifobject->xsk, fds); - worker_pkt_validate(); - } else { - worker_stats_validate(ifobject); - } + pthread_mutex_lock(&sync_mutex); + pthread_cond_signal(&signal_rx_condition); + pthread_mutex_unlock(&sync_mutex); - if (sigvar) - break; + while (1) { + if (test_type != TEST_TYPE_STATS) { + rx_pkt(ifobject->xsk, fds); + worker_pkt_validate(); + } else { + worker_stats_validate(ifobject); } + if (sigvar) + break; + } - if (test_type != TEST_TYPE_STATS) - print_verbose("Received %d packets on interface %s\n", - pkt_counter, ifobject->ifname); + print_verbose("Received %d packets on interface %s\n", + pkt_counter, ifobject->ifname); - if (test_type == TEST_TYPE_TEARDOWN) - print_verbose("Destroying socket\n"); - } + if (test_type == TEST_TYPE_TEARDOWN) + print_verbose("Destroying socket\n"); if ((test_type != TEST_TYPE_BIDI) || bidi_pass) { xsk_socket__delete(ifobject->xsk->xsk); @@ -911,12 +909,12 @@ static void testapp_validate(void) /*Spawn RX thread */ if (!bidi || !bidi_pass) { - if (pthread_create(&t0, &attr, worker_testapp_validate, ifdict[1])) + if (pthread_create(&t0, &attr, worker_testapp_validate_rx, ifdict[1])) exit_with_error(errno); } else if (bidi && bidi_pass) { /*switch Tx/Rx vectors */ ifdict[0]->fv.vector = rx; - if (pthread_create(&t0, &attr, worker_testapp_validate, ifdict[0])) + if (pthread_create(&t0, &attr, worker_testapp_validate_rx, ifdict[0])) exit_with_error(errno); } @@ -931,12 +929,12 @@ static void testapp_validate(void) /*Spawn TX thread */ if (!bidi || !bidi_pass) { - if (pthread_create(&t1, &attr, worker_testapp_validate, ifdict[0])) + if (pthread_create(&t1, &attr, worker_testapp_validate_tx, ifdict[0])) exit_with_error(errno); } else if (bidi && bidi_pass) { /*switch Tx/Rx vectors */ ifdict[1]->fv.vector = tx; - if (pthread_create(&t1, &attr, worker_testapp_validate, ifdict[1])) + if (pthread_create(&t1, &attr, worker_testapp_validate_tx, ifdict[1])) exit_with_error(errno); } -- cgit v1.2.3 From 99f9bcb65705dda07288b759c569d30d8a4f297c Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Tue, 30 Mar 2021 00:43:09 +0200 Subject: selftests: xsk: Remove Tx synchronization resources Tx thread needs to be started after the Rx side is fully initialized so that packets are not xmitted until xsk Rx socket is ready to be used. It can be observed that atomic variable spinning_tx is not checked from Rx side in any way, so thread_common_ops can be modified to only address the spinning_rx. This means that spinning_tx can be removed altogheter. signal_tx_condition is never utilized, so simply remove it. Signed-off-by: Maciej Fijalkowski Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210329224316.17793-11-maciej.fijalkowski@intel.com --- tools/testing/selftests/bpf/xdpxceiver.c | 15 +++++++-------- tools/testing/selftests/bpf/xdpxceiver.h | 2 -- 2 files changed, 7 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index cb6583b8ad67..9f7ee9a38752 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -126,7 +126,6 @@ static void pthread_init_mutex(void) pthread_mutex_init(&sync_mutex, NULL); pthread_mutex_init(&sync_mutex_tx, NULL); pthread_cond_init(&signal_rx_condition, NULL); - pthread_cond_init(&signal_tx_condition, NULL); } static void pthread_destroy_mutex(void) @@ -134,7 +133,6 @@ static void pthread_destroy_mutex(void) pthread_mutex_destroy(&sync_mutex); pthread_mutex_destroy(&sync_mutex_tx); pthread_cond_destroy(&signal_rx_condition); - pthread_cond_destroy(&signal_tx_condition); } static void *memset32_htonl(void *dest, u32 val, u32 size) @@ -754,8 +752,7 @@ static void worker_pkt_validate(void) } } -static void thread_common_ops(struct ifobject *ifobject, void *bufs, pthread_mutex_t *mutexptr, - atomic_int *spinningptr) +static void thread_common_ops(struct ifobject *ifobject, void *bufs, pthread_mutex_t *mutexptr) { int ctr = 0; int ret; @@ -780,13 +777,15 @@ static void thread_common_ops(struct ifobject *ifobject, void *bufs, pthread_mut */ pthread_mutex_lock(mutexptr); while (ret && ctr < SOCK_RECONF_CTR) { - atomic_store(spinningptr, 1); + if (ifobject->fv.vector == rx) + atomic_store(&spinning_rx, 1); xsk_configure_umem(ifobject, bufs, num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE); ret = xsk_configure_socket(ifobject); usleep(USLEEP_MAX); ctr++; } - atomic_store(spinningptr, 0); + if (ifobject->fv.vector == rx) + atomic_store(&spinning_rx, 0); pthread_mutex_unlock(mutexptr); if (ctr >= SOCK_RECONF_CTR) @@ -808,7 +807,7 @@ static void *worker_testapp_validate_tx(void *arg) void *bufs = NULL; if (!bidi_pass) - thread_common_ops(ifobject, bufs, &sync_mutex_tx, &spinning_tx); + thread_common_ops(ifobject, bufs, &sync_mutex_tx); while (atomic_load(&spinning_rx) && spinningrxctr < SOCK_RECONF_CTR) { spinningrxctr++; @@ -846,7 +845,7 @@ static void *worker_testapp_validate_rx(void *arg) void *bufs = NULL; if (!bidi_pass) - thread_common_ops(ifobject, bufs, &sync_mutex_tx, &spinning_rx); + thread_common_ops(ifobject, bufs, &sync_mutex_tx); if (stat_test_type != STAT_TEST_RX_FILL_EMPTY) xsk_populate_fill_ring(ifobject->umem); diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h index 493f7498d40e..483be41229c6 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.h +++ b/tools/testing/selftests/bpf/xdpxceiver.h @@ -144,12 +144,10 @@ struct ifobject { static struct ifobject *ifdict[MAX_INTERFACES]; /*threads*/ -atomic_int spinning_tx; atomic_int spinning_rx; pthread_mutex_t sync_mutex; pthread_mutex_t sync_mutex_tx; pthread_cond_t signal_rx_condition; -pthread_cond_t signal_tx_condition; pthread_t t0, t1; pthread_attr_t attr; -- cgit v1.2.3 From 9445f8c765838edf84dd0d3910ff309bdab8f95f Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Tue, 30 Mar 2021 00:43:10 +0200 Subject: selftests: xsk: Refactor teardown/bidi test cases and testapp_validate Currently, there is a testapp_sockets() that acts like a wrapper around testapp_validate() and it is called for bidi and teardown test types. Other test types call testapp_validate() directly. Split testapp_sockets() onto two separate functions so a bunch of bidi specific logic can be moved there and out of testapp_validate() itself. Introduce function pointer to ifobject struct which will be used for assigning the Rx/Tx function that is assigned to worker thread. Let's also have a global ifobject Rx/Tx pointers so it's easier to swap the vectors on a second run of a bi-directional test. Thread creation now is easey to follow. switching_notify variable is useless, info about vector switch can be printed based on bidi_pass state. Last but not least, init/destroy synchronization variables only once, not per each test. Signed-off-by: Maciej Fijalkowski Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210329224316.17793-12-maciej.fijalkowski@intel.com --- tools/testing/selftests/bpf/xdpxceiver.c | 117 ++++++++++++++++++------------- tools/testing/selftests/bpf/xdpxceiver.h | 14 ++-- 2 files changed, 78 insertions(+), 53 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index 9f7ee9a38752..c0ebca8f55f8 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -896,26 +896,10 @@ static void testapp_validate(void) pthread_attr_init(&attr); pthread_attr_setstacksize(&attr, THREAD_STACK); - if ((test_type == TEST_TYPE_BIDI) && bidi_pass) { - pthread_init_mutex(); - if (!switching_notify) { - print_verbose("Switching Tx/Rx vectors\n"); - switching_notify++; - } - } - pthread_mutex_lock(&sync_mutex); /*Spawn RX thread */ - if (!bidi || !bidi_pass) { - if (pthread_create(&t0, &attr, worker_testapp_validate_rx, ifdict[1])) - exit_with_error(errno); - } else if (bidi && bidi_pass) { - /*switch Tx/Rx vectors */ - ifdict[0]->fv.vector = rx; - if (pthread_create(&t0, &attr, worker_testapp_validate_rx, ifdict[0])) - exit_with_error(errno); - } + pthread_create(&t0, &attr, ifdict_rx->func_ptr, ifdict_rx); if (clock_gettime(CLOCK_REALTIME, &max_wait)) exit_with_error(errno); @@ -927,15 +911,7 @@ static void testapp_validate(void) pthread_mutex_unlock(&sync_mutex); /*Spawn TX thread */ - if (!bidi || !bidi_pass) { - if (pthread_create(&t1, &attr, worker_testapp_validate_tx, ifdict[0])) - exit_with_error(errno); - } else if (bidi && bidi_pass) { - /*switch Tx/Rx vectors */ - ifdict[1]->fv.vector = tx; - if (pthread_create(&t1, &attr, worker_testapp_validate_tx, ifdict[1])) - exit_with_error(errno); - } + pthread_create(&t1, &attr, ifdict_tx->func_ptr, ifdict_tx); pthread_join(t1, NULL); pthread_join(t0, NULL); @@ -953,18 +929,53 @@ static void testapp_validate(void) print_ksft_result(); } -static void testapp_sockets(void) +static void testapp_teardown(void) +{ + int i; + + for (i = 0; i < MAX_TEARDOWN_ITER; i++) { + pkt_counter = 0; + prev_pkt = -1; + sigvar = 0; + print_verbose("Creating socket\n"); + testapp_validate(); + } + + print_ksft_result(); +} + +static void swap_vectors(struct ifobject *ifobj1, struct ifobject *ifobj2) +{ + void *(*tmp_func_ptr)(void *) = ifobj1->func_ptr; + enum fvector tmp_vector = ifobj1->fv.vector; + + ifobj1->func_ptr = ifobj2->func_ptr; + ifobj1->fv.vector = ifobj2->fv.vector; + + ifobj2->func_ptr = tmp_func_ptr; + ifobj2->fv.vector = tmp_vector; + + ifdict_tx = ifobj1; + ifdict_rx = ifobj2; +} + +static void testapp_bidi(void) { - for (int i = 0; i < ((test_type == TEST_TYPE_TEARDOWN) ? MAX_TEARDOWN_ITER : MAX_BIDI_ITER); - i++) { + for (int i = 0; i < MAX_BIDI_ITER; i++) { pkt_counter = 0; prev_pkt = -1; sigvar = 0; print_verbose("Creating socket\n"); testapp_validate(); - test_type == TEST_TYPE_BIDI ? bidi_pass++ : bidi_pass; + if (!bidi_pass) { + print_verbose("Switching Tx/Rx vectors\n"); + swap_vectors(ifdict[1], ifdict[0]); + } + bidi_pass++; } + swap_vectors(ifdict[0], ifdict[1]); + print_ksft_result(); } @@ -997,7 +1008,7 @@ static void testapp_stats(void) static void init_iface(struct ifobject *ifobj, const char *dst_mac, const char *src_mac, const char *dst_ip, const char *src_ip, const u16 dst_port, - const u16 src_port) + const u16 src_port, enum fvector vector) { struct in_addr ip; @@ -1012,6 +1023,16 @@ static void init_iface(struct ifobject *ifobj, const char *dst_mac, ifobj->dst_port = dst_port; ifobj->src_port = src_port; + + if (vector == tx) { + ifobj->fv.vector = tx; + ifobj->func_ptr = worker_testapp_validate_tx; + ifdict_tx = ifobj; + } else { + ifobj->fv.vector = rx; + ifobj->func_ptr = worker_testapp_validate_rx; + ifdict_rx = ifobj; + } } static void run_pkt_test(int mode, int type) @@ -1021,11 +1042,8 @@ static void run_pkt_test(int mode, int type) /* reset defaults after potential previous test */ xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; pkt_counter = 0; - switching_notify = 0; bidi_pass = 0; prev_pkt = -1; - ifdict[0]->fv.vector = tx; - ifdict[1]->fv.vector = rx; sigvar = 0; stat_test_type = -1; rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS; @@ -1042,16 +1060,20 @@ static void run_pkt_test(int mode, int type) break; } - pthread_init_mutex(); - - if (test_type == TEST_TYPE_STATS) + switch (test_type) { + case TEST_TYPE_STATS: testapp_stats(); - else if ((test_type != TEST_TYPE_TEARDOWN) && (test_type != TEST_TYPE_BIDI)) + break; + case TEST_TYPE_TEARDOWN: + testapp_teardown(); + break; + case TEST_TYPE_BIDI: + testapp_bidi(); + break; + default: testapp_validate(); - else - testapp_sockets(); - - pthread_destroy_mutex(); + break; + } } int main(int argc, char **argv) @@ -1076,14 +1098,13 @@ int main(int argc, char **argv) num_frames = ++opt_pkt_count; - ifdict[0]->fv.vector = tx; - init_iface(ifdict[0], MAC1, MAC2, IP1, IP2, UDP_PORT1, UDP_PORT2); - - ifdict[1]->fv.vector = rx; - init_iface(ifdict[1], MAC2, MAC1, IP2, IP1, UDP_PORT2, UDP_PORT1); + init_iface(ifdict[0], MAC1, MAC2, IP1, IP2, UDP_PORT1, UDP_PORT2, tx); + init_iface(ifdict[1], MAC2, MAC1, IP2, IP1, UDP_PORT2, UDP_PORT1, rx); ksft_set_plan(TEST_MODE_MAX * TEST_TYPE_MAX); + pthread_init_mutex(); + for (i = 0; i < TEST_MODE_MAX; i++) { for (j = 0; j < TEST_TYPE_MAX; j++) run_pkt_test(i, j); @@ -1095,6 +1116,8 @@ int main(int argc, char **argv) free(ifdict[i]); } + pthread_destroy_mutex(); + ksft_exit_pass(); return 0; diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h index 483be41229c6..3945746900af 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.h +++ b/tools/testing/selftests/bpf/xdpxceiver.h @@ -77,7 +77,6 @@ enum STAT_TEST_TYPES { static int configured_mode = TEST_MODE_UNCONFIGURED; static u8 debug_pkt_dump; static u32 num_frames; -static u8 switching_notify; static u8 bidi_pass; static int test_type; @@ -126,22 +125,25 @@ struct generic_data { }; struct ifobject { - int ns_fd; - int ifdict_index; char ifname[MAX_INTERFACE_NAME_CHARS]; char nsname[MAX_INTERFACES_NAMESPACE_CHARS]; - struct flow_vector fv; struct xsk_socket_info *xsk; struct xsk_umem_info *umem; - u8 dst_mac[ETH_ALEN]; - u8 src_mac[ETH_ALEN]; + void *(*func_ptr)(void *arg); + struct flow_vector fv; + int ns_fd; + int ifdict_index; u32 dst_ip; u32 src_ip; u16 src_port; u16 dst_port; + u8 dst_mac[ETH_ALEN]; + u8 src_mac[ETH_ALEN]; }; static struct ifobject *ifdict[MAX_INTERFACES]; +static struct ifobject *ifdict_rx; +static struct ifobject *ifdict_tx; /*threads*/ atomic_int spinning_rx; -- cgit v1.2.3 From 0464b1ed07677f869005bde3fade1580b48de67e Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Tue, 30 Mar 2021 00:43:11 +0200 Subject: selftests: xsk: Remove sync_mutex_tx and atomic var Although thread_common_ops() are called in both Tx and Rx threads, testapp_validate() will not spawn Tx thread until Rx thread signals that it has finished its initialization via condition variable. Therefore, locking in thread_common_ops is not needed and furthermore Tx thread does not have to spin on atomic variable. Note that this simplification wouldn't be possible if there would still be a common worker thread. Signed-off-by: Maciej Fijalkowski Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210329224316.17793-13-maciej.fijalkowski@intel.com --- tools/testing/selftests/bpf/xdpxceiver.c | 31 +++++++------------------------ tools/testing/selftests/bpf/xdpxceiver.h | 2 -- 2 files changed, 7 insertions(+), 26 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index c0ebca8f55f8..35e14ad8097e 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -121,17 +121,15 @@ static void __exit_with_error(int error, const char *file, const char *func, int test_type == TEST_TYPE_BIDI ? "Bi-directional Sockets" : "",\ test_type == TEST_TYPE_STATS ? "Stats" : "")) -static void pthread_init_mutex(void) +static void init_sync_resources(void) { pthread_mutex_init(&sync_mutex, NULL); - pthread_mutex_init(&sync_mutex_tx, NULL); pthread_cond_init(&signal_rx_condition, NULL); } -static void pthread_destroy_mutex(void) +static void destroy_sync_resources(void) { pthread_mutex_destroy(&sync_mutex); - pthread_mutex_destroy(&sync_mutex_tx); pthread_cond_destroy(&signal_rx_condition); } @@ -752,7 +750,7 @@ static void worker_pkt_validate(void) } } -static void thread_common_ops(struct ifobject *ifobject, void *bufs, pthread_mutex_t *mutexptr) +static void thread_common_ops(struct ifobject *ifobject, void *bufs) { int ctr = 0; int ret; @@ -771,22 +769,13 @@ static void thread_common_ops(struct ifobject *ifobject, void *bufs, pthread_mut /* Retry Create Socket if it fails as xsk_socket__create() * is asynchronous - * - * Essential to lock Mutex here to prevent Tx thread from - * entering before Rx and causing a deadlock */ - pthread_mutex_lock(mutexptr); while (ret && ctr < SOCK_RECONF_CTR) { - if (ifobject->fv.vector == rx) - atomic_store(&spinning_rx, 1); xsk_configure_umem(ifobject, bufs, num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE); ret = xsk_configure_socket(ifobject); usleep(USLEEP_MAX); ctr++; } - if (ifobject->fv.vector == rx) - atomic_store(&spinning_rx, 0); - pthread_mutex_unlock(mutexptr); if (ctr >= SOCK_RECONF_CTR) exit_with_error(ret); @@ -803,16 +792,10 @@ static void *worker_testapp_validate_tx(void *arg) struct ethhdr *eth_hdr = (struct ethhdr *)pkt_data; struct ifobject *ifobject = (struct ifobject *)arg; struct generic_data data; - int spinningrxctr = 0; void *bufs = NULL; if (!bidi_pass) - thread_common_ops(ifobject, bufs, &sync_mutex_tx); - - while (atomic_load(&spinning_rx) && spinningrxctr < SOCK_RECONF_CTR) { - spinningrxctr++; - usleep(USLEEP_MAX); - } + thread_common_ops(ifobject, bufs); for (int i = 0; i < num_frames; i++) { /*send EOT frame */ @@ -845,7 +828,7 @@ static void *worker_testapp_validate_rx(void *arg) void *bufs = NULL; if (!bidi_pass) - thread_common_ops(ifobject, bufs, &sync_mutex_tx); + thread_common_ops(ifobject, bufs); if (stat_test_type != STAT_TEST_RX_FILL_EMPTY) xsk_populate_fill_ring(ifobject->umem); @@ -1103,7 +1086,7 @@ int main(int argc, char **argv) ksft_set_plan(TEST_MODE_MAX * TEST_TYPE_MAX); - pthread_init_mutex(); + init_sync_resources(); for (i = 0; i < TEST_MODE_MAX; i++) { for (j = 0; j < TEST_TYPE_MAX; j++) @@ -1116,7 +1099,7 @@ int main(int argc, char **argv) free(ifdict[i]); } - pthread_destroy_mutex(); + destroy_sync_resources(); ksft_exit_pass(); diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h index 3945746900af..e304229d8a4c 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.h +++ b/tools/testing/selftests/bpf/xdpxceiver.h @@ -146,9 +146,7 @@ static struct ifobject *ifdict_rx; static struct ifobject *ifdict_tx; /*threads*/ -atomic_int spinning_rx; pthread_mutex_t sync_mutex; -pthread_mutex_t sync_mutex_tx; pthread_cond_t signal_rx_condition; pthread_t t0, t1; pthread_attr_t attr; -- cgit v1.2.3 From 27e1ca2525de264901b5c2d9d0c4403c3fe8608c Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Tue, 30 Mar 2021 00:43:13 +0200 Subject: selftests: xsk: Implement bpf_link test Introduce a test that is supposed to verify the persistence of BPF resources based on underlying bpf_link usage. Test will: 1) create and bind two sockets on queue ids 0 and 1 2) run a traffic on queue ids 0 3) remove xsk sockets from queue 0 on both veth interfaces 4) run a traffic on queues ids 1 Running traffic successfully on qids 1 means that BPF resources were not removed on step 3). In order to make it work, change the command that creates veth pair to have the 4 queue pairs by default. Introduce the arrays of xsks and umems to ifobject struct but keep a pointers to single entities, so rest of the logic around Rx/Tx can be kept as-is. For umem handling, double the size of mmapped space and split that between the two sockets. Rename also bidi_pass to a variable 'second_step' of a boolean type as it's now used also for the test that is introduced here and it doesn't have anything in common with bi-directional testing. Drop opt_queue command line argument as it wasn't working before anyway. Signed-off-by: Maciej Fijalkowski Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210329224316.17793-15-maciej.fijalkowski@intel.com --- tools/testing/selftests/bpf/test_xsk.sh | 3 +- tools/testing/selftests/bpf/xdpxceiver.c | 179 +++++++++++++++++++++++-------- tools/testing/selftests/bpf/xdpxceiver.h | 7 +- 3 files changed, 139 insertions(+), 50 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_xsk.sh b/tools/testing/selftests/bpf/test_xsk.sh index 56d4474e2c83..46633a3bfb0b 100755 --- a/tools/testing/selftests/bpf/test_xsk.sh +++ b/tools/testing/selftests/bpf/test_xsk.sh @@ -107,7 +107,7 @@ setup_vethPairs() { echo "setting up ${VETH0}: namespace: ${NS0}" fi ip netns add ${NS1} - ip link add ${VETH0} type veth peer name ${VETH1} + ip link add ${VETH0} numtxqueues 4 numrxqueues 4 type veth peer name ${VETH1} numtxqueues 4 numrxqueues 4 if [ -f /proc/net/if_inet6 ]; then echo 1 > /proc/sys/net/ipv6/conf/${VETH0}/disable_ipv6 fi @@ -118,6 +118,7 @@ setup_vethPairs() { ip netns exec ${NS1} ip link set ${VETH1} mtu ${MTU} ip link set ${VETH0} mtu ${MTU} ip netns exec ${NS1} ip link set ${VETH1} up + ip netns exec ${NS1} ip link set dev lo up ip link set ${VETH0} up } diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index 35e14ad8097e..5c0d6dbd076f 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -41,8 +41,12 @@ * Reduce the size of the RX ring to a fraction of the fill ring size. * iv. fill queue empty * Do not populate the fill queue and then try to receive pkts. + * f. bpf_link resource persistence + * Configure sockets at indexes 0 and 1, run a traffic on queue ids 0, + * then remove xsk sockets from queue 0 on both veth interfaces and + * finally run a traffic on queues ids 1 * - * Total tests: 10 + * Total tests: 12 * * Flow: * ----- @@ -115,11 +119,12 @@ static void __exit_with_error(int error, const char *file, const char *func, int #define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, __LINE__) #define print_ksft_result(void)\ - (ksft_test_result_pass("PASS: %s %s %s%s%s\n", configured_mode ? "DRV" : "SKB",\ + (ksft_test_result_pass("PASS: %s %s %s%s%s%s\n", configured_mode ? "DRV" : "SKB",\ test_type == TEST_TYPE_POLL ? "POLL" : "NOPOLL",\ test_type == TEST_TYPE_TEARDOWN ? "Socket Teardown" : "",\ test_type == TEST_TYPE_BIDI ? "Bi-directional Sockets" : "",\ - test_type == TEST_TYPE_STATS ? "Stats" : "")) + test_type == TEST_TYPE_STATS ? "Stats" : "",\ + test_type == TEST_TYPE_BPF_RES ? "BPF RES" : "")) static void init_sync_resources(void) { @@ -258,9 +263,8 @@ static void gen_eth_frame(struct xsk_umem_info *umem, u64 addr) memcpy(xsk_umem__get_data(umem->buffer, addr), pkt_data, PKT_SIZE); } -static void xsk_configure_umem(struct ifobject *data, void *buffer, u64 size) +static void xsk_configure_umem(struct ifobject *data, void *buffer, int idx) { - int ret; struct xsk_umem_config cfg = { .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, @@ -268,17 +272,22 @@ static void xsk_configure_umem(struct ifobject *data, void *buffer, u64 size) .frame_headroom = frame_headroom, .flags = XSK_UMEM__DEFAULT_FLAGS }; + int size = num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE; + struct xsk_umem_info *umem; + int ret; - data->umem = calloc(1, sizeof(struct xsk_umem_info)); - if (!data->umem) + umem = calloc(1, sizeof(struct xsk_umem_info)); + if (!umem) exit_with_error(errno); - ret = xsk_umem__create(&data->umem->umem, buffer, size, - &data->umem->fq, &data->umem->cq, &cfg); + ret = xsk_umem__create(&umem->umem, buffer, size, + &umem->fq, &umem->cq, &cfg); if (ret) exit_with_error(ret); - data->umem->buffer = buffer; + umem->buffer = buffer; + + data->umem_arr[idx] = umem; } static void xsk_populate_fill_ring(struct xsk_umem_info *umem) @@ -294,18 +303,19 @@ static void xsk_populate_fill_ring(struct xsk_umem_info *umem) xsk_ring_prod__submit(&umem->fq, XSK_RING_PROD__DEFAULT_NUM_DESCS); } -static int xsk_configure_socket(struct ifobject *ifobject) +static int xsk_configure_socket(struct ifobject *ifobject, int idx) { struct xsk_socket_config cfg; + struct xsk_socket_info *xsk; struct xsk_ring_cons *rxr; struct xsk_ring_prod *txr; int ret; - ifobject->xsk = calloc(1, sizeof(struct xsk_socket_info)); - if (!ifobject->xsk) + xsk = calloc(1, sizeof(struct xsk_socket_info)); + if (!xsk) exit_with_error(errno); - ifobject->xsk->umem = ifobject->umem; + xsk->umem = ifobject->umem; cfg.rx_size = rxqsize; cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; cfg.libbpf_flags = 0; @@ -313,19 +323,20 @@ static int xsk_configure_socket(struct ifobject *ifobject) cfg.bind_flags = xdp_bind_flags; if (test_type != TEST_TYPE_BIDI) { - rxr = (ifobject->fv.vector == rx) ? &ifobject->xsk->rx : NULL; - txr = (ifobject->fv.vector == tx) ? &ifobject->xsk->tx : NULL; + rxr = (ifobject->fv.vector == rx) ? &xsk->rx : NULL; + txr = (ifobject->fv.vector == tx) ? &xsk->tx : NULL; } else { - rxr = &ifobject->xsk->rx; - txr = &ifobject->xsk->tx; + rxr = &xsk->rx; + txr = &xsk->tx; } - ret = xsk_socket__create(&ifobject->xsk->xsk, ifobject->ifname, - opt_queue, ifobject->umem->umem, rxr, txr, &cfg); - + ret = xsk_socket__create(&xsk->xsk, ifobject->ifname, idx, + ifobject->umem->umem, rxr, txr, &cfg); if (ret) return 1; + ifobject->xsk_arr[idx] = xsk; + return 0; } @@ -393,7 +404,7 @@ static void parse_command_line(int argc, char **argv) opterr = 0; for (;;) { - c = getopt_long(argc, argv, "i:q:DC:v", long_options, &option_index); + c = getopt_long(argc, argv, "i:DC:v", long_options, &option_index); if (c == -1) break; @@ -413,9 +424,6 @@ static void parse_command_line(int argc, char **argv) MAX_INTERFACES_NAMESPACE_CHARS); interface_index++; break; - case 'q': - opt_queue = atoi(optarg); - break; case 'D': debug_pkt_dump = 1; break; @@ -752,27 +760,33 @@ static void worker_pkt_validate(void) static void thread_common_ops(struct ifobject *ifobject, void *bufs) { + int umem_sz = num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE; int ctr = 0; int ret; pthread_attr_setstacksize(&attr, THREAD_STACK); - bufs = mmap(NULL, num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE, + ifobject->ns_fd = switch_namespace(ifobject->nsname); + + if (test_type == TEST_TYPE_BPF_RES) + umem_sz *= 2; + + bufs = mmap(NULL, umem_sz, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (bufs == MAP_FAILED) exit_with_error(errno); - ifobject->ns_fd = switch_namespace(ifobject->nsname); - - xsk_configure_umem(ifobject, bufs, num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE); - ret = xsk_configure_socket(ifobject); + xsk_configure_umem(ifobject, bufs, 0); + ifobject->umem = ifobject->umem_arr[0]; + ret = xsk_configure_socket(ifobject, 0); /* Retry Create Socket if it fails as xsk_socket__create() * is asynchronous */ while (ret && ctr < SOCK_RECONF_CTR) { - xsk_configure_umem(ifobject, bufs, num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE); - ret = xsk_configure_socket(ifobject); + xsk_configure_umem(ifobject, bufs, 0); + ifobject->umem = ifobject->umem_arr[0]; + ret = xsk_configure_socket(ifobject, 0); usleep(USLEEP_MAX); ctr++; } @@ -780,10 +794,34 @@ static void thread_common_ops(struct ifobject *ifobject, void *bufs) if (ctr >= SOCK_RECONF_CTR) exit_with_error(ret); + ifobject->umem = ifobject->umem_arr[0]; + ifobject->xsk = ifobject->xsk_arr[0]; + + if (test_type == TEST_TYPE_BPF_RES) { + xsk_configure_umem(ifobject, (u8 *)bufs + (umem_sz / 2), 1); + ifobject->umem = ifobject->umem_arr[1]; + ret = xsk_configure_socket(ifobject, 1); + } + + ifobject->umem = ifobject->umem_arr[0]; + ifobject->xsk = ifobject->xsk_arr[0]; print_verbose("Interface [%s] vector [%s]\n", ifobject->ifname, ifobject->fv.vector == tx ? "Tx" : "Rx"); } +static bool testapp_is_test_two_stepped(void) +{ + return (test_type != TEST_TYPE_BIDI && test_type != TEST_TYPE_BPF_RES) || second_step; +} + +static void testapp_cleanup_xsk_res(struct ifobject *ifobj) +{ + if (testapp_is_test_two_stepped()) { + xsk_socket__delete(ifobj->xsk->xsk); + (void)xsk_umem__delete(ifobj->umem->umem); + } +} + static void *worker_testapp_validate_tx(void *arg) { struct udphdr *udp_hdr = @@ -794,7 +832,7 @@ static void *worker_testapp_validate_tx(void *arg) struct generic_data data; void *bufs = NULL; - if (!bidi_pass) + if (!second_step) thread_common_ops(ifobject, bufs); for (int i = 0; i < num_frames; i++) { @@ -814,10 +852,7 @@ static void *worker_testapp_validate_tx(void *arg) (opt_pkt_count - 1), ifobject->ifname); tx_only_all(ifobject); - if (test_type != TEST_TYPE_BIDI || bidi_pass) { - xsk_socket__delete(ifobject->xsk->xsk); - (void)xsk_umem__delete(ifobject->umem->umem); - } + testapp_cleanup_xsk_res(ifobject); pthread_exit(NULL); } @@ -827,7 +862,7 @@ static void *worker_testapp_validate_rx(void *arg) struct pollfd fds[MAX_SOCKS] = { }; void *bufs = NULL; - if (!bidi_pass) + if (!second_step) thread_common_ops(ifobject, bufs); if (stat_test_type != STAT_TEST_RX_FILL_EMPTY) @@ -864,10 +899,7 @@ static void *worker_testapp_validate_rx(void *arg) if (test_type == TEST_TYPE_TEARDOWN) print_verbose("Destroying socket\n"); - if ((test_type != TEST_TYPE_BIDI) || bidi_pass) { - xsk_socket__delete(ifobject->xsk->xsk); - (void)xsk_umem__delete(ifobject->umem->umem); - } + testapp_cleanup_xsk_res(ifobject); pthread_exit(NULL); } @@ -875,6 +907,7 @@ static void testapp_validate(void) { struct timespec max_wait = { 0, 0 }; bool bidi = test_type == TEST_TYPE_BIDI; + bool bpf = test_type == TEST_TYPE_BPF_RES; pthread_attr_init(&attr); pthread_attr_setstacksize(&attr, THREAD_STACK); @@ -908,7 +941,7 @@ static void testapp_validate(void) free(pkt_buf); } - if (!(test_type == TEST_TYPE_TEARDOWN) && !bidi && !(test_type == TEST_TYPE_STATS)) + if (!(test_type == TEST_TYPE_TEARDOWN) && !bidi && !bpf && !(test_type == TEST_TYPE_STATS)) print_ksft_result(); } @@ -950,11 +983,11 @@ static void testapp_bidi(void) sigvar = 0; print_verbose("Creating socket\n"); testapp_validate(); - if (!bidi_pass) { + if (!second_step) { print_verbose("Switching Tx/Rx vectors\n"); swap_vectors(ifdict[1], ifdict[0]); } - bidi_pass++; + second_step = true; } swap_vectors(ifdict[0], ifdict[1]); @@ -962,6 +995,36 @@ static void testapp_bidi(void) print_ksft_result(); } +static void swap_xsk_res(void) +{ + xsk_socket__delete(ifdict_tx->xsk->xsk); + xsk_umem__delete(ifdict_tx->umem->umem); + xsk_socket__delete(ifdict_rx->xsk->xsk); + xsk_umem__delete(ifdict_rx->umem->umem); + ifdict_tx->umem = ifdict_tx->umem_arr[1]; + ifdict_tx->xsk = ifdict_tx->xsk_arr[1]; + ifdict_rx->umem = ifdict_rx->umem_arr[1]; + ifdict_rx->xsk = ifdict_rx->xsk_arr[1]; +} + +static void testapp_bpf_res(void) +{ + int i; + + for (i = 0; i < MAX_BPF_ITER; i++) { + pkt_counter = 0; + prev_pkt = -1; + sigvar = 0; + print_verbose("Creating socket\n"); + testapp_validate(); + if (!second_step) + swap_xsk_res(); + second_step = true; + } + + print_ksft_result(); +} + static void testapp_stats(void) { for (int i = 0; i < STAT_TEST_TYPE_MAX; i++) { @@ -1025,13 +1088,15 @@ static void run_pkt_test(int mode, int type) /* reset defaults after potential previous test */ xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; pkt_counter = 0; - bidi_pass = 0; + second_step = 0; prev_pkt = -1; sigvar = 0; stat_test_type = -1; rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS; frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM; + configured_mode = mode; + switch (mode) { case (TEST_MODE_SKB): xdp_flags |= XDP_FLAGS_SKB_MODE; @@ -1053,6 +1118,9 @@ static void run_pkt_test(int mode, int type) case TEST_TYPE_BIDI: testapp_bidi(); break; + case TEST_TYPE_BPF_RES: + testapp_bpf_res(); + break; default: testapp_validate(); break; @@ -1062,6 +1130,7 @@ static void run_pkt_test(int mode, int type) int main(int argc, char **argv) { struct rlimit _rlim = { RLIM_INFINITY, RLIM_INFINITY }; + bool failure = false; int i, j; if (setrlimit(RLIMIT_MEMLOCK, &_rlim)) @@ -1073,6 +1142,16 @@ int main(int argc, char **argv) exit_with_error(errno); ifdict[i]->ifdict_index = i; + ifdict[i]->xsk_arr = calloc(2, sizeof(struct xsk_socket_info *)); + if (!ifdict[i]->xsk_arr) { + failure = true; + goto cleanup; + } + ifdict[i]->umem_arr = calloc(2, sizeof(struct xsk_umem_info *)); + if (!ifdict[i]->umem_arr) { + failure = true; + goto cleanup; + } } setlocale(LC_ALL, ""); @@ -1093,13 +1172,19 @@ int main(int argc, char **argv) run_pkt_test(i, j); } + destroy_sync_resources(); + +cleanup: for (int i = 0; i < MAX_INTERFACES; i++) { if (ifdict[i]->ns_fd != -1) close(ifdict[i]->ns_fd); + free(ifdict[i]->xsk_arr); + free(ifdict[i]->umem_arr); free(ifdict[i]); } - destroy_sync_resources(); + if (failure) + exit_with_error(errno); ksft_exit_pass(); diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h index e304229d8a4c..e431ecb9bb95 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.h +++ b/tools/testing/selftests/bpf/xdpxceiver.h @@ -23,6 +23,7 @@ #define MAX_SOCKS 1 #define MAX_TEARDOWN_ITER 10 #define MAX_BIDI_ITER 2 +#define MAX_BPF_ITER 2 #define PKT_HDR_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \ sizeof(struct udphdr)) #define MIN_PKT_SIZE 64 @@ -63,6 +64,7 @@ enum TEST_TYPES { TEST_TYPE_TEARDOWN, TEST_TYPE_BIDI, TEST_TYPE_STATS, + TEST_TYPE_BPF_RES, TEST_TYPE_MAX }; @@ -77,10 +79,9 @@ enum STAT_TEST_TYPES { static int configured_mode = TEST_MODE_UNCONFIGURED; static u8 debug_pkt_dump; static u32 num_frames; -static u8 bidi_pass; +static bool second_step; static int test_type; -static int opt_queue; static int opt_pkt_count; static u8 opt_verbose; @@ -128,6 +129,8 @@ struct ifobject { char ifname[MAX_INTERFACE_NAME_CHARS]; char nsname[MAX_INTERFACES_NAMESPACE_CHARS]; struct xsk_socket_info *xsk; + struct xsk_socket_info **xsk_arr; + struct xsk_umem_info **umem_arr; struct xsk_umem_info *umem; void *(*func_ptr)(void *arg); struct flow_vector fv; -- cgit v1.2.3 From 7651910257c8fb1ec76b50bef0330fcf739105c7 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Tue, 30 Mar 2021 00:43:14 +0200 Subject: selftests: xsk: Remove thread attribute MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is really no reason to have a non-default thread stack size. Remove that. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210329224316.17793-16-maciej.fijalkowski@intel.com --- tools/testing/selftests/bpf/xdpxceiver.c | 9 ++------- tools/testing/selftests/bpf/xdpxceiver.h | 2 -- 2 files changed, 2 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index 5c0d6dbd076f..ec09b8fe1178 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -764,8 +764,6 @@ static void thread_common_ops(struct ifobject *ifobject, void *bufs) int ctr = 0; int ret; - pthread_attr_setstacksize(&attr, THREAD_STACK); - ifobject->ns_fd = switch_namespace(ifobject->nsname); if (test_type == TEST_TYPE_BPF_RES) @@ -909,13 +907,10 @@ static void testapp_validate(void) bool bidi = test_type == TEST_TYPE_BIDI; bool bpf = test_type == TEST_TYPE_BPF_RES; - pthread_attr_init(&attr); - pthread_attr_setstacksize(&attr, THREAD_STACK); - pthread_mutex_lock(&sync_mutex); /*Spawn RX thread */ - pthread_create(&t0, &attr, ifdict_rx->func_ptr, ifdict_rx); + pthread_create(&t0, NULL, ifdict_rx->func_ptr, ifdict_rx); if (clock_gettime(CLOCK_REALTIME, &max_wait)) exit_with_error(errno); @@ -927,7 +922,7 @@ static void testapp_validate(void) pthread_mutex_unlock(&sync_mutex); /*Spawn TX thread */ - pthread_create(&t1, &attr, ifdict_tx->func_ptr, ifdict_tx); + pthread_create(&t1, NULL, ifdict_tx->func_ptr, ifdict_tx); pthread_join(t1, NULL); pthread_join(t0, NULL); diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h index e431ecb9bb95..78863820fb81 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.h +++ b/tools/testing/selftests/bpf/xdpxceiver.h @@ -37,7 +37,6 @@ #define TMOUT_SEC (3) #define EOT (-1) #define USLEEP_MAX 200000 -#define THREAD_STACK 60000000 #define SOCK_RECONF_CTR 10 #define BATCH_SIZE 64 #define POLL_TMOUT 1000 @@ -152,7 +151,6 @@ static struct ifobject *ifdict_tx; pthread_mutex_t sync_mutex; pthread_cond_t signal_rx_condition; pthread_t t0, t1; -pthread_attr_t attr; TAILQ_HEAD(head_s, pkt) head = TAILQ_HEAD_INITIALIZER(head); struct head_s *head_p; -- cgit v1.2.3 From 96539f1c5efb0022b94412e8623722aad23dee6b Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Tue, 30 Mar 2021 00:43:15 +0200 Subject: selftests: xsk: Remove mutex and condition variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The usage of the condition variable is broken, and overkill. Replace it with a pthread barrier. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210329224316.17793-17-maciej.fijalkowski@intel.com --- tools/testing/selftests/bpf/xdpxceiver.c | 33 +++++--------------------------- tools/testing/selftests/bpf/xdpxceiver.h | 3 +-- 2 files changed, 6 insertions(+), 30 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index ec09b8fe1178..6d87bdf3574a 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -126,18 +126,6 @@ static void __exit_with_error(int error, const char *file, const char *func, int test_type == TEST_TYPE_STATS ? "Stats" : "",\ test_type == TEST_TYPE_BPF_RES ? "BPF RES" : "")) -static void init_sync_resources(void) -{ - pthread_mutex_init(&sync_mutex, NULL); - pthread_cond_init(&signal_rx_condition, NULL); -} - -static void destroy_sync_resources(void) -{ - pthread_mutex_destroy(&sync_mutex); - pthread_cond_destroy(&signal_rx_condition); -} - static void *memset32_htonl(void *dest, u32 val, u32 size) { u32 *ptr = (u32 *)dest; @@ -876,9 +864,7 @@ static void *worker_testapp_validate_rx(void *arg) fds[0].fd = xsk_socket__fd(ifobject->xsk->xsk); fds[0].events = POLLIN; - pthread_mutex_lock(&sync_mutex); - pthread_cond_signal(&signal_rx_condition); - pthread_mutex_unlock(&sync_mutex); + pthread_barrier_wait(&barr); while (1) { if (test_type != TEST_TYPE_STATS) { @@ -903,24 +889,19 @@ static void *worker_testapp_validate_rx(void *arg) static void testapp_validate(void) { - struct timespec max_wait = { 0, 0 }; bool bidi = test_type == TEST_TYPE_BIDI; bool bpf = test_type == TEST_TYPE_BPF_RES; - pthread_mutex_lock(&sync_mutex); + if (pthread_barrier_init(&barr, NULL, 2)) + exit_with_error(errno); /*Spawn RX thread */ pthread_create(&t0, NULL, ifdict_rx->func_ptr, ifdict_rx); - if (clock_gettime(CLOCK_REALTIME, &max_wait)) - exit_with_error(errno); - max_wait.tv_sec += TMOUT_SEC; - - if (pthread_cond_timedwait(&signal_rx_condition, &sync_mutex, &max_wait) == ETIMEDOUT) + pthread_barrier_wait(&barr); + if (pthread_barrier_destroy(&barr)) exit_with_error(errno); - pthread_mutex_unlock(&sync_mutex); - /*Spawn TX thread */ pthread_create(&t1, NULL, ifdict_tx->func_ptr, ifdict_tx); @@ -1160,15 +1141,11 @@ int main(int argc, char **argv) ksft_set_plan(TEST_MODE_MAX * TEST_TYPE_MAX); - init_sync_resources(); - for (i = 0; i < TEST_MODE_MAX; i++) { for (j = 0; j < TEST_TYPE_MAX; j++) run_pkt_test(i, j); } - destroy_sync_resources(); - cleanup: for (int i = 0; i < MAX_INTERFACES; i++) { if (ifdict[i]->ns_fd != -1) diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h index 78863820fb81..ef219c0785eb 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.h +++ b/tools/testing/selftests/bpf/xdpxceiver.h @@ -148,8 +148,7 @@ static struct ifobject *ifdict_rx; static struct ifobject *ifdict_tx; /*threads*/ -pthread_mutex_t sync_mutex; -pthread_cond_t signal_rx_condition; +pthread_barrier_t barr; pthread_t t0, t1; TAILQ_HEAD(head_s, pkt) head = TAILQ_HEAD_INITIALIZER(head); -- cgit v1.2.3 From ae6b6a17800f34dd5215286b44a4e99a0a1cf862 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Tue, 30 Mar 2021 00:43:16 +0200 Subject: selftests: xsk: Remove unused defines MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove two unused defines. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210329224316.17793-18-maciej.fijalkowski@intel.com --- tools/testing/selftests/bpf/xdpxceiver.c | 7 +++---- tools/testing/selftests/bpf/xdpxceiver.h | 2 -- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index 6d87bdf3574a..1135fb980814 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -456,7 +456,7 @@ static void complete_tx_only(struct xsk_socket_info *xsk, int batch_size) if (!xsk->outstanding_tx) return; - if (!NEED_WAKEUP || xsk_ring_prod__needs_wakeup(&xsk->tx)) + if (xsk_ring_prod__needs_wakeup(&xsk->tx)) kick_tx(xsk); rcvd = xsk_ring_cons__peek(&xsk->umem->cq, batch_size, &idx); @@ -544,9 +544,8 @@ static void tx_only(struct xsk_socket_info *xsk, u32 *frameptr, int batch_size) xsk_ring_prod__submit(&xsk->tx, batch_size); if (!tx_invalid_test) { xsk->outstanding_tx += batch_size; - } else { - if (!NEED_WAKEUP || xsk_ring_prod__needs_wakeup(&xsk->tx)) - kick_tx(xsk); + } else if (xsk_ring_prod__needs_wakeup(&xsk->tx)) { + kick_tx(xsk); } *frameptr += batch_size; *frameptr %= num_frames; diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h index ef219c0785eb..6c428b276ab6 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.h +++ b/tools/testing/selftests/bpf/xdpxceiver.h @@ -34,13 +34,11 @@ #define IP_PKT_TOS 0x9 #define UDP_PKT_SIZE (IP_PKT_SIZE - sizeof(struct iphdr)) #define UDP_PKT_DATA_SIZE (UDP_PKT_SIZE - sizeof(struct udphdr)) -#define TMOUT_SEC (3) #define EOT (-1) #define USLEEP_MAX 200000 #define SOCK_RECONF_CTR 10 #define BATCH_SIZE 64 #define POLL_TMOUT 1000 -#define NEED_WAKEUP true #define DEFAULT_PKT_CNT 10000 #define RX_FULL_RXQSIZE 32 -- cgit v1.2.3 From a062260a9d5fce22977744ed6dab64b7b1508ab3 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 30 Mar 2021 12:28:56 +0200 Subject: selftests: net: add UDP GRO forwarding self-tests Create a bunch of virtual topologies and verify that NETIF_F_GRO_FRAGLIST or NETIF_F_GRO_UDP_FWD-enabled devices aggregate the ingress packets as expected. Additionally check that the aggregate packets are segmented correctly when landing on a socket Also test SKB_GSO_FRAGLIST and SKB_GSO_UDP_L4 aggregation on top of UDP tunnel (vxlan) v1 -> v2: - hopefully clarify the commit message - moved the overlay network ipv6 range into the 'documentation' reserved range (Willem) Signed-off-by: Paolo Abeni Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- tools/testing/selftests/net/Makefile | 1 + tools/testing/selftests/net/udpgro_fwd.sh | 251 ++++++++++++++++++++++++++++++ 2 files changed, 252 insertions(+) create mode 100755 tools/testing/selftests/net/udpgro_fwd.sh (limited to 'tools') diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 25f198bec0b2..2d71b283dde3 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -23,6 +23,7 @@ TEST_PROGS += drop_monitor_tests.sh TEST_PROGS += vrf_route_leaking.sh TEST_PROGS += bareudp.sh TEST_PROGS += unicast_extensions.sh +TEST_PROGS += udpgro_fwd.sh TEST_PROGS_EXTENDED := in_netns.sh TEST_GEN_FILES = socket nettest TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy reuseport_addr_any diff --git a/tools/testing/selftests/net/udpgro_fwd.sh b/tools/testing/selftests/net/udpgro_fwd.sh new file mode 100755 index 000000000000..a8fa64136282 --- /dev/null +++ b/tools/testing/selftests/net/udpgro_fwd.sh @@ -0,0 +1,251 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +readonly BASE="ns-$(mktemp -u XXXXXX)" +readonly SRC=2 +readonly DST=1 +readonly DST_NAT=100 +readonly NS_SRC=$BASE$SRC +readonly NS_DST=$BASE$DST + +# "baremetal" network used for raw UDP traffic +readonly BM_NET_V4=192.168.1. +readonly BM_NET_V6=2001:db8:: + +# "overlay" network used for UDP over UDP tunnel traffic +readonly OL_NET_V4=172.16.1. +readonly OL_NET_V6=2001:db8:1:: +readonly NPROCS=`nproc` + +cleanup() { + local ns + local -r jobs="$(jobs -p)" + [ -n "${jobs}" ] && kill -1 ${jobs} 2>/dev/null + + for ns in $NS_SRC $NS_DST; do + ip netns del $ns 2>/dev/null + done +} + +trap cleanup EXIT + +create_ns() { + local net + local ns + + for ns in $NS_SRC $NS_DST; do + ip netns add $ns + ip -n $ns link set dev lo up + done + + ip link add name veth$SRC type veth peer name veth$DST + + for ns in $SRC $DST; do + ip link set dev veth$ns netns $BASE$ns + ip -n $BASE$ns link set dev veth$ns up + ip -n $BASE$ns addr add dev veth$ns $BM_NET_V4$ns/24 + ip -n $BASE$ns addr add dev veth$ns $BM_NET_V6$ns/64 nodad + done + ip -n $NS_DST link set veth$DST xdp object ../bpf/xdp_dummy.o section xdp_dummy 2>/dev/null +} + +create_vxlan_endpoint() { + local -r netns=$1 + local -r bm_dev=$2 + local -r bm_rem_addr=$3 + local -r vxlan_dev=$4 + local -r vxlan_id=$5 + local -r vxlan_port=4789 + + ip -n $netns link set dev $bm_dev up + ip -n $netns link add dev $vxlan_dev type vxlan id $vxlan_id \ + dstport $vxlan_port remote $bm_rem_addr + ip -n $netns link set dev $vxlan_dev up +} + +create_vxlan_pair() { + local ns + + create_ns + + for ns in $SRC $DST; do + # note that 3 - $SRC == $DST and 3 - $DST == $SRC + create_vxlan_endpoint $BASE$ns veth$ns $BM_NET_V4$((3 - $ns)) vxlan$ns 4 + ip -n $BASE$ns addr add dev vxlan$ns $OL_NET_V4$ns/24 + done + for ns in $SRC $DST; do + create_vxlan_endpoint $BASE$ns veth$ns $BM_NET_V6$((3 - $ns)) vxlan6$ns 6 + ip -n $BASE$ns addr add dev vxlan6$ns $OL_NET_V6$ns/24 nodad + done +} + +is_ipv6() { + if [[ $1 =~ .*:.* ]]; then + return 0 + fi + return 1 +} + +run_test() { + local -r msg=$1 + local -r dst=$2 + local -r pkts=$3 + local -r vxpkts=$4 + local bind=$5 + local rx_args="" + local rx_family="-4" + local family=-4 + local filter=IpInReceives + local ipt=iptables + + printf "%-40s" "$msg" + + if is_ipv6 $dst; then + # rx program does not support '-6' and implies ipv6 usage by default + rx_family="" + family=-6 + filter=Ip6InReceives + ipt=ip6tables + fi + + rx_args="$rx_family" + [ -n "$bind" ] && rx_args="$rx_args -b $bind" + + # send a single GSO packet, segmented in 10 UDP frames. + # Always expect 10 UDP frames on RX side as rx socket does + # not enable GRO + ip netns exec $NS_DST $ipt -A INPUT -p udp --dport 4789 + ip netns exec $NS_DST $ipt -A INPUT -p udp --dport 8000 + ip netns exec $NS_DST ./udpgso_bench_rx -C 1000 -R 10 -n 10 -l 1300 $rx_args & + local spid=$! + sleep 0.1 + ip netns exec $NS_SRC ./udpgso_bench_tx $family -M 1 -s 13000 -S 1300 -D $dst + local retc=$? + wait $spid + local rets=$? + if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then + echo " fail client exit code $retc, server $rets" + ret=1 + return + fi + + local rcv=`ip netns exec $NS_DST $ipt"-save" -c | grep 'dport 8000' | \ + sed -e 's/\[//' -e 's/:.*//'` + if [ $rcv != $pkts ]; then + echo " fail - received $rvs packets, expected $pkts" + ret=1 + return + fi + + local vxrcv=`ip netns exec $NS_DST $ipt"-save" -c | grep 'dport 4789' | \ + sed -e 's/\[//' -e 's/:.*//'` + + # upper net can generate a little noise, allow some tolerance + if [ $vxrcv -lt $vxpkts -o $vxrcv -gt $((vxpkts + 3)) ]; then + echo " fail - received $vxrcv vxlan packets, expected $vxpkts" + ret=1 + return + fi + echo " ok" +} + +run_bench() { + local -r msg=$1 + local -r dst=$2 + local family=-4 + + printf "%-40s" "$msg" + if [ $NPROCS -lt 2 ]; then + echo " skip - needed 2 CPUs found $NPROCS" + return + fi + + is_ipv6 $dst && family=-6 + + # bind the sender and the receiver to different CPUs to try + # get reproducible results + ip netns exec $NS_DST bash -c "echo 2 > /sys/class/net/veth$DST/queues/rx-0/rps_cpus" + ip netns exec $NS_DST taskset 0x2 ./udpgso_bench_rx -C 1000 -R 10 & + local spid=$! + sleep 0.1 + ip netns exec $NS_SRC taskset 0x1 ./udpgso_bench_tx $family -l 3 -S 1300 -D $dst + local retc=$? + wait $spid + local rets=$? + if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then + echo " fail client exit code $retc, server $rets" + ret=1 + return + fi +} + +for family in 4 6; do + BM_NET=$BM_NET_V4 + OL_NET=$OL_NET_V4 + IPT=iptables + SUFFIX=24 + VXDEV=vxlan + + if [ $family = 6 ]; then + BM_NET=$BM_NET_V6 + OL_NET=$OL_NET_V6 + SUFFIX="64 nodad" + VXDEV=vxlan6 + IPT=ip6tables + fi + + echo "IPv$family" + + create_ns + run_test "No GRO" $BM_NET$DST 10 0 + cleanup + + create_ns + ip netns exec $NS_DST ethtool -K veth$DST rx-gro-list on + run_test "GRO frag list" $BM_NET$DST 1 0 + cleanup + + # UDP GRO fwd skips aggregation when find an udp socket with the GRO option + # if there is an UDP tunnel in the running system, such lookup happen + # take place. + # use NAT to circumvent GRO FWD check + create_ns + ip -n $NS_DST addr add dev veth$DST $BM_NET$DST_NAT/$SUFFIX + ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on + ip netns exec $NS_DST $IPT -t nat -I PREROUTING -d $BM_NET$DST_NAT \ + -j DNAT --to-destination $BM_NET$DST + run_test "GRO fwd" $BM_NET$DST_NAT 1 0 $BM_NET$DST + cleanup + + create_ns + run_bench "UDP fwd perf" $BM_NET$DST + ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on + run_bench "UDP GRO fwd perf" $BM_NET$DST + cleanup + + create_vxlan_pair + ip netns exec $NS_DST ethtool -K veth$DST rx-gro-list on + run_test "GRO frag list over UDP tunnel" $OL_NET$DST 1 1 + cleanup + + # use NAT to circumvent GRO FWD check + create_vxlan_pair + ip -n $NS_DST addr add dev $VXDEV$DST $OL_NET$DST_NAT/$SUFFIX + ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on + ip netns exec $NS_DST $IPT -t nat -I PREROUTING -d $OL_NET$DST_NAT \ + -j DNAT --to-destination $OL_NET$DST + + # load arp cache before running the test to reduce the amount of + # stray traffic on top of the UDP tunnel + ip netns exec $NS_SRC ping -q -c 1 $OL_NET$DST_NAT >/dev/null + run_test "GRO fwd over UDP tunnel" $OL_NET$DST_NAT 1 1 $OL_NET$DST + cleanup + + create_vxlan_pair + run_bench "UDP tunnel fwd perf" $OL_NET$DST + ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on + run_bench "UDP tunnel GRO fwd perf" $OL_NET$DST + cleanup +done + +exit $ret -- cgit v1.2.3 From e48792a9ec7898444fec6577611a6205e36dfdb9 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Tue, 30 Mar 2021 13:41:10 +0300 Subject: tc-testing: add simple action change test Use act_simple to verify that action created with 'tc actions change' command exists after command returns. The goal is to verify internal action API reference counting to ensure that the case when netlink message has NLM_F_REPLACE flag set but action with specified index doesn't exist is handled correctly. Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- .../tc-testing/tc-tests/actions/simple.json | 24 ++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/simple.json b/tools/testing/selftests/tc-testing/tc-tests/actions/simple.json index 8e8c1ae12260..e15f708b0fa4 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/simple.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/simple.json @@ -23,6 +23,30 @@ "$TC actions flush action simple" ] }, + { + "id": "4297", + "name": "Add simple action with change command", + "category": [ + "actions", + "simple" + ], + "setup": [ + [ + "$TC actions flush action simple", + 0, + 1, + 255 + ] + ], + "cmdUnderTest": "$TC actions change action simple sdata \"Not changed\" index 60", + "expExitCode": "0", + "verifyCmd": "$TC actions list action simple", + "matchPattern": "action order [0-9]*: Simple .*index 60 ref", + "matchCount": "1", + "teardown": [ + "$TC actions flush action simple" + ] + }, { "id": "6d4c", "name": "Add simple action with duplicate index", -- cgit v1.2.3 From 6254ad408820ca84283798c8fdbda3b01259a9a8 Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Tue, 30 Mar 2021 17:08:54 -0700 Subject: selftests: mptcp: avoid calling pm_nl_ctl with bad IDs IDs are supposed to be between 0 and 255. In pm_nl_ctl, for both the 'add' and 'get' instruction, the ID is casted in a u_int8_t. So if we give 256, we will delete ID 0. Obviously, the goal is not to delete this ID by giving 256. We could modify pm_nl_ctl and stop if the ID is negative or higher than 255 but probably better not to increase the number of lines for such things in this tool which is only used in selftests. Instead, we use it within the limits. This modification also means that we will no longer add a new ID for the 2nd entry. That's why we removed an expected entry from the dump and introduced with commit dc8eb10e95a8 ("selftests: mptcp: add testcases for setting the address ID"). So now we delete ID 9 like before and we add entries for IDs 10 to 255 that are deleted just after. Note that this could be seen as a fix but it was not really an issue so far: we were simply playing with ID 0/1 once again. With the following commit ("selftests: mptcp: add addr argument for del_addr"), it will be different because ID 0 is going to required an address. We don't want errors when trying to delete ID 0 without the address argument. Acked-and-tested-by: Geliang Tang Signed-off-by: Matthieu Baerts Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/pm_netlink.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/pm_netlink.sh b/tools/testing/selftests/net/mptcp/pm_netlink.sh index a617e293734c..3c741abe034e 100755 --- a/tools/testing/selftests/net/mptcp/pm_netlink.sh +++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh @@ -100,12 +100,12 @@ done check "ip netns exec $ns1 ./pm_nl_ctl get 9" "id 9 flags signal 10.0.1.9" "hard addr limit" check "ip netns exec $ns1 ./pm_nl_ctl get 10" "" "above hard addr limit" -for i in `seq 9 256`; do +ip netns exec $ns1 ./pm_nl_ctl del 9 +for i in `seq 10 255`; do + ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.9 id $i ip netns exec $ns1 ./pm_nl_ctl del $i - ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.9 id $((i+1)) done check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags 10.0.1.1 -id 2 flags 10.0.0.9 id 3 flags signal,backup 10.0.1.3 id 4 flags signal 10.0.1.4 id 5 flags signal 10.0.1.5 -- cgit v1.2.3 From 2d121c9a882a93d5b229fc13deb10036a1b35967 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 30 Mar 2021 17:08:55 -0700 Subject: selftests: mptcp: add addr argument for del_addr For the id 0 address, different MPTCP connections could be using different IP addresses for id 0. This patch added an extra argument IP address for del_addr when using id 0. Reviewed-by: Mat Martineau Signed-off-by: Geliang Tang Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/pm_nl_ctl.c | 34 ++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c index 7b4167f3f9a2..115decfdc1ef 100644 --- a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c +++ b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c @@ -26,7 +26,7 @@ static void syntax(char *argv[]) { fprintf(stderr, "%s add|get|set|del|flush|dump|accept []\n", argv[0]); fprintf(stderr, "\tadd [flags signal|subflow|backup] [id ] [dev ] \n"); - fprintf(stderr, "\tdel \n"); + fprintf(stderr, "\tdel []\n"); fprintf(stderr, "\tget \n"); fprintf(stderr, "\tset [flags backup|nobackup]\n"); fprintf(stderr, "\tflush\n"); @@ -301,6 +301,7 @@ int del_addr(int fd, int pm_family, int argc, char *argv[]) 1024]; struct rtattr *rta, *nest; struct nlmsghdr *nh; + u_int16_t family; int nest_start; u_int8_t id; int off = 0; @@ -310,11 +311,14 @@ int del_addr(int fd, int pm_family, int argc, char *argv[]) off = init_genl_req(data, pm_family, MPTCP_PM_CMD_DEL_ADDR, MPTCP_PM_VER); - /* the only argument is the address id */ - if (argc != 3) + /* the only argument is the address id (nonzero) */ + if (argc != 3 && argc != 4) syntax(argv); id = atoi(argv[2]); + /* zero id with the IP address */ + if (!id && argc != 4) + syntax(argv); nest_start = off; nest = (void *)(data + off); @@ -328,6 +332,30 @@ int del_addr(int fd, int pm_family, int argc, char *argv[]) rta->rta_len = RTA_LENGTH(1); memcpy(RTA_DATA(rta), &id, 1); off += NLMSG_ALIGN(rta->rta_len); + + if (!id) { + /* addr data */ + rta = (void *)(data + off); + if (inet_pton(AF_INET, argv[3], RTA_DATA(rta))) { + family = AF_INET; + rta->rta_type = MPTCP_PM_ADDR_ATTR_ADDR4; + rta->rta_len = RTA_LENGTH(4); + } else if (inet_pton(AF_INET6, argv[3], RTA_DATA(rta))) { + family = AF_INET6; + rta->rta_type = MPTCP_PM_ADDR_ATTR_ADDR6; + rta->rta_len = RTA_LENGTH(16); + } else { + error(1, errno, "can't parse ip %s", argv[3]); + } + off += NLMSG_ALIGN(rta->rta_len); + + /* family */ + rta = (void *)(data + off); + rta->rta_type = MPTCP_PM_ADDR_ATTR_FAMILY; + rta->rta_len = RTA_LENGTH(2); + memcpy(RTA_DATA(rta), &family, 2); + off += NLMSG_ALIGN(rta->rta_len); + } nest->rta_len = off - nest_start; do_nl_req(fd, nh, off, 0); -- cgit v1.2.3 From 5e287fe761495eb669b5e2543919bd5124edecf1 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 30 Mar 2021 17:08:56 -0700 Subject: selftests: mptcp: remove id 0 address testcases This patch added the testcases for removing the id 0 subflow and the id 0 address. In do_transfer, use the removing addresses number '9' for deleting the id 0 address. Reviewed-by: Mat Martineau Signed-off-by: Geliang Tang Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 35 +++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 679de3abaf34..d2273b88e72c 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -294,9 +294,12 @@ do_transfer() let id+=1 done fi - else + elif [ $rm_nr_ns1 -eq 8 ]; then sleep 1 ip netns exec ${listener_ns} ./pm_nl_ctl flush + elif [ $rm_nr_ns1 -eq 9 ]; then + sleep 1 + ip netns exec ${listener_ns} ./pm_nl_ctl del 0 ${connect_addr} fi fi @@ -333,9 +336,18 @@ do_transfer() let id+=1 done fi - else + elif [ $rm_nr_ns2 -eq 8 ]; then sleep 1 ip netns exec ${connector_ns} ./pm_nl_ctl flush + elif [ $rm_nr_ns2 -eq 9 ]; then + local addr + if is_v6 "${connect_addr}"; then + addr="dead:beef:1::2" + else + addr="10.0.1.2" + fi + sleep 1 + ip netns exec ${connector_ns} ./pm_nl_ctl del 0 $addr fi fi @@ -988,6 +1000,25 @@ remove_tests() chk_join_nr "flush invalid addresses" 1 1 1 chk_add_nr 3 3 chk_rm_nr 3 1 invert + + # remove id 0 subflow + reset + ip netns exec $ns1 ./pm_nl_ctl limits 0 1 + ip netns exec $ns2 ./pm_nl_ctl limits 0 1 + ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow + run_tests $ns1 $ns2 10.0.1.1 0 0 -9 slow + chk_join_nr "remove id 0 subflow" 1 1 1 + chk_rm_nr 1 1 + + # remove id 0 address + reset + ip netns exec $ns1 ./pm_nl_ctl limits 0 1 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal + ip netns exec $ns2 ./pm_nl_ctl limits 1 1 + run_tests $ns1 $ns2 10.0.1.1 0 -9 0 slow + chk_join_nr "remove id 0 address" 1 1 1 + chk_add_nr 1 1 + chk_rm_nr 1 1 invert } add_tests() -- cgit v1.2.3 From 63f8af0fc34197a276674fa0d4d865aeff1f0172 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Tue, 23 Mar 2021 01:47:52 +0000 Subject: selftests/bpf: Add an option for a debug shell in vmtest.sh The newly introduced -s command line option starts an interactive shell. If a command is specified, the shell is started after the command finishes executing. It's useful to have a shell especially when debugging failing tests or developing new tests. Since the user may terminate the VM forcefully, an extra "sync" is added after the execution of the command to persist any logs from the command into the log file. Signed-off-by: KP Singh Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210323014752.3198283-1-kpsingh@kernel.org --- tools/testing/selftests/bpf/vmtest.sh | 39 +++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/vmtest.sh b/tools/testing/selftests/bpf/vmtest.sh index 22554894db99..8889b3f55236 100755 --- a/tools/testing/selftests/bpf/vmtest.sh +++ b/tools/testing/selftests/bpf/vmtest.sh @@ -24,15 +24,15 @@ EXIT_STATUS_FILE="${LOG_FILE_BASE}.exit_status" usage() { cat <] -- [] +Usage: $0 [-i] [-s] [-d ] -- [] is the command you would normally run when you are in tools/testing/selftests/bpf. e.g: $0 -- ./test_progs -t test_lsm -If no command is specified, "${DEFAULT_COMMAND}" will be run by -default. +If no command is specified and a debug shell (-s) is not requested, +"${DEFAULT_COMMAND}" will be run by default. If you build your kernel using KBUILD_OUTPUT= or O= options, these can be passed as environment variables to the script: @@ -49,6 +49,9 @@ Options: -d) Update the output directory (default: ${OUTPUT_DIR}) -j) Number of jobs for compilation, similar to -j in make (default: ${NUM_COMPILE_JOBS}) + -s) Instead of powering off the VM, start an interactive + shell. If is specified, the shell runs after + the command finishes executing EOF } @@ -149,6 +152,7 @@ update_init_script() local init_script_dir="${OUTPUT_DIR}/${MOUNT_DIR}/etc/rcS.d" local init_script="${init_script_dir}/S50-startup" local command="$1" + local exit_command="$2" mount_image @@ -162,9 +166,10 @@ EOF fi - sudo bash -c "cat >${init_script}" < ${init_script}" + if [[ "${command}" != "" ]]; then + sudo bash -c "cat >>${init_script}" < "/root/${EXIT_STATUS_FILE}" @@ -175,9 +180,12 @@ echo "130" > "/root/${EXIT_STATUS_FILE}" stdbuf -oL -eL ${command} echo "\$?" > "/root/${EXIT_STATUS_FILE}" } 2>&1 | tee "/root/${LOG_FILE}" -poweroff -f +# Ensure that the logs are written to disk +sync EOF + fi + sudo bash -c "echo ${exit_command} >> ${init_script}" sudo chmod a+x "${init_script}" unmount_image } @@ -277,8 +285,10 @@ main() local kernel_bzimage="${kernel_checkout}/${X86_BZIMAGE}" local command="${DEFAULT_COMMAND}" local update_image="no" + local exit_command="poweroff -f" + local debug_shell="no" - while getopts 'hkid:j:' opt; do + while getopts 'hskid:j:' opt; do case ${opt} in i) update_image="yes" @@ -289,6 +299,11 @@ main() j) NUM_COMPILE_JOBS="$OPTARG" ;; + s) + command="" + debug_shell="yes" + exit_command="bash" + ;; h) usage exit 0 @@ -307,7 +322,7 @@ main() done shift $((OPTIND -1)) - if [[ $# -eq 0 ]]; then + if [[ $# -eq 0 && "${debug_shell}" == "no" ]]; then echo "No command specified, will run ${DEFAULT_COMMAND} in the vm" else command="$@" @@ -355,10 +370,12 @@ main() fi update_selftests "${kernel_checkout}" "${make_command}" - update_init_script "${command}" + update_init_script "${command}" "${exit_command}" run_vm "${kernel_bzimage}" - copy_logs - echo "Logs saved in ${OUTPUT_DIR}/${LOG_FILE}" + if [[ "${command}" != "" ]]; then + copy_logs + echo "Logs saved in ${OUTPUT_DIR}/${LOG_FILE}" + fi } catch() -- cgit v1.2.3 From 3406ac5347dbf64ab9f7b137ed25a18493f5ea2d Mon Sep 17 00:00:00 2001 From: Martin Liška Date: Tue, 30 Mar 2021 20:33:55 +0200 Subject: perf annotate: Add --demangle and --demangle-kernel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 'perf annotate' supports --symbol but it's impossible to filter a C++ symbol. With --no-demangle one can filter easily by mangled function name. Signed-off-by: Martin Liška Link: http://lore.kernel.org/lkml/c3c7e959-9f7f-18e2-e795-f604275cbac3@suse.cz Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-annotate.txt | 7 +++++++ tools/perf/builtin-annotate.c | 4 ++++ 2 files changed, 11 insertions(+) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-annotate.txt b/tools/perf/Documentation/perf-annotate.txt index 1b5042f134a8..80c1be5d566c 100644 --- a/tools/perf/Documentation/perf-annotate.txt +++ b/tools/perf/Documentation/perf-annotate.txt @@ -124,6 +124,13 @@ OPTIONS --group:: Show event group information together +--demangle:: + Demangle symbol names to human readable form. It's enabled by default, + disable with --no-demangle. + +--demangle-kernel:: + Demangle kernel symbol names to human readable form (for C++ kernels). + --percent-type:: Set annotation percent type from following choices: global-period, local-period, global-hits, local-hits diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 0f3a196e5d6e..021d974c978e 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -538,6 +538,10 @@ int cmd_annotate(int argc, const char **argv) "Strip first N entries of source file path name in programs (with --prefix)"), OPT_STRING(0, "objdump", &annotate.opts.objdump_path, "path", "objdump binary to use for disassembly and annotations"), + OPT_BOOLEAN(0, "demangle", &symbol_conf.demangle, + "Enable symbol demangling"), + OPT_BOOLEAN(0, "demangle-kernel", &symbol_conf.demangle_kernel, + "Enable kernel symbol demangling"), OPT_BOOLEAN(0, "group", &symbol_conf.event_group, "Show event group information together"), OPT_BOOLEAN(0, "show-total-period", &symbol_conf.show_total_period, -- cgit v1.2.3 From 52fa82c21f64e900a72437269a5cc9e0034b424e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Mar 2021 16:12:00 +0100 Subject: x86: Add insn_decode_kernel() Add a helper to decode kernel instructions; there's no point in endlessly repeating those last two arguments. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210326151259.379242587@infradead.org --- arch/x86/include/asm/insn.h | 2 ++ arch/x86/kernel/alternative.c | 2 +- arch/x86/kernel/cpu/mce/severity.c | 2 +- arch/x86/kernel/kprobes/core.c | 4 ++-- arch/x86/kernel/kprobes/opt.c | 2 +- arch/x86/kernel/traps.c | 2 +- tools/arch/x86/include/asm/insn.h | 2 ++ 7 files changed, 10 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h index f03b6ca7dec6..05a6ab940f45 100644 --- a/arch/x86/include/asm/insn.h +++ b/arch/x86/include/asm/insn.h @@ -150,6 +150,8 @@ enum insn_mode { extern int insn_decode(struct insn *insn, const void *kaddr, int buf_len, enum insn_mode m); +#define insn_decode_kernel(_insn, _ptr) insn_decode((_insn), (_ptr), MAX_INSN_SIZE, INSN_MODE_KERN) + /* Attribute will be determined after getting ModRM (for opcode groups) */ static inline void insn_get_attribute(struct insn *insn) { diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index ce28c5c1deba..ff359b3a30e7 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1280,7 +1280,7 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, if (!emulate) emulate = opcode; - ret = insn_decode(&insn, emulate, MAX_INSN_SIZE, INSN_MODE_KERN); + ret = insn_decode_kernel(&insn, emulate); BUG_ON(ret < 0); BUG_ON(len != insn.length); diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index a2136ced9d73..abdd2e40d7c4 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -225,7 +225,7 @@ static bool is_copy_from_user(struct pt_regs *regs) if (copy_from_kernel_nofault(insn_buf, (void *)regs->ip, MAX_INSN_SIZE)) return false; - ret = insn_decode(&insn, insn_buf, MAX_INSN_SIZE, INSN_MODE_KERN); + ret = insn_decode_kernel(&insn, insn_buf); if (ret < 0) return false; diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index dd0902184708..1319ff47c85c 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -285,7 +285,7 @@ static int can_probe(unsigned long paddr) if (!__addr) return 0; - ret = insn_decode(&insn, (void *)__addr, MAX_INSN_SIZE, INSN_MODE_KERN); + ret = insn_decode_kernel(&insn, (void *)__addr); if (ret < 0) return 0; @@ -322,7 +322,7 @@ int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn) MAX_INSN_SIZE)) return 0; - ret = insn_decode(insn, dest, MAX_INSN_SIZE, INSN_MODE_KERN); + ret = insn_decode_kernel(insn, dest); if (ret < 0) return 0; diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 4299fc865732..71425ebba98a 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -324,7 +324,7 @@ static int can_optimize(unsigned long paddr) if (!recovered_insn) return 0; - ret = insn_decode(&insn, (void *)recovered_insn, MAX_INSN_SIZE, INSN_MODE_KERN); + ret = insn_decode_kernel(&insn, (void *)recovered_insn); if (ret < 0) return 0; diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a5d254057b88..034f27fc230a 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -504,7 +504,7 @@ static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs, MAX_INSN_SIZE)) return GP_NO_HINT; - ret = insn_decode(&insn, insn_buf, MAX_INSN_SIZE, INSN_MODE_KERN); + ret = insn_decode_kernel(&insn, insn_buf); if (ret < 0) return GP_NO_HINT; diff --git a/tools/arch/x86/include/asm/insn.h b/tools/arch/x86/include/asm/insn.h index c9f3eeebb53b..dc632b41f135 100644 --- a/tools/arch/x86/include/asm/insn.h +++ b/tools/arch/x86/include/asm/insn.h @@ -150,6 +150,8 @@ enum insn_mode { extern int insn_decode(struct insn *insn, const void *kaddr, int buf_len, enum insn_mode m); +#define insn_decode_kernel(_insn, _ptr) insn_decode((_insn), (_ptr), MAX_INSN_SIZE, INSN_MODE_KERN) + /* Attribute will be determined after getting ModRM (for opcode groups) */ static inline void insn_get_attribute(struct insn *insn) { -- cgit v1.2.3 From e27bfefb21f28d5295432f042b5d9d7871100c35 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 29 Mar 2021 15:31:43 -0700 Subject: tools/resolve_btfids: Fix warnings * make eprintf static, used only in main.c * initialize ret in eprintf * remove unused *tmp v3: * remove another err (Song Liu) v2: * remove unused 'int err = -1' Signed-off-by: Stanislav Fomichev Signed-off-by: Andrii Nakryiko Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20210329223143.3659983-1-sdf@google.com --- tools/bpf/resolve_btfids/main.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c index 80d966cfcaa1..7550fd9c3188 100644 --- a/tools/bpf/resolve_btfids/main.c +++ b/tools/bpf/resolve_btfids/main.c @@ -115,10 +115,10 @@ struct object { static int verbose; -int eprintf(int level, int var, const char *fmt, ...) +static int eprintf(int level, int var, const char *fmt, ...) { va_list args; - int ret; + int ret = 0; if (var >= level) { va_start(args, fmt); @@ -385,7 +385,7 @@ static int elf_collect(struct object *obj) static int symbols_collect(struct object *obj) { Elf_Scn *scn = NULL; - int n, i, err = 0; + int n, i; GElf_Shdr sh; char *name; @@ -402,11 +402,10 @@ static int symbols_collect(struct object *obj) * Scan symbols and look for the ones starting with * __BTF_ID__* over .BTF_ids section. */ - for (i = 0; !err && i < n; i++) { - char *tmp, *prefix; + for (i = 0; i < n; i++) { + char *prefix; struct btf_id *id; GElf_Sym sym; - int err = -1; if (!gelf_getsym(obj->efile.symbols, i, &sym)) return -1; -- cgit v1.2.3 From 1da07e5db3564789eb598bc772ea50b547194691 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 29 Mar 2021 20:59:54 -0700 Subject: selftests: ethtool: add a netdevsim FEC test Test FEC settings, iterate over configs. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- .../drivers/net/netdevsim/ethtool-common.sh | 5 +- .../selftests/drivers/net/netdevsim/ethtool-fec.sh | 110 +++++++++++++++++++++ 2 files changed, 114 insertions(+), 1 deletion(-) create mode 100755 tools/testing/selftests/drivers/net/netdevsim/ethtool-fec.sh (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/netdevsim/ethtool-common.sh b/tools/testing/selftests/drivers/net/netdevsim/ethtool-common.sh index 9f64d5c7107b..7ca1f030d209 100644 --- a/tools/testing/selftests/drivers/net/netdevsim/ethtool-common.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/ethtool-common.sh @@ -24,8 +24,11 @@ function check { local code=$1 local str=$2 local exp_str=$3 + local exp_fail=$4 - if [ $code -ne 0 ]; then + [ -z "$exp_fail" ] && cop="-ne" || cop="-eq" + + if [ $code $cop 0 ]; then ((num_errors++)) return fi diff --git a/tools/testing/selftests/drivers/net/netdevsim/ethtool-fec.sh b/tools/testing/selftests/drivers/net/netdevsim/ethtool-fec.sh new file mode 100755 index 000000000000..0c56746e9ce0 --- /dev/null +++ b/tools/testing/selftests/drivers/net/netdevsim/ethtool-fec.sh @@ -0,0 +1,110 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0-only + +source ethtool-common.sh + +NSIM_NETDEV=$(make_netdev) +[ a$ETHTOOL == a ] && ETHTOOL=ethtool + +set -o pipefail + +# netdevsim starts out with None/None +s=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2) +check $? "$s" "Configured FEC encodings: None +Active FEC encoding: None" + +# Test Auto +$ETHTOOL --set-fec $NSIM_NETDEV encoding auto +check $? +s=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2) +check $? "$s" "Configured FEC encodings: Auto +Active FEC encoding: Off" + +# Test case in-sensitivity +for o in off Off OFF; do + $ETHTOOL --set-fec $NSIM_NETDEV encoding $o + check $? + s=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2) + check $? "$s" "Configured FEC encodings: Off +Active FEC encoding: Off" +done + +for o in BaseR baser BAser; do + $ETHTOOL --set-fec $NSIM_NETDEV encoding $o + check $? + s=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2) + check $? "$s" "Configured FEC encodings: BaseR +Active FEC encoding: BaseR" +done + +for o in llrs rs; do + $ETHTOOL --set-fec $NSIM_NETDEV encoding $o + check $? + s=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2) + check $? "$s" "Configured FEC encodings: ${o^^} +Active FEC encoding: ${o^^}" +done + +# Test mutliple bits +$ETHTOOL --set-fec $NSIM_NETDEV encoding rs llrs +check $? +s=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2) +check $? "$s" "Configured FEC encodings: RS LLRS +Active FEC encoding: LLRS" + +$ETHTOOL --set-fec $NSIM_NETDEV encoding rs off auto +check $? +s=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2) +check $? "$s" "Configured FEC encodings: Auto Off RS +Active FEC encoding: RS" + +# Make sure other link modes are rejected +$ETHTOOL --set-fec $NSIM_NETDEV encoding FIBRE 2>/dev/null +check $? '' '' 1 + +$ETHTOOL --set-fec $NSIM_NETDEV encoding bla-bla-bla 2>/dev/null +check $? '' '' 1 + +# Try JSON +$ETHTOOL --json --show-fec $NSIM_NETDEV | jq empty >>/dev/null 2>&1 +if [ $? -eq 0 ]; then + $ETHTOOL --set-fec $NSIM_NETDEV encoding auto + check $? + + s=$($ETHTOOL --json --show-fec $NSIM_NETDEV | jq '.[].config[]') + check $? "$s" '"Auto"' + s=$($ETHTOOL --json --show-fec $NSIM_NETDEV | jq '.[].active[]') + check $? "$s" '"Off"' + + $ETHTOOL --set-fec $NSIM_NETDEV encoding auto RS + check $? + + s=$($ETHTOOL --json --show-fec $NSIM_NETDEV | jq '.[].config[]') + check $? "$s" '"Auto" +"RS"' + s=$($ETHTOOL --json --show-fec $NSIM_NETDEV | jq '.[].active[]') + check $? "$s" '"RS"' +fi + +# Test error injection +echo 11 > $NSIM_DEV_DFS/ethtool/get_err + +$ETHTOOL --show-fec $NSIM_NETDEV >>/dev/null 2>&1 +check $? '' '' 1 + +echo 0 > $NSIM_DEV_DFS/ethtool/get_err +echo 11 > $NSIM_DEV_DFS/ethtool/set_err + +$ETHTOOL --show-fec $NSIM_NETDEV >>/dev/null 2>&1 +check $? + +$ETHTOOL --set-fec $NSIM_NETDEV encoding RS 2>/dev/null +check $? '' '' 1 + +if [ $num_errors -eq 0 ]; then + echo "PASSED all $((num_passes)) checks" + exit 0 +else + echo "FAILED $num_errors/$((num_errors+num_passes)) checks" + exit 1 +fi -- cgit v1.2.3 From 040806343bb4ef6365166eae666ced8a91b95321 Mon Sep 17 00:00:00 2001 From: Carlos Llamas Date: Thu, 1 Apr 2021 00:40:20 +0000 Subject: selftests/net: so_txtime multi-host support SO_TXTIME hardware offload requires testing across devices, either between machines or separate network namespaces. Split up SO_TXTIME test into tx and rx modes, so traffic can be sent from one process to another. Create a veth-pair on different namespaces and bind each process to an end point via [-S]ource and [-D]estination parameters. Optional start [-t]ime parameter can be passed to synchronize the test across the hosts (with synchorinzed clocks). Signed-off-by: Carlos Llamas Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- tools/testing/selftests/net/so_txtime.c | 247 +++++++++++++++++++++++-------- tools/testing/selftests/net/so_txtime.sh | 97 +++++++++--- 2 files changed, 259 insertions(+), 85 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/so_txtime.c b/tools/testing/selftests/net/so_txtime.c index b4cca382d125..59067f64b775 100644 --- a/tools/testing/selftests/net/so_txtime.c +++ b/tools/testing/selftests/net/so_txtime.c @@ -2,9 +2,12 @@ /* * Test the SO_TXTIME API * - * Takes two streams of { payload, delivery time }[], one input and one output. - * Sends the input stream and verifies arrival matches the output stream. - * The two streams can differ due to out-of-order delivery and drops. + * Takes a stream of { payload, delivery time }[], to be sent across two + * processes. Start this program on two separate network namespaces or + * connected hosts, one instance in transmit mode and the other in receive + * mode using the '-r' option. Receiver will compare arrival timestamps to + * the expected stream. Sender will read transmit timestamps from the error + * queue. The streams can differ due to out-of-order delivery and drops. */ #define _GNU_SOURCE @@ -28,14 +31,17 @@ #include #include #include +#include static int cfg_clockid = CLOCK_TAI; -static bool cfg_do_ipv4; -static bool cfg_do_ipv6; static uint16_t cfg_port = 8000; static int cfg_variance_us = 4000; +static uint64_t cfg_start_time_ns; +static int cfg_mark; +static bool cfg_rx; static uint64_t glob_tstart; +static uint64_t tdeliver_max; /* encode one timed transmission (of a 1B payload) */ struct timed_send { @@ -44,18 +50,21 @@ struct timed_send { }; #define MAX_NUM_PKT 8 -static struct timed_send cfg_in[MAX_NUM_PKT]; -static struct timed_send cfg_out[MAX_NUM_PKT]; +static struct timed_send cfg_buf[MAX_NUM_PKT]; static int cfg_num_pkt; static int cfg_errq_level; static int cfg_errq_type; -static uint64_t gettime_ns(void) +static struct sockaddr_storage cfg_dst_addr; +static struct sockaddr_storage cfg_src_addr; +static socklen_t cfg_alen; + +static uint64_t gettime_ns(clockid_t clock) { struct timespec ts; - if (clock_gettime(cfg_clockid, &ts)) + if (clock_gettime(clock, &ts)) error(1, errno, "gettime"); return ts.tv_sec * (1000ULL * 1000 * 1000) + ts.tv_nsec; @@ -75,6 +84,8 @@ static void do_send_one(int fdt, struct timed_send *ts) msg.msg_iov = &iov; msg.msg_iovlen = 1; + msg.msg_name = (struct sockaddr *)&cfg_dst_addr; + msg.msg_namelen = cfg_alen; if (ts->delay_us >= 0) { memset(control, 0, sizeof(control)); @@ -82,6 +93,8 @@ static void do_send_one(int fdt, struct timed_send *ts) msg.msg_controllen = sizeof(control); tdeliver = glob_tstart + ts->delay_us * 1000; + tdeliver_max = tdeliver_max > tdeliver ? + tdeliver_max : tdeliver; cm = CMSG_FIRSTHDR(&msg); cm->cmsg_level = SOL_SOCKET; @@ -98,7 +111,7 @@ static void do_send_one(int fdt, struct timed_send *ts) } -static bool do_recv_one(int fdr, struct timed_send *ts) +static void do_recv_one(int fdr, struct timed_send *ts) { int64_t tstop, texpect; char rbuf[2]; @@ -106,13 +119,13 @@ static bool do_recv_one(int fdr, struct timed_send *ts) ret = recv(fdr, rbuf, sizeof(rbuf), 0); if (ret == -1 && errno == EAGAIN) - return true; + error(1, EAGAIN, "recv: timeout"); if (ret == -1) error(1, errno, "read"); if (ret != 1) error(1, 0, "read: %dB", ret); - tstop = (gettime_ns() - glob_tstart) / 1000; + tstop = (gettime_ns(cfg_clockid) - glob_tstart) / 1000; texpect = ts->delay_us >= 0 ? ts->delay_us : 0; fprintf(stderr, "payload:%c delay:%lld expected:%lld (us)\n", @@ -123,8 +136,6 @@ static bool do_recv_one(int fdr, struct timed_send *ts) if (llabs(tstop - texpect) > cfg_variance_us) error(1, 0, "exceeds variance (%d us)", cfg_variance_us); - - return false; } static void do_recv_verify_empty(int fdr) @@ -137,18 +148,18 @@ static void do_recv_verify_empty(int fdr) error(1, 0, "recv: not empty as expected (%d, %d)", ret, errno); } -static void do_recv_errqueue_timeout(int fdt) +static int do_recv_errqueue_timeout(int fdt) { char control[CMSG_SPACE(sizeof(struct sock_extended_err)) + CMSG_SPACE(sizeof(struct sockaddr_in6))] = {0}; char data[sizeof(struct ethhdr) + sizeof(struct ipv6hdr) + sizeof(struct udphdr) + 1]; struct sock_extended_err *err; + int ret, num_tstamp = 0; struct msghdr msg = {0}; struct iovec iov = {0}; struct cmsghdr *cm; int64_t tstamp = 0; - int ret; iov.iov_base = data; iov.iov_len = sizeof(data); @@ -206,9 +217,47 @@ static void do_recv_errqueue_timeout(int fdt) msg.msg_flags = 0; msg.msg_controllen = sizeof(control); + num_tstamp++; } - error(1, 0, "recv: timeout"); + return num_tstamp; +} + +static void recv_errqueue_msgs(int fdt) +{ + struct pollfd pfd = { .fd = fdt, .events = POLLERR }; + const int timeout_ms = 10; + int ret, num_tstamp = 0; + + do { + ret = poll(&pfd, 1, timeout_ms); + if (ret == -1) + error(1, errno, "poll"); + + if (ret && (pfd.revents & POLLERR)) + num_tstamp += do_recv_errqueue_timeout(fdt); + + if (num_tstamp == cfg_num_pkt) + break; + + } while (gettime_ns(cfg_clockid) < tdeliver_max); +} + +static void start_time_wait(void) +{ + uint64_t now; + int err; + + if (!cfg_start_time_ns) + return; + + now = gettime_ns(CLOCK_REALTIME); + if (cfg_start_time_ns < now) + return; + + err = usleep((cfg_start_time_ns - now) / 1000); + if (err) + error(1, errno, "usleep"); } static void setsockopt_txtime(int fd) @@ -245,6 +294,10 @@ static int setup_tx(struct sockaddr *addr, socklen_t alen) setsockopt_txtime(fd); + if (cfg_mark && + setsockopt(fd, SOL_SOCKET, SO_MARK, &cfg_mark, sizeof(cfg_mark))) + error(1, errno, "setsockopt mark"); + return fd; } @@ -266,31 +319,70 @@ static int setup_rx(struct sockaddr *addr, socklen_t alen) return fd; } -static void do_test(struct sockaddr *addr, socklen_t alen) +static void do_test_tx(struct sockaddr *addr, socklen_t alen) { - int fdt, fdr, i; + int fdt, i; fprintf(stderr, "\nSO_TXTIME ipv%c clock %s\n", addr->sa_family == PF_INET ? '4' : '6', cfg_clockid == CLOCK_TAI ? "tai" : "monotonic"); fdt = setup_tx(addr, alen); - fdr = setup_rx(addr, alen); - glob_tstart = gettime_ns(); + start_time_wait(); + glob_tstart = gettime_ns(cfg_clockid); for (i = 0; i < cfg_num_pkt; i++) - do_send_one(fdt, &cfg_in[i]); + do_send_one(fdt, &cfg_buf[i]); + + recv_errqueue_msgs(fdt); + + if (close(fdt)) + error(1, errno, "close t"); +} + +static void do_test_rx(struct sockaddr *addr, socklen_t alen) +{ + int fdr, i; + + fdr = setup_rx(addr, alen); + + start_time_wait(); + glob_tstart = gettime_ns(cfg_clockid); + for (i = 0; i < cfg_num_pkt; i++) - if (do_recv_one(fdr, &cfg_out[i])) - do_recv_errqueue_timeout(fdt); + do_recv_one(fdr, &cfg_buf[i]); do_recv_verify_empty(fdr); if (close(fdr)) error(1, errno, "close r"); - if (close(fdt)) - error(1, errno, "close t"); +} + +static void setup_sockaddr(int domain, const char *str_addr, + struct sockaddr_storage *sockaddr) +{ + struct sockaddr_in6 *addr6 = (void *) sockaddr; + struct sockaddr_in *addr4 = (void *) sockaddr; + + switch (domain) { + case PF_INET: + memset(addr4, 0, sizeof(*addr4)); + addr4->sin_family = AF_INET; + addr4->sin_port = htons(cfg_port); + if (str_addr && + inet_pton(AF_INET, str_addr, &(addr4->sin_addr)) != 1) + error(1, 0, "ipv4 parse error: %s", str_addr); + break; + case PF_INET6: + memset(addr6, 0, sizeof(*addr6)); + addr6->sin6_family = AF_INET6; + addr6->sin6_port = htons(cfg_port); + if (str_addr && + inet_pton(AF_INET6, str_addr, &(addr6->sin6_addr)) != 1) + error(1, 0, "ipv6 parse error: %s", str_addr); + break; + } } static int parse_io(const char *optarg, struct timed_send *array) @@ -323,17 +415,46 @@ static int parse_io(const char *optarg, struct timed_send *array) return aoff / 2; } +static void usage(const char *progname) +{ + fprintf(stderr, "\nUsage: %s [options] \n" + "Options:\n" + " -4 only IPv4\n" + " -6 only IPv6\n" + " -c monotonic (default) or tai\n" + " -D destination IP address (server)\n" + " -S source IP address (client)\n" + " -r run rx mode\n" + " -t start time (UTC nanoseconds)\n" + " -m socket mark\n" + "\n", + progname); + exit(1); +} + static void parse_opts(int argc, char **argv) { - int c, ilen, olen; + char *daddr = NULL, *saddr = NULL; + int domain = PF_UNSPEC; + int c; - while ((c = getopt(argc, argv, "46c:")) != -1) { + while ((c = getopt(argc, argv, "46c:S:D:rt:m:")) != -1) { switch (c) { case '4': - cfg_do_ipv4 = true; + if (domain != PF_UNSPEC) + error(1, 0, "Pass one of -4 or -6"); + domain = PF_INET; + cfg_alen = sizeof(struct sockaddr_in); + cfg_errq_level = SOL_IP; + cfg_errq_type = IP_RECVERR; break; case '6': - cfg_do_ipv6 = true; + if (domain != PF_UNSPEC) + error(1, 0, "Pass one of -4 or -6"); + domain = PF_INET6; + cfg_alen = sizeof(struct sockaddr_in6); + cfg_errq_level = SOL_IPV6; + cfg_errq_type = IPV6_RECVERR; break; case 'c': if (!strcmp(optarg, "tai")) @@ -344,50 +465,50 @@ static void parse_opts(int argc, char **argv) else error(1, 0, "unknown clock id %s", optarg); break; + case 'S': + saddr = optarg; + break; + case 'D': + daddr = optarg; + break; + case 'r': + cfg_rx = true; + break; + case 't': + cfg_start_time_ns = strtol(optarg, NULL, 0); + break; + case 'm': + cfg_mark = strtol(optarg, NULL, 0); + break; default: - error(1, 0, "parse error at %d", optind); + usage(argv[0]); } } - if (argc - optind != 2) - error(1, 0, "Usage: %s [-46] -c ", argv[0]); + if (argc - optind != 1) + usage(argv[0]); + + if (domain == PF_UNSPEC) + error(1, 0, "Pass one of -4 or -6"); + if (!daddr) + error(1, 0, "-D required\n"); + if (!cfg_rx && !saddr) + error(1, 0, "-S required\n"); - ilen = parse_io(argv[optind], cfg_in); - olen = parse_io(argv[optind + 1], cfg_out); - if (ilen != olen) - error(1, 0, "i/o streams len mismatch (%d, %d)\n", ilen, olen); - cfg_num_pkt = ilen; + setup_sockaddr(domain, daddr, &cfg_dst_addr); + setup_sockaddr(domain, saddr, &cfg_src_addr); + + cfg_num_pkt = parse_io(argv[optind], cfg_buf); } int main(int argc, char **argv) { parse_opts(argc, argv); - if (cfg_do_ipv6) { - struct sockaddr_in6 addr6 = {0}; - - addr6.sin6_family = AF_INET6; - addr6.sin6_port = htons(cfg_port); - addr6.sin6_addr = in6addr_loopback; - - cfg_errq_level = SOL_IPV6; - cfg_errq_type = IPV6_RECVERR; - - do_test((void *)&addr6, sizeof(addr6)); - } - - if (cfg_do_ipv4) { - struct sockaddr_in addr4 = {0}; - - addr4.sin_family = AF_INET; - addr4.sin_port = htons(cfg_port); - addr4.sin_addr.s_addr = htonl(INADDR_LOOPBACK); - - cfg_errq_level = SOL_IP; - cfg_errq_type = IP_RECVERR; - - do_test((void *)&addr4, sizeof(addr4)); - } + if (cfg_rx) + do_test_rx((void *)&cfg_dst_addr, cfg_alen); + else + do_test_tx((void *)&cfg_src_addr, cfg_alen); return 0; } diff --git a/tools/testing/selftests/net/so_txtime.sh b/tools/testing/selftests/net/so_txtime.sh index 3f7800eaecb1..3f06f4d286a9 100755 --- a/tools/testing/selftests/net/so_txtime.sh +++ b/tools/testing/selftests/net/so_txtime.sh @@ -3,32 +3,85 @@ # # Regression tests for the SO_TXTIME interface -# Run in network namespace -if [[ $# -eq 0 ]]; then - if ! ./in_netns.sh $0 __subprocess; then - # test is time sensitive, can be flaky - echo "test failed: retry once" - ./in_netns.sh $0 __subprocess +set -e + +readonly DEV="veth0" +readonly BIN="./so_txtime" + +readonly RAND="$(mktemp -u XXXXXX)" +readonly NSPREFIX="ns-${RAND}" +readonly NS1="${NSPREFIX}1" +readonly NS2="${NSPREFIX}2" + +readonly SADDR4='192.168.1.1' +readonly DADDR4='192.168.1.2' +readonly SADDR6='fd::1' +readonly DADDR6='fd::2' + +cleanup() { + ip netns del "${NS2}" + ip netns del "${NS1}" +} + +trap cleanup EXIT + +# Create virtual ethernet pair between network namespaces +ip netns add "${NS1}" +ip netns add "${NS2}" + +ip link add "${DEV}" netns "${NS1}" type veth \ + peer name "${DEV}" netns "${NS2}" + +# Bring the devices up +ip -netns "${NS1}" link set "${DEV}" up +ip -netns "${NS2}" link set "${DEV}" up + +# Set fixed MAC addresses on the devices +ip -netns "${NS1}" link set dev "${DEV}" address 02:02:02:02:02:02 +ip -netns "${NS2}" link set dev "${DEV}" address 06:06:06:06:06:06 + +# Add fixed IP addresses to the devices +ip -netns "${NS1}" addr add 192.168.1.1/24 dev "${DEV}" +ip -netns "${NS2}" addr add 192.168.1.2/24 dev "${DEV}" +ip -netns "${NS1}" addr add fd::1/64 dev "${DEV}" nodad +ip -netns "${NS2}" addr add fd::2/64 dev "${DEV}" nodad + +do_test() { + local readonly IP="$1" + local readonly CLOCK="$2" + local readonly TXARGS="$3" + local readonly RXARGS="$4" + + if [[ "${IP}" == "4" ]]; then + local readonly SADDR="${SADDR4}" + local readonly DADDR="${DADDR4}" + elif [[ "${IP}" == "6" ]]; then + local readonly SADDR="${SADDR6}" + local readonly DADDR="${DADDR6}" + else + echo "Invalid IP version ${IP}" + exit 1 fi - exit $? -fi + local readonly START="$(date +%s%N --date="+ 0.1 seconds")" + ip netns exec "${NS2}" "${BIN}" -"${IP}" -c "${CLOCK}" -t "${START}" -S "${SADDR}" -D "${DADDR}" "${RXARGS}" -r & + ip netns exec "${NS1}" "${BIN}" -"${IP}" -c "${CLOCK}" -t "${START}" -S "${SADDR}" -D "${DADDR}" "${TXARGS}" + wait "$!" +} -set -e +ip netns exec "${NS1}" tc qdisc add dev "${DEV}" root fq +do_test 4 mono a,-1 a,-1 +do_test 6 mono a,0 a,0 +do_test 6 mono a,10 a,10 +do_test 4 mono a,10,b,20 a,10,b,20 +do_test 6 mono a,20,b,10 b,20,a,20 -tc qdisc add dev lo root fq -./so_txtime -4 -6 -c mono a,-1 a,-1 -./so_txtime -4 -6 -c mono a,0 a,0 -./so_txtime -4 -6 -c mono a,10 a,10 -./so_txtime -4 -6 -c mono a,10,b,20 a,10,b,20 -./so_txtime -4 -6 -c mono a,20,b,10 b,20,a,20 - -if tc qdisc replace dev lo root etf clockid CLOCK_TAI delta 400000; then - ! ./so_txtime -4 -6 -c tai a,-1 a,-1 - ! ./so_txtime -4 -6 -c tai a,0 a,0 - ./so_txtime -4 -6 -c tai a,10 a,10 - ./so_txtime -4 -6 -c tai a,10,b,20 a,10,b,20 - ./so_txtime -4 -6 -c tai a,20,b,10 b,10,a,20 +if ip netns exec "${NS1}" tc qdisc replace dev "${DEV}" root etf clockid CLOCK_TAI delta 400000; then + ! do_test 4 tai a,-1 a,-1 + ! do_test 6 tai a,0 a,0 + do_test 6 tai a,10 a,10 + do_test 4 tai a,10,b,20 a,10,b,20 + do_test 6 tai a,20,b,10 b,10,a,20 else echo "tc ($(tc -V)) does not support qdisc etf. skipping" fi -- cgit v1.2.3 From a7ba4558e69a3c2ae4ca521f015832ef44799538 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:30 -0700 Subject: sock_map: Introduce BPF_SK_SKB_VERDICT Reusing BPF_SK_SKB_STREAM_VERDICT is possible but its name is confusing and more importantly we still want to distinguish them from user-space. So we can just reuse the stream verdict code but introduce a new type of eBPF program, skb_verdict. Users are not allowed to attach stream_verdict and skb_verdict programs to the same map. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210331023237.41094-10-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 2 ++ include/uapi/linux/bpf.h | 1 + kernel/bpf/syscall.c | 1 + net/core/skmsg.c | 4 +++- net/core/sock_map.c | 28 ++++++++++++++++++++++++++++ tools/bpf/bpftool/common.c | 1 + tools/bpf/bpftool/prog.c | 1 + tools/include/uapi/linux/bpf.h | 1 + 8 files changed, 38 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index e7aba150539d..c83dbc2d81d9 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -58,6 +58,7 @@ struct sk_psock_progs { struct bpf_prog *msg_parser; struct bpf_prog *stream_parser; struct bpf_prog *stream_verdict; + struct bpf_prog *skb_verdict; }; enum sk_psock_state_bits { @@ -487,6 +488,7 @@ static inline void psock_progs_drop(struct sk_psock_progs *progs) psock_set_prog(&progs->msg_parser, NULL); psock_set_prog(&progs->stream_parser, NULL); psock_set_prog(&progs->stream_verdict, NULL); + psock_set_prog(&progs->skb_verdict, NULL); } int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 598716742593..49371eba98ba 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -957,6 +957,7 @@ enum bpf_attach_type { BPF_XDP_CPUMAP, BPF_SK_LOOKUP, BPF_XDP, + BPF_SK_SKB_VERDICT, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9603de81811a..6428634da57e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2948,6 +2948,7 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) return BPF_PROG_TYPE_SK_MSG; case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: + case BPF_SK_SKB_VERDICT: return BPF_PROG_TYPE_SK_SKB; case BPF_LIRC_MODE2: return BPF_PROG_TYPE_LIRC_MODE2; diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 656eceab73bc..a045812d7c78 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -697,7 +697,7 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) rcu_assign_sk_user_data(sk, NULL); if (psock->progs.stream_parser) sk_psock_stop_strp(sk, psock); - else if (psock->progs.stream_verdict) + else if (psock->progs.stream_verdict || psock->progs.skb_verdict) sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock); @@ -1024,6 +1024,8 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, } skb_set_owner_r(skb, sk); prog = READ_ONCE(psock->progs.stream_verdict); + if (!prog) + prog = READ_ONCE(psock->progs.skb_verdict); if (likely(prog)) { skb_dst_drop(skb); skb_bpf_redirect_clear(skb); diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 42d797291d34..c2a0411e08a8 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -156,6 +156,8 @@ static void sock_map_del_link(struct sock *sk, strp_stop = true; if (psock->saved_data_ready && stab->progs.stream_verdict) verdict_stop = true; + if (psock->saved_data_ready && stab->progs.skb_verdict) + verdict_stop = true; list_del(&link->list); sk_psock_free_link(link); } @@ -232,6 +234,7 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk) struct sk_psock_progs *progs = sock_map_progs(map); struct bpf_prog *stream_verdict = NULL; struct bpf_prog *stream_parser = NULL; + struct bpf_prog *skb_verdict = NULL; struct bpf_prog *msg_parser = NULL; struct sk_psock *psock; int ret; @@ -268,6 +271,15 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk) } } + skb_verdict = READ_ONCE(progs->skb_verdict); + if (skb_verdict) { + skb_verdict = bpf_prog_inc_not_zero(skb_verdict); + if (IS_ERR(skb_verdict)) { + ret = PTR_ERR(skb_verdict); + goto out_put_msg_parser; + } + } + no_progs: psock = sock_map_psock_get_checked(sk); if (IS_ERR(psock)) { @@ -278,6 +290,9 @@ no_progs: if (psock) { if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) || (stream_parser && READ_ONCE(psock->progs.stream_parser)) || + (skb_verdict && READ_ONCE(psock->progs.skb_verdict)) || + (skb_verdict && READ_ONCE(psock->progs.stream_verdict)) || + (stream_verdict && READ_ONCE(psock->progs.skb_verdict)) || (stream_verdict && READ_ONCE(psock->progs.stream_verdict))) { sk_psock_put(sk, psock); ret = -EBUSY; @@ -309,6 +324,9 @@ no_progs: } else if (!stream_parser && stream_verdict && !psock->saved_data_ready) { psock_set_prog(&psock->progs.stream_verdict, stream_verdict); sk_psock_start_verdict(sk,psock); + } else if (!stream_verdict && skb_verdict && !psock->saved_data_ready) { + psock_set_prog(&psock->progs.skb_verdict, skb_verdict); + sk_psock_start_verdict(sk, psock); } write_unlock_bh(&sk->sk_callback_lock); return 0; @@ -317,6 +335,9 @@ out_unlock_drop: out_drop: sk_psock_put(sk, psock); out_progs: + if (skb_verdict) + bpf_prog_put(skb_verdict); +out_put_msg_parser: if (msg_parser) bpf_prog_put(msg_parser); out_put_stream_parser: @@ -1442,8 +1463,15 @@ static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, break; #endif case BPF_SK_SKB_STREAM_VERDICT: + if (progs->skb_verdict) + return -EBUSY; pprog = &progs->stream_verdict; break; + case BPF_SK_SKB_VERDICT: + if (progs->stream_verdict) + return -EBUSY; + pprog = &progs->skb_verdict; + break; default: return -EOPNOTSUPP; } diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index 65303664417e..1828bba19020 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -57,6 +57,7 @@ const char * const attach_type_name[__MAX_BPF_ATTACH_TYPE] = { [BPF_SK_SKB_STREAM_PARSER] = "sk_skb_stream_parser", [BPF_SK_SKB_STREAM_VERDICT] = "sk_skb_stream_verdict", + [BPF_SK_SKB_VERDICT] = "sk_skb_verdict", [BPF_SK_MSG_VERDICT] = "sk_msg_verdict", [BPF_LIRC_MODE2] = "lirc_mode2", [BPF_FLOW_DISSECTOR] = "flow_dissector", diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index f2b915b20546..3f067d2d7584 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -76,6 +76,7 @@ enum dump_mode { static const char * const attach_type_strings[] = { [BPF_SK_SKB_STREAM_PARSER] = "stream_parser", [BPF_SK_SKB_STREAM_VERDICT] = "stream_verdict", + [BPF_SK_SKB_VERDICT] = "skb_verdict", [BPF_SK_MSG_VERDICT] = "msg_verdict", [BPF_FLOW_DISSECTOR] = "flow_dissector", [__MAX_BPF_ATTACH_TYPE] = NULL, diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index ab9f2233607c..69902603012c 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -957,6 +957,7 @@ enum bpf_attach_type { BPF_XDP_CPUMAP, BPF_SK_LOOKUP, BPF_XDP, + BPF_SK_SKB_VERDICT, __MAX_BPF_ATTACH_TYPE }; -- cgit v1.2.3 From d6378af615275435ce6e390a538c980ac19b6659 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:36 -0700 Subject: selftests/bpf: Add a test case for udp sockmap Add a test case to ensure redirection between two UDP sockets work. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210331023237.41094-16-xiyou.wangcong@gmail.com --- .../selftests/bpf/prog_tests/sockmap_listen.c | 136 +++++++++++++++++++++ .../selftests/bpf/progs/test_sockmap_listen.c | 22 ++++ 2 files changed, 158 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c index c26e6bf05e49..648d9ae898d2 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -1603,6 +1603,141 @@ static void test_reuseport(struct test_sockmap_listen *skel, } } +static void udp_redir_to_connected(int family, int sotype, int sock_mapfd, + int verd_mapfd, enum redir_mode mode) +{ + const char *log_prefix = redir_mode_str(mode); + struct sockaddr_storage addr; + int c0, c1, p0, p1; + unsigned int pass; + socklen_t len; + int err, n; + u64 value; + u32 key; + char b; + + zero_verdict_count(verd_mapfd); + + p0 = socket_loopback(family, sotype | SOCK_NONBLOCK); + if (p0 < 0) + return; + len = sizeof(addr); + err = xgetsockname(p0, sockaddr(&addr), &len); + if (err) + goto close_peer0; + + c0 = xsocket(family, sotype | SOCK_NONBLOCK, 0); + if (c0 < 0) + goto close_peer0; + err = xconnect(c0, sockaddr(&addr), len); + if (err) + goto close_cli0; + err = xgetsockname(c0, sockaddr(&addr), &len); + if (err) + goto close_cli0; + err = xconnect(p0, sockaddr(&addr), len); + if (err) + goto close_cli0; + + p1 = socket_loopback(family, sotype | SOCK_NONBLOCK); + if (p1 < 0) + goto close_cli0; + err = xgetsockname(p1, sockaddr(&addr), &len); + if (err) + goto close_cli0; + + c1 = xsocket(family, sotype | SOCK_NONBLOCK, 0); + if (c1 < 0) + goto close_peer1; + err = xconnect(c1, sockaddr(&addr), len); + if (err) + goto close_cli1; + err = xgetsockname(c1, sockaddr(&addr), &len); + if (err) + goto close_cli1; + err = xconnect(p1, sockaddr(&addr), len); + if (err) + goto close_cli1; + + key = 0; + value = p0; + err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST); + if (err) + goto close_cli1; + + key = 1; + value = p1; + err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST); + if (err) + goto close_cli1; + + n = write(c1, "a", 1); + if (n < 0) + FAIL_ERRNO("%s: write", log_prefix); + if (n == 0) + FAIL("%s: incomplete write", log_prefix); + if (n < 1) + goto close_cli1; + + key = SK_PASS; + err = xbpf_map_lookup_elem(verd_mapfd, &key, &pass); + if (err) + goto close_cli1; + if (pass != 1) + FAIL("%s: want pass count 1, have %d", log_prefix, pass); + + n = read(mode == REDIR_INGRESS ? p0 : c0, &b, 1); + if (n < 0) + FAIL_ERRNO("%s: read", log_prefix); + if (n == 0) + FAIL("%s: incomplete read", log_prefix); + +close_cli1: + xclose(c1); +close_peer1: + xclose(p1); +close_cli0: + xclose(c0); +close_peer0: + xclose(p0); +} + +static void udp_skb_redir_to_connected(struct test_sockmap_listen *skel, + struct bpf_map *inner_map, int family) +{ + int verdict = bpf_program__fd(skel->progs.prog_skb_verdict); + int verdict_map = bpf_map__fd(skel->maps.verdict_map); + int sock_map = bpf_map__fd(inner_map); + int err; + + err = xbpf_prog_attach(verdict, sock_map, BPF_SK_SKB_VERDICT, 0); + if (err) + return; + + skel->bss->test_ingress = false; + udp_redir_to_connected(family, SOCK_DGRAM, sock_map, verdict_map, + REDIR_EGRESS); + skel->bss->test_ingress = true; + udp_redir_to_connected(family, SOCK_DGRAM, sock_map, verdict_map, + REDIR_INGRESS); + + xbpf_prog_detach2(verdict, sock_map, BPF_SK_SKB_VERDICT); +} + +static void test_udp_redir(struct test_sockmap_listen *skel, struct bpf_map *map, + int family) +{ + const char *family_name, *map_name; + char s[MAX_TEST_NAME]; + + family_name = family_str(family); + map_name = map_type_str(map); + snprintf(s, sizeof(s), "%s %s %s", map_name, family_name, __func__); + if (!test__start_subtest(s)) + return; + udp_skb_redir_to_connected(skel, map, family); +} + static void run_tests(struct test_sockmap_listen *skel, struct bpf_map *map, int family) { @@ -1611,6 +1746,7 @@ static void run_tests(struct test_sockmap_listen *skel, struct bpf_map *map, test_redir(skel, map, family, SOCK_STREAM); test_reuseport(skel, map, family, SOCK_STREAM); test_reuseport(skel, map, family, SOCK_DGRAM); + test_udp_redir(skel, map, family); } void test_sockmap_listen(void) diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_listen.c b/tools/testing/selftests/bpf/progs/test_sockmap_listen.c index fa221141e9c1..a39eba9f5201 100644 --- a/tools/testing/selftests/bpf/progs/test_sockmap_listen.c +++ b/tools/testing/selftests/bpf/progs/test_sockmap_listen.c @@ -29,6 +29,7 @@ struct { } verdict_map SEC(".maps"); static volatile bool test_sockmap; /* toggled by user-space */ +static volatile bool test_ingress; /* toggled by user-space */ SEC("sk_skb/stream_parser") int prog_stream_parser(struct __sk_buff *skb) @@ -55,6 +56,27 @@ int prog_stream_verdict(struct __sk_buff *skb) return verdict; } +SEC("sk_skb/skb_verdict") +int prog_skb_verdict(struct __sk_buff *skb) +{ + unsigned int *count; + __u32 zero = 0; + int verdict; + + if (test_sockmap) + verdict = bpf_sk_redirect_map(skb, &sock_map, zero, + test_ingress ? BPF_F_INGRESS : 0); + else + verdict = bpf_sk_redirect_hash(skb, &sock_hash, &zero, + test_ingress ? BPF_F_INGRESS : 0); + + count = bpf_map_lookup_elem(&verdict_map, &verdict); + if (count) + (*count)++; + + return verdict; +} + SEC("sk_msg") int prog_msg_verdict(struct sk_msg_md *msg) { -- cgit v1.2.3 From 8d7cb74f2ccb5486ab8c631a8fcdc7621bbbc42c Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:37 -0700 Subject: selftests/bpf: Add a test case for loading BPF_SK_SKB_VERDICT This adds a test case to ensure BPF_SK_SKB_VERDICT and BPF_SK_STREAM_VERDICT will never be attached at the same time. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210331023237.41094-17-xiyou.wangcong@gmail.com --- .../selftests/bpf/prog_tests/sockmap_basic.c | 40 ++++++++++++++++++++++ .../bpf/progs/test_sockmap_skb_verdict_attach.c | 18 ++++++++++ 2 files changed, 58 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/test_sockmap_skb_verdict_attach.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index b8b48cac2ac3..ab77596b64e3 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -7,6 +7,7 @@ #include "test_skmsg_load_helpers.skel.h" #include "test_sockmap_update.skel.h" #include "test_sockmap_invalid_update.skel.h" +#include "test_sockmap_skb_verdict_attach.skel.h" #include "bpf_iter_sockmap.skel.h" #define TCP_REPAIR 19 /* TCP sock is under repair right now */ @@ -281,6 +282,39 @@ out: bpf_iter_sockmap__destroy(skel); } +static void test_sockmap_skb_verdict_attach(enum bpf_attach_type first, + enum bpf_attach_type second) +{ + struct test_sockmap_skb_verdict_attach *skel; + int err, map, verdict; + + skel = test_sockmap_skb_verdict_attach__open_and_load(); + if (CHECK_FAIL(!skel)) { + perror("test_sockmap_skb_verdict_attach__open_and_load"); + return; + } + + verdict = bpf_program__fd(skel->progs.prog_skb_verdict); + map = bpf_map__fd(skel->maps.sock_map); + + err = bpf_prog_attach(verdict, map, first, 0); + if (CHECK_FAIL(err)) { + perror("bpf_prog_attach"); + goto out; + } + + err = bpf_prog_attach(verdict, map, second, 0); + assert(err == -1 && errno == EBUSY); + + err = bpf_prog_detach2(verdict, map, first); + if (CHECK_FAIL(err)) { + perror("bpf_prog_detach2"); + goto out; + } +out: + test_sockmap_skb_verdict_attach__destroy(skel); +} + void test_sockmap_basic(void) { if (test__start_subtest("sockmap create_update_free")) @@ -301,4 +335,10 @@ void test_sockmap_basic(void) test_sockmap_copy(BPF_MAP_TYPE_SOCKMAP); if (test__start_subtest("sockhash copy")) test_sockmap_copy(BPF_MAP_TYPE_SOCKHASH); + if (test__start_subtest("sockmap skb_verdict attach")) { + test_sockmap_skb_verdict_attach(BPF_SK_SKB_VERDICT, + BPF_SK_SKB_STREAM_VERDICT); + test_sockmap_skb_verdict_attach(BPF_SK_SKB_STREAM_VERDICT, + BPF_SK_SKB_VERDICT); + } } diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_skb_verdict_attach.c b/tools/testing/selftests/bpf/progs/test_sockmap_skb_verdict_attach.c new file mode 100644 index 000000000000..2d31f66e4f23 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_sockmap_skb_verdict_attach.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "vmlinux.h" +#include + +struct { + __uint(type, BPF_MAP_TYPE_SOCKMAP); + __uint(max_entries, 2); + __type(key, __u32); + __type(value, __u64); +} sock_map SEC(".maps"); + +SEC("sk_skb/skb_verdict") +int prog_skb_verdict(struct __sk_buff *skb) +{ + return SK_DROP; +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 23c1ad538f4f371bdb67d8a112314842d5db7e5a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Mar 2021 16:12:01 +0100 Subject: x86/alternatives: Optimize optimize_nops() Currently, optimize_nops() scans to see if the alternative starts with NOPs. However, the emit pattern is: 141: \oldinstr 142: .skip (len-(142b-141b)), 0x90 That is, when 'oldinstr' is short, the tail is padded with NOPs. This case never gets optimized. Rewrite optimize_nops() to replace any trailing string of NOPs inside the alternative to larger NOPs. Also run it irrespective of patching, replacing NOPs in both the original and replaced code. A direct consequence is that 'padlen' becomes superfluous, so remove it. [ bp: - Adjust commit message - remove a stale comment about needing to pad - add a comment in optimize_nops() - exit early if the NOP verif. loop catches a mismatch - function should not not add NOPs in that case - fix the "optimized NOPs" offsets output ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20210326151259.442992235@infradead.org --- arch/x86/include/asm/alternative.h | 17 +++------- arch/x86/kernel/alternative.c | 49 +++++++++++++++++---------- tools/objtool/arch/x86/include/arch/special.h | 2 +- 3 files changed, 37 insertions(+), 31 deletions(-) (limited to 'tools') diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 17b36090d448..a3c2315aca12 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -65,7 +65,6 @@ struct alt_instr { u16 cpuid; /* cpuid bit set for replacement */ u8 instrlen; /* length of original instruction */ u8 replacementlen; /* length of new instruction */ - u8 padlen; /* length of build-time padding */ } __packed; /* @@ -104,7 +103,6 @@ static inline int alternatives_text_reserved(void *start, void *end) #define alt_end_marker "663" #define alt_slen "662b-661b" -#define alt_pad_len alt_end_marker"b-662b" #define alt_total_slen alt_end_marker"b-661b" #define alt_rlen(num) e_replacement(num)"f-"b_replacement(num)"f" @@ -151,8 +149,7 @@ static inline int alternatives_text_reserved(void *start, void *end) " .long " b_replacement(num)"f - .\n" /* new instruction */ \ " .word " __stringify(feature) "\n" /* feature bit */ \ " .byte " alt_total_slen "\n" /* source len */ \ - " .byte " alt_rlen(num) "\n" /* replacement len */ \ - " .byte " alt_pad_len "\n" /* pad len */ + " .byte " alt_rlen(num) "\n" /* replacement len */ #define ALTINSTR_REPLACEMENT(newinstr, num) /* replacement */ \ "# ALT: replacement " #num "\n" \ @@ -224,9 +221,6 @@ static inline int alternatives_text_reserved(void *start, void *end) * Peculiarities: * No memory clobber here. * Argument numbers start with 1. - * Best is to use constraints that are fixed size (like (%1) ... "r") - * If you use variable sized constraints like "m" or "g" in the - * replacement make sure to pad to the worst case length. * Leaving an unused argument 0 to keep API compatibility. */ #define alternative_input(oldinstr, newinstr, feature, input...) \ @@ -315,13 +309,12 @@ static inline int alternatives_text_reserved(void *start, void *end) * enough information for the alternatives patching code to patch an * instruction. See apply_alternatives(). */ -.macro altinstruction_entry orig alt feature orig_len alt_len pad_len +.macro altinstruction_entry orig alt feature orig_len alt_len .long \orig - . .long \alt - . .word \feature .byte \orig_len .byte \alt_len - .byte \pad_len .endm /* @@ -338,7 +331,7 @@ static inline int alternatives_text_reserved(void *start, void *end) 142: .pushsection .altinstructions,"a" - altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f,142b-141b + altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f .popsection .pushsection .altinstr_replacement,"ax" @@ -375,8 +368,8 @@ static inline int alternatives_text_reserved(void *start, void *end) 142: .pushsection .altinstructions,"a" - altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f,142b-141b - altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f,142b-141b + altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f + altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f .popsection .pushsection .altinstr_replacement,"ax" diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 80adf5a81a79..84ec0ba491e4 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -189,19 +189,35 @@ done: static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr) { unsigned long flags; - int i; + struct insn insn; + int nop, i = 0; + + /* + * Jump over the non-NOP insns, the remaining bytes must be single-byte + * NOPs, optimize them. + */ + for (;;) { + if (insn_decode_kernel(&insn, &instr[i])) + return; + + if (insn.length == 1 && insn.opcode.bytes[0] == 0x90) + break; + + if ((i += insn.length) >= a->instrlen) + return; + } - for (i = 0; i < a->padlen; i++) { - if (instr[i] != 0x90) + for (nop = i; i < a->instrlen; i++) { + if (WARN_ONCE(instr[i] != 0x90, "Not a NOP at 0x%px\n", &instr[i])) return; } local_irq_save(flags); - add_nops(instr + (a->instrlen - a->padlen), a->padlen); + add_nops(instr + nop, i - nop); local_irq_restore(flags); DUMP_BYTES(instr, a->instrlen, "%px: [%d:%d) optimized NOPs: ", - instr, a->instrlen - a->padlen, a->padlen); + instr, nop, a->instrlen); } /* @@ -247,19 +263,15 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, * - feature not present but ALTINSTR_FLAG_INV is set to mean, * patch if feature is *NOT* present. */ - if (!boot_cpu_has(feature) == !(a->cpuid & ALTINSTR_FLAG_INV)) { - if (a->padlen > 1) - optimize_nops(a, instr); - - continue; - } + if (!boot_cpu_has(feature) == !(a->cpuid & ALTINSTR_FLAG_INV)) + goto next; - DPRINTK("feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d), pad: %d", + DPRINTK("feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)", (a->cpuid & ALTINSTR_FLAG_INV) ? "!" : "", feature >> 5, feature & 0x1f, instr, instr, a->instrlen, - replacement, a->replacementlen, a->padlen); + replacement, a->replacementlen); DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr); DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement); @@ -283,14 +295,15 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, if (a->replacementlen && is_jmp(replacement[0])) recompute_jump(a, instr, replacement, insn_buff); - if (a->instrlen > a->replacementlen) { - add_nops(insn_buff + a->replacementlen, - a->instrlen - a->replacementlen); - insn_buff_sz += a->instrlen - a->replacementlen; - } + for (; insn_buff_sz < a->instrlen; insn_buff_sz++) + insn_buff[insn_buff_sz] = 0x90; + DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr); text_poke_early(instr, insn_buff, insn_buff_sz); + +next: + optimize_nops(a, instr); } } diff --git a/tools/objtool/arch/x86/include/arch/special.h b/tools/objtool/arch/x86/include/arch/special.h index d818b2bffa02..14271cca0c74 100644 --- a/tools/objtool/arch/x86/include/arch/special.h +++ b/tools/objtool/arch/x86/include/arch/special.h @@ -10,7 +10,7 @@ #define JUMP_ORIG_OFFSET 0 #define JUMP_NEW_OFFSET 4 -#define ALT_ENTRY_SIZE 13 +#define ALT_ENTRY_SIZE 12 #define ALT_ORIG_OFFSET 0 #define ALT_NEW_OFFSET 4 #define ALT_FEATURE_OFFSET 8 -- cgit v1.2.3 From 119251855f9adf9421cb5eb409933092141ab2c7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Mar 2021 16:12:02 +0100 Subject: x86/retpoline: Simplify retpolines Due to: c9c324dc22aa ("objtool: Support stack layout changes in alternatives") it is now possible to simplify the retpolines. Currently our retpolines consist of 2 symbols: - __x86_indirect_thunk_\reg: the compiler target - __x86_retpoline_\reg: the actual retpoline. Both are consecutive in code and aligned such that for any one register they both live in the same cacheline: 0000000000000000 <__x86_indirect_thunk_rax>: 0: ff e0 jmpq *%rax 2: 90 nop 3: 90 nop 4: 90 nop 0000000000000005 <__x86_retpoline_rax>: 5: e8 07 00 00 00 callq 11 <__x86_retpoline_rax+0xc> a: f3 90 pause c: 0f ae e8 lfence f: eb f9 jmp a <__x86_retpoline_rax+0x5> 11: 48 89 04 24 mov %rax,(%rsp) 15: c3 retq 16: 66 2e 0f 1f 84 00 00 00 00 00 nopw %cs:0x0(%rax,%rax,1) The thunk is an alternative_2, where one option is a JMP to the retpoline. This was done so that objtool didn't need to deal with alternatives with stack ops. But that problem has been solved, so now it is possible to fold the entire retpoline into the alternative to simplify and consolidate unused bytes: 0000000000000000 <__x86_indirect_thunk_rax>: 0: ff e0 jmpq *%rax 2: 90 nop 3: 90 nop 4: 90 nop 5: 90 nop 6: 90 nop 7: 90 nop 8: 90 nop 9: 90 nop a: 90 nop b: 90 nop c: 90 nop d: 90 nop e: 90 nop f: 90 nop 10: 90 nop 11: 66 66 2e 0f 1f 84 00 00 00 00 00 data16 nopw %cs:0x0(%rax,%rax,1) 1c: 0f 1f 40 00 nopl 0x0(%rax) Notice that since the longest alternative sequence is now: 0: e8 07 00 00 00 callq c <.altinstr_replacement+0xc> 5: f3 90 pause 7: 0f ae e8 lfence a: eb f9 jmp 5 <.altinstr_replacement+0x5> c: 48 89 04 24 mov %rax,(%rsp) 10: c3 retq 17 bytes, we have 15 bytes NOP at the end of our 32 byte slot. (IOW, if we can shrink the retpoline by 1 byte we can pack it more densely). [ bp: Massage commit message. ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20210326151259.506071949@infradead.org --- arch/x86/include/asm/asm-prototypes.h | 7 ------- arch/x86/include/asm/nospec-branch.h | 6 +++--- arch/x86/lib/retpoline.S | 34 +++++++++++++++++----------------- tools/objtool/check.c | 3 +-- 4 files changed, 21 insertions(+), 29 deletions(-) (limited to 'tools') diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index 51e2bf27cc9b..0545b07895a5 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -22,15 +22,8 @@ extern void cmpxchg8b_emu(void); #define DECL_INDIRECT_THUNK(reg) \ extern asmlinkage void __x86_indirect_thunk_ ## reg (void); -#define DECL_RETPOLINE(reg) \ - extern asmlinkage void __x86_retpoline_ ## reg (void); - #undef GEN #define GEN(reg) DECL_INDIRECT_THUNK(reg) #include -#undef GEN -#define GEN(reg) DECL_RETPOLINE(reg) -#include - #endif /* CONFIG_RETPOLINE */ diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 529f8e9380d8..664be73daa8c 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -80,7 +80,7 @@ .macro JMP_NOSPEC reg:req #ifdef CONFIG_RETPOLINE ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \ - __stringify(jmp __x86_retpoline_\reg), X86_FEATURE_RETPOLINE, \ + __stringify(jmp __x86_indirect_thunk_\reg), X86_FEATURE_RETPOLINE, \ __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_AMD #else jmp *%\reg @@ -90,7 +90,7 @@ .macro CALL_NOSPEC reg:req #ifdef CONFIG_RETPOLINE ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; call *%\reg), \ - __stringify(call __x86_retpoline_\reg), X86_FEATURE_RETPOLINE, \ + __stringify(call __x86_indirect_thunk_\reg), X86_FEATURE_RETPOLINE, \ __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; call *%\reg), X86_FEATURE_RETPOLINE_AMD #else call *%\reg @@ -128,7 +128,7 @@ ALTERNATIVE_2( \ ANNOTATE_RETPOLINE_SAFE \ "call *%[thunk_target]\n", \ - "call __x86_retpoline_%V[thunk_target]\n", \ + "call __x86_indirect_thunk_%V[thunk_target]\n", \ X86_FEATURE_RETPOLINE, \ "lfence;\n" \ ANNOTATE_RETPOLINE_SAFE \ diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index 6bb74b5c238c..d2c0d14c1e22 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -10,27 +10,31 @@ #include #include -.macro THUNK reg - .section .text.__x86.indirect_thunk - - .align 32 -SYM_FUNC_START(__x86_indirect_thunk_\reg) - JMP_NOSPEC \reg -SYM_FUNC_END(__x86_indirect_thunk_\reg) - -SYM_FUNC_START_NOALIGN(__x86_retpoline_\reg) +.macro RETPOLINE reg ANNOTATE_INTRA_FUNCTION_CALL - call .Ldo_rop_\@ + call .Ldo_rop_\@ .Lspec_trap_\@: UNWIND_HINT_EMPTY pause lfence - jmp .Lspec_trap_\@ + jmp .Lspec_trap_\@ .Ldo_rop_\@: - mov %\reg, (%_ASM_SP) + mov %\reg, (%_ASM_SP) UNWIND_HINT_FUNC ret -SYM_FUNC_END(__x86_retpoline_\reg) +.endm + +.macro THUNK reg + .section .text.__x86.indirect_thunk + + .align 32 +SYM_FUNC_START(__x86_indirect_thunk_\reg) + + ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \ + __stringify(RETPOLINE \reg), X86_FEATURE_RETPOLINE, \ + __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_AMD + +SYM_FUNC_END(__x86_indirect_thunk_\reg) .endm @@ -48,7 +52,6 @@ SYM_FUNC_END(__x86_retpoline_\reg) #define __EXPORT_THUNK(sym) _ASM_NOKPROBE(sym); EXPORT_SYMBOL(sym) #define EXPORT_THUNK(reg) __EXPORT_THUNK(__x86_indirect_thunk_ ## reg) -#define EXPORT_RETPOLINE(reg) __EXPORT_THUNK(__x86_retpoline_ ## reg) #undef GEN #define GEN(reg) THUNK reg @@ -58,6 +61,3 @@ SYM_FUNC_END(__x86_retpoline_\reg) #define GEN(reg) EXPORT_THUNK(reg) #include -#undef GEN -#define GEN(reg) EXPORT_RETPOLINE(reg) -#include diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 5e5388a38e2a..d45f0181f133 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -872,8 +872,7 @@ static int add_jump_destinations(struct objtool_file *file) } else if (reloc->sym->type == STT_SECTION) { dest_sec = reloc->sym->sec; dest_off = arch_dest_reloc_offset(reloc->addend); - } else if (!strncmp(reloc->sym->name, "__x86_indirect_thunk_", 21) || - !strncmp(reloc->sym->name, "__x86_retpoline_", 16)) { + } else if (!strncmp(reloc->sym->name, "__x86_indirect_thunk_", 21)) { /* * Retpoline jumps are really dynamic jumps in * disguise, so convert them accordingly. -- cgit v1.2.3 From bcb1b6ff39da7e8a6a986eb08126fba2b5e13c32 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Mar 2021 16:12:03 +0100 Subject: objtool: Correctly handle retpoline thunk calls Just like JMP handling, convert a direct CALL to a retpoline thunk into a retpoline safe indirect CALL. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Signed-off-by: Ingo Molnar Reviewed-by: Miroslav Benes Link: https://lkml.kernel.org/r/20210326151259.567568238@infradead.org --- tools/objtool/check.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'tools') diff --git a/tools/objtool/check.c b/tools/objtool/check.c index d45f0181f133..519af4be7018 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -1025,6 +1025,18 @@ static int add_call_destinations(struct objtool_file *file) dest_off); return -1; } + + } else if (!strncmp(reloc->sym->name, "__x86_indirect_thunk_", 21)) { + /* + * Retpoline calls are really dynamic calls in + * disguise, so convert them accordingly. + */ + insn->type = INSN_CALL_DYNAMIC; + insn->retpoline_safe = true; + + remove_insn_ops(insn); + continue; + } else insn->call_dest = reloc->sym; -- cgit v1.2.3 From 530b4ddd9dd92b263081f5c7786d39a8129c8b2d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Mar 2021 16:12:04 +0100 Subject: objtool: Handle per arch retpoline naming The __x86_indirect_ naming is obviously not generic. Shorten to allow matching some additional magic names later. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Signed-off-by: Ingo Molnar Reviewed-by: Miroslav Benes Link: https://lkml.kernel.org/r/20210326151259.630296706@infradead.org --- tools/objtool/arch/x86/decode.c | 5 +++++ tools/objtool/check.c | 9 +++++++-- tools/objtool/include/objtool/arch.h | 2 ++ 3 files changed, 14 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index ba9ebff7e8f7..782894eb351a 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -648,3 +648,8 @@ int arch_decode_hint_reg(struct instruction *insn, u8 sp_reg) return 0; } + +bool arch_is_retpoline(struct symbol *sym) +{ + return !strncmp(sym->name, "__x86_indirect_", 15); +} diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 519af4be7018..6fbc00122d27 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -850,6 +850,11 @@ static int add_ignore_alternatives(struct objtool_file *file) return 0; } +__weak bool arch_is_retpoline(struct symbol *sym) +{ + return false; +} + /* * Find the destination instructions for all jumps. */ @@ -872,7 +877,7 @@ static int add_jump_destinations(struct objtool_file *file) } else if (reloc->sym->type == STT_SECTION) { dest_sec = reloc->sym->sec; dest_off = arch_dest_reloc_offset(reloc->addend); - } else if (!strncmp(reloc->sym->name, "__x86_indirect_thunk_", 21)) { + } else if (arch_is_retpoline(reloc->sym)) { /* * Retpoline jumps are really dynamic jumps in * disguise, so convert them accordingly. @@ -1026,7 +1031,7 @@ static int add_call_destinations(struct objtool_file *file) return -1; } - } else if (!strncmp(reloc->sym->name, "__x86_indirect_thunk_", 21)) { + } else if (arch_is_retpoline(reloc->sym)) { /* * Retpoline calls are really dynamic calls in * disguise, so convert them accordingly. diff --git a/tools/objtool/include/objtool/arch.h b/tools/objtool/include/objtool/arch.h index 6ff0685f5cc5..bb30993a1ced 100644 --- a/tools/objtool/include/objtool/arch.h +++ b/tools/objtool/include/objtool/arch.h @@ -86,4 +86,6 @@ const char *arch_nop_insn(int len); int arch_decode_hint_reg(struct instruction *insn, u8 sp_reg); +bool arch_is_retpoline(struct symbol *sym); + #endif /* _ARCH_H */ -- cgit v1.2.3 From a958c4fea768d2c378c89032ab41d38da2a24422 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Mar 2021 16:12:05 +0100 Subject: objtool: Fix static_call list generation Currently, objtool generates tail call entries in add_jump_destination() but waits until validate_branch() to generate the regular call entries. Move these to add_call_destination() for consistency. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Signed-off-by: Ingo Molnar Reviewed-by: Miroslav Benes Link: https://lkml.kernel.org/r/20210326151259.691529901@infradead.org --- tools/objtool/check.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 6fbc00122d27..8618d03f61ec 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -1045,6 +1045,11 @@ static int add_call_destinations(struct objtool_file *file) } else insn->call_dest = reloc->sym; + if (insn->call_dest && insn->call_dest->static_call_tramp) { + list_add_tail(&insn->static_call_node, + &file->static_call_list); + } + /* * Many compilers cannot disable KCOV with a function attribute * so they need a little help, NOP out any KCOV calls from noinstr @@ -1788,6 +1793,9 @@ static int decode_sections(struct objtool_file *file) if (ret) return ret; + /* + * Must be before add_{jump_call}_destination. + */ ret = read_static_call_tramps(file); if (ret) return ret; @@ -1800,6 +1808,10 @@ static int decode_sections(struct objtool_file *file) if (ret) return ret; + /* + * Must be before add_call_destination(); it changes INSN_CALL to + * INSN_JUMP. + */ ret = read_intra_function_calls(file); if (ret) return ret; @@ -2762,11 +2774,6 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, if (dead_end_function(file, insn->call_dest)) return 0; - if (insn->type == INSN_CALL && insn->call_dest->static_call_tramp) { - list_add_tail(&insn->static_call_node, - &file->static_call_list); - } - break; case INSN_JUMP_CONDITIONAL: -- cgit v1.2.3 From 3a647607b57ad8346e659ddd3b951ac292c83690 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Mar 2021 16:12:06 +0100 Subject: objtool: Rework the elf_rebuild_reloc_section() logic Instead of manually calling elf_rebuild_reloc_section() on sections we've called elf_add_reloc() on, have elf_write() DTRT. This makes it easier to add random relocations in places without carefully tracking when we're done and need to flush what section. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Signed-off-by: Ingo Molnar Reviewed-by: Miroslav Benes Link: https://lkml.kernel.org/r/20210326151259.754213408@infradead.org --- tools/objtool/check.c | 6 ------ tools/objtool/elf.c | 20 ++++++++++++++------ tools/objtool/include/objtool/elf.h | 1 - tools/objtool/orc_gen.c | 3 --- 4 files changed, 14 insertions(+), 16 deletions(-) (limited to 'tools') diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 8618d03f61ec..1d0415b3391a 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -542,9 +542,6 @@ static int create_static_call_sections(struct objtool_file *file) idx++; } - if (elf_rebuild_reloc_section(file->elf, reloc_sec)) - return -1; - return 0; } @@ -614,9 +611,6 @@ static int create_mcount_loc_sections(struct objtool_file *file) idx++; } - if (elf_rebuild_reloc_section(file->elf, reloc_sec)) - return -1; - return 0; } diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index 93fa833a49a5..374813eafa52 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -479,6 +479,8 @@ void elf_add_reloc(struct elf *elf, struct reloc *reloc) list_add_tail(&reloc->list, &sec->reloc_list); elf_hash_add(elf->reloc_hash, &reloc->hash, reloc_hash(reloc)); + + sec->changed = true; } static int read_rel_reloc(struct section *sec, int i, struct reloc *reloc, unsigned int *symndx) @@ -558,7 +560,9 @@ static int read_relocs(struct elf *elf) return -1; } - elf_add_reloc(elf, reloc); + list_add_tail(&reloc->list, &sec->reloc_list); + elf_hash_add(elf->reloc_hash, &reloc->hash, reloc_hash(reloc)); + nr_reloc++; } max_reloc = max(max_reloc, nr_reloc); @@ -873,14 +877,11 @@ static int elf_rebuild_rela_reloc_section(struct section *sec, int nr) return 0; } -int elf_rebuild_reloc_section(struct elf *elf, struct section *sec) +static int elf_rebuild_reloc_section(struct elf *elf, struct section *sec) { struct reloc *reloc; int nr; - sec->changed = true; - elf->changed = true; - nr = 0; list_for_each_entry(reloc, &sec->reloc_list, list) nr++; @@ -944,9 +945,15 @@ int elf_write(struct elf *elf) struct section *sec; Elf_Scn *s; - /* Update section headers for changed sections: */ + /* Update changed relocation sections and section headers: */ list_for_each_entry(sec, &elf->sections, list) { if (sec->changed) { + if (sec->base && + elf_rebuild_reloc_section(elf, sec)) { + WARN("elf_rebuild_reloc_section"); + return -1; + } + s = elf_getscn(elf->elf, sec->idx); if (!s) { WARN_ELF("elf_getscn"); @@ -958,6 +965,7 @@ int elf_write(struct elf *elf) } sec->changed = false; + elf->changed = true; } } diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h index e6890cc70a25..fc576edacfc4 100644 --- a/tools/objtool/include/objtool/elf.h +++ b/tools/objtool/include/objtool/elf.h @@ -142,7 +142,6 @@ struct reloc *find_reloc_by_dest_range(const struct elf *elf, struct section *se struct symbol *find_func_containing(struct section *sec, unsigned long offset); void insn_to_reloc_sym_addend(struct section *sec, unsigned long offset, struct reloc *reloc); -int elf_rebuild_reloc_section(struct elf *elf, struct section *sec); #define for_each_sec(file, sec) \ list_for_each_entry(sec, &file->elf->sections, list) diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c index 738aa5021bc4..f5347089550e 100644 --- a/tools/objtool/orc_gen.c +++ b/tools/objtool/orc_gen.c @@ -254,8 +254,5 @@ int orc_create(struct objtool_file *file) return -1; } - if (elf_rebuild_reloc_section(file->elf, ip_rsec)) - return -1; - return 0; } -- cgit v1.2.3 From ef47cc01cb4abcd760d8ac66b9361d6ade4d0846 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Mar 2021 16:12:07 +0100 Subject: objtool: Add elf_create_reloc() helper We have 4 instances of adding a relocation. Create a common helper to avoid growing even more. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Signed-off-by: Ingo Molnar Reviewed-by: Miroslav Benes Link: https://lkml.kernel.org/r/20210326151259.817438847@infradead.org --- tools/objtool/check.c | 78 +++++++-------------------------- tools/objtool/elf.c | 86 ++++++++++++++++++++++++------------- tools/objtool/include/objtool/elf.h | 10 +++-- tools/objtool/orc_gen.c | 30 +++---------- 4 files changed, 85 insertions(+), 119 deletions(-) (limited to 'tools') diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 1d0415b3391a..61fe29ae4cd4 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -433,8 +433,7 @@ reachable: static int create_static_call_sections(struct objtool_file *file) { - struct section *sec, *reloc_sec; - struct reloc *reloc; + struct section *sec; struct static_call_site *site; struct instruction *insn; struct symbol *key_sym; @@ -460,8 +459,7 @@ static int create_static_call_sections(struct objtool_file *file) if (!sec) return -1; - reloc_sec = elf_create_reloc_section(file->elf, sec, SHT_RELA); - if (!reloc_sec) + if (!elf_create_reloc_section(file->elf, sec, SHT_RELA)) return -1; idx = 0; @@ -471,25 +469,11 @@ static int create_static_call_sections(struct objtool_file *file) memset(site, 0, sizeof(struct static_call_site)); /* populate reloc for 'addr' */ - reloc = malloc(sizeof(*reloc)); - - if (!reloc) { - perror("malloc"); - return -1; - } - memset(reloc, 0, sizeof(*reloc)); - - insn_to_reloc_sym_addend(insn->sec, insn->offset, reloc); - if (!reloc->sym) { - WARN_FUNC("static call tramp: missing containing symbol", - insn->sec, insn->offset); + if (elf_add_reloc_to_insn(file->elf, sec, + idx * sizeof(struct static_call_site), + R_X86_64_PC32, + insn->sec, insn->offset)) return -1; - } - - reloc->type = R_X86_64_PC32; - reloc->offset = idx * sizeof(struct static_call_site); - reloc->sec = reloc_sec; - elf_add_reloc(file->elf, reloc); /* find key symbol */ key_name = strdup(insn->call_dest->name); @@ -526,18 +510,11 @@ static int create_static_call_sections(struct objtool_file *file) free(key_name); /* populate reloc for 'key' */ - reloc = malloc(sizeof(*reloc)); - if (!reloc) { - perror("malloc"); + if (elf_add_reloc(file->elf, sec, + idx * sizeof(struct static_call_site) + 4, + R_X86_64_PC32, key_sym, + is_sibling_call(insn) * STATIC_CALL_SITE_TAIL)) return -1; - } - memset(reloc, 0, sizeof(*reloc)); - reloc->sym = key_sym; - reloc->addend = is_sibling_call(insn) ? STATIC_CALL_SITE_TAIL : 0; - reloc->type = R_X86_64_PC32; - reloc->offset = idx * sizeof(struct static_call_site) + 4; - reloc->sec = reloc_sec; - elf_add_reloc(file->elf, reloc); idx++; } @@ -547,8 +524,7 @@ static int create_static_call_sections(struct objtool_file *file) static int create_mcount_loc_sections(struct objtool_file *file) { - struct section *sec, *reloc_sec; - struct reloc *reloc; + struct section *sec; unsigned long *loc; struct instruction *insn; int idx; @@ -571,8 +547,7 @@ static int create_mcount_loc_sections(struct objtool_file *file) if (!sec) return -1; - reloc_sec = elf_create_reloc_section(file->elf, sec, SHT_RELA); - if (!reloc_sec) + if (!elf_create_reloc_section(file->elf, sec, SHT_RELA)) return -1; idx = 0; @@ -581,32 +556,11 @@ static int create_mcount_loc_sections(struct objtool_file *file) loc = (unsigned long *)sec->data->d_buf + idx; memset(loc, 0, sizeof(unsigned long)); - reloc = malloc(sizeof(*reloc)); - if (!reloc) { - perror("malloc"); + if (elf_add_reloc_to_insn(file->elf, sec, + idx * sizeof(unsigned long), + R_X86_64_64, + insn->sec, insn->offset)) return -1; - } - memset(reloc, 0, sizeof(*reloc)); - - if (insn->sec->sym) { - reloc->sym = insn->sec->sym; - reloc->addend = insn->offset; - } else { - reloc->sym = find_symbol_containing(insn->sec, insn->offset); - - if (!reloc->sym) { - WARN("missing symbol for insn at offset 0x%lx\n", - insn->offset); - return -1; - } - - reloc->addend = insn->offset - reloc->sym->offset; - } - - reloc->type = R_X86_64_64; - reloc->offset = idx * sizeof(unsigned long); - reloc->sec = reloc_sec; - elf_add_reloc(file->elf, reloc); idx++; } diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index 374813eafa52..0ab52ac3f861 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -211,32 +211,6 @@ struct reloc *find_reloc_by_dest(const struct elf *elf, struct section *sec, uns return find_reloc_by_dest_range(elf, sec, offset, 1); } -void insn_to_reloc_sym_addend(struct section *sec, unsigned long offset, - struct reloc *reloc) -{ - if (sec->sym) { - reloc->sym = sec->sym; - reloc->addend = offset; - return; - } - - /* - * The Clang assembler strips section symbols, so we have to reference - * the function symbol instead: - */ - reloc->sym = find_symbol_containing(sec, offset); - if (!reloc->sym) { - /* - * Hack alert. This happens when we need to reference the NOP - * pad insn immediately after the function. - */ - reloc->sym = find_symbol_containing(sec, offset - 1); - } - - if (reloc->sym) - reloc->addend = offset - reloc->sym->offset; -} - static int read_sections(struct elf *elf) { Elf_Scn *s = NULL; @@ -473,14 +447,66 @@ err: return -1; } -void elf_add_reloc(struct elf *elf, struct reloc *reloc) +int elf_add_reloc(struct elf *elf, struct section *sec, unsigned long offset, + unsigned int type, struct symbol *sym, int addend) { - struct section *sec = reloc->sec; + struct reloc *reloc; + + reloc = malloc(sizeof(*reloc)); + if (!reloc) { + perror("malloc"); + return -1; + } + memset(reloc, 0, sizeof(*reloc)); - list_add_tail(&reloc->list, &sec->reloc_list); + reloc->sec = sec->reloc; + reloc->offset = offset; + reloc->type = type; + reloc->sym = sym; + reloc->addend = addend; + + list_add_tail(&reloc->list, &sec->reloc->reloc_list); elf_hash_add(elf->reloc_hash, &reloc->hash, reloc_hash(reloc)); - sec->changed = true; + sec->reloc->changed = true; + + return 0; +} + +int elf_add_reloc_to_insn(struct elf *elf, struct section *sec, + unsigned long offset, unsigned int type, + struct section *insn_sec, unsigned long insn_off) +{ + struct symbol *sym; + int addend; + + if (insn_sec->sym) { + sym = insn_sec->sym; + addend = insn_off; + + } else { + /* + * The Clang assembler strips section symbols, so we have to + * reference the function symbol instead: + */ + sym = find_symbol_containing(insn_sec, insn_off); + if (!sym) { + /* + * Hack alert. This happens when we need to reference + * the NOP pad insn immediately after the function. + */ + sym = find_symbol_containing(insn_sec, insn_off - 1); + } + + if (!sym) { + WARN("can't find symbol containing %s+0x%lx", insn_sec->name, insn_off); + return -1; + } + + addend = insn_off - sym->offset; + } + + return elf_add_reloc(elf, sec, offset, type, sym, addend); } static int read_rel_reloc(struct section *sec, int i, struct reloc *reloc, unsigned int *symndx) diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h index fc576edacfc4..825ad326201d 100644 --- a/tools/objtool/include/objtool/elf.h +++ b/tools/objtool/include/objtool/elf.h @@ -123,7 +123,13 @@ static inline u32 reloc_hash(struct reloc *reloc) struct elf *elf_open_read(const char *name, int flags); struct section *elf_create_section(struct elf *elf, const char *name, unsigned int sh_flags, size_t entsize, int nr); struct section *elf_create_reloc_section(struct elf *elf, struct section *base, int reltype); -void elf_add_reloc(struct elf *elf, struct reloc *reloc); + +int elf_add_reloc(struct elf *elf, struct section *sec, unsigned long offset, + unsigned int type, struct symbol *sym, int addend); +int elf_add_reloc_to_insn(struct elf *elf, struct section *sec, + unsigned long offset, unsigned int type, + struct section *insn_sec, unsigned long insn_off); + int elf_write_insn(struct elf *elf, struct section *sec, unsigned long offset, unsigned int len, const char *insn); @@ -140,8 +146,6 @@ struct reloc *find_reloc_by_dest(const struct elf *elf, struct section *sec, uns struct reloc *find_reloc_by_dest_range(const struct elf *elf, struct section *sec, unsigned long offset, unsigned int len); struct symbol *find_func_containing(struct section *sec, unsigned long offset); -void insn_to_reloc_sym_addend(struct section *sec, unsigned long offset, - struct reloc *reloc); #define for_each_sec(file, sec) \ list_for_each_entry(sec, &file->elf->sections, list) diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c index f5347089550e..1b57be6508f4 100644 --- a/tools/objtool/orc_gen.c +++ b/tools/objtool/orc_gen.c @@ -82,12 +82,11 @@ static int init_orc_entry(struct orc_entry *orc, struct cfi_state *cfi) } static int write_orc_entry(struct elf *elf, struct section *orc_sec, - struct section *ip_rsec, unsigned int idx, + struct section *ip_sec, unsigned int idx, struct section *insn_sec, unsigned long insn_off, struct orc_entry *o) { struct orc_entry *orc; - struct reloc *reloc; /* populate ORC data */ orc = (struct orc_entry *)orc_sec->data->d_buf + idx; @@ -96,25 +95,9 @@ static int write_orc_entry(struct elf *elf, struct section *orc_sec, orc->bp_offset = bswap_if_needed(orc->bp_offset); /* populate reloc for ip */ - reloc = malloc(sizeof(*reloc)); - if (!reloc) { - perror("malloc"); + if (elf_add_reloc_to_insn(elf, ip_sec, idx * sizeof(int), R_X86_64_PC32, + insn_sec, insn_off)) return -1; - } - memset(reloc, 0, sizeof(*reloc)); - - insn_to_reloc_sym_addend(insn_sec, insn_off, reloc); - if (!reloc->sym) { - WARN("missing symbol for insn at offset 0x%lx", - insn_off); - return -1; - } - - reloc->type = R_X86_64_PC32; - reloc->offset = idx * sizeof(int); - reloc->sec = ip_rsec; - - elf_add_reloc(elf, reloc); return 0; } @@ -153,7 +136,7 @@ static unsigned long alt_group_len(struct alt_group *alt_group) int orc_create(struct objtool_file *file) { - struct section *sec, *ip_rsec, *orc_sec; + struct section *sec, *orc_sec; unsigned int nr = 0, idx = 0; struct orc_list_entry *entry; struct list_head orc_list; @@ -242,13 +225,12 @@ int orc_create(struct objtool_file *file) sec = elf_create_section(file->elf, ".orc_unwind_ip", 0, sizeof(int), nr); if (!sec) return -1; - ip_rsec = elf_create_reloc_section(file->elf, sec, SHT_RELA); - if (!ip_rsec) + if (!elf_create_reloc_section(file->elf, sec, SHT_RELA)) return -1; /* Write ORC entries to sections: */ list_for_each_entry(entry, &orc_list, list) { - if (write_orc_entry(file->elf, orc_sec, ip_rsec, idx++, + if (write_orc_entry(file->elf, orc_sec, sec, idx++, entry->insn_sec, entry->insn_off, &entry->orc)) return -1; -- cgit v1.2.3 From d0c5c4cc73da0b05b0d9e5f833f2d859e1b45f8e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Mar 2021 16:12:08 +0100 Subject: objtool: Create reloc sections implicitly Have elf_add_reloc() create the relocation section implicitly. Suggested-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Signed-off-by: Ingo Molnar Reviewed-by: Miroslav Benes Link: https://lkml.kernel.org/r/20210326151259.880174448@infradead.org --- tools/objtool/check.c | 6 ------ tools/objtool/elf.c | 9 ++++++++- tools/objtool/include/objtool/elf.h | 1 - tools/objtool/orc_gen.c | 2 -- 4 files changed, 8 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 61fe29ae4cd4..600fa67153d3 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -459,9 +459,6 @@ static int create_static_call_sections(struct objtool_file *file) if (!sec) return -1; - if (!elf_create_reloc_section(file->elf, sec, SHT_RELA)) - return -1; - idx = 0; list_for_each_entry(insn, &file->static_call_list, static_call_node) { @@ -547,9 +544,6 @@ static int create_mcount_loc_sections(struct objtool_file *file) if (!sec) return -1; - if (!elf_create_reloc_section(file->elf, sec, SHT_RELA)) - return -1; - idx = 0; list_for_each_entry(insn, &file->mcount_loc_list, mcount_loc_node) { diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index 0ab52ac3f861..7b65ae3beea4 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -447,11 +447,18 @@ err: return -1; } +static struct section *elf_create_reloc_section(struct elf *elf, + struct section *base, + int reltype); + int elf_add_reloc(struct elf *elf, struct section *sec, unsigned long offset, unsigned int type, struct symbol *sym, int addend) { struct reloc *reloc; + if (!sec->reloc && !elf_create_reloc_section(elf, sec, SHT_RELA)) + return -1; + reloc = malloc(sizeof(*reloc)); if (!reloc) { perror("malloc"); @@ -829,7 +836,7 @@ static struct section *elf_create_rela_reloc_section(struct elf *elf, struct sec return sec; } -struct section *elf_create_reloc_section(struct elf *elf, +static struct section *elf_create_reloc_section(struct elf *elf, struct section *base, int reltype) { diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h index 825ad326201d..463f329f1c66 100644 --- a/tools/objtool/include/objtool/elf.h +++ b/tools/objtool/include/objtool/elf.h @@ -122,7 +122,6 @@ static inline u32 reloc_hash(struct reloc *reloc) struct elf *elf_open_read(const char *name, int flags); struct section *elf_create_section(struct elf *elf, const char *name, unsigned int sh_flags, size_t entsize, int nr); -struct section *elf_create_reloc_section(struct elf *elf, struct section *base, int reltype); int elf_add_reloc(struct elf *elf, struct section *sec, unsigned long offset, unsigned int type, struct symbol *sym, int addend); diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c index 1b57be6508f4..dc9b7dd314b0 100644 --- a/tools/objtool/orc_gen.c +++ b/tools/objtool/orc_gen.c @@ -225,8 +225,6 @@ int orc_create(struct objtool_file *file) sec = elf_create_section(file->elf, ".orc_unwind_ip", 0, sizeof(int), nr); if (!sec) return -1; - if (!elf_create_reloc_section(file->elf, sec, SHT_RELA)) - return -1; /* Write ORC entries to sections: */ list_for_each_entry(entry, &orc_list, list) { -- cgit v1.2.3 From 417a4dc91e559f92404c2544f785b02ce75784c3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Mar 2021 16:12:09 +0100 Subject: objtool: Extract elf_strtab_concat() Create a common helper to append strings to a strtab. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Signed-off-by: Ingo Molnar Reviewed-by: Miroslav Benes Link: https://lkml.kernel.org/r/20210326151259.941474004@infradead.org --- tools/objtool/elf.c | 60 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 22 deletions(-) (limited to 'tools') diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index 7b65ae3beea4..c278a04874ad 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -673,13 +673,48 @@ err: return NULL; } +static int elf_add_string(struct elf *elf, struct section *strtab, char *str) +{ + Elf_Data *data; + Elf_Scn *s; + int len; + + if (!strtab) + strtab = find_section_by_name(elf, ".strtab"); + if (!strtab) { + WARN("can't find .strtab section"); + return -1; + } + + s = elf_getscn(elf->elf, strtab->idx); + if (!s) { + WARN_ELF("elf_getscn"); + return -1; + } + + data = elf_newdata(s); + if (!data) { + WARN_ELF("elf_newdata"); + return -1; + } + + data->d_buf = str; + data->d_size = strlen(str) + 1; + data->d_align = 1; + + len = strtab->len; + strtab->len += data->d_size; + strtab->changed = true; + + return len; +} + struct section *elf_create_section(struct elf *elf, const char *name, unsigned int sh_flags, size_t entsize, int nr) { struct section *sec, *shstrtab; size_t size = entsize * nr; Elf_Scn *s; - Elf_Data *data; sec = malloc(sizeof(*sec)); if (!sec) { @@ -736,7 +771,6 @@ struct section *elf_create_section(struct elf *elf, const char *name, sec->sh.sh_addralign = 1; sec->sh.sh_flags = SHF_ALLOC | sh_flags; - /* Add section name to .shstrtab (or .strtab for Clang) */ shstrtab = find_section_by_name(elf, ".shstrtab"); if (!shstrtab) @@ -745,27 +779,9 @@ struct section *elf_create_section(struct elf *elf, const char *name, WARN("can't find .shstrtab or .strtab section"); return NULL; } - - s = elf_getscn(elf->elf, shstrtab->idx); - if (!s) { - WARN_ELF("elf_getscn"); + sec->sh.sh_name = elf_add_string(elf, shstrtab, sec->name); + if (sec->sh.sh_name == -1) return NULL; - } - - data = elf_newdata(s); - if (!data) { - WARN_ELF("elf_newdata"); - return NULL; - } - - data->d_buf = sec->name; - data->d_size = strlen(name) + 1; - data->d_align = 1; - - sec->sh.sh_name = shstrtab->len; - - shstrtab->len += strlen(name) + 1; - shstrtab->changed = true; list_add_tail(&sec->list, &elf->sections); elf_hash_add(elf->section_hash, &sec->hash, sec->idx); -- cgit v1.2.3 From 9a7827b7789c630c1efdb121daa42c6e77dce97f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Mar 2021 16:12:10 +0100 Subject: objtool: Extract elf_symbol_add() Create a common helper to add symbols. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Signed-off-by: Ingo Molnar Reviewed-by: Miroslav Benes Link: https://lkml.kernel.org/r/20210326151300.003468981@infradead.org --- tools/objtool/elf.c | 56 +++++++++++++++++++++++++++++------------------------ 1 file changed, 31 insertions(+), 25 deletions(-) (limited to 'tools') diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index c278a04874ad..8457218dcf22 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -290,12 +290,39 @@ static int read_sections(struct elf *elf) return 0; } +static void elf_add_symbol(struct elf *elf, struct symbol *sym) +{ + struct list_head *entry; + struct rb_node *pnode; + + sym->type = GELF_ST_TYPE(sym->sym.st_info); + sym->bind = GELF_ST_BIND(sym->sym.st_info); + + sym->offset = sym->sym.st_value; + sym->len = sym->sym.st_size; + + rb_add(&sym->node, &sym->sec->symbol_tree, symbol_to_offset); + pnode = rb_prev(&sym->node); + if (pnode) + entry = &rb_entry(pnode, struct symbol, node)->list; + else + entry = &sym->sec->symbol_list; + list_add(&sym->list, entry); + elf_hash_add(elf->symbol_hash, &sym->hash, sym->idx); + elf_hash_add(elf->symbol_name_hash, &sym->name_hash, str_hash(sym->name)); + + /* + * Don't store empty STT_NOTYPE symbols in the rbtree. They + * can exist within a function, confusing the sorting. + */ + if (!sym->len) + rb_erase(&sym->node, &sym->sec->symbol_tree); +} + static int read_symbols(struct elf *elf) { struct section *symtab, *symtab_shndx, *sec; struct symbol *sym, *pfunc; - struct list_head *entry; - struct rb_node *pnode; int symbols_nr, i; char *coldstr; Elf_Data *shndx_data = NULL; @@ -340,9 +367,6 @@ static int read_symbols(struct elf *elf) goto err; } - sym->type = GELF_ST_TYPE(sym->sym.st_info); - sym->bind = GELF_ST_BIND(sym->sym.st_info); - if ((sym->sym.st_shndx > SHN_UNDEF && sym->sym.st_shndx < SHN_LORESERVE) || (shndx_data && sym->sym.st_shndx == SHN_XINDEX)) { @@ -355,32 +379,14 @@ static int read_symbols(struct elf *elf) sym->name); goto err; } - if (sym->type == STT_SECTION) { + if (GELF_ST_TYPE(sym->sym.st_info) == STT_SECTION) { sym->name = sym->sec->name; sym->sec->sym = sym; } } else sym->sec = find_section_by_index(elf, 0); - sym->offset = sym->sym.st_value; - sym->len = sym->sym.st_size; - - rb_add(&sym->node, &sym->sec->symbol_tree, symbol_to_offset); - pnode = rb_prev(&sym->node); - if (pnode) - entry = &rb_entry(pnode, struct symbol, node)->list; - else - entry = &sym->sec->symbol_list; - list_add(&sym->list, entry); - elf_hash_add(elf->symbol_hash, &sym->hash, sym->idx); - elf_hash_add(elf->symbol_name_hash, &sym->name_hash, str_hash(sym->name)); - - /* - * Don't store empty STT_NOTYPE symbols in the rbtree. They - * can exist within a function, confusing the sorting. - */ - if (!sym->len) - rb_erase(&sym->node, &sym->sec->symbol_tree); + elf_add_symbol(elf, sym); } if (stats) -- cgit v1.2.3 From 2f2f7e47f0525cbaad5dd9675fd9d8aa8da12046 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Mar 2021 16:12:11 +0100 Subject: objtool: Add elf_create_undef_symbol() Allow objtool to create undefined symbols; this allows creating relocations to symbols not currently in the symbol table. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Signed-off-by: Ingo Molnar Reviewed-by: Miroslav Benes Link: https://lkml.kernel.org/r/20210326151300.064743095@infradead.org --- tools/objtool/elf.c | 60 +++++++++++++++++++++++++++++++++++++ tools/objtool/include/objtool/elf.h | 1 + 2 files changed, 61 insertions(+) (limited to 'tools') diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index 8457218dcf22..d08f5f3670f8 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -715,6 +715,66 @@ static int elf_add_string(struct elf *elf, struct section *strtab, char *str) return len; } +struct symbol *elf_create_undef_symbol(struct elf *elf, const char *name) +{ + struct section *symtab; + struct symbol *sym; + Elf_Data *data; + Elf_Scn *s; + + sym = malloc(sizeof(*sym)); + if (!sym) { + perror("malloc"); + return NULL; + } + memset(sym, 0, sizeof(*sym)); + + sym->name = strdup(name); + + sym->sym.st_name = elf_add_string(elf, NULL, sym->name); + if (sym->sym.st_name == -1) + return NULL; + + sym->sym.st_info = GELF_ST_INFO(STB_GLOBAL, STT_NOTYPE); + // st_other 0 + // st_shndx 0 + // st_value 0 + // st_size 0 + + symtab = find_section_by_name(elf, ".symtab"); + if (!symtab) { + WARN("can't find .symtab"); + return NULL; + } + + s = elf_getscn(elf->elf, symtab->idx); + if (!s) { + WARN_ELF("elf_getscn"); + return NULL; + } + + data = elf_newdata(s); + if (!data) { + WARN_ELF("elf_newdata"); + return NULL; + } + + data->d_buf = &sym->sym; + data->d_size = sizeof(sym->sym); + data->d_align = 1; + + sym->idx = symtab->len / sizeof(sym->sym); + + symtab->len += data->d_size; + symtab->changed = true; + + sym->sec = find_section_by_index(elf, 0); + + elf_add_symbol(elf, sym); + + return sym; +} + struct section *elf_create_section(struct elf *elf, const char *name, unsigned int sh_flags, size_t entsize, int nr) { diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h index 463f329f1c66..45e5ede363b0 100644 --- a/tools/objtool/include/objtool/elf.h +++ b/tools/objtool/include/objtool/elf.h @@ -133,6 +133,7 @@ int elf_write_insn(struct elf *elf, struct section *sec, unsigned long offset, unsigned int len, const char *insn); int elf_write_reloc(struct elf *elf, struct reloc *reloc); +struct symbol *elf_create_undef_symbol(struct elf *elf, const char *name); int elf_write(struct elf *elf); void elf_close(struct elf *elf); -- cgit v1.2.3 From 43d5430ad74ef5156353af7aec352426ec7a8e57 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Mar 2021 16:12:12 +0100 Subject: objtool: Keep track of retpoline call sites Provide infrastructure for architectures to rewrite/augment compiler generated retpoline calls. Similar to what we do for static_call()s, keep track of the instructions that are retpoline calls. Use the same list_head, since a retpoline call cannot also be a static_call. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Signed-off-by: Ingo Molnar Reviewed-by: Miroslav Benes Link: https://lkml.kernel.org/r/20210326151300.130805730@infradead.org --- tools/objtool/check.c | 34 ++++++++++++++++++++++++++++----- tools/objtool/include/objtool/arch.h | 2 ++ tools/objtool/include/objtool/check.h | 2 +- tools/objtool/include/objtool/objtool.h | 1 + tools/objtool/objtool.c | 1 + 5 files changed, 34 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 600fa67153d3..77074db1fcda 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -451,7 +451,7 @@ static int create_static_call_sections(struct objtool_file *file) return 0; idx = 0; - list_for_each_entry(insn, &file->static_call_list, static_call_node) + list_for_each_entry(insn, &file->static_call_list, call_node) idx++; sec = elf_create_section(file->elf, ".static_call_sites", SHF_WRITE, @@ -460,7 +460,7 @@ static int create_static_call_sections(struct objtool_file *file) return -1; idx = 0; - list_for_each_entry(insn, &file->static_call_list, static_call_node) { + list_for_each_entry(insn, &file->static_call_list, call_node) { site = (struct static_call_site *)sec->data->d_buf + idx; memset(site, 0, sizeof(struct static_call_site)); @@ -829,13 +829,16 @@ static int add_jump_destinations(struct objtool_file *file) else insn->type = INSN_JUMP_DYNAMIC_CONDITIONAL; + list_add_tail(&insn->call_node, + &file->retpoline_call_list); + insn->retpoline_safe = true; continue; } else if (insn->func) { /* internal or external sibling call (with reloc) */ insn->call_dest = reloc->sym; if (insn->call_dest->static_call_tramp) { - list_add_tail(&insn->static_call_node, + list_add_tail(&insn->call_node, &file->static_call_list); } continue; @@ -897,7 +900,7 @@ static int add_jump_destinations(struct objtool_file *file) /* internal sibling call (without reloc) */ insn->call_dest = insn->jump_dest->func; if (insn->call_dest->static_call_tramp) { - list_add_tail(&insn->static_call_node, + list_add_tail(&insn->call_node, &file->static_call_list); } } @@ -981,6 +984,9 @@ static int add_call_destinations(struct objtool_file *file) insn->type = INSN_CALL_DYNAMIC; insn->retpoline_safe = true; + list_add_tail(&insn->call_node, + &file->retpoline_call_list); + remove_insn_ops(insn); continue; @@ -988,7 +994,7 @@ static int add_call_destinations(struct objtool_file *file) insn->call_dest = reloc->sym; if (insn->call_dest && insn->call_dest->static_call_tramp) { - list_add_tail(&insn->static_call_node, + list_add_tail(&insn->call_node, &file->static_call_list); } @@ -1714,6 +1720,11 @@ static void mark_rodata(struct objtool_file *file) file->rodata = found; } +__weak int arch_rewrite_retpolines(struct objtool_file *file) +{ + return 0; +} + static int decode_sections(struct objtool_file *file) { int ret; @@ -1742,6 +1753,10 @@ static int decode_sections(struct objtool_file *file) if (ret) return ret; + /* + * Must be before add_special_section_alts() as that depends on + * jump_dest being set. + */ ret = add_jump_destinations(file); if (ret) return ret; @@ -1778,6 +1793,15 @@ static int decode_sections(struct objtool_file *file) if (ret) return ret; + /* + * Must be after add_special_section_alts(), since this will emit + * alternatives. Must be after add_{jump,call}_destination(), since + * those create the call insn lists. + */ + ret = arch_rewrite_retpolines(file); + if (ret) + return ret; + return 0; } diff --git a/tools/objtool/include/objtool/arch.h b/tools/objtool/include/objtool/arch.h index bb30993a1ced..48b540a9d9e9 100644 --- a/tools/objtool/include/objtool/arch.h +++ b/tools/objtool/include/objtool/arch.h @@ -88,4 +88,6 @@ int arch_decode_hint_reg(struct instruction *insn, u8 sp_reg); bool arch_is_retpoline(struct symbol *sym); +int arch_rewrite_retpolines(struct objtool_file *file); + #endif /* _ARCH_H */ diff --git a/tools/objtool/include/objtool/check.h b/tools/objtool/include/objtool/check.h index f5be798107bc..e5528ce3961a 100644 --- a/tools/objtool/include/objtool/check.h +++ b/tools/objtool/include/objtool/check.h @@ -39,7 +39,7 @@ struct alt_group { struct instruction { struct list_head list; struct hlist_node hash; - struct list_head static_call_node; + struct list_head call_node; struct list_head mcount_loc_node; struct section *sec; unsigned long offset; diff --git a/tools/objtool/include/objtool/objtool.h b/tools/objtool/include/objtool/objtool.h index e68e37476c15..e4084afb2304 100644 --- a/tools/objtool/include/objtool/objtool.h +++ b/tools/objtool/include/objtool/objtool.h @@ -18,6 +18,7 @@ struct objtool_file { struct elf *elf; struct list_head insn_list; DECLARE_HASHTABLE(insn_hash, 20); + struct list_head retpoline_call_list; struct list_head static_call_list; struct list_head mcount_loc_list; bool ignore_unreachables, c_file, hints, rodata; diff --git a/tools/objtool/objtool.c b/tools/objtool/objtool.c index 7b97ce499405..3a3ea1b4e4da 100644 --- a/tools/objtool/objtool.c +++ b/tools/objtool/objtool.c @@ -61,6 +61,7 @@ struct objtool_file *objtool_open_read(const char *_objname) INIT_LIST_HEAD(&file.insn_list); hash_init(file.insn_hash); + INIT_LIST_HEAD(&file.retpoline_call_list); INIT_LIST_HEAD(&file.static_call_list); INIT_LIST_HEAD(&file.mcount_loc_list); file.c_file = !vmlinux && find_section_by_name(file.elf, ".comment"); -- cgit v1.2.3 From 7bd2a600f3e9d27286bbf23c83d599e9cc7cf245 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Mar 2021 16:12:13 +0100 Subject: objtool: Cache instruction relocs Track the reloc of instructions in the new instruction->reloc field to avoid having to look them up again later. ( Technically x86 instructions can have two relocations, but not jumps and calls, for which we're using this. ) Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Signed-off-by: Ingo Molnar Reviewed-by: Miroslav Benes Link: https://lkml.kernel.org/r/20210326151300.195441549@infradead.org --- tools/objtool/check.c | 28 ++++++++++++++++++++++------ tools/objtool/include/objtool/check.h | 1 + 2 files changed, 23 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 77074db1fcda..1f4154f9b04b 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -797,6 +797,25 @@ __weak bool arch_is_retpoline(struct symbol *sym) return false; } +#define NEGATIVE_RELOC ((void *)-1L) + +static struct reloc *insn_reloc(struct objtool_file *file, struct instruction *insn) +{ + if (insn->reloc == NEGATIVE_RELOC) + return NULL; + + if (!insn->reloc) { + insn->reloc = find_reloc_by_dest_range(file->elf, insn->sec, + insn->offset, insn->len); + if (!insn->reloc) { + insn->reloc = NEGATIVE_RELOC; + return NULL; + } + } + + return insn->reloc; +} + /* * Find the destination instructions for all jumps. */ @@ -811,8 +830,7 @@ static int add_jump_destinations(struct objtool_file *file) if (!is_static_jump(insn)) continue; - reloc = find_reloc_by_dest_range(file->elf, insn->sec, - insn->offset, insn->len); + reloc = insn_reloc(file, insn); if (!reloc) { dest_sec = insn->sec; dest_off = arch_jump_destination(insn); @@ -944,8 +962,7 @@ static int add_call_destinations(struct objtool_file *file) if (insn->type != INSN_CALL) continue; - reloc = find_reloc_by_dest_range(file->elf, insn->sec, - insn->offset, insn->len); + reloc = insn_reloc(file, insn); if (!reloc) { dest_off = arch_jump_destination(insn); insn->call_dest = find_call_destination(insn->sec, dest_off); @@ -1144,8 +1161,7 @@ static int handle_group_alt(struct objtool_file *file, * alternatives code can adjust the relative offsets * accordingly. */ - alt_reloc = find_reloc_by_dest_range(file->elf, insn->sec, - insn->offset, insn->len); + alt_reloc = insn_reloc(file, insn); if (alt_reloc && !arch_support_alt_relocation(special_alt, insn, alt_reloc)) { diff --git a/tools/objtool/include/objtool/check.h b/tools/objtool/include/objtool/check.h index e5528ce3961a..56d50bc50c10 100644 --- a/tools/objtool/include/objtool/check.h +++ b/tools/objtool/include/objtool/check.h @@ -56,6 +56,7 @@ struct instruction { struct instruction *jump_dest; struct instruction *first_jump_src; struct reloc *jump_table; + struct reloc *reloc; struct list_head alts; struct symbol *func; struct list_head stack_ops; -- cgit v1.2.3 From 50e7b4a1a1b264fc7df0698f2defb93cadf19a7b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Mar 2021 16:12:14 +0100 Subject: objtool: Skip magical retpoline .altinstr_replacement When the .altinstr_replacement is a retpoline, skip the alternative. We already special case retpolines anyway. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Signed-off-by: Ingo Molnar Reviewed-by: Miroslav Benes Link: https://lkml.kernel.org/r/20210326151300.259429287@infradead.org --- tools/objtool/special.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/objtool/special.c b/tools/objtool/special.c index 2c7fbda7b055..07b21cfabf5c 100644 --- a/tools/objtool/special.c +++ b/tools/objtool/special.c @@ -106,6 +106,14 @@ static int get_alt_entry(struct elf *elf, struct special_entry *entry, return -1; } + /* + * Skip retpoline .altinstr_replacement... we already rewrite the + * instructions for retpolines anyway, see arch_is_retpoline() + * usage in add_{call,jump}_destinations(). + */ + if (arch_is_retpoline(new_reloc->sym)) + return 1; + alt->new_sec = new_reloc->sym->sec; alt->new_off = (unsigned int)new_reloc->addend; @@ -154,7 +162,9 @@ int special_get_alts(struct elf *elf, struct list_head *alts) memset(alt, 0, sizeof(*alt)); ret = get_alt_entry(elf, entry, sec, idx, alt); - if (ret) + if (ret > 0) + continue; + if (ret < 0) return ret; list_add_tail(&alt->list, alts); -- cgit v1.2.3 From 9bc0bb50727c8ac69fbb33fb937431cf3518ff37 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Mar 2021 16:12:15 +0100 Subject: objtool/x86: Rewrite retpoline thunk calls When the compiler emits: "CALL __x86_indirect_thunk_\reg" for an indirect call, have objtool rewrite it to: ALTERNATIVE "call __x86_indirect_thunk_\reg", "call *%reg", ALT_NOT(X86_FEATURE_RETPOLINE) Additionally, in order to not emit endless identical .altinst_replacement chunks, use a global symbol for them, see __x86_indirect_alt_*. This also avoids objtool from having to do code generation. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Signed-off-by: Ingo Molnar Reviewed-by: Miroslav Benes Link: https://lkml.kernel.org/r/20210326151300.320177914@infradead.org --- arch/x86/include/asm/asm-prototypes.h | 12 +++- arch/x86/lib/retpoline.S | 41 +++++++++++- tools/objtool/arch/x86/decode.c | 117 ++++++++++++++++++++++++++++++++++ 3 files changed, 167 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index 0545b07895a5..4cb726c71ed8 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -19,11 +19,19 @@ extern void cmpxchg8b_emu(void); #ifdef CONFIG_RETPOLINE -#define DECL_INDIRECT_THUNK(reg) \ +#undef GEN +#define GEN(reg) \ extern asmlinkage void __x86_indirect_thunk_ ## reg (void); +#include + +#undef GEN +#define GEN(reg) \ + extern asmlinkage void __x86_indirect_alt_call_ ## reg (void); +#include #undef GEN -#define GEN(reg) DECL_INDIRECT_THUNK(reg) +#define GEN(reg) \ + extern asmlinkage void __x86_indirect_alt_jmp_ ## reg (void); #include #endif /* CONFIG_RETPOLINE */ diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index d2c0d14c1e22..4d32cb06ffd5 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -10,6 +10,8 @@ #include #include + .section .text.__x86.indirect_thunk + .macro RETPOLINE reg ANNOTATE_INTRA_FUNCTION_CALL call .Ldo_rop_\@ @@ -25,9 +27,9 @@ .endm .macro THUNK reg - .section .text.__x86.indirect_thunk .align 32 + SYM_FUNC_START(__x86_indirect_thunk_\reg) ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \ @@ -38,6 +40,32 @@ SYM_FUNC_END(__x86_indirect_thunk_\reg) .endm +/* + * This generates .altinstr_replacement symbols for use by objtool. They, + * however, must not actually live in .altinstr_replacement since that will be + * discarded after init, but module alternatives will also reference these + * symbols. + * + * Their names matches the "__x86_indirect_" prefix to mark them as retpolines. + */ +.macro ALT_THUNK reg + + .align 1 + +SYM_FUNC_START_NOALIGN(__x86_indirect_alt_call_\reg) + ANNOTATE_RETPOLINE_SAFE +1: call *%\reg +2: .skip 5-(2b-1b), 0x90 +SYM_FUNC_END(__x86_indirect_alt_call_\reg) + +SYM_FUNC_START_NOALIGN(__x86_indirect_alt_jmp_\reg) + ANNOTATE_RETPOLINE_SAFE +1: jmp *%\reg +2: .skip 5-(2b-1b), 0x90 +SYM_FUNC_END(__x86_indirect_alt_jmp_\reg) + +.endm + /* * Despite being an assembler file we can't just use .irp here * because __KSYM_DEPS__ only uses the C preprocessor and would @@ -61,3 +89,14 @@ SYM_FUNC_END(__x86_indirect_thunk_\reg) #define GEN(reg) EXPORT_THUNK(reg) #include +#undef GEN +#define GEN(reg) ALT_THUNK reg +#include + +#undef GEN +#define GEN(reg) __EXPORT_THUNK(__x86_indirect_alt_call_ ## reg) +#include + +#undef GEN +#define GEN(reg) __EXPORT_THUNK(__x86_indirect_alt_jmp_ ## reg) +#include diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index 782894eb351a..7e8b5bedd946 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -19,6 +19,7 @@ #include #include #include +#include static unsigned char op_to_cfi_reg[][2] = { {CFI_AX, CFI_R8}, @@ -613,6 +614,122 @@ const char *arch_nop_insn(int len) return nops[len-1]; } +/* asm/alternative.h ? */ + +#define ALTINSTR_FLAG_INV (1 << 15) +#define ALT_NOT(feat) ((feat) | ALTINSTR_FLAG_INV) + +struct alt_instr { + s32 instr_offset; /* original instruction */ + s32 repl_offset; /* offset to replacement instruction */ + u16 cpuid; /* cpuid bit set for replacement */ + u8 instrlen; /* length of original instruction */ + u8 replacementlen; /* length of new instruction */ +} __packed; + +static int elf_add_alternative(struct elf *elf, + struct instruction *orig, struct symbol *sym, + int cpuid, u8 orig_len, u8 repl_len) +{ + const int size = sizeof(struct alt_instr); + struct alt_instr *alt; + struct section *sec; + Elf_Scn *s; + + sec = find_section_by_name(elf, ".altinstructions"); + if (!sec) { + sec = elf_create_section(elf, ".altinstructions", + SHF_WRITE, size, 0); + + if (!sec) { + WARN_ELF("elf_create_section"); + return -1; + } + } + + s = elf_getscn(elf->elf, sec->idx); + if (!s) { + WARN_ELF("elf_getscn"); + return -1; + } + + sec->data = elf_newdata(s); + if (!sec->data) { + WARN_ELF("elf_newdata"); + return -1; + } + + sec->data->d_size = size; + sec->data->d_align = 1; + + alt = sec->data->d_buf = malloc(size); + if (!sec->data->d_buf) { + perror("malloc"); + return -1; + } + memset(sec->data->d_buf, 0, size); + + if (elf_add_reloc_to_insn(elf, sec, sec->sh.sh_size, + R_X86_64_PC32, orig->sec, orig->offset)) { + WARN("elf_create_reloc: alt_instr::instr_offset"); + return -1; + } + + if (elf_add_reloc(elf, sec, sec->sh.sh_size + 4, + R_X86_64_PC32, sym, 0)) { + WARN("elf_create_reloc: alt_instr::repl_offset"); + return -1; + } + + alt->cpuid = cpuid; + alt->instrlen = orig_len; + alt->replacementlen = repl_len; + + sec->sh.sh_size += size; + sec->changed = true; + + return 0; +} + +#define X86_FEATURE_RETPOLINE ( 7*32+12) + +int arch_rewrite_retpolines(struct objtool_file *file) +{ + struct instruction *insn; + struct reloc *reloc; + struct symbol *sym; + char name[32] = ""; + + list_for_each_entry(insn, &file->retpoline_call_list, call_node) { + + if (!strcmp(insn->sec->name, ".text.__x86.indirect_thunk")) + continue; + + reloc = insn->reloc; + + sprintf(name, "__x86_indirect_alt_%s_%s", + insn->type == INSN_JUMP_DYNAMIC ? "jmp" : "call", + reloc->sym->name + 21); + + sym = find_symbol_by_name(file->elf, name); + if (!sym) { + sym = elf_create_undef_symbol(file->elf, name); + if (!sym) { + WARN("elf_create_undef_symbol"); + return -1; + } + } + + if (elf_add_alternative(file->elf, insn, sym, + ALT_NOT(X86_FEATURE_RETPOLINE), 5, 5)) { + WARN("elf_add_alternative"); + return -1; + } + } + + return 0; +} + int arch_decode_hint_reg(struct instruction *insn, u8 sp_reg) { struct cfi_reg *cfa = &insn->cfi.cfa; -- cgit v1.2.3 From fd6103cb67966ed783b3800110bdbd66edae26a4 Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Thu, 1 Apr 2021 14:23:54 +0800 Subject: perf evsel: Remove duplicate 'struct target' forward declaration 'struct target' is declared twice. One has been declared at 21st line. Remove the duplicate. Signed-off-by: Wan Jiabing Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Peter Zijlstra Cc: kael_w@yeah.net Link: http://lore.kernel.org/lkml/20210401062424.991737-1-wanjiabing@vivo.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.h | 1 - 1 file changed, 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index dd4f56f9cfdf..eccc4fd5b3eb 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -175,7 +175,6 @@ struct perf_missing_features { extern struct perf_missing_features perf_missing_features; struct perf_cpu_map; -struct target; struct thread_map; struct record_opts; -- cgit v1.2.3 From 42e4eefb089f12ea900062ecdcc7ca10c3423a05 Mon Sep 17 00:00:00 2001 From: Hao Fang Date: Tue, 30 Mar 2021 14:33:48 +0800 Subject: dma-mapping: benchmark: use the correct HiSilicon copyright s/Hisilicon/HiSilicon/g. It should use capital S, according to https://www.hisilicon.com/en/terms-of-use. Signed-off-by: Hao Fang Acked-by: Barry Song Signed-off-by: Christoph Hellwig --- kernel/dma/map_benchmark.c | 2 +- tools/testing/selftests/dma/dma_map_benchmark.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c index e0e64f8b0739..00d6549a5495 100644 --- a/kernel/dma/map_benchmark.c +++ b/kernel/dma/map_benchmark.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (C) 2020 Hisilicon Limited. + * Copyright (C) 2020 HiSilicon Limited. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/tools/testing/selftests/dma/dma_map_benchmark.c b/tools/testing/selftests/dma/dma_map_benchmark.c index fb23ce9617ea..b492bed0936d 100644 --- a/tools/testing/selftests/dma/dma_map_benchmark.c +++ b/tools/testing/selftests/dma/dma_map_benchmark.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (C) 2020 Hisilicon Limited. + * Copyright (C) 2020 HiSilicon Limited. */ #include -- cgit v1.2.3 From ca947482b0b30443e6da1f0f5ba7244e34a4f65a Mon Sep 17 00:00:00 2001 From: Xiang Chen Date: Thu, 18 Mar 2021 17:29:30 +0800 Subject: dma-mapping: benchmark: Add support for multi-pages map/unmap Currently it only support one page map/unmap once a time for dma-map benchmark, but there are some other scenaries which need to support for multi-page map/unmap: for those multi-pages interfaces such as dma_alloc_coherent() and dma_map_sg(), the time spent on multi-pages map/unmap is not the time of a single page * npages (not linear) as it may use block description instead of page description when it is satified with the size such as 2M/1G, and also it can send a single TLB invalidation command to invalidate multi-pages instead of multi-times when RIL is enabled (which will short the time of unmap). So it is necessary to add support for multi-pages map/unmap. Add a parameter "-g" to support multi-pages map/unmap. Signed-off-by: Xiang Chen Acked-by: Barry Song Signed-off-by: Christoph Hellwig --- kernel/dma/map_benchmark.c | 21 ++++++++++++++------- tools/testing/selftests/dma/dma_map_benchmark.c | 20 ++++++++++++++++---- 2 files changed, 30 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c index 00d6549a5495..9b9af1bd6be3 100644 --- a/kernel/dma/map_benchmark.c +++ b/kernel/dma/map_benchmark.c @@ -38,7 +38,8 @@ struct map_benchmark { __u32 dma_bits; /* DMA addressing capability */ __u32 dma_dir; /* DMA data direction */ __u32 dma_trans_ns; /* time for DMA transmission in ns */ - __u8 expansion[80]; /* For future use */ + __u32 granule; /* how many PAGE_SIZE will do map/unmap once a time */ + __u8 expansion[76]; /* For future use */ }; struct map_benchmark_data { @@ -58,9 +59,11 @@ static int map_benchmark_thread(void *data) void *buf; dma_addr_t dma_addr; struct map_benchmark_data *map = data; + int npages = map->bparam.granule; + u64 size = npages * PAGE_SIZE; int ret = 0; - buf = (void *)__get_free_page(GFP_KERNEL); + buf = alloc_pages_exact(size, GFP_KERNEL); if (!buf) return -ENOMEM; @@ -76,10 +79,10 @@ static int map_benchmark_thread(void *data) * 66 means evertything goes well! 66 is lucky. */ if (map->dir != DMA_FROM_DEVICE) - memset(buf, 0x66, PAGE_SIZE); + memset(buf, 0x66, size); map_stime = ktime_get(); - dma_addr = dma_map_single(map->dev, buf, PAGE_SIZE, map->dir); + dma_addr = dma_map_single(map->dev, buf, size, map->dir); if (unlikely(dma_mapping_error(map->dev, dma_addr))) { pr_err("dma_map_single failed on %s\n", dev_name(map->dev)); @@ -93,7 +96,7 @@ static int map_benchmark_thread(void *data) ndelay(map->bparam.dma_trans_ns); unmap_stime = ktime_get(); - dma_unmap_single(map->dev, dma_addr, PAGE_SIZE, map->dir); + dma_unmap_single(map->dev, dma_addr, size, map->dir); unmap_etime = ktime_get(); unmap_delta = ktime_sub(unmap_etime, unmap_stime); @@ -112,7 +115,7 @@ static int map_benchmark_thread(void *data) } out: - free_page((unsigned long)buf); + free_pages_exact(buf, size); return ret; } @@ -203,7 +206,6 @@ static long map_benchmark_ioctl(struct file *file, unsigned int cmd, struct map_benchmark_data *map = file->private_data; void __user *argp = (void __user *)arg; u64 old_dma_mask; - int ret; if (copy_from_user(&map->bparam, argp, sizeof(map->bparam))) @@ -234,6 +236,11 @@ static long map_benchmark_ioctl(struct file *file, unsigned int cmd, return -EINVAL; } + if (map->bparam.granule < 1 || map->bparam.granule > 1024) { + pr_err("invalid granule size\n"); + return -EINVAL; + } + switch (map->bparam.dma_dir) { case DMA_MAP_BIDIRECTIONAL: map->dir = DMA_BIDIRECTIONAL; diff --git a/tools/testing/selftests/dma/dma_map_benchmark.c b/tools/testing/selftests/dma/dma_map_benchmark.c index b492bed0936d..485dff51bad2 100644 --- a/tools/testing/selftests/dma/dma_map_benchmark.c +++ b/tools/testing/selftests/dma/dma_map_benchmark.c @@ -40,7 +40,8 @@ struct map_benchmark { __u32 dma_bits; /* DMA addressing capability */ __u32 dma_dir; /* DMA data direction */ __u32 dma_trans_ns; /* time for DMA transmission in ns */ - __u8 expansion[80]; /* For future use */ + __u32 granule; /* how many PAGE_SIZE will do map/unmap once a time */ + __u8 expansion[76]; /* For future use */ }; int main(int argc, char **argv) @@ -51,11 +52,13 @@ int main(int argc, char **argv) int threads = 1, seconds = 20, node = -1; /* default dma mask 32bit, bidirectional DMA */ int bits = 32, xdelay = 0, dir = DMA_MAP_BIDIRECTIONAL; + /* default granule 1 PAGESIZE */ + int granule = 1; int cmd = DMA_MAP_BENCHMARK; char *p; - while ((opt = getopt(argc, argv, "t:s:n:b:d:x:")) != -1) { + while ((opt = getopt(argc, argv, "t:s:n:b:d:x:g:")) != -1) { switch (opt) { case 't': threads = atoi(optarg); @@ -75,6 +78,9 @@ int main(int argc, char **argv) case 'x': xdelay = atoi(optarg); break; + case 'g': + granule = atoi(optarg); + break; default: return -1; } @@ -110,6 +116,11 @@ int main(int argc, char **argv) exit(1); } + if (granule < 1 || granule > 1024) { + fprintf(stderr, "invalid granule size\n"); + exit(1); + } + fd = open("/sys/kernel/debug/dma_map_benchmark", O_RDWR); if (fd == -1) { perror("open"); @@ -123,14 +134,15 @@ int main(int argc, char **argv) map.dma_bits = bits; map.dma_dir = dir; map.dma_trans_ns = xdelay; + map.granule = granule; if (ioctl(fd, cmd, &map)) { perror("ioctl"); exit(1); } - printf("dma mapping benchmark: threads:%d seconds:%d node:%d dir:%s\n", - threads, seconds, node, dir[directions]); + printf("dma mapping benchmark: threads:%d seconds:%d node:%d dir:%s granule: %d\n", + threads, seconds, node, dir[directions], granule); printf("average map latency(us):%.1f standard deviation:%.1f\n", map.avg_map_100ns/10.0, map.map_stddev/10.0); printf("average unmap latency(us):%.1f standard deviation:%.1f\n", -- cgit v1.2.3 From a9d26a302dea29eb84f491b1340a57e56c631a71 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 17 Mar 2021 02:22:35 +0000 Subject: selftests/resctrl: Enable gcc checks to detect buffer overflows David reported a buffer overflow error in the check_results() function of the cmt unit test and he suggested enabling _FORTIFY_SOURCE gcc compiler option to automatically detect any such errors. Feature Test Macros man page describes_FORTIFY_SOURCE as below "Defining this macro causes some lightweight checks to be performed to detect some buffer overflow errors when employing various string and memory manipulation functions (for example, memcpy, memset, stpcpy, strcpy, strncpy, strcat, strncat, sprintf, snprintf, vsprintf, vsnprintf, gets, and wide character variants thereof). For some functions, argument consistency is checked; for example, a check is made that open has been supplied with a mode argument when the specified flags include O_CREAT. Not all problems are detected, just some common cases. If _FORTIFY_SOURCE is set to 1, with compiler optimization level 1 (gcc -O1) and above, checks that shouldn't change the behavior of conforming programs are performed. With _FORTIFY_SOURCE set to 2, some more checking is added, but some conforming programs might fail. Some of the checks can be performed at compile time (via macros logic implemented in header files), and result in compiler warnings; other checks take place at run time, and result in a run-time error if the check fails. Use of this macro requires compiler support, available with gcc since version 4.0." Fix the buffer overflow error in the check_results() function of the cmt unit test and enable _FORTIFY_SOURCE gcc check to catch any future buffer overflow errors. Reported-by: David Binderman Suggested-by: David Binderman Tested-by: Babu Moger Signed-off-by: Fenghua Yu Signed-off-by: Shuah Khan --- tools/testing/selftests/resctrl/Makefile | 2 +- tools/testing/selftests/resctrl/cqm_test.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/resctrl/Makefile b/tools/testing/selftests/resctrl/Makefile index d585cc1948cc..6bcee2ec91a9 100644 --- a/tools/testing/selftests/resctrl/Makefile +++ b/tools/testing/selftests/resctrl/Makefile @@ -1,5 +1,5 @@ CC = $(CROSS_COMPILE)gcc -CFLAGS = -g -Wall +CFLAGS = -g -Wall -O2 -D_FORTIFY_SOURCE=2 SRCS=$(wildcard *.c) OBJS=$(SRCS:.c=.o) diff --git a/tools/testing/selftests/resctrl/cqm_test.c b/tools/testing/selftests/resctrl/cqm_test.c index c8756152bd61..5e7308ac63be 100644 --- a/tools/testing/selftests/resctrl/cqm_test.c +++ b/tools/testing/selftests/resctrl/cqm_test.c @@ -86,7 +86,7 @@ static int check_results(struct resctrl_val_param *param, int no_of_bits) return errno; } - while (fgets(temp, 1024, fp)) { + while (fgets(temp, sizeof(temp), fp)) { char *token = strtok(temp, ":\t"); int fields = 0; -- cgit v1.2.3 From 8236c51d85a64643588505a6791e022cc8d84864 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 17 Mar 2021 02:22:36 +0000 Subject: selftests/resctrl: Fix compilation issues for global variables Reinette reported following compilation issue on Fedora 32, gcc version 10.1.1 /usr/bin/ld: cqm_test.o:/cqm_test.c:22: multiple definition of `cache_size'; cat_test.o:/cat_test.c:23: first defined here The same issue is reported for long_mask, cbm_mask, count_of_bits etc variables as well. Compiler isn't happy because these variables are defined globally in two .c files namely cqm_test.c and cat_test.c and the compiler during compilation finds that the variable is already defined (multiple definition error). Taking a closer look at the usage of these variables reveals that these variables are used only locally in functions such as cqm_resctrl_val() (defined in cqm_test.c) and cat_perf_miss_val() (defined in cat_test.c). These variables are not shared between those functions. So, there is no need for these variables to be global. Hence, fix this issue by making them static variables. Reported-by: Reinette Chatre Tested-by: Babu Moger Signed-off-by: Fenghua Yu Signed-off-by: Shuah Khan --- tools/testing/selftests/resctrl/cat_test.c | 10 +++++----- tools/testing/selftests/resctrl/cqm_test.c | 10 +++++----- tools/testing/selftests/resctrl/resctrl.h | 2 +- tools/testing/selftests/resctrl/resctrlfs.c | 10 +++++----- 4 files changed, 16 insertions(+), 16 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c index 5da43767b973..bdeeb5772592 100644 --- a/tools/testing/selftests/resctrl/cat_test.c +++ b/tools/testing/selftests/resctrl/cat_test.c @@ -17,10 +17,10 @@ #define MAX_DIFF_PERCENT 4 #define MAX_DIFF 1000000 -int count_of_bits; -char cbm_mask[256]; -unsigned long long_mask; -unsigned long cache_size; +static int count_of_bits; +static char cbm_mask[256]; +static unsigned long long_mask; +static unsigned long cache_size; /* * Change schemata. Write schemata to specified @@ -136,7 +136,7 @@ int cat_perf_miss_val(int cpu_no, int n, char *cache_type) return -1; /* Get default cbm mask for L3/L2 cache */ - ret = get_cbm_mask(cache_type); + ret = get_cbm_mask(cache_type, cbm_mask); if (ret) return ret; diff --git a/tools/testing/selftests/resctrl/cqm_test.c b/tools/testing/selftests/resctrl/cqm_test.c index 5e7308ac63be..de33d1c0466e 100644 --- a/tools/testing/selftests/resctrl/cqm_test.c +++ b/tools/testing/selftests/resctrl/cqm_test.c @@ -16,10 +16,10 @@ #define MAX_DIFF 2000000 #define MAX_DIFF_PERCENT 15 -int count_of_bits; -char cbm_mask[256]; -unsigned long long_mask; -unsigned long cache_size; +static int count_of_bits; +static char cbm_mask[256]; +static unsigned long long_mask; +static unsigned long cache_size; static int cqm_setup(int num, ...) { @@ -125,7 +125,7 @@ int cqm_resctrl_val(int cpu_no, int n, char **benchmark_cmd) if (!validate_resctrl_feature_request("cqm")) return -1; - ret = get_cbm_mask("L3"); + ret = get_cbm_mask("L3", cbm_mask); if (ret) return ret; diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h index 39bf59c6b9c5..959c71e39bdc 100644 --- a/tools/testing/selftests/resctrl/resctrl.h +++ b/tools/testing/selftests/resctrl/resctrl.h @@ -92,7 +92,7 @@ void tests_cleanup(void); void mbm_test_cleanup(void); int mba_schemata_change(int cpu_no, char *bw_report, char **benchmark_cmd); void mba_test_cleanup(void); -int get_cbm_mask(char *cache_type); +int get_cbm_mask(char *cache_type, char *cbm_mask); int get_cache_size(int cpu_no, char *cache_type, unsigned long *cache_size); void ctrlc_handler(int signum, siginfo_t *info, void *ptr); int cat_val(struct resctrl_val_param *param); diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c index 19c0ec4045a4..2a16100c9c3f 100644 --- a/tools/testing/selftests/resctrl/resctrlfs.c +++ b/tools/testing/selftests/resctrl/resctrlfs.c @@ -49,8 +49,6 @@ static int find_resctrl_mount(char *buffer) return -ENOENT; } -char cbm_mask[256]; - /* * remount_resctrlfs - Remount resctrl FS at /sys/fs/resctrl * @mum_resctrlfs: Should the resctrl FS be remounted? @@ -205,16 +203,18 @@ int get_cache_size(int cpu_no, char *cache_type, unsigned long *cache_size) /* * get_cbm_mask - Get cbm mask for given cache * @cache_type: Cache level L2/L3 - * - * Mask is stored in cbm_mask which is global variable. + * @cbm_mask: cbm_mask returned as a string * * Return: = 0 on success, < 0 on failure. */ -int get_cbm_mask(char *cache_type) +int get_cbm_mask(char *cache_type, char *cbm_mask) { char cbm_mask_path[1024]; FILE *fp; + if (!cbm_mask) + return -1; + sprintf(cbm_mask_path, "%s/%s/cbm_mask", CBM_MASK_PATH, cache_type); fp = fopen(cbm_mask_path, "r"); -- cgit v1.2.3 From 896016d2ad051811ff9c9c087393adc063322fbc Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 17 Mar 2021 02:22:37 +0000 Subject: selftests/resctrl: Fix compilation issues for other global variables Reinette reported following compilation issue on Fedora 32, gcc version 10.1.1 /usr/bin/ld: resctrl_tests.o:/resctrl.h:65: multiple definition of `bm_pid'; cache.o:/resctrl.h:65: first defined here Other variables are ppid, tests_run, llc_occup_path, is_amd. Compiler isn't happy because these variables are defined globally in two .c files but are not declared as extern. To fix issues for the global variables, declare them as extern. Chang Log: - Split this patch from v4's patch 1 (Shuah). Reported-by: Reinette Chatre Tested-by: Babu Moger Signed-off-by: Fenghua Yu Signed-off-by: Shuah Khan --- tools/testing/selftests/resctrl/resctrl.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h index 959c71e39bdc..12b77182cb44 100644 --- a/tools/testing/selftests/resctrl/resctrl.h +++ b/tools/testing/selftests/resctrl/resctrl.h @@ -62,11 +62,11 @@ struct resctrl_val_param { int (*setup)(int num, ...); }; -pid_t bm_pid, ppid; -int tests_run; +extern pid_t bm_pid, ppid; +extern int tests_run; -char llc_occup_path[1024]; -bool is_amd; +extern char llc_occup_path[1024]; +extern bool is_amd; bool check_resctrlfs_support(void); int filter_dmesg(void); -- cgit v1.2.3 From 2428673638ea28fa93d2a38b1c3e8d70122b00ee Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 17 Mar 2021 02:22:38 +0000 Subject: selftests/resctrl: Clean up resctrl features check Checking resctrl features call strcmp() to compare feature strings (e.g. "mba", "cat" etc). The checkings are error prone and don't have good coding style. Define the constant strings in macros and call strncmp() to solve the potential issues. Suggested-by: Shuah Khan Tested-by: Babu Moger Signed-off-by: Fenghua Yu Signed-off-by: Shuah Khan --- tools/testing/selftests/resctrl/cache.c | 8 ++++---- tools/testing/selftests/resctrl/cat_test.c | 2 +- tools/testing/selftests/resctrl/cqm_test.c | 2 +- tools/testing/selftests/resctrl/fill_buf.c | 4 ++-- tools/testing/selftests/resctrl/mba_test.c | 2 +- tools/testing/selftests/resctrl/mbm_test.c | 2 +- tools/testing/selftests/resctrl/resctrl.h | 5 +++++ tools/testing/selftests/resctrl/resctrl_tests.c | 12 ++++++------ tools/testing/selftests/resctrl/resctrl_val.c | 22 +++++++++++----------- tools/testing/selftests/resctrl/resctrlfs.c | 17 +++++++++-------- 10 files changed, 41 insertions(+), 35 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/resctrl/cache.c b/tools/testing/selftests/resctrl/cache.c index 38dbf4962e33..5922cc1b0386 100644 --- a/tools/testing/selftests/resctrl/cache.c +++ b/tools/testing/selftests/resctrl/cache.c @@ -182,7 +182,7 @@ int measure_cache_vals(struct resctrl_val_param *param, int bm_pid) /* * Measure cache miss from perf. */ - if (!strcmp(param->resctrl_val, "cat")) { + if (!strncmp(param->resctrl_val, CAT_STR, sizeof(CAT_STR))) { ret = get_llc_perf(&llc_perf_miss); if (ret < 0) return ret; @@ -192,7 +192,7 @@ int measure_cache_vals(struct resctrl_val_param *param, int bm_pid) /* * Measure llc occupancy from resctrl. */ - if (!strcmp(param->resctrl_val, "cqm")) { + if (!strncmp(param->resctrl_val, CQM_STR, sizeof(CQM_STR))) { ret = get_llc_occu_resctrl(&llc_occu_resc); if (ret < 0) return ret; @@ -234,7 +234,7 @@ int cat_val(struct resctrl_val_param *param) if (ret) return ret; - if ((strcmp(resctrl_val, "cat") == 0)) { + if (!strncmp(resctrl_val, CAT_STR, sizeof(CAT_STR))) { ret = initialize_llc_perf(); if (ret) return ret; @@ -242,7 +242,7 @@ int cat_val(struct resctrl_val_param *param) /* Test runs until the callback setup() tells the test to stop. */ while (1) { - if (strcmp(resctrl_val, "cat") == 0) { + if (!strncmp(resctrl_val, CAT_STR, sizeof(CAT_STR))) { ret = param->setup(1, param); if (ret) { ret = 0; diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c index bdeeb5772592..20823725daca 100644 --- a/tools/testing/selftests/resctrl/cat_test.c +++ b/tools/testing/selftests/resctrl/cat_test.c @@ -164,7 +164,7 @@ int cat_perf_miss_val(int cpu_no, int n, char *cache_type) return -1; struct resctrl_val_param param = { - .resctrl_val = "cat", + .resctrl_val = CAT_STR, .cpu_no = cpu_no, .mum_resctrlfs = 0, .setup = cat_setup, diff --git a/tools/testing/selftests/resctrl/cqm_test.c b/tools/testing/selftests/resctrl/cqm_test.c index de33d1c0466e..271752e9ef5b 100644 --- a/tools/testing/selftests/resctrl/cqm_test.c +++ b/tools/testing/selftests/resctrl/cqm_test.c @@ -145,7 +145,7 @@ int cqm_resctrl_val(int cpu_no, int n, char **benchmark_cmd) } struct resctrl_val_param param = { - .resctrl_val = "cqm", + .resctrl_val = CQM_STR, .ctrlgrp = "c1", .mongrp = "m1", .cpu_no = cpu_no, diff --git a/tools/testing/selftests/resctrl/fill_buf.c b/tools/testing/selftests/resctrl/fill_buf.c index 79c611c99a3d..51e5cf22632f 100644 --- a/tools/testing/selftests/resctrl/fill_buf.c +++ b/tools/testing/selftests/resctrl/fill_buf.c @@ -115,7 +115,7 @@ static int fill_cache_read(unsigned char *start_ptr, unsigned char *end_ptr, while (1) { ret = fill_one_span_read(start_ptr, end_ptr); - if (!strcmp(resctrl_val, "cat")) + if (!strncmp(resctrl_val, CAT_STR, sizeof(CAT_STR))) break; } @@ -134,7 +134,7 @@ static int fill_cache_write(unsigned char *start_ptr, unsigned char *end_ptr, { while (1) { fill_one_span_write(start_ptr, end_ptr); - if (!strcmp(resctrl_val, "cat")) + if (!strncmp(resctrl_val, CAT_STR, sizeof(CAT_STR))) break; } diff --git a/tools/testing/selftests/resctrl/mba_test.c b/tools/testing/selftests/resctrl/mba_test.c index 7bf8eaa6204b..6449fbd96096 100644 --- a/tools/testing/selftests/resctrl/mba_test.c +++ b/tools/testing/selftests/resctrl/mba_test.c @@ -141,7 +141,7 @@ void mba_test_cleanup(void) int mba_schemata_change(int cpu_no, char *bw_report, char **benchmark_cmd) { struct resctrl_val_param param = { - .resctrl_val = "mba", + .resctrl_val = MBA_STR, .ctrlgrp = "c1", .mongrp = "m1", .cpu_no = cpu_no, diff --git a/tools/testing/selftests/resctrl/mbm_test.c b/tools/testing/selftests/resctrl/mbm_test.c index 4700f7453f81..ec6cfe01c9c2 100644 --- a/tools/testing/selftests/resctrl/mbm_test.c +++ b/tools/testing/selftests/resctrl/mbm_test.c @@ -114,7 +114,7 @@ void mbm_test_cleanup(void) int mbm_bw_change(int span, int cpu_no, char *bw_report, char **benchmark_cmd) { struct resctrl_val_param param = { - .resctrl_val = "mbm", + .resctrl_val = MBM_STR, .ctrlgrp = "c1", .mongrp = "m1", .span = span, diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h index 12b77182cb44..36da6136af96 100644 --- a/tools/testing/selftests/resctrl/resctrl.h +++ b/tools/testing/selftests/resctrl/resctrl.h @@ -62,6 +62,11 @@ struct resctrl_val_param { int (*setup)(int num, ...); }; +#define MBM_STR "mbm" +#define MBA_STR "mba" +#define CQM_STR "cqm" +#define CAT_STR "cat" + extern pid_t bm_pid, ppid; extern int tests_run; diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c index 425cc85ac883..4b109a59f72d 100644 --- a/tools/testing/selftests/resctrl/resctrl_tests.c +++ b/tools/testing/selftests/resctrl/resctrl_tests.c @@ -85,13 +85,13 @@ int main(int argc, char **argv) cqm_test = false; cat_test = false; while (token) { - if (!strcmp(token, "mbm")) { + if (!strncmp(token, MBM_STR, sizeof(MBM_STR))) { mbm_test = true; - } else if (!strcmp(token, "mba")) { + } else if (!strncmp(token, MBA_STR, sizeof(MBA_STR))) { mba_test = true; - } else if (!strcmp(token, "cqm")) { + } else if (!strncmp(token, CQM_STR, sizeof(CQM_STR))) { cqm_test = true; - } else if (!strcmp(token, "cat")) { + } else if (!strncmp(token, CAT_STR, sizeof(CAT_STR))) { cat_test = true; } else { printf("invalid argument\n"); @@ -161,7 +161,7 @@ int main(int argc, char **argv) if (!is_amd && mbm_test) { printf("# Starting MBM BW change ...\n"); if (!has_ben) - sprintf(benchmark_cmd[5], "%s", "mba"); + sprintf(benchmark_cmd[5], "%s", MBA_STR); res = mbm_bw_change(span, cpu_no, bw_report, benchmark_cmd); printf("%sok MBM: bw change\n", res ? "not " : ""); mbm_test_cleanup(); @@ -181,7 +181,7 @@ int main(int argc, char **argv) if (cqm_test) { printf("# Starting CQM test ...\n"); if (!has_ben) - sprintf(benchmark_cmd[5], "%s", "cqm"); + sprintf(benchmark_cmd[5], "%s", CQM_STR); res = cqm_resctrl_val(cpu_no, no_of_bits, benchmark_cmd); printf("%sok CQM: test\n", res ? "not " : ""); cqm_test_cleanup(); diff --git a/tools/testing/selftests/resctrl/resctrl_val.c b/tools/testing/selftests/resctrl/resctrl_val.c index 520fea3606d1..aed71fd0713b 100644 --- a/tools/testing/selftests/resctrl/resctrl_val.c +++ b/tools/testing/selftests/resctrl/resctrl_val.c @@ -397,10 +397,10 @@ static void initialize_mem_bw_resctrl(const char *ctrlgrp, const char *mongrp, return; } - if (strcmp(resctrl_val, "mbm") == 0) + if (!strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR))) set_mbm_path(ctrlgrp, mongrp, resource_id); - if ((strcmp(resctrl_val, "mba") == 0)) { + if (!strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR))) { if (ctrlgrp) sprintf(mbm_total_path, CON_MBM_LOCAL_BYTES_PATH, RESCTRL_PATH, ctrlgrp, resource_id); @@ -524,7 +524,7 @@ static void initialize_llc_occu_resctrl(const char *ctrlgrp, const char *mongrp, return; } - if (strcmp(resctrl_val, "cqm") == 0) + if (!strncmp(resctrl_val, CQM_STR, sizeof(CQM_STR))) set_cqm_path(ctrlgrp, mongrp, resource_id); } @@ -579,8 +579,8 @@ int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param) if (strcmp(param->filename, "") == 0) sprintf(param->filename, "stdio"); - if ((strcmp(resctrl_val, "mba")) == 0 || - (strcmp(resctrl_val, "mbm")) == 0) { + if (!strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR)) || + !strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR))) { ret = validate_bw_report_request(param->bw_report); if (ret) return ret; @@ -674,15 +674,15 @@ int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param) if (ret) goto out; - if ((strcmp(resctrl_val, "mbm") == 0) || - (strcmp(resctrl_val, "mba") == 0)) { + if (!strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR)) || + !strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR))) { ret = initialize_mem_bw_imc(); if (ret) goto out; initialize_mem_bw_resctrl(param->ctrlgrp, param->mongrp, param->cpu_no, resctrl_val); - } else if (strcmp(resctrl_val, "cqm") == 0) + } else if (!strncmp(resctrl_val, CQM_STR, sizeof(CQM_STR))) initialize_llc_occu_resctrl(param->ctrlgrp, param->mongrp, param->cpu_no, resctrl_val); @@ -710,8 +710,8 @@ int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param) /* Test runs until the callback setup() tells the test to stop. */ while (1) { - if ((strcmp(resctrl_val, "mbm") == 0) || - (strcmp(resctrl_val, "mba") == 0)) { + if (!strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR)) || + !strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR))) { ret = param->setup(1, param); if (ret) { ret = 0; @@ -721,7 +721,7 @@ int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param) ret = measure_vals(param, &bw_resc_start); if (ret) break; - } else if (strcmp(resctrl_val, "cqm") == 0) { + } else if (!strncmp(resctrl_val, CQM_STR, sizeof(CQM_STR))) { ret = param->setup(1, param); if (ret) { ret = 0; diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c index 2a16100c9c3f..4174e48e06d1 100644 --- a/tools/testing/selftests/resctrl/resctrlfs.c +++ b/tools/testing/selftests/resctrl/resctrlfs.c @@ -334,7 +334,7 @@ void run_benchmark(int signum, siginfo_t *info, void *ucontext) operation = atoi(benchmark_cmd[4]); sprintf(resctrl_val, "%s", benchmark_cmd[5]); - if (strcmp(resctrl_val, "cqm") != 0) + if (strncmp(resctrl_val, CQM_STR, sizeof(CQM_STR))) buffer_span = span * MB; else buffer_span = span; @@ -459,8 +459,8 @@ int write_bm_pid_to_resctrl(pid_t bm_pid, char *ctrlgrp, char *mongrp, goto out; /* Create mon grp and write pid into it for "mbm" and "cqm" test */ - if ((strcmp(resctrl_val, "cqm") == 0) || - (strcmp(resctrl_val, "mbm") == 0)) { + if (!strncmp(resctrl_val, CQM_STR, sizeof(CQM_STR)) || + !strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR))) { if (strlen(mongrp)) { sprintf(monitorgroup_p, "%s/mon_groups", controlgroup); sprintf(monitorgroup, "%s/%s", monitorgroup_p, mongrp); @@ -505,9 +505,9 @@ int write_schemata(char *ctrlgrp, char *schemata, int cpu_no, char *resctrl_val) int resource_id, ret = 0; FILE *fp; - if ((strcmp(resctrl_val, "mba") != 0) && - (strcmp(resctrl_val, "cat") != 0) && - (strcmp(resctrl_val, "cqm") != 0)) + if (strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR)) && + strncmp(resctrl_val, CAT_STR, sizeof(CAT_STR)) && + strncmp(resctrl_val, CQM_STR, sizeof(CQM_STR))) return -ENOENT; if (!schemata) { @@ -528,9 +528,10 @@ int write_schemata(char *ctrlgrp, char *schemata, int cpu_no, char *resctrl_val) else sprintf(controlgroup, "%s/schemata", RESCTRL_PATH); - if (!strcmp(resctrl_val, "cat") || !strcmp(resctrl_val, "cqm")) + if (!strncmp(resctrl_val, CAT_STR, sizeof(CAT_STR)) || + !strncmp(resctrl_val, CQM_STR, sizeof(CQM_STR))) sprintf(schema, "%s%d%c%s", "L3:", resource_id, '=', schemata); - if (strcmp(resctrl_val, "mba") == 0) + if (!strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR))) sprintf(schema, "%s%d%c%s", "MB:", resource_id, '=', schemata); fp = fopen(controlgroup, "w"); -- cgit v1.2.3 From f5f16ae4fae9d4d51aa365a0e1d84d368bef53ea Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Wed, 17 Mar 2021 02:22:39 +0000 Subject: selftests/resctrl: Ensure sibling CPU is not same as original CPU The resctrl tests can accept a CPU on which the tests are run and use default of CPU #1 if it is not provided. In the CAT test a "sibling CPU" is determined that is from the same package where another thread will be run. The current algorithm with which a "sibling CPU" is determined does not take the provided/default CPU into account and when that CPU is the first CPU in a package then the "sibling CPU" will be selected to be the same CPU since it starts by picking the first CPU from core_siblings_list. Fix the "sibling CPU" selection by taking the provided/default CPU into account and ensuring a sibling that is a different CPU is selected. Tested-by: Babu Moger Signed-off-by: Reinette Chatre Signed-off-by: Fenghua Yu Signed-off-by: Shuah Khan --- tools/testing/selftests/resctrl/resctrlfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c index 4174e48e06d1..bc52076bee7f 100644 --- a/tools/testing/selftests/resctrl/resctrlfs.c +++ b/tools/testing/selftests/resctrl/resctrlfs.c @@ -268,7 +268,7 @@ int get_core_sibling(int cpu_no) while (token) { sibling_cpu_no = atoi(token); /* Skipping core 0 as we don't want to run test on core 0 */ - if (sibling_cpu_no != 0) + if (sibling_cpu_no != 0 && sibling_cpu_no != cpu_no) break; token = strtok(NULL, "-,"); } -- cgit v1.2.3 From d7af3d0d515cbdf63b6c3398a3c15ecb1bc2bd38 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 17 Mar 2021 02:22:40 +0000 Subject: selftests/resctrl: Fix missing options "-n" and "-p" resctrl test suite accepts command line arguments (like -b, -t, -n and -p) as documented in the help. But passing -n and -p throws an invalid option error. This happens because -n and -p are missing in the list of characters that getopt() recognizes as valid arguments. Hence, they are treated as invalid options. Fix this by adding them to the list of characters that getopt() recognizes as valid arguments. Please note that the main() function already has the logic to deal with the values passed as part of these arguments and hence no changes are needed there. Tested-by: Babu Moger Signed-off-by: Fenghua Yu Signed-off-by: Shuah Khan --- tools/testing/selftests/resctrl/resctrl_tests.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c index 4b109a59f72d..ac2269610aa9 100644 --- a/tools/testing/selftests/resctrl/resctrl_tests.c +++ b/tools/testing/selftests/resctrl/resctrl_tests.c @@ -73,7 +73,7 @@ int main(int argc, char **argv) } } - while ((c = getopt(argc_new, argv, "ht:b:")) != -1) { + while ((c = getopt(argc_new, argv, "ht:b:n:p:")) != -1) { char *token; switch (c) { -- cgit v1.2.3 From 2f320911d9fab38597d2a32d91b4f31165e0c9b4 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 17 Mar 2021 02:22:41 +0000 Subject: selftests/resctrl: Rename CQM test as CMT test CMT (Cache Monitoring Technology) [1] is a H/W feature that reports cache occupancy of a process. resctrl selftest suite has a unit test to test CMT for LLC but the test is named as CQM (Cache Quality Monitoring). Furthermore, the unit test source file is named as cqm_test.c and several functions, variables, comments, preprocessors and statements widely use "cqm" as either suffix or prefix. This rampant misusage of CQM for CMT might confuse someone who is newly looking at resctrl selftests because this feature is named CMT in the Intel Software Developer's Manual. Hence, rename all the occurrences (unit test source file name, functions, variables, comments and preprocessors) of cqm with cmt. [1] Please see Intel SDM, Volume 3, chapter 17 and section 18 for more information on CMT: https://software.intel.com/content/www/us/en/develop/articles/intel-sdm.html Suggested-by: Reinette Chatre Tested-by: Babu Moger Signed-off-by: Fenghua Yu Signed-off-by: Shuah Khan --- tools/testing/selftests/resctrl/README | 4 +- tools/testing/selftests/resctrl/cache.c | 4 +- tools/testing/selftests/resctrl/cmt_test.c | 176 ++++++++++++++++++++++++ tools/testing/selftests/resctrl/cqm_test.c | 176 ------------------------ tools/testing/selftests/resctrl/resctrl.h | 6 +- tools/testing/selftests/resctrl/resctrl_tests.c | 26 ++-- tools/testing/selftests/resctrl/resctrl_val.c | 12 +- tools/testing/selftests/resctrl/resctrlfs.c | 10 +- 8 files changed, 207 insertions(+), 207 deletions(-) create mode 100644 tools/testing/selftests/resctrl/cmt_test.c delete mode 100644 tools/testing/selftests/resctrl/cqm_test.c (limited to 'tools') diff --git a/tools/testing/selftests/resctrl/README b/tools/testing/selftests/resctrl/README index 6e5a0ffa18e8..4b36b25b6ac0 100644 --- a/tools/testing/selftests/resctrl/README +++ b/tools/testing/selftests/resctrl/README @@ -46,8 +46,8 @@ ARGUMENTS Parameter '-h' shows usage information. usage: resctrl_tests [-h] [-b "benchmark_cmd [options]"] [-t test list] [-n no_of_bits] - -b benchmark_cmd [options]: run specified benchmark for MBM, MBA and CQM default benchmark is builtin fill_buf - -t test list: run tests specified in the test list, e.g. -t mbm, mba, cqm, cat + -b benchmark_cmd [options]: run specified benchmark for MBM, MBA and CMT default benchmark is builtin fill_buf + -t test list: run tests specified in the test list, e.g. -t mbm, mba, cmt, cat -n no_of_bits: run cache tests using specified no of bits in cache bit mask -p cpu_no: specify CPU number to run the test. 1 is default -h: help diff --git a/tools/testing/selftests/resctrl/cache.c b/tools/testing/selftests/resctrl/cache.c index 5922cc1b0386..2aa1b5c7d9e1 100644 --- a/tools/testing/selftests/resctrl/cache.c +++ b/tools/testing/selftests/resctrl/cache.c @@ -111,7 +111,7 @@ static int get_llc_perf(unsigned long *llc_perf_miss) /* * Get LLC Occupancy as reported by RESCTRL FS - * For CQM, + * For CMT, * 1. If con_mon grp and mon grp given, then read from mon grp in * con_mon grp * 2. If only con_mon grp given, then read from con_mon grp @@ -192,7 +192,7 @@ int measure_cache_vals(struct resctrl_val_param *param, int bm_pid) /* * Measure llc occupancy from resctrl. */ - if (!strncmp(param->resctrl_val, CQM_STR, sizeof(CQM_STR))) { + if (!strncmp(param->resctrl_val, CMT_STR, sizeof(CMT_STR))) { ret = get_llc_occu_resctrl(&llc_occu_resc); if (ret < 0) return ret; diff --git a/tools/testing/selftests/resctrl/cmt_test.c b/tools/testing/selftests/resctrl/cmt_test.c new file mode 100644 index 000000000000..4b63838dda32 --- /dev/null +++ b/tools/testing/selftests/resctrl/cmt_test.c @@ -0,0 +1,176 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Cache Monitoring Technology (CMT) test + * + * Copyright (C) 2018 Intel Corporation + * + * Authors: + * Sai Praneeth Prakhya , + * Fenghua Yu + */ +#include "resctrl.h" +#include + +#define RESULT_FILE_NAME "result_cmt" +#define NUM_OF_RUNS 5 +#define MAX_DIFF 2000000 +#define MAX_DIFF_PERCENT 15 + +static int count_of_bits; +static char cbm_mask[256]; +static unsigned long long_mask; +static unsigned long cache_size; + +static int cmt_setup(int num, ...) +{ + struct resctrl_val_param *p; + va_list param; + + va_start(param, num); + p = va_arg(param, struct resctrl_val_param *); + va_end(param); + + /* Run NUM_OF_RUNS times */ + if (p->num_of_runs >= NUM_OF_RUNS) + return -1; + + p->num_of_runs++; + + return 0; +} + +static void show_cache_info(unsigned long sum_llc_occu_resc, int no_of_bits, + unsigned long span) +{ + unsigned long avg_llc_occu_resc = 0; + float diff_percent; + long avg_diff = 0; + bool res; + + avg_llc_occu_resc = sum_llc_occu_resc / (NUM_OF_RUNS - 1); + avg_diff = (long)abs(span - avg_llc_occu_resc); + + diff_percent = (((float)span - avg_llc_occu_resc) / span) * 100; + + if ((abs((int)diff_percent) <= MAX_DIFF_PERCENT) || + (abs(avg_diff) <= MAX_DIFF)) + res = true; + else + res = false; + + printf("%sok CMT: diff within %d, %d\%%\n", res ? "" : "not", + MAX_DIFF, (int)MAX_DIFF_PERCENT); + + printf("# diff: %ld\n", avg_diff); + printf("# percent diff=%d\n", abs((int)diff_percent)); + printf("# Results are displayed in (Bytes)\n"); + printf("# Number of bits: %d\n", no_of_bits); + printf("# Avg_llc_occu_resc: %lu\n", avg_llc_occu_resc); + printf("# llc_occu_exp (span): %lu\n", span); + + tests_run++; +} + +static int check_results(struct resctrl_val_param *param, int no_of_bits) +{ + char *token_array[8], temp[512]; + unsigned long sum_llc_occu_resc = 0; + int runs = 0; + FILE *fp; + + printf("# checking for pass/fail\n"); + fp = fopen(param->filename, "r"); + if (!fp) { + perror("# Error in opening file\n"); + + return errno; + } + + while (fgets(temp, sizeof(temp), fp)) { + char *token = strtok(temp, ":\t"); + int fields = 0; + + while (token) { + token_array[fields++] = token; + token = strtok(NULL, ":\t"); + } + + /* Field 3 is llc occ resc value */ + if (runs > 0) + sum_llc_occu_resc += strtoul(token_array[3], NULL, 0); + runs++; + } + fclose(fp); + show_cache_info(sum_llc_occu_resc, no_of_bits, param->span); + + return 0; +} + +void cmt_test_cleanup(void) +{ + remove(RESULT_FILE_NAME); +} + +int cmt_resctrl_val(int cpu_no, int n, char **benchmark_cmd) +{ + int ret, mum_resctrlfs; + + cache_size = 0; + mum_resctrlfs = 1; + + ret = remount_resctrlfs(mum_resctrlfs); + if (ret) + return ret; + + if (!validate_resctrl_feature_request(CMT_STR)) + return -1; + + ret = get_cbm_mask("L3", cbm_mask); + if (ret) + return ret; + + long_mask = strtoul(cbm_mask, NULL, 16); + + ret = get_cache_size(cpu_no, "L3", &cache_size); + if (ret) + return ret; + printf("cache size :%lu\n", cache_size); + + count_of_bits = count_bits(long_mask); + + if (n < 1 || n > count_of_bits) { + printf("Invalid input value for numbr_of_bits n!\n"); + printf("Please Enter value in range 1 to %d\n", count_of_bits); + return -1; + } + + struct resctrl_val_param param = { + .resctrl_val = CMT_STR, + .ctrlgrp = "c1", + .mongrp = "m1", + .cpu_no = cpu_no, + .mum_resctrlfs = 0, + .filename = RESULT_FILE_NAME, + .mask = ~(long_mask << n) & long_mask, + .span = cache_size * n / count_of_bits, + .num_of_runs = 0, + .setup = cmt_setup, + }; + + if (strcmp(benchmark_cmd[0], "fill_buf") == 0) + sprintf(benchmark_cmd[1], "%lu", param.span); + + remove(RESULT_FILE_NAME); + + ret = resctrl_val(benchmark_cmd, ¶m); + if (ret) + return ret; + + ret = check_results(¶m, n); + if (ret) + return ret; + + cmt_test_cleanup(); + + return 0; +} diff --git a/tools/testing/selftests/resctrl/cqm_test.c b/tools/testing/selftests/resctrl/cqm_test.c deleted file mode 100644 index 271752e9ef5b..000000000000 --- a/tools/testing/selftests/resctrl/cqm_test.c +++ /dev/null @@ -1,176 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Cache Monitoring Technology (CQM) test - * - * Copyright (C) 2018 Intel Corporation - * - * Authors: - * Sai Praneeth Prakhya , - * Fenghua Yu - */ -#include "resctrl.h" -#include - -#define RESULT_FILE_NAME "result_cqm" -#define NUM_OF_RUNS 5 -#define MAX_DIFF 2000000 -#define MAX_DIFF_PERCENT 15 - -static int count_of_bits; -static char cbm_mask[256]; -static unsigned long long_mask; -static unsigned long cache_size; - -static int cqm_setup(int num, ...) -{ - struct resctrl_val_param *p; - va_list param; - - va_start(param, num); - p = va_arg(param, struct resctrl_val_param *); - va_end(param); - - /* Run NUM_OF_RUNS times */ - if (p->num_of_runs >= NUM_OF_RUNS) - return -1; - - p->num_of_runs++; - - return 0; -} - -static void show_cache_info(unsigned long sum_llc_occu_resc, int no_of_bits, - unsigned long span) -{ - unsigned long avg_llc_occu_resc = 0; - float diff_percent; - long avg_diff = 0; - bool res; - - avg_llc_occu_resc = sum_llc_occu_resc / (NUM_OF_RUNS - 1); - avg_diff = (long)abs(span - avg_llc_occu_resc); - - diff_percent = (((float)span - avg_llc_occu_resc) / span) * 100; - - if ((abs((int)diff_percent) <= MAX_DIFF_PERCENT) || - (abs(avg_diff) <= MAX_DIFF)) - res = true; - else - res = false; - - printf("%sok CQM: diff within %d, %d\%%\n", res ? "" : "not", - MAX_DIFF, (int)MAX_DIFF_PERCENT); - - printf("# diff: %ld\n", avg_diff); - printf("# percent diff=%d\n", abs((int)diff_percent)); - printf("# Results are displayed in (Bytes)\n"); - printf("# Number of bits: %d\n", no_of_bits); - printf("# Avg_llc_occu_resc: %lu\n", avg_llc_occu_resc); - printf("# llc_occu_exp (span): %lu\n", span); - - tests_run++; -} - -static int check_results(struct resctrl_val_param *param, int no_of_bits) -{ - char *token_array[8], temp[512]; - unsigned long sum_llc_occu_resc = 0; - int runs = 0; - FILE *fp; - - printf("# checking for pass/fail\n"); - fp = fopen(param->filename, "r"); - if (!fp) { - perror("# Error in opening file\n"); - - return errno; - } - - while (fgets(temp, sizeof(temp), fp)) { - char *token = strtok(temp, ":\t"); - int fields = 0; - - while (token) { - token_array[fields++] = token; - token = strtok(NULL, ":\t"); - } - - /* Field 3 is llc occ resc value */ - if (runs > 0) - sum_llc_occu_resc += strtoul(token_array[3], NULL, 0); - runs++; - } - fclose(fp); - show_cache_info(sum_llc_occu_resc, no_of_bits, param->span); - - return 0; -} - -void cqm_test_cleanup(void) -{ - remove(RESULT_FILE_NAME); -} - -int cqm_resctrl_val(int cpu_no, int n, char **benchmark_cmd) -{ - int ret, mum_resctrlfs; - - cache_size = 0; - mum_resctrlfs = 1; - - ret = remount_resctrlfs(mum_resctrlfs); - if (ret) - return ret; - - if (!validate_resctrl_feature_request("cqm")) - return -1; - - ret = get_cbm_mask("L3", cbm_mask); - if (ret) - return ret; - - long_mask = strtoul(cbm_mask, NULL, 16); - - ret = get_cache_size(cpu_no, "L3", &cache_size); - if (ret) - return ret; - printf("cache size :%lu\n", cache_size); - - count_of_bits = count_bits(long_mask); - - if (n < 1 || n > count_of_bits) { - printf("Invalid input value for numbr_of_bits n!\n"); - printf("Please Enter value in range 1 to %d\n", count_of_bits); - return -1; - } - - struct resctrl_val_param param = { - .resctrl_val = CQM_STR, - .ctrlgrp = "c1", - .mongrp = "m1", - .cpu_no = cpu_no, - .mum_resctrlfs = 0, - .filename = RESULT_FILE_NAME, - .mask = ~(long_mask << n) & long_mask, - .span = cache_size * n / count_of_bits, - .num_of_runs = 0, - .setup = cqm_setup, - }; - - if (strcmp(benchmark_cmd[0], "fill_buf") == 0) - sprintf(benchmark_cmd[1], "%lu", param.span); - - remove(RESULT_FILE_NAME); - - ret = resctrl_val(benchmark_cmd, ¶m); - if (ret) - return ret; - - ret = check_results(¶m, n); - if (ret) - return ret; - - cqm_test_cleanup(); - - return 0; -} diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h index 36da6136af96..1a58767a0bd2 100644 --- a/tools/testing/selftests/resctrl/resctrl.h +++ b/tools/testing/selftests/resctrl/resctrl.h @@ -64,7 +64,7 @@ struct resctrl_val_param { #define MBM_STR "mbm" #define MBA_STR "mba" -#define CQM_STR "cqm" +#define CMT_STR "cmt" #define CAT_STR "cat" extern pid_t bm_pid, ppid; @@ -103,9 +103,9 @@ void ctrlc_handler(int signum, siginfo_t *info, void *ptr); int cat_val(struct resctrl_val_param *param); void cat_test_cleanup(void); int cat_perf_miss_val(int cpu_no, int no_of_bits, char *cache_type); -int cqm_resctrl_val(int cpu_no, int n, char **benchmark_cmd); +int cmt_resctrl_val(int cpu_no, int n, char **benchmark_cmd); unsigned int count_bits(unsigned long n); -void cqm_test_cleanup(void); +void cmt_test_cleanup(void); int get_core_sibling(int cpu_no); int measure_cache_vals(struct resctrl_val_param *param, int bm_pid); diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c index ac2269610aa9..6b2ed2efaad4 100644 --- a/tools/testing/selftests/resctrl/resctrl_tests.c +++ b/tools/testing/selftests/resctrl/resctrl_tests.c @@ -37,10 +37,10 @@ void detect_amd(void) static void cmd_help(void) { printf("usage: resctrl_tests [-h] [-b \"benchmark_cmd [options]\"] [-t test list] [-n no_of_bits]\n"); - printf("\t-b benchmark_cmd [options]: run specified benchmark for MBM, MBA and CQM"); + printf("\t-b benchmark_cmd [options]: run specified benchmark for MBM, MBA and CMT"); printf("\t default benchmark is builtin fill_buf\n"); printf("\t-t test list: run tests specified in the test list, "); - printf("e.g. -t mbm, mba, cqm, cat\n"); + printf("e.g. -t mbm, mba, cmt, cat\n"); printf("\t-n no_of_bits: run cache tests using specified no of bits in cache bit mask\n"); printf("\t-p cpu_no: specify CPU number to run the test. 1 is default\n"); printf("\t-h: help\n"); @@ -50,13 +50,13 @@ void tests_cleanup(void) { mbm_test_cleanup(); mba_test_cleanup(); - cqm_test_cleanup(); + cmt_test_cleanup(); cat_test_cleanup(); } int main(int argc, char **argv) { - bool has_ben = false, mbm_test = true, mba_test = true, cqm_test = true; + bool has_ben = false, mbm_test = true, mba_test = true, cmt_test = true; int res, c, cpu_no = 1, span = 250, argc_new = argc, i, no_of_bits = 5; char *benchmark_cmd[BENCHMARK_ARGS], bw_report[64], bm_type[64]; char benchmark_cmd_area[BENCHMARK_ARGS][BENCHMARK_ARG_SIZE]; @@ -82,15 +82,15 @@ int main(int argc, char **argv) mbm_test = false; mba_test = false; - cqm_test = false; + cmt_test = false; cat_test = false; while (token) { if (!strncmp(token, MBM_STR, sizeof(MBM_STR))) { mbm_test = true; } else if (!strncmp(token, MBA_STR, sizeof(MBA_STR))) { mba_test = true; - } else if (!strncmp(token, CQM_STR, sizeof(CQM_STR))) { - cqm_test = true; + } else if (!strncmp(token, CMT_STR, sizeof(CMT_STR))) { + cmt_test = true; } else if (!strncmp(token, CAT_STR, sizeof(CAT_STR))) { cat_test = true; } else { @@ -178,13 +178,13 @@ int main(int argc, char **argv) tests_run++; } - if (cqm_test) { - printf("# Starting CQM test ...\n"); + if (cmt_test) { + printf("# Starting CMT test ...\n"); if (!has_ben) - sprintf(benchmark_cmd[5], "%s", CQM_STR); - res = cqm_resctrl_val(cpu_no, no_of_bits, benchmark_cmd); - printf("%sok CQM: test\n", res ? "not " : ""); - cqm_test_cleanup(); + sprintf(benchmark_cmd[5], "%s", CMT_STR); + res = cmt_resctrl_val(cpu_no, no_of_bits, benchmark_cmd); + printf("%sok CMT: test\n", res ? "not " : ""); + cmt_test_cleanup(); tests_run++; } diff --git a/tools/testing/selftests/resctrl/resctrl_val.c b/tools/testing/selftests/resctrl/resctrl_val.c index aed71fd0713b..17770095c98e 100644 --- a/tools/testing/selftests/resctrl/resctrl_val.c +++ b/tools/testing/selftests/resctrl/resctrl_val.c @@ -492,7 +492,7 @@ static int print_results_bw(char *filename, int bm_pid, float bw_imc, return 0; } -static void set_cqm_path(const char *ctrlgrp, const char *mongrp, char sock_num) +static void set_cmt_path(const char *ctrlgrp, const char *mongrp, char sock_num) { if (strlen(ctrlgrp) && strlen(mongrp)) sprintf(llc_occup_path, CON_MON_LCC_OCCUP_PATH, RESCTRL_PATH, @@ -512,7 +512,7 @@ static void set_cqm_path(const char *ctrlgrp, const char *mongrp, char sock_num) * @ctrlgrp: Name of the control monitor group (con_mon grp) * @mongrp: Name of the monitor group (mon grp) * @cpu_no: CPU number that the benchmark PID is binded to - * @resctrl_val: Resctrl feature (Eg: cat, cqm.. etc) + * @resctrl_val: Resctrl feature (Eg: cat, cmt.. etc) */ static void initialize_llc_occu_resctrl(const char *ctrlgrp, const char *mongrp, int cpu_no, char *resctrl_val) @@ -524,8 +524,8 @@ static void initialize_llc_occu_resctrl(const char *ctrlgrp, const char *mongrp, return; } - if (!strncmp(resctrl_val, CQM_STR, sizeof(CQM_STR))) - set_cqm_path(ctrlgrp, mongrp, resource_id); + if (!strncmp(resctrl_val, CMT_STR, sizeof(CMT_STR))) + set_cmt_path(ctrlgrp, mongrp, resource_id); } static int @@ -682,7 +682,7 @@ int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param) initialize_mem_bw_resctrl(param->ctrlgrp, param->mongrp, param->cpu_no, resctrl_val); - } else if (!strncmp(resctrl_val, CQM_STR, sizeof(CQM_STR))) + } else if (!strncmp(resctrl_val, CMT_STR, sizeof(CMT_STR))) initialize_llc_occu_resctrl(param->ctrlgrp, param->mongrp, param->cpu_no, resctrl_val); @@ -721,7 +721,7 @@ int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param) ret = measure_vals(param, &bw_resc_start); if (ret) break; - } else if (!strncmp(resctrl_val, CQM_STR, sizeof(CQM_STR))) { + } else if (!strncmp(resctrl_val, CMT_STR, sizeof(CMT_STR))) { ret = param->setup(1, param); if (ret) { ret = 0; diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c index bc52076bee7f..b47f4f150189 100644 --- a/tools/testing/selftests/resctrl/resctrlfs.c +++ b/tools/testing/selftests/resctrl/resctrlfs.c @@ -334,7 +334,7 @@ void run_benchmark(int signum, siginfo_t *info, void *ucontext) operation = atoi(benchmark_cmd[4]); sprintf(resctrl_val, "%s", benchmark_cmd[5]); - if (strncmp(resctrl_val, CQM_STR, sizeof(CQM_STR))) + if (strncmp(resctrl_val, CMT_STR, sizeof(CMT_STR))) buffer_span = span * MB; else buffer_span = span; @@ -458,8 +458,8 @@ int write_bm_pid_to_resctrl(pid_t bm_pid, char *ctrlgrp, char *mongrp, if (ret) goto out; - /* Create mon grp and write pid into it for "mbm" and "cqm" test */ - if (!strncmp(resctrl_val, CQM_STR, sizeof(CQM_STR)) || + /* Create mon grp and write pid into it for "mbm" and "cmt" test */ + if (!strncmp(resctrl_val, CMT_STR, sizeof(CMT_STR)) || !strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR))) { if (strlen(mongrp)) { sprintf(monitorgroup_p, "%s/mon_groups", controlgroup); @@ -507,7 +507,7 @@ int write_schemata(char *ctrlgrp, char *schemata, int cpu_no, char *resctrl_val) if (strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR)) && strncmp(resctrl_val, CAT_STR, sizeof(CAT_STR)) && - strncmp(resctrl_val, CQM_STR, sizeof(CQM_STR))) + strncmp(resctrl_val, CMT_STR, sizeof(CMT_STR))) return -ENOENT; if (!schemata) { @@ -529,7 +529,7 @@ int write_schemata(char *ctrlgrp, char *schemata, int cpu_no, char *resctrl_val) sprintf(controlgroup, "%s/schemata", RESCTRL_PATH); if (!strncmp(resctrl_val, CAT_STR, sizeof(CAT_STR)) || - !strncmp(resctrl_val, CQM_STR, sizeof(CQM_STR))) + !strncmp(resctrl_val, CMT_STR, sizeof(CMT_STR))) sprintf(schema, "%s%d%c%s", "L3:", resource_id, '=', schemata); if (!strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR))) sprintf(schema, "%s%d%c%s", "MB:", resource_id, '=', schemata); -- cgit v1.2.3 From ca2f4214f9671dfc08b6c5723188e03574203dc5 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 17 Mar 2021 02:22:42 +0000 Subject: selftests/resctrl: Call kselftest APIs to log test results Call kselftest APIs instead of using printf() to log test results for cleaner code and better future extension. Suggested-by: Shuah Khan Tested-by: Babu Moger Signed-off-by: Fenghua Yu Signed-off-by: Shuah Khan --- tools/testing/selftests/resctrl/cat_test.c | 37 ++++++++++---------- tools/testing/selftests/resctrl/cmt_test.c | 42 +++++++++++------------ tools/testing/selftests/resctrl/mba_test.c | 24 ++++++------- tools/testing/selftests/resctrl/mbm_test.c | 28 ++++++++------- tools/testing/selftests/resctrl/resctrl.h | 2 +- tools/testing/selftests/resctrl/resctrl_tests.c | 40 +++++++++++----------- tools/testing/selftests/resctrl/resctrl_val.c | 4 +-- tools/testing/selftests/resctrl/resctrlfs.c | 45 +++++++++---------------- 8 files changed, 105 insertions(+), 117 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c index 20823725daca..1daf911076c7 100644 --- a/tools/testing/selftests/resctrl/cat_test.c +++ b/tools/testing/selftests/resctrl/cat_test.c @@ -52,25 +52,28 @@ static int cat_setup(int num, ...) return ret; } -static void show_cache_info(unsigned long sum_llc_perf_miss, int no_of_bits, - unsigned long span) +static int show_cache_info(unsigned long sum_llc_perf_miss, int no_of_bits, + unsigned long span) { unsigned long allocated_cache_lines = span / 64; unsigned long avg_llc_perf_miss = 0; float diff_percent; + int ret; avg_llc_perf_miss = sum_llc_perf_miss / (NUM_OF_RUNS - 1); diff_percent = ((float)allocated_cache_lines - avg_llc_perf_miss) / allocated_cache_lines * 100; - printf("%sok CAT: cache miss rate within %d%%\n", - !is_amd && abs((int)diff_percent) > MAX_DIFF_PERCENT ? - "not " : "", MAX_DIFF_PERCENT); - tests_run++; - printf("# Percent diff=%d\n", abs((int)diff_percent)); - printf("# Number of bits: %d\n", no_of_bits); - printf("# Avg_llc_perf_miss: %lu\n", avg_llc_perf_miss); - printf("# Allocated cache lines: %lu\n", allocated_cache_lines); + ret = !is_amd && abs((int)diff_percent) > MAX_DIFF_PERCENT; + ksft_print_msg("Cache miss rate %swithin %d%%\n", + ret ? "not " : "", MAX_DIFF_PERCENT); + + ksft_print_msg("Percent diff=%d\n", abs((int)diff_percent)); + ksft_print_msg("Number of bits: %d\n", no_of_bits); + ksft_print_msg("Avg_llc_perf_miss: %lu\n", avg_llc_perf_miss); + ksft_print_msg("Allocated cache lines: %lu\n", allocated_cache_lines); + + return ret; } static int check_results(struct resctrl_val_param *param) @@ -80,7 +83,7 @@ static int check_results(struct resctrl_val_param *param) int runs = 0, no_of_bits = 0; FILE *fp; - printf("# Checking for pass/fail\n"); + ksft_print_msg("Checking for pass/fail\n"); fp = fopen(param->filename, "r"); if (!fp) { perror("# Cannot open file"); @@ -108,9 +111,7 @@ static int check_results(struct resctrl_val_param *param) fclose(fp); no_of_bits = count_bits(param->mask); - show_cache_info(sum_llc_perf_miss, no_of_bits, param->span); - - return 0; + return show_cache_info(sum_llc_perf_miss, no_of_bits, param->span); } void cat_test_cleanup(void) @@ -146,15 +147,15 @@ int cat_perf_miss_val(int cpu_no, int n, char *cache_type) ret = get_cache_size(cpu_no, cache_type, &cache_size); if (ret) return ret; - printf("cache size :%lu\n", cache_size); + ksft_print_msg("Cache size :%lu\n", cache_size); /* Get max number of bits from default-cabm mask */ count_of_bits = count_bits(long_mask); if (n < 1 || n > count_of_bits - 1) { - printf("Invalid input value for no_of_bits n!\n"); - printf("Please Enter value in range 1 to %d\n", - count_of_bits - 1); + ksft_print_msg("Invalid input value for no_of_bits n!\n"); + ksft_print_msg("Please enter value in range 1 to %d\n", + count_of_bits - 1); return -1; } diff --git a/tools/testing/selftests/resctrl/cmt_test.c b/tools/testing/selftests/resctrl/cmt_test.c index 4b63838dda32..b1ab1bd1f74d 100644 --- a/tools/testing/selftests/resctrl/cmt_test.c +++ b/tools/testing/selftests/resctrl/cmt_test.c @@ -39,36 +39,33 @@ static int cmt_setup(int num, ...) return 0; } -static void show_cache_info(unsigned long sum_llc_occu_resc, int no_of_bits, - unsigned long span) +static int show_cache_info(unsigned long sum_llc_occu_resc, int no_of_bits, + unsigned long span) { unsigned long avg_llc_occu_resc = 0; float diff_percent; long avg_diff = 0; - bool res; + int ret; avg_llc_occu_resc = sum_llc_occu_resc / (NUM_OF_RUNS - 1); avg_diff = (long)abs(span - avg_llc_occu_resc); diff_percent = (((float)span - avg_llc_occu_resc) / span) * 100; - if ((abs((int)diff_percent) <= MAX_DIFF_PERCENT) || - (abs(avg_diff) <= MAX_DIFF)) - res = true; - else - res = false; + ret = (abs((int)diff_percent) > MAX_DIFF_PERCENT) && + (abs(avg_diff) > MAX_DIFF); - printf("%sok CMT: diff within %d, %d\%%\n", res ? "" : "not", - MAX_DIFF, (int)MAX_DIFF_PERCENT); + ksft_print_msg("%s cache miss diff within %d, %d\%%\n", + ret ? "Fail:" : "Pass:", MAX_DIFF, (int)MAX_DIFF_PERCENT); - printf("# diff: %ld\n", avg_diff); - printf("# percent diff=%d\n", abs((int)diff_percent)); - printf("# Results are displayed in (Bytes)\n"); - printf("# Number of bits: %d\n", no_of_bits); - printf("# Avg_llc_occu_resc: %lu\n", avg_llc_occu_resc); - printf("# llc_occu_exp (span): %lu\n", span); + ksft_print_msg("Diff: %ld\n", avg_diff); + ksft_print_msg("Percent diff=%d\n", abs((int)diff_percent)); + ksft_print_msg("Results are displayed in (Bytes)\n"); + ksft_print_msg("Number of bits: %d\n", no_of_bits); + ksft_print_msg("Avg_llc_occu_resc: %lu\n", avg_llc_occu_resc); + ksft_print_msg("llc_occu_exp (span): %lu\n", span); - tests_run++; + return ret; } static int check_results(struct resctrl_val_param *param, int no_of_bits) @@ -78,7 +75,7 @@ static int check_results(struct resctrl_val_param *param, int no_of_bits) int runs = 0; FILE *fp; - printf("# checking for pass/fail\n"); + ksft_print_msg("Checking for pass/fail\n"); fp = fopen(param->filename, "r"); if (!fp) { perror("# Error in opening file\n"); @@ -101,9 +98,8 @@ static int check_results(struct resctrl_val_param *param, int no_of_bits) runs++; } fclose(fp); - show_cache_info(sum_llc_occu_resc, no_of_bits, param->span); - return 0; + return show_cache_info(sum_llc_occu_resc, no_of_bits, param->span); } void cmt_test_cleanup(void) @@ -134,13 +130,13 @@ int cmt_resctrl_val(int cpu_no, int n, char **benchmark_cmd) ret = get_cache_size(cpu_no, "L3", &cache_size); if (ret) return ret; - printf("cache size :%lu\n", cache_size); + ksft_print_msg("Cache size :%lu\n", cache_size); count_of_bits = count_bits(long_mask); if (n < 1 || n > count_of_bits) { - printf("Invalid input value for numbr_of_bits n!\n"); - printf("Please Enter value in range 1 to %d\n", count_of_bits); + ksft_print_msg("Invalid input value for numbr_of_bits n!\n"); + ksft_print_msg("Please enter value in range 1 to %d\n", count_of_bits); return -1; } diff --git a/tools/testing/selftests/resctrl/mba_test.c b/tools/testing/selftests/resctrl/mba_test.c index 6449fbd96096..f42d4ba70363 100644 --- a/tools/testing/selftests/resctrl/mba_test.c +++ b/tools/testing/selftests/resctrl/mba_test.c @@ -56,7 +56,7 @@ static void show_mba_info(unsigned long *bw_imc, unsigned long *bw_resc) int allocation, runs; bool failed = false; - printf("# Results are displayed in (MB)\n"); + ksft_print_msg("Results are displayed in (MB)\n"); /* Memory bandwidth from 100% down to 10% */ for (allocation = 0; allocation < ALLOCATION_MAX / ALLOCATION_STEP; allocation++) { @@ -78,21 +78,21 @@ static void show_mba_info(unsigned long *bw_imc, unsigned long *bw_resc) avg_bw_resc = sum_bw_resc / (NUM_OF_RUNS - 1); avg_diff = labs((long)(avg_bw_resc - avg_bw_imc)); - printf("%sok MBA schemata percentage %u smaller than %d %%\n", - avg_diff > MAX_DIFF ? "not " : "", - ALLOCATION_MAX - ALLOCATION_STEP * allocation, - MAX_DIFF); - tests_run++; - printf("# avg_diff: %lu\n", avg_diff); - printf("# avg_bw_imc: %lu\n", avg_bw_imc); - printf("# avg_bw_resc: %lu\n", avg_bw_resc); + ksft_print_msg("%s MBA schemata percentage %u smaller than %d %%\n", + avg_diff > MAX_DIFF ? "Fail:" : "Pass:", + ALLOCATION_MAX - ALLOCATION_STEP * allocation, + MAX_DIFF); + ksft_print_msg("avg_diff: %lu\n", avg_diff); + ksft_print_msg("avg_bw_imc: %lu\n", avg_bw_imc); + ksft_print_msg("avg_bw_resc: %lu\n", avg_bw_resc); if (avg_diff > MAX_DIFF) failed = true; } - printf("%sok schemata change using MBA%s\n", failed ? "not " : "", - failed ? " # at least one test failed" : ""); - tests_run++; + ksft_print_msg("%s schemata change using MBA\n", + failed ? "Fail:" : "Pass:"); + if (failed) + ksft_print_msg("At least one test failed"); } static int check_results(void) diff --git a/tools/testing/selftests/resctrl/mbm_test.c b/tools/testing/selftests/resctrl/mbm_test.c index ec6cfe01c9c2..0d65ba4b62b4 100644 --- a/tools/testing/selftests/resctrl/mbm_test.c +++ b/tools/testing/selftests/resctrl/mbm_test.c @@ -14,13 +14,13 @@ #define MAX_DIFF 300 #define NUM_OF_RUNS 5 -static void +static int show_bw_info(unsigned long *bw_imc, unsigned long *bw_resc, int span) { unsigned long avg_bw_imc = 0, avg_bw_resc = 0; unsigned long sum_bw_imc = 0, sum_bw_resc = 0; long avg_diff = 0; - int runs; + int runs, ret; /* * Discard the first value which is inaccurate due to monitoring setup @@ -35,13 +35,15 @@ show_bw_info(unsigned long *bw_imc, unsigned long *bw_resc, int span) avg_bw_resc = sum_bw_resc / 4; avg_diff = avg_bw_resc - avg_bw_imc; - printf("%sok MBM: diff within %d%%\n", - labs(avg_diff) > MAX_DIFF ? "not " : "", MAX_DIFF); - tests_run++; - printf("# avg_diff: %lu\n", labs(avg_diff)); - printf("# Span (MB): %d\n", span); - printf("# avg_bw_imc: %lu\n", avg_bw_imc); - printf("# avg_bw_resc: %lu\n", avg_bw_resc); + ret = labs(avg_diff) > MAX_DIFF; + ksft_print_msg("%s MBM: diff within %d%%\n", + ret ? "Fail:" : "Pass:", MAX_DIFF); + ksft_print_msg("avg_diff: %lu\n", labs(avg_diff)); + ksft_print_msg("Span (MB): %d\n", span); + ksft_print_msg("avg_bw_imc: %lu\n", avg_bw_imc); + ksft_print_msg("avg_bw_resc: %lu\n", avg_bw_resc); + + return ret; } static int check_results(int span) @@ -49,10 +51,10 @@ static int check_results(int span) unsigned long bw_imc[NUM_OF_RUNS], bw_resc[NUM_OF_RUNS]; char temp[1024], *token_array[8]; char output[] = RESULT_FILE_NAME; - int runs; + int runs, ret; FILE *fp; - printf("# Checking for pass/fail\n"); + ksft_print_msg("Checking for pass/fail\n"); fp = fopen(output, "r"); if (!fp) { @@ -76,11 +78,11 @@ static int check_results(int span) runs++; } - show_bw_info(bw_imc, bw_resc, span); + ret = show_bw_info(bw_imc, bw_resc, span); fclose(fp); - return 0; + return ret; } static int mbm_setup(int num, ...) diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h index 1a58767a0bd2..ebf88217f9de 100644 --- a/tools/testing/selftests/resctrl/resctrl.h +++ b/tools/testing/selftests/resctrl/resctrl.h @@ -23,6 +23,7 @@ #include #include #include +#include "../kselftest.h" #define MB (1024 * 1024) #define RESCTRL_PATH "/sys/fs/resctrl" @@ -68,7 +69,6 @@ struct resctrl_val_param { #define CAT_STR "cat" extern pid_t bm_pid, ppid; -extern int tests_run; extern char llc_occup_path[1024]; extern bool is_amd; diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c index 6b2ed2efaad4..ccc1d6987cc6 100644 --- a/tools/testing/selftests/resctrl/resctrl_tests.c +++ b/tools/testing/selftests/resctrl/resctrl_tests.c @@ -60,7 +60,7 @@ int main(int argc, char **argv) int res, c, cpu_no = 1, span = 250, argc_new = argc, i, no_of_bits = 5; char *benchmark_cmd[BENCHMARK_ARGS], bw_report[64], bm_type[64]; char benchmark_cmd_area[BENCHMARK_ARGS][BENCHMARK_ARG_SIZE]; - int ben_ind, ben_count; + int ben_ind, ben_count, tests = 0; bool cat_test = true; for (i = 0; i < argc; i++) { @@ -87,12 +87,16 @@ int main(int argc, char **argv) while (token) { if (!strncmp(token, MBM_STR, sizeof(MBM_STR))) { mbm_test = true; + tests++; } else if (!strncmp(token, MBA_STR, sizeof(MBA_STR))) { mba_test = true; + tests++; } else if (!strncmp(token, CMT_STR, sizeof(CMT_STR))) { cmt_test = true; + tests++; } else if (!strncmp(token, CAT_STR, sizeof(CAT_STR))) { cat_test = true; + tests++; } else { printf("invalid argument\n"); @@ -118,7 +122,7 @@ int main(int argc, char **argv) } } - printf("TAP version 13\n"); + ksft_print_header(); /* * Typically we need root privileges, because: @@ -126,7 +130,7 @@ int main(int argc, char **argv) * 2. We execute perf commands */ if (geteuid() != 0) - printf("# WARNING: not running as root, tests may fail.\n"); + return ksft_exit_fail_msg("Not running as root, abort testing.\n"); /* Detect AMD vendor */ detect_amd(); @@ -155,48 +159,46 @@ int main(int argc, char **argv) sprintf(bw_report, "reads"); sprintf(bm_type, "fill_buf"); - check_resctrlfs_support(); + if (!check_resctrlfs_support()) + return ksft_exit_fail_msg("resctrl FS does not exist\n"); + filter_dmesg(); + ksft_set_plan(tests ? : 4); + if (!is_amd && mbm_test) { - printf("# Starting MBM BW change ...\n"); + ksft_print_msg("Starting MBM BW change ...\n"); if (!has_ben) sprintf(benchmark_cmd[5], "%s", MBA_STR); res = mbm_bw_change(span, cpu_no, bw_report, benchmark_cmd); - printf("%sok MBM: bw change\n", res ? "not " : ""); + ksft_test_result(!res, "MBM: bw change\n"); mbm_test_cleanup(); - tests_run++; } if (!is_amd && mba_test) { - printf("# Starting MBA Schemata change ...\n"); + ksft_print_msg("Starting MBA Schemata change ...\n"); if (!has_ben) sprintf(benchmark_cmd[1], "%d", span); res = mba_schemata_change(cpu_no, bw_report, benchmark_cmd); - printf("%sok MBA: schemata change\n", res ? "not " : ""); + ksft_test_result(!res, "MBA: schemata change\n"); mba_test_cleanup(); - tests_run++; } if (cmt_test) { - printf("# Starting CMT test ...\n"); + ksft_print_msg("Starting CMT test ...\n"); if (!has_ben) sprintf(benchmark_cmd[5], "%s", CMT_STR); res = cmt_resctrl_val(cpu_no, no_of_bits, benchmark_cmd); - printf("%sok CMT: test\n", res ? "not " : ""); + ksft_test_result(!res, "CMT: test\n"); cmt_test_cleanup(); - tests_run++; } if (cat_test) { - printf("# Starting CAT test ...\n"); + ksft_print_msg("Starting CAT test ...\n"); res = cat_perf_miss_val(cpu_no, no_of_bits, "L3"); - printf("%sok CAT: test\n", res ? "not " : ""); - tests_run++; + ksft_test_result(!res, "CAT: test\n"); cat_test_cleanup(); } - printf("1..%d\n", tests_run); - - return 0; + return ksft_exit_pass(); } diff --git a/tools/testing/selftests/resctrl/resctrl_val.c b/tools/testing/selftests/resctrl/resctrl_val.c index 17770095c98e..5dfae51133bc 100644 --- a/tools/testing/selftests/resctrl/resctrl_val.c +++ b/tools/testing/selftests/resctrl/resctrl_val.c @@ -449,7 +449,7 @@ void ctrlc_handler(int signum, siginfo_t *info, void *ptr) kill(bm_pid, SIGKILL); umount_resctrlfs(); tests_cleanup(); - printf("Ending\n\n"); + ksft_print_msg("Ending\n\n"); exit(EXIT_SUCCESS); } @@ -645,7 +645,7 @@ int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param) PARENT_EXIT("Child is done"); } - printf("# benchmark PID: %d\n", bm_pid); + ksft_print_msg("Benchmark PID: %d\n", bm_pid); /* * Register CTRL-C handler for parent, as it has to kill benchmark diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c index b47f4f150189..6b22a186790a 100644 --- a/tools/testing/selftests/resctrl/resctrlfs.c +++ b/tools/testing/selftests/resctrl/resctrlfs.c @@ -10,8 +10,6 @@ */ #include "resctrl.h" -int tests_run; - static int find_resctrl_mount(char *buffer) { FILE *mounts; @@ -68,23 +66,17 @@ int remount_resctrlfs(bool mum_resctrlfs) if (ret) strcpy(mountpoint, RESCTRL_PATH); - if (!ret && mum_resctrlfs && umount(mountpoint)) { - printf("not ok unmounting \"%s\"\n", mountpoint); - perror("# umount"); - tests_run++; - } + if (!ret && mum_resctrlfs && umount(mountpoint)) + ksft_print_msg("Fail: unmounting \"%s\"\n", mountpoint); if (!ret && !mum_resctrlfs) return 0; + ksft_print_msg("Mounting resctrl to \"%s\"\n", RESCTRL_PATH); ret = mount("resctrl", RESCTRL_PATH, "resctrl", 0, NULL); - printf("%sok mounting resctrl to \"%s\"\n", ret ? "not " : "", - RESCTRL_PATH); if (ret) perror("# mount"); - tests_run++; - return ret; } @@ -477,13 +469,10 @@ int write_bm_pid_to_resctrl(pid_t bm_pid, char *ctrlgrp, char *mongrp, } out: - printf("%sok writing benchmark parameters to resctrl FS\n", - ret ? "not " : ""); + ksft_print_msg("Writing benchmark parameters to resctrl FS\n"); if (ret) perror("# writing to resctrlfs"); - tests_run++; - return ret; } @@ -511,7 +500,7 @@ int write_schemata(char *ctrlgrp, char *schemata, int cpu_no, char *resctrl_val) return -ENOENT; if (!schemata) { - printf("# Skipping empty schemata update\n"); + ksft_print_msg("Skipping empty schemata update\n"); return -1; } @@ -552,10 +541,9 @@ int write_schemata(char *ctrlgrp, char *schemata, int cpu_no, char *resctrl_val) fclose(fp); out: - printf("%sok Write schema \"%s\" to resctrl FS%s%s\n", - ret ? "not " : "", schema, ret ? " # " : "", - ret ? reason : ""); - tests_run++; + ksft_print_msg("Write schema \"%s\" to resctrl FS%s%s\n", + schema, ret ? " # " : "", + ret ? reason : ""); return ret; } @@ -579,18 +567,17 @@ bool check_resctrlfs_support(void) fclose(inf); - printf("%sok kernel supports resctrl filesystem\n", ret ? "" : "not "); - tests_run++; + ksft_print_msg("%s kernel supports resctrl filesystem\n", + ret ? "Pass:" : "Fail:"); dp = opendir(RESCTRL_PATH); - printf("%sok resctrl mountpoint \"%s\" exists\n", - dp ? "" : "not ", RESCTRL_PATH); + ksft_print_msg("%s resctrl mountpoint \"%s\" exists\n", + dp ? "Pass:" : "Fail:", RESCTRL_PATH); if (dp) closedir(dp); - tests_run++; - printf("# resctrl filesystem %s mounted\n", - find_resctrl_mount(NULL) ? "not" : "is"); + ksft_print_msg("resctrl filesystem %s mounted\n", + find_resctrl_mount(NULL) ? "not" : "is"); return ret; } @@ -672,9 +659,9 @@ int filter_dmesg(void) while (fgets(line, 1024, fp)) { if (strstr(line, "intel_rdt:")) - printf("# dmesg: %s", line); + ksft_print_msg("dmesg: %s", line); if (strstr(line, "resctrl:")) - printf("# dmesg: %s", line); + ksft_print_msg("dmesg: %s", line); } fclose(fp); waitpid(pid, NULL, 0); -- cgit v1.2.3 From 03216ed7bb4de8ce707eb4de23a08516a542770f Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 17 Mar 2021 02:22:43 +0000 Subject: selftests/resctrl: Share show_cache_info() by CAT and CMT tests show_cache_info() functions are defined separately in CAT and CMT tests. But the functions are same for the tests and unnecessary to be defined separately. Share the function by the tests. Suggested-by: Shuah Khan Tested-by: Babu Moger Signed-off-by: Fenghua Yu Signed-off-by: Shuah Khan --- tools/testing/selftests/resctrl/cache.c | 42 ++++++++++++++++++++++++++++++ tools/testing/selftests/resctrl/cat_test.c | 28 +++----------------- tools/testing/selftests/resctrl/cmt_test.c | 33 +++-------------------- tools/testing/selftests/resctrl/resctrl.h | 4 +++ 4 files changed, 52 insertions(+), 55 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/resctrl/cache.c b/tools/testing/selftests/resctrl/cache.c index 2aa1b5c7d9e1..362e3a418caa 100644 --- a/tools/testing/selftests/resctrl/cache.c +++ b/tools/testing/selftests/resctrl/cache.c @@ -270,3 +270,45 @@ int cat_val(struct resctrl_val_param *param) return ret; } + +/* + * show_cache_info: show cache test result information + * @sum_llc_val: sum of LLC cache result data + * @no_of_bits: number of bits + * @cache_span: cache span in bytes for CMT or in lines for CAT + * @max_diff: max difference + * @max_diff_percent: max difference percentage + * @num_of_runs: number of runs + * @platform: show test information on this platform + * @cmt: CMT test or CAT test + * + * Return: 0 on success. non-zero on failure. + */ +int show_cache_info(unsigned long sum_llc_val, int no_of_bits, + unsigned long cache_span, unsigned long max_diff, + unsigned long max_diff_percent, unsigned long num_of_runs, + bool platform, bool cmt) +{ + unsigned long avg_llc_val = 0; + float diff_percent; + long avg_diff = 0; + int ret; + + avg_llc_val = sum_llc_val / (num_of_runs - 1); + avg_diff = (long)abs(cache_span - avg_llc_val); + diff_percent = ((float)cache_span - avg_llc_val) / cache_span * 100; + + ret = platform && abs((int)diff_percent) > max_diff_percent && + (cmt ? (abs(avg_diff) > max_diff) : true); + + ksft_print_msg("%s cache miss rate within %d%%\n", + ret ? "Fail:" : "Pass:", max_diff_percent); + + ksft_print_msg("Percent diff=%d\n", abs((int)diff_percent)); + ksft_print_msg("Number of bits: %d\n", no_of_bits); + ksft_print_msg("Average LLC val: %lu\n", avg_llc_val); + ksft_print_msg("Cache span (%s): %lu\n", cmt ? "bytes" : "lines", + cache_span); + + return ret; +} diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c index 1daf911076c7..090d3afc7a78 100644 --- a/tools/testing/selftests/resctrl/cat_test.c +++ b/tools/testing/selftests/resctrl/cat_test.c @@ -52,30 +52,6 @@ static int cat_setup(int num, ...) return ret; } -static int show_cache_info(unsigned long sum_llc_perf_miss, int no_of_bits, - unsigned long span) -{ - unsigned long allocated_cache_lines = span / 64; - unsigned long avg_llc_perf_miss = 0; - float diff_percent; - int ret; - - avg_llc_perf_miss = sum_llc_perf_miss / (NUM_OF_RUNS - 1); - diff_percent = ((float)allocated_cache_lines - avg_llc_perf_miss) / - allocated_cache_lines * 100; - - ret = !is_amd && abs((int)diff_percent) > MAX_DIFF_PERCENT; - ksft_print_msg("Cache miss rate %swithin %d%%\n", - ret ? "not " : "", MAX_DIFF_PERCENT); - - ksft_print_msg("Percent diff=%d\n", abs((int)diff_percent)); - ksft_print_msg("Number of bits: %d\n", no_of_bits); - ksft_print_msg("Avg_llc_perf_miss: %lu\n", avg_llc_perf_miss); - ksft_print_msg("Allocated cache lines: %lu\n", allocated_cache_lines); - - return ret; -} - static int check_results(struct resctrl_val_param *param) { char *token_array[8], temp[512]; @@ -111,7 +87,9 @@ static int check_results(struct resctrl_val_param *param) fclose(fp); no_of_bits = count_bits(param->mask); - return show_cache_info(sum_llc_perf_miss, no_of_bits, param->span); + return show_cache_info(sum_llc_perf_miss, no_of_bits, param->span / 64, + MAX_DIFF, MAX_DIFF_PERCENT, NUM_OF_RUNS, + !is_amd, false); } void cat_test_cleanup(void) diff --git a/tools/testing/selftests/resctrl/cmt_test.c b/tools/testing/selftests/resctrl/cmt_test.c index b1ab1bd1f74d..8968e36db99d 100644 --- a/tools/testing/selftests/resctrl/cmt_test.c +++ b/tools/testing/selftests/resctrl/cmt_test.c @@ -39,35 +39,6 @@ static int cmt_setup(int num, ...) return 0; } -static int show_cache_info(unsigned long sum_llc_occu_resc, int no_of_bits, - unsigned long span) -{ - unsigned long avg_llc_occu_resc = 0; - float diff_percent; - long avg_diff = 0; - int ret; - - avg_llc_occu_resc = sum_llc_occu_resc / (NUM_OF_RUNS - 1); - avg_diff = (long)abs(span - avg_llc_occu_resc); - - diff_percent = (((float)span - avg_llc_occu_resc) / span) * 100; - - ret = (abs((int)diff_percent) > MAX_DIFF_PERCENT) && - (abs(avg_diff) > MAX_DIFF); - - ksft_print_msg("%s cache miss diff within %d, %d\%%\n", - ret ? "Fail:" : "Pass:", MAX_DIFF, (int)MAX_DIFF_PERCENT); - - ksft_print_msg("Diff: %ld\n", avg_diff); - ksft_print_msg("Percent diff=%d\n", abs((int)diff_percent)); - ksft_print_msg("Results are displayed in (Bytes)\n"); - ksft_print_msg("Number of bits: %d\n", no_of_bits); - ksft_print_msg("Avg_llc_occu_resc: %lu\n", avg_llc_occu_resc); - ksft_print_msg("llc_occu_exp (span): %lu\n", span); - - return ret; -} - static int check_results(struct resctrl_val_param *param, int no_of_bits) { char *token_array[8], temp[512]; @@ -99,7 +70,9 @@ static int check_results(struct resctrl_val_param *param, int no_of_bits) } fclose(fp); - return show_cache_info(sum_llc_occu_resc, no_of_bits, param->span); + return show_cache_info(sum_llc_occu_resc, no_of_bits, param->span, + MAX_DIFF, MAX_DIFF_PERCENT, NUM_OF_RUNS, + true, true); } void cmt_test_cleanup(void) diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h index ebf88217f9de..81f322245ef7 100644 --- a/tools/testing/selftests/resctrl/resctrl.h +++ b/tools/testing/selftests/resctrl/resctrl.h @@ -108,5 +108,9 @@ unsigned int count_bits(unsigned long n); void cmt_test_cleanup(void); int get_core_sibling(int cpu_no); int measure_cache_vals(struct resctrl_val_param *param, int bm_pid); +int show_cache_info(unsigned long sum_llc_val, int no_of_bits, + unsigned long cache_span, unsigned long max_diff, + unsigned long max_diff_percent, unsigned long num_of_runs, + bool platform, bool cmt); #endif /* RESCTRL_H */ -- cgit v1.2.3 From f29838e6fa131f4a323225457112fb869d15931b Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Wed, 17 Mar 2021 02:22:44 +0000 Subject: selftests/resctrl: Fix a printed message Add a missing newline to the printed help text to improve readability. Tested-by: Babu Moger Signed-off-by: Reinette Chatre Signed-off-by: Fenghua Yu Signed-off-by: Shuah Khan --- tools/testing/selftests/resctrl/resctrl_tests.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c index ccc1d6987cc6..355bd28b996a 100644 --- a/tools/testing/selftests/resctrl/resctrl_tests.c +++ b/tools/testing/selftests/resctrl/resctrl_tests.c @@ -37,8 +37,8 @@ void detect_amd(void) static void cmd_help(void) { printf("usage: resctrl_tests [-h] [-b \"benchmark_cmd [options]\"] [-t test list] [-n no_of_bits]\n"); - printf("\t-b benchmark_cmd [options]: run specified benchmark for MBM, MBA and CMT"); - printf("\t default benchmark is builtin fill_buf\n"); + printf("\t-b benchmark_cmd [options]: run specified benchmark for MBM, MBA and CMT\n"); + printf("\t default benchmark is builtin fill_buf\n"); printf("\t-t test list: run tests specified in the test list, "); printf("e.g. -t mbm, mba, cmt, cat\n"); printf("\t-n no_of_bits: run cache tests using specified no of bits in cache bit mask\n"); -- cgit v1.2.3 From b67a7665a917e7305eaa573a474c859fe4c5949e Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 17 Mar 2021 02:22:45 +0000 Subject: selftests/resctrl: Add config dependencies Add the config file for test dependencies. Suggested-by: Shuah Khan Tested-by: Babu Moger Signed-off-by: Fenghua Yu Signed-off-by: Shuah Khan --- tools/testing/selftests/resctrl/config | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 tools/testing/selftests/resctrl/config (limited to 'tools') diff --git a/tools/testing/selftests/resctrl/config b/tools/testing/selftests/resctrl/config new file mode 100644 index 000000000000..8d9f2deb56ed --- /dev/null +++ b/tools/testing/selftests/resctrl/config @@ -0,0 +1,2 @@ +CONFIG_X86_CPU_RESCTRL=y +CONFIG_PROC_CPU_RESCTRL=y -- cgit v1.2.3 From a3611fbc6e58c147bdd409b356baf15ddf57271e Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 17 Mar 2021 02:22:46 +0000 Subject: selftests/resctrl: Check for resctrl mount point only if resctrl FS is supported check_resctrlfs_support() does the following 1. Checks if the platform supports resctrl file system or not by looking for resctrl in /proc/filesystems 2. Calls opendir() on default resctrl file system path (i.e. /sys/fs/resctrl) 3. Checks if resctrl file system is mounted or not by looking at /proc/mounts Steps 2 and 3 will fail if the platform does not support resctrl file system. So, there is no need to check for them if step 1 fails. Fix this by returning immediately if the platform does not support resctrl file system. Tested-by: Babu Moger Signed-off-by: Fenghua Yu Signed-off-by: Shuah Khan --- tools/testing/selftests/resctrl/resctrlfs.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c index 6b22a186790a..87195eb78356 100644 --- a/tools/testing/selftests/resctrl/resctrlfs.c +++ b/tools/testing/selftests/resctrl/resctrlfs.c @@ -570,6 +570,9 @@ bool check_resctrlfs_support(void) ksft_print_msg("%s kernel supports resctrl filesystem\n", ret ? "Pass:" : "Fail:"); + if (!ret) + return ret; + dp = opendir(RESCTRL_PATH); ksft_print_msg("%s resctrl mountpoint \"%s\" exists\n", dp ? "Pass:" : "Fail:", RESCTRL_PATH); -- cgit v1.2.3 From ee0415681eb661efa1eb2db7acc263f2c7df1e23 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 17 Mar 2021 02:22:47 +0000 Subject: selftests/resctrl: Use resctrl/info for feature detection Resctrl test suite before running any unit test (like cmt, cat, mbm and mba) should first check if the feature is enabled (by kernel and not just supported by H/W) on the platform or not. validate_resctrl_feature_request() is supposed to do that. This function intends to grep for relevant flags in /proc/cpuinfo but there are several issues here 1. validate_resctrl_feature_request() calls fgrep() to get flags from /proc/cpuinfo. But, fgrep() can only return a string with maximum of 255 characters and hence the complete cpu flags are never returned. 2. The substring search logic is also busted. If strstr() finds requested resctrl feature in the cpu flags, it returns pointer to the first occurrence. But, the logic negates the return value of strstr() and hence validate_resctrl_feature_request() returns false if the feature is present in the cpu flags and returns true if the feature is not present. 3. validate_resctrl_feature_request() checks if a resctrl feature is reported in /proc/cpuinfo flags or not. Having a cpu flag means that the H/W supports the feature, but it doesn't mean that the kernel enabled it. A user could selectively enable only a subset of resctrl features using kernel command line arguments. Hence, /proc/cpuinfo isn't a reliable source to check if a feature is enabled or not. The 3rd issue being the major one and fixing it requires changing the way validate_resctrl_feature_request() works. Since, /proc/cpuinfo isn't the right place to check if a resctrl feature is enabled or not, a more appropriate place is /sys/fs/resctrl/info directory. Change validate_resctrl_feature_request() such that, 1. For cat, check if /sys/fs/resctrl/info/L3 directory is present or not 2. For mba, check if /sys/fs/resctrl/info/MB directory is present or not 3. For cmt, check if /sys/fs/resctrl/info/L3_MON directory is present and check if /sys/fs/resctrl/info/L3_MON/mon_features has llc_occupancy 4. For mbm, check if /sys/fs/resctrl/info/L3_MON directory is present and check if /sys/fs/resctrl/info/L3_MON/mon_features has mbm__bytes Please note that only L3_CAT, L3_CMT, MBA and MBM are supported. CDP and L2 variants can be added later. Reported-by: Reinette Chatre Tested-by: Babu Moger Signed-off-by: Fenghua Yu Signed-off-by: Shuah Khan --- tools/testing/selftests/resctrl/resctrl.h | 6 +++- tools/testing/selftests/resctrl/resctrlfs.c | 52 +++++++++++++++++++++++------ 2 files changed, 46 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h index 81f322245ef7..1ad10c47e31d 100644 --- a/tools/testing/selftests/resctrl/resctrl.h +++ b/tools/testing/selftests/resctrl/resctrl.h @@ -29,6 +29,10 @@ #define RESCTRL_PATH "/sys/fs/resctrl" #define PHYS_ID_PATH "/sys/devices/system/cpu/cpu" #define CBM_MASK_PATH "/sys/fs/resctrl/info" +#define L3_PATH "/sys/fs/resctrl/info/L3" +#define MB_PATH "/sys/fs/resctrl/info/MB" +#define L3_MON_PATH "/sys/fs/resctrl/info/L3_MON" +#define L3_MON_FEATURES_PATH "/sys/fs/resctrl/info/L3_MON/mon_features" #define PARENT_EXIT(err_msg) \ do { \ @@ -79,7 +83,7 @@ int remount_resctrlfs(bool mum_resctrlfs); int get_resource_id(int cpu_no, int *resource_id); int umount_resctrlfs(void); int validate_bw_report_request(char *bw_report); -bool validate_resctrl_feature_request(char *resctrl_val); +bool validate_resctrl_feature_request(const char *resctrl_val); char *fgrep(FILE *inf, const char *str); int taskset_benchmark(pid_t bm_pid, int cpu_no); void run_benchmark(int signum, siginfo_t *info, void *ucontext); diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c index 87195eb78356..26563175acf6 100644 --- a/tools/testing/selftests/resctrl/resctrlfs.c +++ b/tools/testing/selftests/resctrl/resctrlfs.c @@ -606,26 +606,56 @@ char *fgrep(FILE *inf, const char *str) * validate_resctrl_feature_request - Check if requested feature is valid. * @resctrl_val: Requested feature * - * Return: 0 on success, non-zero on failure + * Return: True if the feature is supported, else false */ -bool validate_resctrl_feature_request(char *resctrl_val) +bool validate_resctrl_feature_request(const char *resctrl_val) { - FILE *inf = fopen("/proc/cpuinfo", "r"); + struct stat statbuf; bool found = false; char *res; + FILE *inf; - if (!inf) + if (!resctrl_val) return false; - res = fgrep(inf, "flags"); - - if (res) { - char *s = strchr(res, ':'); + if (remount_resctrlfs(false)) + return false; - found = s && !strstr(s, resctrl_val); - free(res); + if (!strncmp(resctrl_val, CAT_STR, sizeof(CAT_STR))) { + if (!stat(L3_PATH, &statbuf)) + return true; + } else if (!strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR))) { + if (!stat(MB_PATH, &statbuf)) + return true; + } else if (!strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR)) || + !strncmp(resctrl_val, CMT_STR, sizeof(CMT_STR))) { + if (!stat(L3_MON_PATH, &statbuf)) { + inf = fopen(L3_MON_FEATURES_PATH, "r"); + if (!inf) + return false; + + if (!strncmp(resctrl_val, CMT_STR, sizeof(CMT_STR))) { + res = fgrep(inf, "llc_occupancy"); + if (res) { + found = true; + free(res); + } + } + + if (!strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR))) { + res = fgrep(inf, "mbm_total_bytes"); + if (res) { + free(res); + res = fgrep(inf, "mbm_local_bytes"); + if (res) { + found = true; + free(res); + } + } + } + fclose(inf); + } } - fclose(inf); return found; } -- cgit v1.2.3 From 06bd03a57f8c2e3a8698a7ce7dead4ef18e00902 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 17 Mar 2021 02:22:48 +0000 Subject: selftests/resctrl: Fix MBA/MBM results reporting format MBM unit test starts fill_buf (default built-in benchmark) in a new con_mon group (c1, m1) and records resctrl reported mbm values and iMC (Integrated Memory Controller) values every second. It does this for five seconds (randomly chosen value) in total. It then calculates average of resctrl_mbm values and imc_mbm values and if the difference is greater than 300 MB/sec (randomly chosen value), the test treats it as a failure. MBA unit test is similar to MBM but after every run it changes schemata. Checking for a difference of 300 MB/sec doesn't look very meaningful when the mbm values are changing over a wide range. For example, below are the values running MBA test on SKL with different allocations 1. With 10% as schemata both iMC and resctrl mbm_values are around 2000 MB/sec 2. With 100% as schemata both iMC and resctrl mbm_values are around 10000 MB/sec A 300 MB/sec difference between resctrl_mbm and imc_mbm values is acceptable at 100% schemata but it isn't acceptable at 10% schemata because that's a huge difference. So, fix this by checking for percentage difference instead of absolute difference i.e. check if the difference between resctrl_mbm value and imc_mbm value is within 5% (randomly chosen value) of imc_mbm value. If the difference is greater than 5% of imc_mbm value, treat it is a failure. Tested-by: Babu Moger Signed-off-by: Fenghua Yu Signed-off-by: Shuah Khan --- tools/testing/selftests/resctrl/mba_test.c | 22 +++++++++++++--------- tools/testing/selftests/resctrl/mbm_test.c | 15 ++++++++------- 2 files changed, 21 insertions(+), 16 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/resctrl/mba_test.c b/tools/testing/selftests/resctrl/mba_test.c index f42d4ba70363..8842d379e886 100644 --- a/tools/testing/selftests/resctrl/mba_test.c +++ b/tools/testing/selftests/resctrl/mba_test.c @@ -12,7 +12,7 @@ #define RESULT_FILE_NAME "result_mba" #define NUM_OF_RUNS 5 -#define MAX_DIFF 300 +#define MAX_DIFF_PERCENT 5 #define ALLOCATION_MAX 100 #define ALLOCATION_MIN 10 #define ALLOCATION_STEP 10 @@ -62,7 +62,8 @@ static void show_mba_info(unsigned long *bw_imc, unsigned long *bw_resc) allocation++) { unsigned long avg_bw_imc, avg_bw_resc; unsigned long sum_bw_imc = 0, sum_bw_resc = 0; - unsigned long avg_diff; + int avg_diff_per; + float avg_diff; /* * The first run is discarded due to inaccurate value from @@ -76,16 +77,19 @@ static void show_mba_info(unsigned long *bw_imc, unsigned long *bw_resc) avg_bw_imc = sum_bw_imc / (NUM_OF_RUNS - 1); avg_bw_resc = sum_bw_resc / (NUM_OF_RUNS - 1); - avg_diff = labs((long)(avg_bw_resc - avg_bw_imc)); + avg_diff = (float)labs(avg_bw_resc - avg_bw_imc) / avg_bw_imc; + avg_diff_per = (int)(avg_diff * 100); - ksft_print_msg("%s MBA schemata percentage %u smaller than %d %%\n", - avg_diff > MAX_DIFF ? "Fail:" : "Pass:", - ALLOCATION_MAX - ALLOCATION_STEP * allocation, - MAX_DIFF); - ksft_print_msg("avg_diff: %lu\n", avg_diff); + ksft_print_msg("%s MBA: diff within %d%% for schemata %u\n", + avg_diff_per > MAX_DIFF_PERCENT ? + "Fail:" : "Pass:", + MAX_DIFF_PERCENT, + ALLOCATION_MAX - ALLOCATION_STEP * allocation); + + ksft_print_msg("avg_diff_per: %d%%\n", avg_diff_per); ksft_print_msg("avg_bw_imc: %lu\n", avg_bw_imc); ksft_print_msg("avg_bw_resc: %lu\n", avg_bw_resc); - if (avg_diff > MAX_DIFF) + if (avg_diff_per > MAX_DIFF_PERCENT) failed = true; } diff --git a/tools/testing/selftests/resctrl/mbm_test.c b/tools/testing/selftests/resctrl/mbm_test.c index 0d65ba4b62b4..651d4ac15986 100644 --- a/tools/testing/selftests/resctrl/mbm_test.c +++ b/tools/testing/selftests/resctrl/mbm_test.c @@ -11,7 +11,7 @@ #include "resctrl.h" #define RESULT_FILE_NAME "result_mbm" -#define MAX_DIFF 300 +#define MAX_DIFF_PERCENT 5 #define NUM_OF_RUNS 5 static int @@ -19,8 +19,8 @@ show_bw_info(unsigned long *bw_imc, unsigned long *bw_resc, int span) { unsigned long avg_bw_imc = 0, avg_bw_resc = 0; unsigned long sum_bw_imc = 0, sum_bw_resc = 0; - long avg_diff = 0; - int runs, ret; + int runs, ret, avg_diff_per; + float avg_diff = 0; /* * Discard the first value which is inaccurate due to monitoring setup @@ -33,12 +33,13 @@ show_bw_info(unsigned long *bw_imc, unsigned long *bw_resc, int span) avg_bw_imc = sum_bw_imc / 4; avg_bw_resc = sum_bw_resc / 4; - avg_diff = avg_bw_resc - avg_bw_imc; + avg_diff = (float)labs(avg_bw_resc - avg_bw_imc) / avg_bw_imc; + avg_diff_per = (int)(avg_diff * 100); - ret = labs(avg_diff) > MAX_DIFF; + ret = avg_diff_per > MAX_DIFF_PERCENT; ksft_print_msg("%s MBM: diff within %d%%\n", - ret ? "Fail:" : "Pass:", MAX_DIFF); - ksft_print_msg("avg_diff: %lu\n", labs(avg_diff)); + ret ? "Fail:" : "Pass:", MAX_DIFF_PERCENT); + ksft_print_msg("avg_diff_per: %d%%\n", avg_diff_per); ksft_print_msg("Span (MB): %d\n", span); ksft_print_msg("avg_bw_imc: %lu\n", avg_bw_imc); ksft_print_msg("avg_bw_resc: %lu\n", avg_bw_resc); -- cgit v1.2.3 From 09a67934625a5941737c566b48e4e574ac4d1d99 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 17 Mar 2021 02:22:49 +0000 Subject: selftests/resctrl: Don't hard code value of "no_of_bits" variable Cache related tests (like CAT and CMT) depend on a variable called no_of_bits to run. no_of_bits defines the number of contiguous bits that should be set in the CBM mask and a user can pass a value for no_of_bits using -n command line argument. If a user hasn't passed any value, it defaults to 5 (randomly chosen value). Hard coding no_of_bits to 5 will make the cache tests fail to run on systems that support maximum cbm mask that is less than or equal to 5 bits. Hence, don't hard code no_of_bits value. If a user passes a value for "no_of_bits" using -n option, use it. Otherwise, no_of_bits is equal to half of the maximum number of bits in the cbm mask. Please note that CMT test is still hard coded to 5 bits. It will change in subsequent patches that change CMT test. Tested-by: Babu Moger Signed-off-by: Fenghua Yu Signed-off-by: Shuah Khan --- tools/testing/selftests/resctrl/cat_test.c | 5 ++++- tools/testing/selftests/resctrl/resctrl_tests.c | 8 ++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c index 090d3afc7a78..04d706b4f10e 100644 --- a/tools/testing/selftests/resctrl/cat_test.c +++ b/tools/testing/selftests/resctrl/cat_test.c @@ -130,7 +130,10 @@ int cat_perf_miss_val(int cpu_no, int n, char *cache_type) /* Get max number of bits from default-cabm mask */ count_of_bits = count_bits(long_mask); - if (n < 1 || n > count_of_bits - 1) { + if (!n) + n = count_of_bits / 2; + + if (n > count_of_bits - 1) { ksft_print_msg("Invalid input value for no_of_bits n!\n"); ksft_print_msg("Please enter value in range 1 to %d\n", count_of_bits - 1); diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c index 355bd28b996a..2ace464b96d1 100644 --- a/tools/testing/selftests/resctrl/resctrl_tests.c +++ b/tools/testing/selftests/resctrl/resctrl_tests.c @@ -57,7 +57,7 @@ void tests_cleanup(void) int main(int argc, char **argv) { bool has_ben = false, mbm_test = true, mba_test = true, cmt_test = true; - int res, c, cpu_no = 1, span = 250, argc_new = argc, i, no_of_bits = 5; + int res, c, cpu_no = 1, span = 250, argc_new = argc, i, no_of_bits = 0; char *benchmark_cmd[BENCHMARK_ARGS], bw_report[64], bm_type[64]; char benchmark_cmd_area[BENCHMARK_ARGS][BENCHMARK_ARG_SIZE]; int ben_ind, ben_count, tests = 0; @@ -110,6 +110,10 @@ int main(int argc, char **argv) break; case 'n': no_of_bits = atoi(optarg); + if (no_of_bits <= 0) { + printf("Bail out! invalid argument for no_of_bits\n"); + return -1; + } break; case 'h': cmd_help(); @@ -188,7 +192,7 @@ int main(int argc, char **argv) ksft_print_msg("Starting CMT test ...\n"); if (!has_ben) sprintf(benchmark_cmd[5], "%s", CMT_STR); - res = cmt_resctrl_val(cpu_no, no_of_bits, benchmark_cmd); + res = cmt_resctrl_val(cpu_no, 5, benchmark_cmd); ksft_test_result(!res, "CMT: test\n"); cmt_test_cleanup(); } -- cgit v1.2.3 From c9fb4e7cee1ebf38257c93f7f5c8915a1424611e Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 17 Mar 2021 02:22:50 +0000 Subject: selftests/resctrl: Modularize resctrl test suite main() function Resctrl test suite main() function does the following things 1. Parses command line arguments passed by user 2. Some setup checks 3. Logic that calls into each unit test 4. Print result and clean up after running each unit test Introduce wrapper functions for steps 3 and 4 to modularize the main() function. Adding these wrapper functions makes it easier to add any logic to each individual test. Please note that this is a preparatory patch for the next one and no functional changes are intended. Suggested-by: Reinette Chatre Tested-by: Babu Moger Signed-off-by: Fenghua Yu Signed-off-by: Shuah Khan --- tools/testing/selftests/resctrl/resctrl_tests.c | 88 ++++++++++++++++--------- 1 file changed, 57 insertions(+), 31 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c index 2ace464b96d1..e63e0d8764ef 100644 --- a/tools/testing/selftests/resctrl/resctrl_tests.c +++ b/tools/testing/selftests/resctrl/resctrl_tests.c @@ -54,10 +54,58 @@ void tests_cleanup(void) cat_test_cleanup(); } +static void run_mbm_test(bool has_ben, char **benchmark_cmd, int span, + int cpu_no, char *bw_report) +{ + int res; + + ksft_print_msg("Starting MBM BW change ...\n"); + if (!has_ben) + sprintf(benchmark_cmd[5], "%s", MBA_STR); + res = mbm_bw_change(span, cpu_no, bw_report, benchmark_cmd); + ksft_test_result(!res, "MBM: bw change\n"); + mbm_test_cleanup(); +} + +static void run_mba_test(bool has_ben, char **benchmark_cmd, int span, + int cpu_no, char *bw_report) +{ + int res; + + ksft_print_msg("Starting MBA Schemata change ...\n"); + if (!has_ben) + sprintf(benchmark_cmd[1], "%d", span); + res = mba_schemata_change(cpu_no, bw_report, benchmark_cmd); + ksft_test_result(!res, "MBA: schemata change\n"); + mba_test_cleanup(); +} + +static void run_cmt_test(bool has_ben, char **benchmark_cmd, int cpu_no) +{ + int res; + + ksft_print_msg("Starting CMT test ...\n"); + if (!has_ben) + sprintf(benchmark_cmd[5], "%s", CMT_STR); + res = cmt_resctrl_val(cpu_no, 5, benchmark_cmd); + ksft_test_result(!res, "CMT: test\n"); + cmt_test_cleanup(); +} + +static void run_cat_test(int cpu_no, int no_of_bits) +{ + int res; + + ksft_print_msg("Starting CAT test ...\n"); + res = cat_perf_miss_val(cpu_no, no_of_bits, "L3"); + ksft_test_result(!res, "CAT: test\n"); + cat_test_cleanup(); +} + int main(int argc, char **argv) { bool has_ben = false, mbm_test = true, mba_test = true, cmt_test = true; - int res, c, cpu_no = 1, span = 250, argc_new = argc, i, no_of_bits = 0; + int c, cpu_no = 1, span = 250, argc_new = argc, i, no_of_bits = 0; char *benchmark_cmd[BENCHMARK_ARGS], bw_report[64], bm_type[64]; char benchmark_cmd_area[BENCHMARK_ARGS][BENCHMARK_ARG_SIZE]; int ben_ind, ben_count, tests = 0; @@ -170,39 +218,17 @@ int main(int argc, char **argv) ksft_set_plan(tests ? : 4); - if (!is_amd && mbm_test) { - ksft_print_msg("Starting MBM BW change ...\n"); - if (!has_ben) - sprintf(benchmark_cmd[5], "%s", MBA_STR); - res = mbm_bw_change(span, cpu_no, bw_report, benchmark_cmd); - ksft_test_result(!res, "MBM: bw change\n"); - mbm_test_cleanup(); - } + if (!is_amd && mbm_test) + run_mbm_test(has_ben, benchmark_cmd, span, cpu_no, bw_report); - if (!is_amd && mba_test) { - ksft_print_msg("Starting MBA Schemata change ...\n"); - if (!has_ben) - sprintf(benchmark_cmd[1], "%d", span); - res = mba_schemata_change(cpu_no, bw_report, benchmark_cmd); - ksft_test_result(!res, "MBA: schemata change\n"); - mba_test_cleanup(); - } + if (!is_amd && mba_test) + run_mba_test(has_ben, benchmark_cmd, span, cpu_no, bw_report); - if (cmt_test) { - ksft_print_msg("Starting CMT test ...\n"); - if (!has_ben) - sprintf(benchmark_cmd[5], "%s", CMT_STR); - res = cmt_resctrl_val(cpu_no, 5, benchmark_cmd); - ksft_test_result(!res, "CMT: test\n"); - cmt_test_cleanup(); - } + if (cmt_test) + run_cmt_test(has_ben, benchmark_cmd, cpu_no); - if (cat_test) { - ksft_print_msg("Starting CAT test ...\n"); - res = cat_perf_miss_val(cpu_no, no_of_bits, "L3"); - ksft_test_result(!res, "CAT: test\n"); - cat_test_cleanup(); - } + if (cat_test) + run_cat_test(cpu_no, no_of_bits); return ksft_exit_pass(); } -- cgit v1.2.3 From f1dd71982d1949a988cedbf4d9f2c726ee24344f Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 17 Mar 2021 02:22:51 +0000 Subject: selftests/resctrl: Skip the test if requested resctrl feature is not supported There could be two reasons why a resctrl feature might not be enabled on the platform 1. H/W might not support the feature 2. Even if the H/W supports it, the user might have disabled the feature through kernel command line arguments Hence, any resctrl unit test (like cmt, cat, mbm and mba) before starting the test will first check if the feature is enabled on the platform or not. If the feature isn't enabled, then the test returns with an error status. For example, if MBA isn't supported on a platform and if the user tries to run MBA, the output will look like this ok mounting resctrl to "/sys/fs/resctrl" not ok MBA: schemata change But, not supporting a feature isn't a test failure. So, instead of treating it as an error, use the SKIP directive of the TAP protocol. With the change, the output will look as below ok MBA # SKIP Hardware does not support MBA or MBA is disabled Suggested-by: Reinette Chatre Tested-by: Babu Moger Signed-off-by: Fenghua Yu Signed-off-by: Shuah Khan --- tools/testing/selftests/resctrl/cat_test.c | 3 --- tools/testing/selftests/resctrl/mba_test.c | 3 --- tools/testing/selftests/resctrl/mbm_test.c | 3 --- tools/testing/selftests/resctrl/resctrl_tests.c | 23 +++++++++++++++++++++++ 4 files changed, 23 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c index 04d706b4f10e..cd4f68388e0f 100644 --- a/tools/testing/selftests/resctrl/cat_test.c +++ b/tools/testing/selftests/resctrl/cat_test.c @@ -111,9 +111,6 @@ int cat_perf_miss_val(int cpu_no, int n, char *cache_type) if (ret) return ret; - if (!validate_resctrl_feature_request("cat")) - return -1; - /* Get default cbm mask for L3/L2 cache */ ret = get_cbm_mask(cache_type, cbm_mask); if (ret) diff --git a/tools/testing/selftests/resctrl/mba_test.c b/tools/testing/selftests/resctrl/mba_test.c index 8842d379e886..26f12ad4c663 100644 --- a/tools/testing/selftests/resctrl/mba_test.c +++ b/tools/testing/selftests/resctrl/mba_test.c @@ -158,9 +158,6 @@ int mba_schemata_change(int cpu_no, char *bw_report, char **benchmark_cmd) remove(RESULT_FILE_NAME); - if (!validate_resctrl_feature_request("mba")) - return -1; - ret = resctrl_val(benchmark_cmd, ¶m); if (ret) return ret; diff --git a/tools/testing/selftests/resctrl/mbm_test.c b/tools/testing/selftests/resctrl/mbm_test.c index 651d4ac15986..02b1ed03f1e5 100644 --- a/tools/testing/selftests/resctrl/mbm_test.c +++ b/tools/testing/selftests/resctrl/mbm_test.c @@ -131,9 +131,6 @@ int mbm_bw_change(int span, int cpu_no, char *bw_report, char **benchmark_cmd) remove(RESULT_FILE_NAME); - if (!validate_resctrl_feature_request("mbm")) - return -1; - ret = resctrl_val(benchmark_cmd, ¶m); if (ret) return ret; diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c index e63e0d8764ef..fb246bc41f47 100644 --- a/tools/testing/selftests/resctrl/resctrl_tests.c +++ b/tools/testing/selftests/resctrl/resctrl_tests.c @@ -60,6 +60,12 @@ static void run_mbm_test(bool has_ben, char **benchmark_cmd, int span, int res; ksft_print_msg("Starting MBM BW change ...\n"); + + if (!validate_resctrl_feature_request(MBM_STR)) { + ksft_test_result_skip("Hardware does not support MBM or MBM is disabled\n"); + return; + } + if (!has_ben) sprintf(benchmark_cmd[5], "%s", MBA_STR); res = mbm_bw_change(span, cpu_no, bw_report, benchmark_cmd); @@ -73,6 +79,12 @@ static void run_mba_test(bool has_ben, char **benchmark_cmd, int span, int res; ksft_print_msg("Starting MBA Schemata change ...\n"); + + if (!validate_resctrl_feature_request(MBA_STR)) { + ksft_test_result_skip("Hardware does not support MBA or MBA is disabled\n"); + return; + } + if (!has_ben) sprintf(benchmark_cmd[1], "%d", span); res = mba_schemata_change(cpu_no, bw_report, benchmark_cmd); @@ -85,6 +97,11 @@ static void run_cmt_test(bool has_ben, char **benchmark_cmd, int cpu_no) int res; ksft_print_msg("Starting CMT test ...\n"); + if (!validate_resctrl_feature_request(CMT_STR)) { + ksft_test_result_skip("Hardware does not support CMT or CMT is disabled\n"); + return; + } + if (!has_ben) sprintf(benchmark_cmd[5], "%s", CMT_STR); res = cmt_resctrl_val(cpu_no, 5, benchmark_cmd); @@ -97,6 +114,12 @@ static void run_cat_test(int cpu_no, int no_of_bits) int res; ksft_print_msg("Starting CAT test ...\n"); + + if (!validate_resctrl_feature_request(CAT_STR)) { + ksft_test_result_skip("Hardware does not support CAT or CAT is disabled\n"); + return; + } + res = cat_perf_miss_val(cpu_no, no_of_bits, "L3"); ksft_test_result(!res, "CAT: test\n"); cat_test_cleanup(); -- cgit v1.2.3 From 4e5cb354c85eafe88709cefc2fdce4911fb6ac17 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 17 Mar 2021 02:22:52 +0000 Subject: selftests/resctrl: Fix unmount resctrl FS umount_resctrlfs() directly attempts to unmount resctrl file system without checking if resctrl FS is already mounted or not. It returns 0 on success and on failure it prints an error message and returns an error status. Calling umount_resctrlfs() when resctrl FS isn't mounted will return an error status. There could be situations where-in the caller might not know if resctrl FS is already mounted or not and the caller might still want to unmount resctrl FS if it's already mounted (For example during teardown). To support above use cases, change umount_resctrlfs() such that it now first checks if resctrl FS is already mounted or not and unmounts resctrl FS only if it's already mounted. unmount resctrl FS upon exit. For example, running only mba test on a Broadwell (BDW) machine (MBA isn't supported on BDW CPU). This happens because validate_resctrl_feature_request() would mount resctrl FS to check if mba is enabled on the platform or not and finds that the H/W doesn't support mba and hence will return false to run_mba_test(). This in turn makes the main() function return without unmounting resctrl FS. Tested-by: Babu Moger Signed-off-by: Fenghua Yu Signed-off-by: Shuah Khan --- tools/testing/selftests/resctrl/resctrl_tests.c | 2 ++ tools/testing/selftests/resctrl/resctrlfs.c | 3 +++ 2 files changed, 5 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c index fb246bc41f47..f51b5fc066a3 100644 --- a/tools/testing/selftests/resctrl/resctrl_tests.c +++ b/tools/testing/selftests/resctrl/resctrl_tests.c @@ -253,5 +253,7 @@ int main(int argc, char **argv) if (cat_test) run_cat_test(cpu_no, no_of_bits); + umount_resctrlfs(); + return ksft_exit_pass(); } diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c index 26563175acf6..ade5f2b8b843 100644 --- a/tools/testing/selftests/resctrl/resctrlfs.c +++ b/tools/testing/selftests/resctrl/resctrlfs.c @@ -82,6 +82,9 @@ int remount_resctrlfs(bool mum_resctrlfs) int umount_resctrlfs(void) { + if (find_resctrl_mount(NULL)) + return 0; + if (umount(RESCTRL_PATH)) { perror("# Unable to umount resctrl"); -- cgit v1.2.3 From d81343b5eedf84be71a4313e8fd073d0c510afcf Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 17 Mar 2021 02:22:53 +0000 Subject: selftests/resctrl: Fix incorrect parsing of iMC counters iMC (Integrated Memory Controller) counters are usually at "/sys/bus/event_source/devices/" and are named as "uncore_imc_". num_of_imcs() function tries to count number of such iMC counters so that it could appropriately initialize required number of perf_attr structures that could be used to read these iMC counters. num_of_imcs() function assumes that all the directories under this path that start with "uncore_imc" are iMC counters. But, on some systems there could be directories named as "uncore_imc_free_running" which aren't iMC counters. Trying to read from such directories will result in "not found file" errors and MBM/MBA tests will fail. Hence, fix the logic in num_of_imcs() such that it looks at the first character after "uncore_imc_" to check if it's a numerical digit or not. If it's a digit then the directory represents an iMC counter, else, skip the directory. Reported-by: Reinette Chatre Tested-by: Babu Moger Signed-off-by: Fenghua Yu Signed-off-by: Shuah Khan --- tools/testing/selftests/resctrl/resctrl_val.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/resctrl/resctrl_val.c b/tools/testing/selftests/resctrl/resctrl_val.c index 5dfae51133bc..20d457c47ded 100644 --- a/tools/testing/selftests/resctrl/resctrl_val.c +++ b/tools/testing/selftests/resctrl/resctrl_val.c @@ -221,8 +221,8 @@ static int read_from_imc_dir(char *imc_dir, int count) */ static int num_of_imcs(void) { + char imc_dir[512], *temp; unsigned int count = 0; - char imc_dir[512]; struct dirent *ep; int ret; DIR *dp; @@ -230,7 +230,25 @@ static int num_of_imcs(void) dp = opendir(DYN_PMU_PATH); if (dp) { while ((ep = readdir(dp))) { - if (strstr(ep->d_name, UNCORE_IMC)) { + temp = strstr(ep->d_name, UNCORE_IMC); + if (!temp) + continue; + + /* + * imc counters are named as "uncore_imc_", hence + * increment the pointer to point to . Note that + * sizeof(UNCORE_IMC) would count for null character as + * well and hence the last underscore character in + * uncore_imc'_' need not be counted. + */ + temp = temp + sizeof(UNCORE_IMC); + + /* + * Some directories under "DYN_PMU_PATH" could have + * names like "uncore_imc_free_running", hence, check if + * first character is a numerical digit or not. + */ + if (temp[0] >= '0' && temp[0] <= '9') { sprintf(imc_dir, "%s/%s/", DYN_PMU_PATH, ep->d_name); ret = read_from_imc_dir(imc_dir, count); -- cgit v1.2.3 From 1205b688c92558a04d8dd4cbc2b213e0fceba5db Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 17 Mar 2021 02:22:54 +0000 Subject: selftests/resctrl: Fix checking for < 0 for unsigned values Dan reported following static checker warnings tools/testing/selftests/resctrl/resctrl_val.c:545 measure_vals() warn: 'bw_imc' unsigned <= 0 tools/testing/selftests/resctrl/resctrl_val.c:549 measure_vals() warn: 'bw_resc_end' unsigned <= 0 These warnings are reported because 1. measure_vals() declares 'bw_imc' and 'bw_resc_end' as unsigned long variables 2. Return value of get_mem_bw_imc() and get_mem_bw_resctrl() are assigned to 'bw_imc' and 'bw_resc_end' respectively 3. The returned values are checked for <= 0 to see if the calls failed Checking for < 0 for an unsigned value doesn't make any sense. Fix this issue by changing the implementation of get_mem_bw_imc() and get_mem_bw_resctrl() such that they now accept reference to a variable and set the variable appropriately upon success and return 0, else return < 0 on error. Reported-by: Dan Carpenter Tested-by: Babu Moger Signed-off-by: Fenghua Yu Signed-off-by: Shuah Khan --- tools/testing/selftests/resctrl/resctrl_val.c | 41 +++++++++++++++------------ 1 file changed, 23 insertions(+), 18 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/resctrl/resctrl_val.c b/tools/testing/selftests/resctrl/resctrl_val.c index 20d457c47ded..95224345c78e 100644 --- a/tools/testing/selftests/resctrl/resctrl_val.c +++ b/tools/testing/selftests/resctrl/resctrl_val.c @@ -300,9 +300,9 @@ static int initialize_mem_bw_imc(void) * Memory B/W utilized by a process on a socket can be calculated using * iMC counters. Perf events are used to read these counters. * - * Return: >= 0 on success. < 0 on failure. + * Return: = 0 on success. < 0 on failure. */ -static float get_mem_bw_imc(int cpu_no, char *bw_report) +static int get_mem_bw_imc(int cpu_no, char *bw_report, float *bw_imc) { float reads, writes, of_mul_read, of_mul_write; int imc, j, ret; @@ -373,13 +373,18 @@ static float get_mem_bw_imc(int cpu_no, char *bw_report) close(imc_counters_config[imc][WRITE].fd); } - if (strcmp(bw_report, "reads") == 0) - return reads; + if (strcmp(bw_report, "reads") == 0) { + *bw_imc = reads; + return 0; + } - if (strcmp(bw_report, "writes") == 0) - return writes; + if (strcmp(bw_report, "writes") == 0) { + *bw_imc = writes; + return 0; + } - return (reads + writes); + *bw_imc = reads + writes; + return 0; } void set_mbm_path(const char *ctrlgrp, const char *mongrp, int resource_id) @@ -438,9 +443,8 @@ static void initialize_mem_bw_resctrl(const char *ctrlgrp, const char *mongrp, * 1. If con_mon grp is given, then read from it * 2. If con_mon grp is not given, then read from root con_mon grp */ -static unsigned long get_mem_bw_resctrl(void) +static int get_mem_bw_resctrl(unsigned long *mbm_total) { - unsigned long mbm_total = 0; FILE *fp; fp = fopen(mbm_total_path, "r"); @@ -449,7 +453,7 @@ static unsigned long get_mem_bw_resctrl(void) return -1; } - if (fscanf(fp, "%lu", &mbm_total) <= 0) { + if (fscanf(fp, "%lu", mbm_total) <= 0) { perror("Could not get mbm local bytes"); fclose(fp); @@ -457,7 +461,7 @@ static unsigned long get_mem_bw_resctrl(void) } fclose(fp); - return mbm_total; + return 0; } pid_t bm_pid, ppid; @@ -549,7 +553,8 @@ static void initialize_llc_occu_resctrl(const char *ctrlgrp, const char *mongrp, static int measure_vals(struct resctrl_val_param *param, unsigned long *bw_resc_start) { - unsigned long bw_imc, bw_resc, bw_resc_end; + unsigned long bw_resc, bw_resc_end; + float bw_imc; int ret; /* @@ -559,13 +564,13 @@ measure_vals(struct resctrl_val_param *param, unsigned long *bw_resc_start) * Compare the two values to validate resctrl value. * It takes 1sec to measure the data. */ - bw_imc = get_mem_bw_imc(param->cpu_no, param->bw_report); - if (bw_imc <= 0) - return bw_imc; + ret = get_mem_bw_imc(param->cpu_no, param->bw_report, &bw_imc); + if (ret < 0) + return ret; - bw_resc_end = get_mem_bw_resctrl(); - if (bw_resc_end <= 0) - return bw_resc_end; + ret = get_mem_bw_resctrl(&bw_resc_end); + if (ret < 0) + return ret; bw_resc = (bw_resc_end - *bw_resc_start) / MB; ret = print_results_bw(param->filename, bm_pid, bw_imc, bw_resc); -- cgit v1.2.3 From 4808bf209efd586512e31590f19c1affbe56980e Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 17 Mar 2021 02:22:55 +0000 Subject: selftests/resctrl: Create .gitignore to include resctrl_tests Create .gitignore to hold the test file resctrl_tests generated after compiling. Suggested-by: Shuah Khan Tested-by: Babu Moger Signed-off-by: Fenghua Yu Signed-off-by: Shuah Khan --- tools/testing/selftests/resctrl/.gitignore | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 tools/testing/selftests/resctrl/.gitignore (limited to 'tools') diff --git a/tools/testing/selftests/resctrl/.gitignore b/tools/testing/selftests/resctrl/.gitignore new file mode 100644 index 000000000000..ab68442b6bc8 --- /dev/null +++ b/tools/testing/selftests/resctrl/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +resctrl_tests -- cgit v1.2.3 From 9854781dba371dda22880fc6acac7688fb5e2bae Mon Sep 17 00:00:00 2001 From: Daniel Latypov Date: Mon, 22 Feb 2021 14:52:41 -0800 Subject: kunit: tool: make --kunitconfig accept dirs, add lib/kunit fragment TL;DR $ ./tools/testing/kunit/kunit.py run --kunitconfig=lib/kunit Per suggestion from Ted [1], we can reduce the amount of typing by assuming a convention that these files are named '.kunitconfig'. In the case of [1], we now have $ ./tools/testing/kunit/kunit.py run --kunitconfig=fs/ext4 Also add in such a fragment for kunit itself so we can give that as an example more close to home (and thus less likely to be accidentally broken). [1] https://lore.kernel.org/linux-ext4/YCNF4yP1dB97zzwD@mit.edu/ Signed-off-by: Daniel Latypov Reviewed-by: David Gow Reviewed-by: Brendan Higgins Signed-off-by: Shuah Khan --- lib/kunit/.kunitconfig | 3 +++ tools/testing/kunit/kunit.py | 4 +++- tools/testing/kunit/kunit_kernel.py | 2 ++ tools/testing/kunit/kunit_tool_test.py | 6 ++++++ 4 files changed, 14 insertions(+), 1 deletion(-) create mode 100644 lib/kunit/.kunitconfig (limited to 'tools') diff --git a/lib/kunit/.kunitconfig b/lib/kunit/.kunitconfig new file mode 100644 index 000000000000..9235b7d42d38 --- /dev/null +++ b/lib/kunit/.kunitconfig @@ -0,0 +1,3 @@ +CONFIG_KUNIT=y +CONFIG_KUNIT_TEST=y +CONFIG_KUNIT_EXAMPLE_TEST=y diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py index d5144fcb03ac..5da8fb3762f9 100755 --- a/tools/testing/kunit/kunit.py +++ b/tools/testing/kunit/kunit.py @@ -184,7 +184,9 @@ def add_common_opts(parser) -> None: help='Run all KUnit tests through allyesconfig', action='store_true') parser.add_argument('--kunitconfig', - help='Path to Kconfig fragment that enables KUnit tests', + help='Path to Kconfig fragment that enables KUnit tests.' + ' If given a directory, (e.g. lib/kunit), "/.kunitconfig" ' + 'will get automatically appended.', metavar='kunitconfig') def add_build_opts(parser) -> None: diff --git a/tools/testing/kunit/kunit_kernel.py b/tools/testing/kunit/kunit_kernel.py index f309a33256cd..89a7d4024e87 100644 --- a/tools/testing/kunit/kunit_kernel.py +++ b/tools/testing/kunit/kunit_kernel.py @@ -132,6 +132,8 @@ class LinuxSourceTree(object): return if kunitconfig_path: + if os.path.isdir(kunitconfig_path): + kunitconfig_path = os.path.join(kunitconfig_path, KUNITCONFIG_PATH) if not os.path.exists(kunitconfig_path): raise ConfigError(f'Specified kunitconfig ({kunitconfig_path}) does not exist') else: diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py index 1ad3049e9069..2e809dd956a7 100755 --- a/tools/testing/kunit/kunit_tool_test.py +++ b/tools/testing/kunit/kunit_tool_test.py @@ -251,6 +251,12 @@ class LinuxSourceTreeTest(unittest.TestCase): with tempfile.NamedTemporaryFile('wt') as kunitconfig: tree = kunit_kernel.LinuxSourceTree('', kunitconfig_path=kunitconfig.name) + def test_dir_kunitconfig(self): + with tempfile.TemporaryDirectory('') as dir: + with open(os.path.join(dir, '.kunitconfig'), 'w') as f: + pass + tree = kunit_kernel.LinuxSourceTree('', kunitconfig_path=dir) + # TODO: add more test cases. -- cgit v1.2.3 From 5888a61cb4e00695075bbacfd86f3fa73af00413 Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Thu, 1 Apr 2021 16:19:45 -0700 Subject: selftests: mptcp: launch mptcp_connect with timeout 'mptcp_connect' already has a timeout for poll() but in some cases, it is not enough. With "timeout" tool, we will force the command to fail if it doesn't finish on time. Thanks to that, the script will continue and display details about the current state before marking the test as failed. Displaying this state is very important to be able to understand the issue. Best to have our CI reporting the issue than just "the test hanged". Note that in mptcp_connect.sh, we were using a long timeout to validate the fact we cannot create a socket if a sysctl is set. We don't need this timeout. In diag.sh, we want to send signals to mptcp_connect instances that have been started in the netns. But we cannot send this signal to 'timeout' otherwise that will stop the timeout and messages telling us SIGUSR1 has been received will be printed. Instead of trying to find the right PID and storing them in an array, we can simply use the output of 'ip netns pids' which is all the PIDs we want to send signal to. Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/160 Signed-off-by: Matthieu Baerts Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/diag.sh | 55 ++++++++++++++-------- tools/testing/selftests/net/mptcp/mptcp_connect.sh | 15 ++++-- tools/testing/selftests/net/mptcp/mptcp_join.sh | 22 ++++++--- tools/testing/selftests/net/mptcp/simult_flows.sh | 13 +++-- 4 files changed, 72 insertions(+), 33 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/diag.sh b/tools/testing/selftests/net/mptcp/diag.sh index 39edce4f541c..2674ba20d524 100755 --- a/tools/testing/selftests/net/mptcp/diag.sh +++ b/tools/testing/selftests/net/mptcp/diag.sh @@ -5,8 +5,9 @@ rndh=$(printf %x $sec)-$(mktemp -u XXXXXX) ns="ns1-$rndh" ksft_skip=4 test_cnt=1 +timeout_poll=100 +timeout_test=$((timeout_poll * 2 + 1)) ret=0 -pids=() flush_pids() { @@ -14,18 +15,14 @@ flush_pids() # give it some time sleep 1.1 - for pid in ${pids[@]}; do - [ -d /proc/$pid ] && kill -SIGUSR1 $pid >/dev/null 2>&1 - done - pids=() + ip netns pids "${ns}" | xargs --no-run-if-empty kill -SIGUSR1 &>/dev/null } cleanup() { + ip netns pids "${ns}" | xargs --no-run-if-empty kill -SIGKILL &>/dev/null + ip netns del $ns - for pid in ${pids[@]}; do - [ -d /proc/$pid ] && kill -9 $pid >/dev/null 2>&1 - done } ip -Version > /dev/null 2>&1 @@ -79,39 +76,57 @@ trap cleanup EXIT ip netns add $ns ip -n $ns link set dev lo up -echo "a" | ip netns exec $ns ./mptcp_connect -p 10000 -l 0.0.0.0 -t 100 >/dev/null & +echo "a" | \ + timeout ${timeout_test} \ + ip netns exec $ns \ + ./mptcp_connect -p 10000 -l -t ${timeout_poll} \ + 0.0.0.0 >/dev/null & sleep 0.1 -pids[0]=$! chk_msk_nr 0 "no msk on netns creation" -echo "b" | ip netns exec $ns ./mptcp_connect -p 10000 127.0.0.1 -j -t 100 >/dev/null & +echo "b" | \ + timeout ${timeout_test} \ + ip netns exec $ns \ + ./mptcp_connect -p 10000 -j -t ${timeout_poll} \ + 127.0.0.1 >/dev/null & sleep 0.1 -pids[1]=$! chk_msk_nr 2 "after MPC handshake " chk_msk_remote_key_nr 2 "....chk remote_key" chk_msk_fallback_nr 0 "....chk no fallback" flush_pids -echo "a" | ip netns exec $ns ./mptcp_connect -p 10001 -s TCP -l 0.0.0.0 -t 100 >/dev/null & -pids[0]=$! +echo "a" | \ + timeout ${timeout_test} \ + ip netns exec $ns \ + ./mptcp_connect -p 10001 -l -s TCP -t ${timeout_poll} \ + 0.0.0.0 >/dev/null & sleep 0.1 -echo "b" | ip netns exec $ns ./mptcp_connect -p 10001 127.0.0.1 -j -t 100 >/dev/null & -pids[1]=$! +echo "b" | \ + timeout ${timeout_test} \ + ip netns exec $ns \ + ./mptcp_connect -p 10001 -j -t ${timeout_poll} \ + 127.0.0.1 >/dev/null & sleep 0.1 chk_msk_fallback_nr 1 "check fallback" flush_pids NR_CLIENTS=100 for I in `seq 1 $NR_CLIENTS`; do - echo "a" | ip netns exec $ns ./mptcp_connect -p $((I+10001)) -l 0.0.0.0 -t 100 -w 10 >/dev/null & - pids[$((I*2))]=$! + echo "a" | \ + timeout ${timeout_test} \ + ip netns exec $ns \ + ./mptcp_connect -p $((I+10001)) -l -w 10 \ + -t ${timeout_poll} 0.0.0.0 >/dev/null & done sleep 0.1 for I in `seq 1 $NR_CLIENTS`; do - echo "b" | ip netns exec $ns ./mptcp_connect -p $((I+10001)) 127.0.0.1 -t 100 -w 10 >/dev/null & - pids[$((I*2 + 1))]=$! + echo "b" | \ + timeout ${timeout_test} \ + ip netns exec $ns \ + ./mptcp_connect -p $((I+10001)) -w 10 \ + -t ${timeout_poll} 127.0.0.1 >/dev/null & done sleep 1.5 diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index 10a030b53b23..65b3b983efc2 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -11,7 +11,8 @@ cin="" cout="" ksft_skip=4 capture=false -timeout=30 +timeout_poll=30 +timeout_test=$((timeout_poll * 2 + 1)) ipv6=true ethtool_random_on=true tc_delay="$((RANDOM%50))" @@ -273,7 +274,7 @@ check_mptcp_disabled() ip netns exec ${disabled_ns} sysctl -q net.mptcp.enabled=0 local err=0 - LANG=C ip netns exec ${disabled_ns} ./mptcp_connect -t $timeout -p 10000 -s MPTCP 127.0.0.1 < "$cin" 2>&1 | \ + LANG=C ip netns exec ${disabled_ns} ./mptcp_connect -p 10000 -s MPTCP 127.0.0.1 < "$cin" 2>&1 | \ grep -q "^socket: Protocol not available$" && err=1 ip netns delete ${disabled_ns} @@ -430,14 +431,20 @@ do_transfer() local stat_cookietx_last=$(get_mib_counter "${listener_ns}" "TcpExtSyncookiesSent") local stat_cookierx_last=$(get_mib_counter "${listener_ns}" "TcpExtSyncookiesRecv") - ip netns exec ${listener_ns} ./mptcp_connect -t $timeout -l -p $port -s ${srv_proto} $extra_args $local_addr < "$sin" > "$sout" & + timeout ${timeout_test} \ + ip netns exec ${listener_ns} \ + ./mptcp_connect -t ${timeout_poll} -l -p $port -s ${srv_proto} \ + $extra_args $local_addr < "$sin" > "$sout" & local spid=$! wait_local_port_listen "${listener_ns}" "${port}" local start start=$(date +%s%3N) - ip netns exec ${connector_ns} ./mptcp_connect -t $timeout -p $port -s ${cl_proto} $extra_args $connect_addr < "$cin" > "$cout" & + timeout ${timeout_test} \ + ip netns exec ${connector_ns} \ + ./mptcp_connect -t ${timeout_poll} -p $port -s ${cl_proto} \ + $extra_args $connect_addr < "$cin" > "$cout" & local cpid=$! wait $cpid diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index d2273b88e72c..56ffdad01526 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -8,7 +8,8 @@ cin="" cinsent="" cout="" ksft_skip=4 -timeout=30 +timeout_poll=30 +timeout_test=$((timeout_poll * 2 + 1)) mptcp_connect="" capture=0 do_all_tests=1 @@ -247,17 +248,26 @@ do_transfer() local_addr="0.0.0.0" fi - ip netns exec ${listener_ns} $mptcp_connect -t $timeout -l -p $port \ - -s ${srv_proto} ${local_addr} < "$sin" > "$sout" & + timeout ${timeout_test} \ + ip netns exec ${listener_ns} \ + $mptcp_connect -t ${timeout_poll} -l -p $port -s ${srv_proto} \ + ${local_addr} < "$sin" > "$sout" & spid=$! sleep 1 if [ "$test_link_fail" -eq 0 ];then - ip netns exec ${connector_ns} $mptcp_connect -t $timeout -p $port -s ${cl_proto} $connect_addr < "$cin" > "$cout" & + timeout ${timeout_test} \ + ip netns exec ${connector_ns} \ + $mptcp_connect -t ${timeout_poll} -p $port -s ${cl_proto} \ + $connect_addr < "$cin" > "$cout" & else - ( cat "$cin" ; sleep 2; link_failure $listener_ns ; cat "$cin" ) | tee "$cinsent" | \ - ip netns exec ${connector_ns} $mptcp_connect -t $timeout -p $port -s ${cl_proto} $connect_addr > "$cout" & + ( cat "$cin" ; sleep 2; link_failure $listener_ns ; cat "$cin" ) | \ + tee "$cinsent" | \ + timeout ${timeout_test} \ + ip netns exec ${connector_ns} \ + $mptcp_connect -t ${timeout_poll} -p $port -s ${cl_proto} \ + $connect_addr > "$cout" & fi cpid=$! diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh index f039ee57eb3c..3aeef3bcb101 100755 --- a/tools/testing/selftests/net/mptcp/simult_flows.sh +++ b/tools/testing/selftests/net/mptcp/simult_flows.sh @@ -7,7 +7,8 @@ ns2="ns2-$rndh" ns3="ns3-$rndh" capture=false ksft_skip=4 -timeout=30 +timeout_poll=30 +timeout_test=$((timeout_poll * 2 + 1)) test_cnt=1 ret=0 bail=0 @@ -157,14 +158,20 @@ do_transfer() sleep 1 fi - ip netns exec ${ns3} ./mptcp_connect -jt $timeout -l -p $port 0.0.0.0 < "$sin" > "$sout" & + timeout ${timeout_test} \ + ip netns exec ${ns3} \ + ./mptcp_connect -jt ${timeout_poll} -l -p $port \ + 0.0.0.0 < "$sin" > "$sout" & local spid=$! wait_local_port_listen "${ns3}" "${port}" local start start=$(date +%s%3N) - ip netns exec ${ns1} ./mptcp_connect -jt $timeout -p $port 10.0.3.3 < "$cin" > "$cout" & + timeout ${timeout_test} \ + ip netns exec ${ns1} \ + ./mptcp_connect -jt ${timeout_poll} -p $port \ + 10.0.3.3 < "$cin" > "$cout" & local cpid=$! wait $cpid -- cgit v1.2.3 From 76e5e27ca98748242ba7c1fc24f06c09002eee45 Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Thu, 1 Apr 2021 16:19:46 -0700 Subject: selftests: mptcp: init nstat history Not to be impacted by packets sent between sub-tests. Signed-off-by: Matthieu Baerts Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/mptcp_connect.sh | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index 65b3b983efc2..385cdc98aed8 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -426,6 +426,13 @@ do_transfer() sleep 1 fi + NSTAT_HISTORY=/tmp/${listener_ns}.nstat ip netns exec ${listener_ns} \ + nstat -n + if [ ${listener_ns} != ${connector_ns} ]; then + NSTAT_HISTORY=/tmp/${connector_ns}.nstat ip netns exec ${connector_ns} \ + nstat -n + fi + local stat_synrx_last_l=$(get_mib_counter "${listener_ns}" "MPTcpExtMPCapableSYNRX") local stat_ackrx_last_l=$(get_mib_counter "${listener_ns}" "MPTcpExtMPCapableACKRX") local stat_cookietx_last=$(get_mib_counter "${listener_ns}" "TcpExtSyncookiesSent") -- cgit v1.2.3 From c2a55e8fd80f1cd9a16a24a08f6df50fc20a65ed Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Thu, 1 Apr 2021 16:19:47 -0700 Subject: selftests: mptcp: dump more info on mpjoin errors Very occasionally, MPTCP selftests fail. Yeah, I saw that at least once! Here we provide more details in case of errors with mptcp_join.sh script like it was done with mptcp_connect.sh, see commit 767389c8dd55 ("selftests: mptcp: dump more info on errors") Suggested-by: Paolo Abeni Signed-off-by: Matthieu Baerts Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 56ffdad01526..abeb24b7f8ec 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -78,6 +78,7 @@ cleanup_partial() for netns in "$ns1" "$ns2"; do ip netns del $netns + rm -f /tmp/$netns.{nstat,out} done } @@ -233,6 +234,11 @@ do_transfer() sleep 1 fi + NSTAT_HISTORY=/tmp/${listener_ns}.nstat ip netns exec ${listener_ns} \ + nstat -n + NSTAT_HISTORY=/tmp/${connector_ns}.nstat ip netns exec ${connector_ns} \ + nstat -n + if [ $speed = "fast" ]; then mptcp_connect="./mptcp_connect -j" elif [ $speed = "slow" ]; then @@ -383,12 +389,19 @@ do_transfer() kill $cappid fi + NSTAT_HISTORY=/tmp/${listener_ns}.nstat ip netns exec ${listener_ns} \ + nstat | grep Tcp > /tmp/${listener_ns}.out + NSTAT_HISTORY=/tmp/${connector_ns}.nstat ip netns exec ${connector_ns} \ + nstat | grep Tcp > /tmp/${connector_ns}.out + if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then echo " client exit code $retc, server $rets" 1>&2 echo -e "\nnetns ${listener_ns} socket stat for ${port}:" 1>&2 - ip netns exec ${listener_ns} ss -nita 1>&2 -o "sport = :$port" + ip netns exec ${listener_ns} ss -Menita 1>&2 -o "sport = :$port" + cat /tmp/${listener_ns}.out echo -e "\nnetns ${connector_ns} socket stat for ${port}:" 1>&2 - ip netns exec ${connector_ns} ss -nita 1>&2 -o "dport = :$port" + ip netns exec ${connector_ns} ss -Menita 1>&2 -o "dport = :$port" + cat /tmp/${connector_ns}.out cat "$capout" ret=1 -- cgit v1.2.3 From 007bdc12d4b466566784c1a39b9fb3985f875b3d Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 1 Apr 2021 15:25:56 -0700 Subject: bpf, selftests: test_maps generating unrecognized data section With a relatively recent clang master branch test_map skips a section, libbpf: elf: skipping unrecognized data section(5) .rodata.str1.1 the cause is some pointless strings from bpf_printks in the BPF program loaded during testing. After just removing the prints to fix above error Daniel points out the program is a bit pointless and could be simply the empty program returning SK_PASS. Here we do just that and return simply SK_PASS. This program is used with test_maps selftests to test insert/remove of a program into the sockmap and sockhash maps. Its not testing actual functionality of the TCP sockmap programs, these are tested from test_sockmap. So we shouldn't lose in test coverage and fix above warnings. This original test was added before test_sockmap existed and has been copied around ever since, clean it up now. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/161731595664.74613.1603087410166945302.stgit@john-XPS-13-9370 --- tools/testing/selftests/bpf/progs/sockmap_tcp_msg_prog.c | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/sockmap_tcp_msg_prog.c b/tools/testing/selftests/bpf/progs/sockmap_tcp_msg_prog.c index fdb4bf4408fa..eeaf6e75c9a2 100644 --- a/tools/testing/selftests/bpf/progs/sockmap_tcp_msg_prog.c +++ b/tools/testing/selftests/bpf/progs/sockmap_tcp_msg_prog.c @@ -8,18 +8,6 @@ int _version SEC("version") = 1; SEC("sk_msg1") int bpf_prog1(struct sk_msg_md *msg) { - void *data_end = (void *)(long) msg->data_end; - void *data = (void *)(long) msg->data; - - char *d; - - if (data + 8 > data_end) - return SK_DROP; - - bpf_printk("data length %i\n", (__u64)msg->data_end - (__u64)msg->data); - d = (char *)data; - bpf_printk("hello sendmsg hook %i %i\n", d[0], d[1]); - return SK_PASS; } -- cgit v1.2.3 From f07669df4c8df0b7134ae94be20a8b61bd157168 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Fri, 2 Apr 2021 09:26:34 +0800 Subject: libbpf: Remove redundant semi-colon Remove redundant semi-colon in finalize_btf_ext(). Signed-off-by: Yang Yingliang Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210402012634.1965453-1-yangyingliang@huawei.com --- tools/lib/bpf/linker.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c index 46b16cbdcda3..4e08bc07e635 100644 --- a/tools/lib/bpf/linker.c +++ b/tools/lib/bpf/linker.c @@ -1895,7 +1895,7 @@ static int finalize_btf_ext(struct bpf_linker *linker) hdr->func_info_len = funcs_sz; hdr->line_info_off = funcs_sz; hdr->line_info_len = lines_sz; - hdr->core_relo_off = funcs_sz + lines_sz;; + hdr->core_relo_off = funcs_sz + lines_sz; hdr->core_relo_len = core_relos_sz; if (funcs_sz) { -- cgit v1.2.3 From f73ea1eb4cce66376cc1dd94b4a083ffb9eeb123 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 2 Apr 2021 17:29:21 -0700 Subject: bpf: selftests: Specify CONFIG_DYNAMIC_FTRACE in the testing config The tracing test and the recent kfunc call test require CONFIG_DYNAMIC_FTRACE. This patch adds it to the config file. Signed-off-by: Martin KaFai Lau Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210403002921.3419721-1-kafai@fb.com --- tools/testing/selftests/bpf/config | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 37e1f303fc11..5192305159ec 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -44,3 +44,5 @@ CONFIG_SECURITYFS=y CONFIG_IMA_WRITE_POLICY=y CONFIG_IMA_READ_POLICY=y CONFIG_BLK_DEV_LOOP=y +CONFIG_FUNCTION_TRACER=y +CONFIG_DYNAMIC_FTRACE=y -- cgit v1.2.3 From 1e1032b0c4afaed7739a6681ff6b4cb120b82994 Mon Sep 17 00:00:00 2001 From: Hengqi Chen Date: Mon, 5 Apr 2021 12:01:19 +0800 Subject: libbpf: Fix KERNEL_VERSION macro Add missing ')' for KERNEL_VERSION macro. Signed-off-by: Hengqi Chen Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210405040119.802188-1-hengqi.chen@gmail.com --- tools/lib/bpf/bpf_helpers.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index cc2e51c64a54..b904128626c2 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -51,7 +51,7 @@ #endif #ifndef KERNEL_VERSION -#define KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + ((c) > 255 ? 255 : (c)) +#define KERNEL_VERSION(a, b, c) (((a) << 16) + ((b) << 8) + ((c) > 255 ? 255 : (c))) #endif /* -- cgit v1.2.3 From 8ca52cc38dc8fdcbdbd0c23eafb19db5e5f5c8d0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Mar 2021 20:23:03 +1300 Subject: x86/sgx: Expose SGX architectural definitions to the kernel Expose SGX architectural structures, as KVM will use many of the architectural constants and structs to virtualize SGX. Name the new header file as asm/sgx.h, rather than asm/sgx_arch.h, to have single header to provide SGX facilities to share with other kernel componments. Also update MAINTAINERS to include asm/sgx.h. Signed-off-by: Sean Christopherson Co-developed-by: Kai Huang Signed-off-by: Kai Huang Signed-off-by: Borislav Petkov Acked-by: Jarkko Sakkinen Acked-by: Dave Hansen Link: https://lkml.kernel.org/r/6bf47acd91ab4d709e66ad1692c7803e4c9063a0.1616136308.git.kai.huang@intel.com --- MAINTAINERS | 1 + arch/x86/include/asm/sgx.h | 350 ++++++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/sgx/arch.h | 340 --------------------------------- arch/x86/kernel/cpu/sgx/encl.c | 2 +- arch/x86/kernel/cpu/sgx/sgx.h | 2 +- tools/testing/selftests/sgx/defines.h | 2 +- 6 files changed, 354 insertions(+), 343 deletions(-) create mode 100644 arch/x86/include/asm/sgx.h delete mode 100644 arch/x86/kernel/cpu/sgx/arch.h (limited to 'tools') diff --git a/MAINTAINERS b/MAINTAINERS index aa84121c5611..0cb606aeba5e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9274,6 +9274,7 @@ Q: https://patchwork.kernel.org/project/intel-sgx/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/sgx F: Documentation/x86/sgx.rst F: arch/x86/entry/vdso/vsgx.S +F: arch/x86/include/asm/sgx.h F: arch/x86/include/uapi/asm/sgx.h F: arch/x86/kernel/cpu/sgx/* F: tools/testing/selftests/sgx/* diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h new file mode 100644 index 000000000000..14bb5f7e221c --- /dev/null +++ b/arch/x86/include/asm/sgx.h @@ -0,0 +1,350 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/** + * Copyright(c) 2016-20 Intel Corporation. + * + * Intel Software Guard Extensions (SGX) support. + */ +#ifndef _ASM_X86_SGX_H +#define _ASM_X86_SGX_H + +#include +#include + +/* + * This file contains both data structures defined by SGX architecture and Linux + * defined software data structures and functions. The two should not be mixed + * together for better readibility. The architectural definitions come first. + */ + +/* The SGX specific CPUID function. */ +#define SGX_CPUID 0x12 +/* EPC enumeration. */ +#define SGX_CPUID_EPC 2 +/* An invalid EPC section, i.e. the end marker. */ +#define SGX_CPUID_EPC_INVALID 0x0 +/* A valid EPC section. */ +#define SGX_CPUID_EPC_SECTION 0x1 +/* The bitmask for the EPC section type. */ +#define SGX_CPUID_EPC_MASK GENMASK(3, 0) + +/** + * enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV + * %SGX_NOT_TRACKED: Previous ETRACK's shootdown sequence has not + * been completed yet. + * %SGX_CHILD_PRESENT SECS has child pages present in the EPC. + * %SGX_INVALID_EINITTOKEN: EINITTOKEN is invalid and enclave signer's + * public key does not match IA32_SGXLEPUBKEYHASH. + * %SGX_UNMASKED_EVENT: An unmasked event, e.g. INTR, was received + */ +enum sgx_return_code { + SGX_NOT_TRACKED = 11, + SGX_CHILD_PRESENT = 13, + SGX_INVALID_EINITTOKEN = 16, + SGX_UNMASKED_EVENT = 128, +}; + +/* The modulus size for 3072-bit RSA keys. */ +#define SGX_MODULUS_SIZE 384 + +/** + * enum sgx_miscselect - additional information to an SSA frame + * %SGX_MISC_EXINFO: Report #PF or #GP to the SSA frame. + * + * Save State Area (SSA) is a stack inside the enclave used to store processor + * state when an exception or interrupt occurs. This enum defines additional + * information stored to an SSA frame. + */ +enum sgx_miscselect { + SGX_MISC_EXINFO = BIT(0), +}; + +#define SGX_MISC_RESERVED_MASK GENMASK_ULL(63, 1) + +#define SGX_SSA_GPRS_SIZE 184 +#define SGX_SSA_MISC_EXINFO_SIZE 16 + +/** + * enum sgx_attributes - the attributes field in &struct sgx_secs + * %SGX_ATTR_INIT: Enclave can be entered (is initialized). + * %SGX_ATTR_DEBUG: Allow ENCLS(EDBGRD) and ENCLS(EDBGWR). + * %SGX_ATTR_MODE64BIT: Tell that this a 64-bit enclave. + * %SGX_ATTR_PROVISIONKEY: Allow to use provisioning keys for remote + * attestation. + * %SGX_ATTR_KSS: Allow to use key separation and sharing (KSS). + * %SGX_ATTR_EINITTOKENKEY: Allow to use token signing key that is used to + * sign cryptographic tokens that can be passed to + * EINIT as an authorization to run an enclave. + */ +enum sgx_attribute { + SGX_ATTR_INIT = BIT(0), + SGX_ATTR_DEBUG = BIT(1), + SGX_ATTR_MODE64BIT = BIT(2), + SGX_ATTR_PROVISIONKEY = BIT(4), + SGX_ATTR_EINITTOKENKEY = BIT(5), + SGX_ATTR_KSS = BIT(7), +}; + +#define SGX_ATTR_RESERVED_MASK (BIT_ULL(3) | BIT_ULL(6) | GENMASK_ULL(63, 8)) + +/** + * struct sgx_secs - SGX Enclave Control Structure (SECS) + * @size: size of the address space + * @base: base address of the address space + * @ssa_frame_size: size of an SSA frame + * @miscselect: additional information stored to an SSA frame + * @attributes: attributes for enclave + * @xfrm: XSave-Feature Request Mask (subset of XCR0) + * @mrenclave: SHA256-hash of the enclave contents + * @mrsigner: SHA256-hash of the public key used to sign the SIGSTRUCT + * @config_id: a user-defined value that is used in key derivation + * @isv_prod_id: a user-defined value that is used in key derivation + * @isv_svn: a user-defined value that is used in key derivation + * @config_svn: a user-defined value that is used in key derivation + * + * SGX Enclave Control Structure (SECS) is a special enclave page that is not + * visible in the address space. In fact, this structure defines the address + * range and other global attributes for the enclave and it is the first EPC + * page created for any enclave. It is moved from a temporary buffer to an EPC + * by the means of ENCLS[ECREATE] function. + */ +struct sgx_secs { + u64 size; + u64 base; + u32 ssa_frame_size; + u32 miscselect; + u8 reserved1[24]; + u64 attributes; + u64 xfrm; + u32 mrenclave[8]; + u8 reserved2[32]; + u32 mrsigner[8]; + u8 reserved3[32]; + u32 config_id[16]; + u16 isv_prod_id; + u16 isv_svn; + u16 config_svn; + u8 reserved4[3834]; +} __packed; + +/** + * enum sgx_tcs_flags - execution flags for TCS + * %SGX_TCS_DBGOPTIN: If enabled allows single-stepping and breakpoints + * inside an enclave. It is cleared by EADD but can + * be set later with EDBGWR. + */ +enum sgx_tcs_flags { + SGX_TCS_DBGOPTIN = 0x01, +}; + +#define SGX_TCS_RESERVED_MASK GENMASK_ULL(63, 1) +#define SGX_TCS_RESERVED_SIZE 4024 + +/** + * struct sgx_tcs - Thread Control Structure (TCS) + * @state: used to mark an entered TCS + * @flags: execution flags (cleared by EADD) + * @ssa_offset: SSA stack offset relative to the enclave base + * @ssa_index: the current SSA frame index (cleard by EADD) + * @nr_ssa_frames: the number of frame in the SSA stack + * @entry_offset: entry point offset relative to the enclave base + * @exit_addr: address outside the enclave to exit on an exception or + * interrupt + * @fs_offset: offset relative to the enclave base to become FS + * segment inside the enclave + * @gs_offset: offset relative to the enclave base to become GS + * segment inside the enclave + * @fs_limit: size to become a new FS-limit (only 32-bit enclaves) + * @gs_limit: size to become a new GS-limit (only 32-bit enclaves) + * + * Thread Control Structure (TCS) is an enclave page visible in its address + * space that defines an entry point inside the enclave. A thread enters inside + * an enclave by supplying address of TCS to ENCLU(EENTER). A TCS can be entered + * by only one thread at a time. + */ +struct sgx_tcs { + u64 state; + u64 flags; + u64 ssa_offset; + u32 ssa_index; + u32 nr_ssa_frames; + u64 entry_offset; + u64 exit_addr; + u64 fs_offset; + u64 gs_offset; + u32 fs_limit; + u32 gs_limit; + u8 reserved[SGX_TCS_RESERVED_SIZE]; +} __packed; + +/** + * struct sgx_pageinfo - an enclave page descriptor + * @addr: address of the enclave page + * @contents: pointer to the page contents + * @metadata: pointer either to a SECINFO or PCMD instance + * @secs: address of the SECS page + */ +struct sgx_pageinfo { + u64 addr; + u64 contents; + u64 metadata; + u64 secs; +} __packed __aligned(32); + + +/** + * enum sgx_page_type - bits in the SECINFO flags defining the page type + * %SGX_PAGE_TYPE_SECS: a SECS page + * %SGX_PAGE_TYPE_TCS: a TCS page + * %SGX_PAGE_TYPE_REG: a regular page + * %SGX_PAGE_TYPE_VA: a VA page + * %SGX_PAGE_TYPE_TRIM: a page in trimmed state + */ +enum sgx_page_type { + SGX_PAGE_TYPE_SECS, + SGX_PAGE_TYPE_TCS, + SGX_PAGE_TYPE_REG, + SGX_PAGE_TYPE_VA, + SGX_PAGE_TYPE_TRIM, +}; + +#define SGX_NR_PAGE_TYPES 5 +#define SGX_PAGE_TYPE_MASK GENMASK(7, 0) + +/** + * enum sgx_secinfo_flags - the flags field in &struct sgx_secinfo + * %SGX_SECINFO_R: allow read + * %SGX_SECINFO_W: allow write + * %SGX_SECINFO_X: allow execution + * %SGX_SECINFO_SECS: a SECS page + * %SGX_SECINFO_TCS: a TCS page + * %SGX_SECINFO_REG: a regular page + * %SGX_SECINFO_VA: a VA page + * %SGX_SECINFO_TRIM: a page in trimmed state + */ +enum sgx_secinfo_flags { + SGX_SECINFO_R = BIT(0), + SGX_SECINFO_W = BIT(1), + SGX_SECINFO_X = BIT(2), + SGX_SECINFO_SECS = (SGX_PAGE_TYPE_SECS << 8), + SGX_SECINFO_TCS = (SGX_PAGE_TYPE_TCS << 8), + SGX_SECINFO_REG = (SGX_PAGE_TYPE_REG << 8), + SGX_SECINFO_VA = (SGX_PAGE_TYPE_VA << 8), + SGX_SECINFO_TRIM = (SGX_PAGE_TYPE_TRIM << 8), +}; + +#define SGX_SECINFO_PERMISSION_MASK GENMASK_ULL(2, 0) +#define SGX_SECINFO_PAGE_TYPE_MASK (SGX_PAGE_TYPE_MASK << 8) +#define SGX_SECINFO_RESERVED_MASK ~(SGX_SECINFO_PERMISSION_MASK | \ + SGX_SECINFO_PAGE_TYPE_MASK) + +/** + * struct sgx_secinfo - describes attributes of an EPC page + * @flags: permissions and type + * + * Used together with ENCLS leaves that add or modify an EPC page to an + * enclave to define page permissions and type. + */ +struct sgx_secinfo { + u64 flags; + u8 reserved[56]; +} __packed __aligned(64); + +#define SGX_PCMD_RESERVED_SIZE 40 + +/** + * struct sgx_pcmd - Paging Crypto Metadata (PCMD) + * @enclave_id: enclave identifier + * @mac: MAC over PCMD, page contents and isvsvn + * + * PCMD is stored for every swapped page to the regular memory. When ELDU loads + * the page back it recalculates the MAC by using a isvsvn number stored in a + * VA page. Together these two structures bring integrity and rollback + * protection. + */ +struct sgx_pcmd { + struct sgx_secinfo secinfo; + u64 enclave_id; + u8 reserved[SGX_PCMD_RESERVED_SIZE]; + u8 mac[16]; +} __packed __aligned(128); + +#define SGX_SIGSTRUCT_RESERVED1_SIZE 84 +#define SGX_SIGSTRUCT_RESERVED2_SIZE 20 +#define SGX_SIGSTRUCT_RESERVED3_SIZE 32 +#define SGX_SIGSTRUCT_RESERVED4_SIZE 12 + +/** + * struct sgx_sigstruct_header - defines author of the enclave + * @header1: constant byte string + * @vendor: must be either 0x0000 or 0x8086 + * @date: YYYYMMDD in BCD + * @header2: costant byte string + * @swdefined: software defined value + */ +struct sgx_sigstruct_header { + u64 header1[2]; + u32 vendor; + u32 date; + u64 header2[2]; + u32 swdefined; + u8 reserved1[84]; +} __packed; + +/** + * struct sgx_sigstruct_body - defines contents of the enclave + * @miscselect: additional information stored to an SSA frame + * @misc_mask: required miscselect in SECS + * @attributes: attributes for enclave + * @xfrm: XSave-Feature Request Mask (subset of XCR0) + * @attributes_mask: required attributes in SECS + * @xfrm_mask: required XFRM in SECS + * @mrenclave: SHA256-hash of the enclave contents + * @isvprodid: a user-defined value that is used in key derivation + * @isvsvn: a user-defined value that is used in key derivation + */ +struct sgx_sigstruct_body { + u32 miscselect; + u32 misc_mask; + u8 reserved2[20]; + u64 attributes; + u64 xfrm; + u64 attributes_mask; + u64 xfrm_mask; + u8 mrenclave[32]; + u8 reserved3[32]; + u16 isvprodid; + u16 isvsvn; +} __packed; + +/** + * struct sgx_sigstruct - an enclave signature + * @header: defines author of the enclave + * @modulus: the modulus of the public key + * @exponent: the exponent of the public key + * @signature: the signature calculated over the fields except modulus, + * @body: defines contents of the enclave + * @q1: a value used in RSA signature verification + * @q2: a value used in RSA signature verification + * + * Header and body are the parts that are actual signed. The remaining fields + * define the signature of the enclave. + */ +struct sgx_sigstruct { + struct sgx_sigstruct_header header; + u8 modulus[SGX_MODULUS_SIZE]; + u32 exponent; + u8 signature[SGX_MODULUS_SIZE]; + struct sgx_sigstruct_body body; + u8 reserved4[12]; + u8 q1[SGX_MODULUS_SIZE]; + u8 q2[SGX_MODULUS_SIZE]; +} __packed; + +#define SGX_LAUNCH_TOKEN_SIZE 304 + +/* + * Do not put any hardware-defined SGX structure representations below this + * comment! + */ + +#endif /* _ASM_X86_SGX_H */ diff --git a/arch/x86/kernel/cpu/sgx/arch.h b/arch/x86/kernel/cpu/sgx/arch.h deleted file mode 100644 index abf99bb71fdc..000000000000 --- a/arch/x86/kernel/cpu/sgx/arch.h +++ /dev/null @@ -1,340 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/** - * Copyright(c) 2016-20 Intel Corporation. - * - * Contains data structures defined by the SGX architecture. Data structures - * defined by the Linux software stack should not be placed here. - */ -#ifndef _ASM_X86_SGX_ARCH_H -#define _ASM_X86_SGX_ARCH_H - -#include -#include - -/* The SGX specific CPUID function. */ -#define SGX_CPUID 0x12 -/* EPC enumeration. */ -#define SGX_CPUID_EPC 2 -/* An invalid EPC section, i.e. the end marker. */ -#define SGX_CPUID_EPC_INVALID 0x0 -/* A valid EPC section. */ -#define SGX_CPUID_EPC_SECTION 0x1 -/* The bitmask for the EPC section type. */ -#define SGX_CPUID_EPC_MASK GENMASK(3, 0) - -/** - * enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV - * %SGX_NOT_TRACKED: Previous ETRACK's shootdown sequence has not - * been completed yet. - * %SGX_CHILD_PRESENT SECS has child pages present in the EPC. - * %SGX_INVALID_EINITTOKEN: EINITTOKEN is invalid and enclave signer's - * public key does not match IA32_SGXLEPUBKEYHASH. - * %SGX_UNMASKED_EVENT: An unmasked event, e.g. INTR, was received - */ -enum sgx_return_code { - SGX_NOT_TRACKED = 11, - SGX_CHILD_PRESENT = 13, - SGX_INVALID_EINITTOKEN = 16, - SGX_UNMASKED_EVENT = 128, -}; - -/* The modulus size for 3072-bit RSA keys. */ -#define SGX_MODULUS_SIZE 384 - -/** - * enum sgx_miscselect - additional information to an SSA frame - * %SGX_MISC_EXINFO: Report #PF or #GP to the SSA frame. - * - * Save State Area (SSA) is a stack inside the enclave used to store processor - * state when an exception or interrupt occurs. This enum defines additional - * information stored to an SSA frame. - */ -enum sgx_miscselect { - SGX_MISC_EXINFO = BIT(0), -}; - -#define SGX_MISC_RESERVED_MASK GENMASK_ULL(63, 1) - -#define SGX_SSA_GPRS_SIZE 184 -#define SGX_SSA_MISC_EXINFO_SIZE 16 - -/** - * enum sgx_attributes - the attributes field in &struct sgx_secs - * %SGX_ATTR_INIT: Enclave can be entered (is initialized). - * %SGX_ATTR_DEBUG: Allow ENCLS(EDBGRD) and ENCLS(EDBGWR). - * %SGX_ATTR_MODE64BIT: Tell that this a 64-bit enclave. - * %SGX_ATTR_PROVISIONKEY: Allow to use provisioning keys for remote - * attestation. - * %SGX_ATTR_KSS: Allow to use key separation and sharing (KSS). - * %SGX_ATTR_EINITTOKENKEY: Allow to use token signing key that is used to - * sign cryptographic tokens that can be passed to - * EINIT as an authorization to run an enclave. - */ -enum sgx_attribute { - SGX_ATTR_INIT = BIT(0), - SGX_ATTR_DEBUG = BIT(1), - SGX_ATTR_MODE64BIT = BIT(2), - SGX_ATTR_PROVISIONKEY = BIT(4), - SGX_ATTR_EINITTOKENKEY = BIT(5), - SGX_ATTR_KSS = BIT(7), -}; - -#define SGX_ATTR_RESERVED_MASK (BIT_ULL(3) | BIT_ULL(6) | GENMASK_ULL(63, 8)) - -/** - * struct sgx_secs - SGX Enclave Control Structure (SECS) - * @size: size of the address space - * @base: base address of the address space - * @ssa_frame_size: size of an SSA frame - * @miscselect: additional information stored to an SSA frame - * @attributes: attributes for enclave - * @xfrm: XSave-Feature Request Mask (subset of XCR0) - * @mrenclave: SHA256-hash of the enclave contents - * @mrsigner: SHA256-hash of the public key used to sign the SIGSTRUCT - * @config_id: a user-defined value that is used in key derivation - * @isv_prod_id: a user-defined value that is used in key derivation - * @isv_svn: a user-defined value that is used in key derivation - * @config_svn: a user-defined value that is used in key derivation - * - * SGX Enclave Control Structure (SECS) is a special enclave page that is not - * visible in the address space. In fact, this structure defines the address - * range and other global attributes for the enclave and it is the first EPC - * page created for any enclave. It is moved from a temporary buffer to an EPC - * by the means of ENCLS[ECREATE] function. - */ -struct sgx_secs { - u64 size; - u64 base; - u32 ssa_frame_size; - u32 miscselect; - u8 reserved1[24]; - u64 attributes; - u64 xfrm; - u32 mrenclave[8]; - u8 reserved2[32]; - u32 mrsigner[8]; - u8 reserved3[32]; - u32 config_id[16]; - u16 isv_prod_id; - u16 isv_svn; - u16 config_svn; - u8 reserved4[3834]; -} __packed; - -/** - * enum sgx_tcs_flags - execution flags for TCS - * %SGX_TCS_DBGOPTIN: If enabled allows single-stepping and breakpoints - * inside an enclave. It is cleared by EADD but can - * be set later with EDBGWR. - */ -enum sgx_tcs_flags { - SGX_TCS_DBGOPTIN = 0x01, -}; - -#define SGX_TCS_RESERVED_MASK GENMASK_ULL(63, 1) -#define SGX_TCS_RESERVED_SIZE 4024 - -/** - * struct sgx_tcs - Thread Control Structure (TCS) - * @state: used to mark an entered TCS - * @flags: execution flags (cleared by EADD) - * @ssa_offset: SSA stack offset relative to the enclave base - * @ssa_index: the current SSA frame index (cleard by EADD) - * @nr_ssa_frames: the number of frame in the SSA stack - * @entry_offset: entry point offset relative to the enclave base - * @exit_addr: address outside the enclave to exit on an exception or - * interrupt - * @fs_offset: offset relative to the enclave base to become FS - * segment inside the enclave - * @gs_offset: offset relative to the enclave base to become GS - * segment inside the enclave - * @fs_limit: size to become a new FS-limit (only 32-bit enclaves) - * @gs_limit: size to become a new GS-limit (only 32-bit enclaves) - * - * Thread Control Structure (TCS) is an enclave page visible in its address - * space that defines an entry point inside the enclave. A thread enters inside - * an enclave by supplying address of TCS to ENCLU(EENTER). A TCS can be entered - * by only one thread at a time. - */ -struct sgx_tcs { - u64 state; - u64 flags; - u64 ssa_offset; - u32 ssa_index; - u32 nr_ssa_frames; - u64 entry_offset; - u64 exit_addr; - u64 fs_offset; - u64 gs_offset; - u32 fs_limit; - u32 gs_limit; - u8 reserved[SGX_TCS_RESERVED_SIZE]; -} __packed; - -/** - * struct sgx_pageinfo - an enclave page descriptor - * @addr: address of the enclave page - * @contents: pointer to the page contents - * @metadata: pointer either to a SECINFO or PCMD instance - * @secs: address of the SECS page - */ -struct sgx_pageinfo { - u64 addr; - u64 contents; - u64 metadata; - u64 secs; -} __packed __aligned(32); - - -/** - * enum sgx_page_type - bits in the SECINFO flags defining the page type - * %SGX_PAGE_TYPE_SECS: a SECS page - * %SGX_PAGE_TYPE_TCS: a TCS page - * %SGX_PAGE_TYPE_REG: a regular page - * %SGX_PAGE_TYPE_VA: a VA page - * %SGX_PAGE_TYPE_TRIM: a page in trimmed state - */ -enum sgx_page_type { - SGX_PAGE_TYPE_SECS, - SGX_PAGE_TYPE_TCS, - SGX_PAGE_TYPE_REG, - SGX_PAGE_TYPE_VA, - SGX_PAGE_TYPE_TRIM, -}; - -#define SGX_NR_PAGE_TYPES 5 -#define SGX_PAGE_TYPE_MASK GENMASK(7, 0) - -/** - * enum sgx_secinfo_flags - the flags field in &struct sgx_secinfo - * %SGX_SECINFO_R: allow read - * %SGX_SECINFO_W: allow write - * %SGX_SECINFO_X: allow execution - * %SGX_SECINFO_SECS: a SECS page - * %SGX_SECINFO_TCS: a TCS page - * %SGX_SECINFO_REG: a regular page - * %SGX_SECINFO_VA: a VA page - * %SGX_SECINFO_TRIM: a page in trimmed state - */ -enum sgx_secinfo_flags { - SGX_SECINFO_R = BIT(0), - SGX_SECINFO_W = BIT(1), - SGX_SECINFO_X = BIT(2), - SGX_SECINFO_SECS = (SGX_PAGE_TYPE_SECS << 8), - SGX_SECINFO_TCS = (SGX_PAGE_TYPE_TCS << 8), - SGX_SECINFO_REG = (SGX_PAGE_TYPE_REG << 8), - SGX_SECINFO_VA = (SGX_PAGE_TYPE_VA << 8), - SGX_SECINFO_TRIM = (SGX_PAGE_TYPE_TRIM << 8), -}; - -#define SGX_SECINFO_PERMISSION_MASK GENMASK_ULL(2, 0) -#define SGX_SECINFO_PAGE_TYPE_MASK (SGX_PAGE_TYPE_MASK << 8) -#define SGX_SECINFO_RESERVED_MASK ~(SGX_SECINFO_PERMISSION_MASK | \ - SGX_SECINFO_PAGE_TYPE_MASK) - -/** - * struct sgx_secinfo - describes attributes of an EPC page - * @flags: permissions and type - * - * Used together with ENCLS leaves that add or modify an EPC page to an - * enclave to define page permissions and type. - */ -struct sgx_secinfo { - u64 flags; - u8 reserved[56]; -} __packed __aligned(64); - -#define SGX_PCMD_RESERVED_SIZE 40 - -/** - * struct sgx_pcmd - Paging Crypto Metadata (PCMD) - * @enclave_id: enclave identifier - * @mac: MAC over PCMD, page contents and isvsvn - * - * PCMD is stored for every swapped page to the regular memory. When ELDU loads - * the page back it recalculates the MAC by using a isvsvn number stored in a - * VA page. Together these two structures bring integrity and rollback - * protection. - */ -struct sgx_pcmd { - struct sgx_secinfo secinfo; - u64 enclave_id; - u8 reserved[SGX_PCMD_RESERVED_SIZE]; - u8 mac[16]; -} __packed __aligned(128); - -#define SGX_SIGSTRUCT_RESERVED1_SIZE 84 -#define SGX_SIGSTRUCT_RESERVED2_SIZE 20 -#define SGX_SIGSTRUCT_RESERVED3_SIZE 32 -#define SGX_SIGSTRUCT_RESERVED4_SIZE 12 - -/** - * struct sgx_sigstruct_header - defines author of the enclave - * @header1: constant byte string - * @vendor: must be either 0x0000 or 0x8086 - * @date: YYYYMMDD in BCD - * @header2: costant byte string - * @swdefined: software defined value - */ -struct sgx_sigstruct_header { - u64 header1[2]; - u32 vendor; - u32 date; - u64 header2[2]; - u32 swdefined; - u8 reserved1[84]; -} __packed; - -/** - * struct sgx_sigstruct_body - defines contents of the enclave - * @miscselect: additional information stored to an SSA frame - * @misc_mask: required miscselect in SECS - * @attributes: attributes for enclave - * @xfrm: XSave-Feature Request Mask (subset of XCR0) - * @attributes_mask: required attributes in SECS - * @xfrm_mask: required XFRM in SECS - * @mrenclave: SHA256-hash of the enclave contents - * @isvprodid: a user-defined value that is used in key derivation - * @isvsvn: a user-defined value that is used in key derivation - */ -struct sgx_sigstruct_body { - u32 miscselect; - u32 misc_mask; - u8 reserved2[20]; - u64 attributes; - u64 xfrm; - u64 attributes_mask; - u64 xfrm_mask; - u8 mrenclave[32]; - u8 reserved3[32]; - u16 isvprodid; - u16 isvsvn; -} __packed; - -/** - * struct sgx_sigstruct - an enclave signature - * @header: defines author of the enclave - * @modulus: the modulus of the public key - * @exponent: the exponent of the public key - * @signature: the signature calculated over the fields except modulus, - * @body: defines contents of the enclave - * @q1: a value used in RSA signature verification - * @q2: a value used in RSA signature verification - * - * Header and body are the parts that are actual signed. The remaining fields - * define the signature of the enclave. - */ -struct sgx_sigstruct { - struct sgx_sigstruct_header header; - u8 modulus[SGX_MODULUS_SIZE]; - u32 exponent; - u8 signature[SGX_MODULUS_SIZE]; - struct sgx_sigstruct_body body; - u8 reserved4[12]; - u8 q1[SGX_MODULUS_SIZE]; - u8 q2[SGX_MODULUS_SIZE]; -} __packed; - -#define SGX_LAUNCH_TOKEN_SIZE 304 - -#endif /* _ASM_X86_SGX_ARCH_H */ diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index d25f2a245e1d..3be203297988 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -7,7 +7,7 @@ #include #include #include -#include "arch.h" +#include #include "encl.h" #include "encls.h" #include "sgx.h" diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h index 4854f3980edd..e4cbc71bf136 100644 --- a/arch/x86/kernel/cpu/sgx/sgx.h +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -8,7 +8,7 @@ #include #include #include -#include "arch.h" +#include #undef pr_fmt #define pr_fmt(fmt) "sgx: " fmt diff --git a/tools/testing/selftests/sgx/defines.h b/tools/testing/selftests/sgx/defines.h index 592c1ccf4576..0bd73428d2f3 100644 --- a/tools/testing/selftests/sgx/defines.h +++ b/tools/testing/selftests/sgx/defines.h @@ -14,7 +14,7 @@ #define __aligned(x) __attribute__((__aligned__(x))) #define __packed __attribute__((packed)) -#include "../../../../arch/x86/kernel/cpu/sgx/arch.h" +#include "../../../../arch/x86/include/asm/sgx.h" #include "../../../../arch/x86/include/asm/enclu.h" #include "../../../../arch/x86/include/uapi/asm/sgx.h" -- cgit v1.2.3 From 2f3eb922cd5b72a58e5b7c8fe036f4be60be9397 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 6 Apr 2021 14:45:04 +0200 Subject: doc: update rcu_dereference.rst reference Changeset b00aedf978aa ("doc: Convert to rcu_dereference.txt to rcu_dereference.rst") renamed: Documentation/RCU/rcu_dereference.txt to: Documentation/RCU/rcu_dereference.rst. Update its cross-reference accordingly. Fixes: b00aedf978aa ("doc: Convert to rcu_dereference.txt to rcu_dereference.rst") Signed-off-by: Mauro Carvalho Chehab --- tools/memory-model/Documentation/glossary.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/memory-model/Documentation/glossary.txt b/tools/memory-model/Documentation/glossary.txt index b2da6365be63..6f3d16dbf467 100644 --- a/tools/memory-model/Documentation/glossary.txt +++ b/tools/memory-model/Documentation/glossary.txt @@ -19,7 +19,7 @@ Address Dependency: When the address of a later memory access is computed from the value returned by the rcu_dereference() on line 2, the address dependency extends from that rcu_dereference() to that "p->a". In rare cases, optimizing compilers can destroy address - dependencies. Please see Documentation/RCU/rcu_dereference.txt + dependencies. Please see Documentation/RCU/rcu_dereference.rst for more information. See also "Control Dependency" and "Data Dependency". -- cgit v1.2.3 From dc0e058eef42f61effe9fd4f0fa4b0c793cc1f14 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Mon, 5 Apr 2021 18:39:41 +0200 Subject: KVM: selftests: aarch64/vgic-v3 init sequence tests The tests exercise the VGIC_V3 device creation including the associated KVM_DEV_ARM_VGIC_GRP_ADDR group attributes: - KVM_VGIC_V3_ADDR_TYPE_DIST/REDIST - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION Some other tests dedicate to KVM_DEV_ARM_VGIC_GRP_REDIST_REGS group and especially the GICR_TYPER read. The goal was to test the case recently fixed by commit 23bde34771f1 ("KVM: arm64: vgic-v3: Drop the reporting of GICR_TYPER.Last for userspace"). The API under test can be found at Documentation/virt/kvm/devices/arm-vgic-v3.rst Signed-off-by: Eric Auger Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210405163941.510258-10-eric.auger@redhat.com --- tools/testing/selftests/kvm/.gitignore | 1 + tools/testing/selftests/kvm/Makefile | 1 + tools/testing/selftests/kvm/aarch64/vgic_init.c | 584 ++++++++++++++++++++++++ tools/testing/selftests/kvm/include/kvm_util.h | 9 + tools/testing/selftests/kvm/lib/kvm_util.c | 77 ++++ 5 files changed, 672 insertions(+) create mode 100644 tools/testing/selftests/kvm/aarch64/vgic_init.c (limited to 'tools') diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 32b87cc77c8e..e65d5572aefc 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only /aarch64/get-reg-list /aarch64/get-reg-list-sve +/aarch64/vgic_init /s390x/memop /s390x/resets /s390x/sync_regs_test diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index a6d61f451f88..4e548d7ab0ab 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -75,6 +75,7 @@ TEST_GEN_PROGS_x86_64 += steal_time TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list-sve +TEST_GEN_PROGS_aarch64 += aarch64/vgic_init TEST_GEN_PROGS_aarch64 += demand_paging_test TEST_GEN_PROGS_aarch64 += dirty_log_test TEST_GEN_PROGS_aarch64 += dirty_log_perf_test diff --git a/tools/testing/selftests/kvm/aarch64/vgic_init.c b/tools/testing/selftests/kvm/aarch64/vgic_init.c new file mode 100644 index 000000000000..03f62fc96f1d --- /dev/null +++ b/tools/testing/selftests/kvm/aarch64/vgic_init.c @@ -0,0 +1,584 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * vgic init sequence tests + * + * Copyright (C) 2020, Red Hat, Inc. + */ +#define _GNU_SOURCE +#include +#include +#include +#include + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" + +#define NR_VCPUS 4 + +#define REDIST_REGION_ATTR_ADDR(count, base, flags, index) (((uint64_t)(count) << 52) | \ + ((uint64_t)((base) >> 16) << 16) | ((uint64_t)(flags) << 12) | index) +#define REG_OFFSET(vcpu, offset) (((uint64_t)vcpu << 32) | offset) + +#define GICR_TYPER 0x8 + +struct vm_gic { + struct kvm_vm *vm; + int gic_fd; +}; + +int max_ipa_bits; + +/* helper to access a redistributor register */ +static int access_redist_reg(int gicv3_fd, int vcpu, int offset, + uint32_t *val, bool write) +{ + uint64_t attr = REG_OFFSET(vcpu, offset); + + return _kvm_device_access(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_REDIST_REGS, + attr, val, write); +} + +/* dummy guest code */ +static void guest_code(void) +{ + GUEST_SYNC(0); + GUEST_SYNC(1); + GUEST_SYNC(2); + GUEST_DONE(); +} + +/* we don't want to assert on run execution, hence that helper */ +static int run_vcpu(struct kvm_vm *vm, uint32_t vcpuid) +{ + int ret; + + vcpu_args_set(vm, vcpuid, 1); + ret = _vcpu_ioctl(vm, vcpuid, KVM_RUN, NULL); + get_ucall(vm, vcpuid, NULL); + + if (ret) + return -errno; + return 0; +} + +static struct vm_gic vm_gic_create(void) +{ + struct vm_gic v; + + v.vm = vm_create_default_with_vcpus(NR_VCPUS, 0, 0, guest_code, NULL); + v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, false); + TEST_ASSERT(v.gic_fd > 0, "GICv3 device created"); + + return v; +} + +static void vm_gic_destroy(struct vm_gic *v) +{ + close(v->gic_fd); + kvm_vm_free(v->vm); +} + +/** + * Helper routine that performs KVM device tests in general and + * especially ARM_VGIC_V3 ones. Eventually the ARM_VGIC_V3 + * device gets created, a legacy RDIST region is set at @0x0 + * and a DIST region is set @0x60000 + */ +static void subtest_dist_rdist(struct vm_gic *v) +{ + int ret; + uint64_t addr; + + /* Check existing group/attributes */ + ret = _kvm_device_check_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_DIST); + TEST_ASSERT(!ret, "KVM_DEV_ARM_VGIC_GRP_ADDR/KVM_VGIC_V3_ADDR_TYPE_DIST supported"); + + ret = _kvm_device_check_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST); + TEST_ASSERT(!ret, "KVM_DEV_ARM_VGIC_GRP_ADDR/KVM_VGIC_V3_ADDR_TYPE_REDIST supported"); + + /* check non existing attribute */ + ret = _kvm_device_check_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 0); + TEST_ASSERT(ret == -ENXIO, "attribute not supported"); + + /* misaligned DIST and REDIST address settings */ + addr = 0x1000; + ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_DIST, &addr, true); + TEST_ASSERT(ret == -EINVAL, "GICv3 dist base not 64kB aligned"); + + ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr, true); + TEST_ASSERT(ret == -EINVAL, "GICv3 redist base not 64kB aligned"); + + /* out of range address */ + if (max_ipa_bits) { + addr = 1ULL << max_ipa_bits; + ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_DIST, &addr, true); + TEST_ASSERT(ret == -E2BIG, "dist address beyond IPA limit"); + + ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr, true); + TEST_ASSERT(ret == -E2BIG, "redist address beyond IPA limit"); + } + + /* set REDIST base address @0x0*/ + addr = 0x00000; + ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr, true); + TEST_ASSERT(!ret, "GICv3 redist base set"); + + /* Attempt to create a second legacy redistributor region */ + addr = 0xE0000; + ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr, true); + TEST_ASSERT(ret == -EEXIST, "GICv3 redist base set again"); + + /* Attempt to mix legacy and new redistributor regions */ + addr = REDIST_REGION_ATTR_ADDR(NR_VCPUS, 0x100000, 0, 0); + ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); + TEST_ASSERT(ret == -EINVAL, "attempt to mix GICv3 REDIST and REDIST_REGION"); + + /* + * Set overlapping DIST / REDIST, cannot be detected here. Will be detected + * on first vcpu run instead. + */ + addr = 3 * 2 * 0x10000; + ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, KVM_VGIC_V3_ADDR_TYPE_DIST, + &addr, true); + TEST_ASSERT(!ret, "dist overlapping rdist"); +} + +/* Test the new REDIST region API */ +static void subtest_redist_regions(struct vm_gic *v) +{ + uint64_t addr, expected_addr; + int ret; + + ret = kvm_device_check_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST); + TEST_ASSERT(!ret, "Multiple redist regions advertised"); + + addr = REDIST_REGION_ATTR_ADDR(NR_VCPUS, 0x100000, 2, 0); + ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); + TEST_ASSERT(ret == -EINVAL, "redist region attr value with flags != 0"); + + addr = REDIST_REGION_ATTR_ADDR(0, 0x100000, 0, 0); + ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); + TEST_ASSERT(ret == -EINVAL, "redist region attr value with count== 0"); + + addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 1); + ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); + TEST_ASSERT(ret == -EINVAL, "attempt to register the first rdist region with index != 0"); + + addr = REDIST_REGION_ATTR_ADDR(2, 0x201000, 0, 1); + ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); + TEST_ASSERT(ret == -EINVAL, "rdist region with misaligned address"); + + addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 0); + ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); + TEST_ASSERT(!ret, "First valid redist region with 2 rdist @ 0x200000, index 0"); + + addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 1); + ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); + TEST_ASSERT(ret == -EINVAL, "register an rdist region with already used index"); + + addr = REDIST_REGION_ATTR_ADDR(1, 0x210000, 0, 2); + ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); + TEST_ASSERT(ret == -EINVAL, "register an rdist region overlapping with another one"); + + addr = REDIST_REGION_ATTR_ADDR(1, 0x240000, 0, 2); + ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); + TEST_ASSERT(ret == -EINVAL, "register redist region with index not +1"); + + addr = REDIST_REGION_ATTR_ADDR(1, 0x240000, 0, 1); + ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); + TEST_ASSERT(!ret, "register valid redist region with 1 rdist @ 0x220000, index 1"); + + addr = REDIST_REGION_ATTR_ADDR(1, 1ULL << max_ipa_bits, 0, 2); + ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); + TEST_ASSERT(ret == -E2BIG, "register redist region with base address beyond IPA range"); + + addr = 0x260000; + ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr, true); + TEST_ASSERT(ret == -EINVAL, "Mix KVM_VGIC_V3_ADDR_TYPE_REDIST and REDIST_REGION"); + + /* + * Now there are 2 redist regions: + * region 0 @ 0x200000 2 redists + * region 1 @ 0x240000 1 redist + * Attempt to read their characteristics + */ + + addr = REDIST_REGION_ATTR_ADDR(0, 0, 0, 0); + expected_addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 0); + ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, false); + TEST_ASSERT(!ret && addr == expected_addr, "read characteristics of region #0"); + + addr = REDIST_REGION_ATTR_ADDR(0, 0, 0, 1); + expected_addr = REDIST_REGION_ATTR_ADDR(1, 0x240000, 0, 1); + ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, false); + TEST_ASSERT(!ret && addr == expected_addr, "read characteristics of region #1"); + + addr = REDIST_REGION_ATTR_ADDR(0, 0, 0, 2); + ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, false); + TEST_ASSERT(ret == -ENOENT, "read characteristics of non existing region"); + + addr = 0x260000; + ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_DIST, &addr, true); + TEST_ASSERT(!ret, "set dist region"); + + addr = REDIST_REGION_ATTR_ADDR(1, 0x260000, 0, 2); + ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); + TEST_ASSERT(ret == -EINVAL, "register redist region colliding with dist"); +} + +/* + * VGIC KVM device is created and initialized before the secondary CPUs + * get created + */ +static void test_vgic_then_vcpus(void) +{ + struct vm_gic v; + int ret, i; + + v.vm = vm_create_default(0, 0, guest_code); + v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, false); + TEST_ASSERT(v.gic_fd > 0, "GICv3 device created"); + + subtest_dist_rdist(&v); + + /* Add the rest of the VCPUs */ + for (i = 1; i < NR_VCPUS; ++i) + vm_vcpu_add_default(v.vm, i, guest_code); + + ucall_init(v.vm, NULL); + ret = run_vcpu(v.vm, 3); + TEST_ASSERT(ret == -EINVAL, "dist/rdist overlap detected on 1st vcpu run"); + + vm_gic_destroy(&v); +} + +/* All the VCPUs are created before the VGIC KVM device gets initialized */ +static void test_vcpus_then_vgic(void) +{ + struct vm_gic v; + int ret; + + v = vm_gic_create(); + + subtest_dist_rdist(&v); + + ucall_init(v.vm, NULL); + ret = run_vcpu(v.vm, 3); + TEST_ASSERT(ret == -EINVAL, "dist/rdist overlap detected on 1st vcpu run"); + + vm_gic_destroy(&v); +} + +static void test_new_redist_regions(void) +{ + void *dummy = NULL; + struct vm_gic v; + uint64_t addr; + int ret; + + v = vm_gic_create(); + subtest_redist_regions(&v); + ret = _kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true); + TEST_ASSERT(!ret, "init the vgic"); + + ucall_init(v.vm, NULL); + ret = run_vcpu(v.vm, 3); + TEST_ASSERT(ret == -ENXIO, "running without sufficient number of rdists"); + vm_gic_destroy(&v); + + /* step2 */ + + v = vm_gic_create(); + subtest_redist_regions(&v); + + addr = REDIST_REGION_ATTR_ADDR(1, 0x280000, 0, 2); + ret = _kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); + TEST_ASSERT(!ret, "register a third region allowing to cover the 4 vcpus"); + + ucall_init(v.vm, NULL); + ret = run_vcpu(v.vm, 3); + TEST_ASSERT(ret == -EBUSY, "running without vgic explicit init"); + + vm_gic_destroy(&v); + + /* step 3 */ + + v = vm_gic_create(); + subtest_redist_regions(&v); + + ret = _kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, dummy, true); + TEST_ASSERT(ret == -EFAULT, "register a third region allowing to cover the 4 vcpus"); + + addr = REDIST_REGION_ATTR_ADDR(1, 0x280000, 0, 2); + ret = _kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); + TEST_ASSERT(!ret, "register a third region allowing to cover the 4 vcpus"); + + ret = _kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true); + TEST_ASSERT(!ret, "init the vgic"); + + ucall_init(v.vm, NULL); + ret = run_vcpu(v.vm, 3); + TEST_ASSERT(!ret, "vcpu run"); + + vm_gic_destroy(&v); +} + +static void test_typer_accesses(void) +{ + int ret, i, gicv3_fd = -1; + uint64_t addr; + struct kvm_vm *vm; + uint32_t val; + + vm = vm_create_default(0, 0, guest_code); + + gicv3_fd = kvm_create_device(vm, KVM_DEV_TYPE_ARM_VGIC_V3, false); + TEST_ASSERT(gicv3_fd >= 0, "VGIC_V3 device created"); + + vm_vcpu_add_default(vm, 3, guest_code); + + ret = access_redist_reg(gicv3_fd, 1, GICR_TYPER, &val, false); + TEST_ASSERT(ret == -EINVAL, "attempting to read GICR_TYPER of non created vcpu"); + + vm_vcpu_add_default(vm, 1, guest_code); + + ret = access_redist_reg(gicv3_fd, 1, GICR_TYPER, &val, false); + TEST_ASSERT(ret == -EBUSY, "read GICR_TYPER before GIC initialized"); + + vm_vcpu_add_default(vm, 2, guest_code); + + ret = _kvm_device_access(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true); + TEST_ASSERT(!ret, "init the vgic after the vcpu creations"); + + for (i = 0; i < NR_VCPUS ; i++) { + ret = access_redist_reg(gicv3_fd, 0, GICR_TYPER, &val, false); + TEST_ASSERT(!ret && !val, "read GICR_TYPER before rdist region setting"); + } + + addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 0); + ret = _kvm_device_access(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); + TEST_ASSERT(!ret, "first rdist region with a capacity of 2 rdists"); + + /* The 2 first rdists should be put there (vcpu 0 and 3) */ + ret = access_redist_reg(gicv3_fd, 0, GICR_TYPER, &val, false); + TEST_ASSERT(!ret && !val, "read typer of rdist #0"); + + ret = access_redist_reg(gicv3_fd, 3, GICR_TYPER, &val, false); + TEST_ASSERT(!ret && val == 0x310, "read typer of rdist #1"); + + addr = REDIST_REGION_ATTR_ADDR(10, 0x100000, 0, 1); + ret = _kvm_device_access(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); + TEST_ASSERT(ret == -EINVAL, "collision with previous rdist region"); + + ret = access_redist_reg(gicv3_fd, 1, GICR_TYPER, &val, false); + TEST_ASSERT(!ret && val == 0x100, + "no redist region attached to vcpu #1 yet, last cannot be returned"); + + ret = access_redist_reg(gicv3_fd, 2, GICR_TYPER, &val, false); + TEST_ASSERT(!ret && val == 0x200, + "no redist region attached to vcpu #2, last cannot be returned"); + + addr = REDIST_REGION_ATTR_ADDR(10, 0x20000, 0, 1); + ret = _kvm_device_access(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); + TEST_ASSERT(!ret, "second rdist region"); + + ret = access_redist_reg(gicv3_fd, 1, GICR_TYPER, &val, false); + TEST_ASSERT(!ret && val == 0x100, "read typer of rdist #1"); + + ret = access_redist_reg(gicv3_fd, 2, GICR_TYPER, &val, false); + TEST_ASSERT(!ret && val == 0x210, + "read typer of rdist #1, last properly returned"); + + close(gicv3_fd); + kvm_vm_free(vm); +} + +/** + * Test GICR_TYPER last bit with new redist regions + * rdist regions #1 and #2 are contiguous + * rdist region #0 @0x100000 2 rdist capacity + * rdists: 0, 3 (Last) + * rdist region #1 @0x240000 2 rdist capacity + * rdists: 5, 4 (Last) + * rdist region #2 @0x200000 2 rdist capacity + * rdists: 1, 2 + */ +static void test_last_bit_redist_regions(void) +{ + uint32_t vcpuids[] = { 0, 3, 5, 4, 1, 2 }; + int ret, gicv3_fd; + uint64_t addr; + struct kvm_vm *vm; + uint32_t val; + + vm = vm_create_default_with_vcpus(6, 0, 0, guest_code, vcpuids); + + gicv3_fd = kvm_create_device(vm, KVM_DEV_TYPE_ARM_VGIC_V3, false); + TEST_ASSERT(gicv3_fd >= 0, "VGIC_V3 device created"); + + ret = _kvm_device_access(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true); + TEST_ASSERT(!ret, "init the vgic after the vcpu creations"); + + addr = REDIST_REGION_ATTR_ADDR(2, 0x100000, 0, 0); + ret = _kvm_device_access(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); + TEST_ASSERT(!ret, "rdist region #0 (2 rdist)"); + + addr = REDIST_REGION_ATTR_ADDR(2, 0x240000, 0, 1); + ret = _kvm_device_access(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); + TEST_ASSERT(!ret, "rdist region #1 (1 rdist) contiguous with #2"); + + addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 2); + ret = _kvm_device_access(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); + TEST_ASSERT(!ret, "rdist region #2 with a capacity of 2 rdists"); + + ret = access_redist_reg(gicv3_fd, 0, GICR_TYPER, &val, false); + TEST_ASSERT(!ret && val == 0x000, "read typer of rdist #0"); + + ret = access_redist_reg(gicv3_fd, 1, GICR_TYPER, &val, false); + TEST_ASSERT(!ret && val == 0x100, "read typer of rdist #1"); + + ret = access_redist_reg(gicv3_fd, 2, GICR_TYPER, &val, false); + TEST_ASSERT(!ret && val == 0x200, "read typer of rdist #2"); + + ret = access_redist_reg(gicv3_fd, 3, GICR_TYPER, &val, false); + TEST_ASSERT(!ret && val == 0x310, "read typer of rdist #3"); + + ret = access_redist_reg(gicv3_fd, 5, GICR_TYPER, &val, false); + TEST_ASSERT(!ret && val == 0x500, "read typer of rdist #5"); + + ret = access_redist_reg(gicv3_fd, 4, GICR_TYPER, &val, false); + TEST_ASSERT(!ret && val == 0x410, "read typer of rdist #4"); + + close(gicv3_fd); + kvm_vm_free(vm); +} + +/* Test last bit with legacy region */ +static void test_last_bit_single_rdist(void) +{ + uint32_t vcpuids[] = { 0, 3, 5, 4, 1, 2 }; + int ret, gicv3_fd; + uint64_t addr; + struct kvm_vm *vm; + uint32_t val; + + vm = vm_create_default_with_vcpus(6, 0, 0, guest_code, vcpuids); + + gicv3_fd = kvm_create_device(vm, KVM_DEV_TYPE_ARM_VGIC_V3, false); + TEST_ASSERT(gicv3_fd >= 0, "VGIC_V3 device created"); + + ret = _kvm_device_access(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true); + TEST_ASSERT(!ret, "init the vgic after the vcpu creations"); + + addr = 0x10000; + ret = _kvm_device_access(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr, true); + + ret = access_redist_reg(gicv3_fd, 0, GICR_TYPER, &val, false); + TEST_ASSERT(!ret && val == 0x000, "read typer of rdist #0"); + + ret = access_redist_reg(gicv3_fd, 3, GICR_TYPER, &val, false); + TEST_ASSERT(!ret && val == 0x300, "read typer of rdist #1"); + + ret = access_redist_reg(gicv3_fd, 5, GICR_TYPER, &val, false); + TEST_ASSERT(!ret && val == 0x500, "read typer of rdist #2"); + + ret = access_redist_reg(gicv3_fd, 1, GICR_TYPER, &val, false); + TEST_ASSERT(!ret && val == 0x100, "read typer of rdist #3"); + + ret = access_redist_reg(gicv3_fd, 2, GICR_TYPER, &val, false); + TEST_ASSERT(!ret && val == 0x210, "read typer of rdist #3"); + + close(gicv3_fd); + kvm_vm_free(vm); +} + +void test_kvm_device(void) +{ + struct vm_gic v; + int ret; + + v.vm = vm_create_default_with_vcpus(NR_VCPUS, 0, 0, guest_code, NULL); + + /* try to create a non existing KVM device */ + ret = _kvm_create_device(v.vm, 0, true); + TEST_ASSERT(ret == -ENODEV, "unsupported device"); + + /* trial mode with VGIC_V3 device */ + ret = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, true); + if (ret) { + print_skip("GICv3 not supported"); + exit(KSFT_SKIP); + } + v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, false); + TEST_ASSERT(v.gic_fd, "create the GICv3 device"); + + ret = _kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, false); + TEST_ASSERT(ret == -EEXIST, "create GICv3 device twice"); + + ret = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, true); + TEST_ASSERT(!ret, "create GICv3 in test mode while the same already is created"); + + if (!_kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V2, true)) { + ret = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V2, false); + TEST_ASSERT(ret == -EINVAL, "create GICv2 while v3 exists"); + } + + vm_gic_destroy(&v); +} + +int main(int ac, char **av) +{ + max_ipa_bits = kvm_check_cap(KVM_CAP_ARM_VM_IPA_SIZE); + + test_kvm_device(); + test_vcpus_then_vgic(); + test_vgic_then_vcpus(); + test_new_redist_regions(); + test_typer_accesses(); + test_last_bit_redist_regions(); + test_last_bit_single_rdist(); + + return 0; +} diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 2d7eb6989e83..13594b8ae5c5 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -223,6 +223,15 @@ int vcpu_nested_state_set(struct kvm_vm *vm, uint32_t vcpuid, #endif void *vcpu_map_dirty_ring(struct kvm_vm *vm, uint32_t vcpuid); +int _kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr); +int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr); +int _kvm_create_device(struct kvm_vm *vm, uint64_t type, bool test); +int kvm_create_device(struct kvm_vm *vm, uint64_t type, bool test); +int _kvm_device_access(int dev_fd, uint32_t group, uint64_t attr, + void *val, bool write); +int kvm_device_access(int dev_fd, uint32_t group, uint64_t attr, + void *val, bool write); + const char *exit_reason_str(unsigned int exit_reason); void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index e5fbf16f725b..05037ecaa8e5 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1728,6 +1728,83 @@ int _kvm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg) return ioctl(vm->kvm_fd, cmd, arg); } +/* + * Device Ioctl + */ + +int _kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr) +{ + struct kvm_device_attr attribute = { + .group = group, + .attr = attr, + .flags = 0, + }; + int ret = ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute); + + if (ret == -1) + return -errno; + return 0; +} + +int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr) +{ + int ret = _kvm_device_check_attr(dev_fd, group, attr); + + TEST_ASSERT(ret >= 0, "KVM_HAS_DEVICE_ATTR failed, errno: %i", errno); + return ret; +} + +int _kvm_create_device(struct kvm_vm *vm, uint64_t type, bool test) +{ + struct kvm_create_device create_dev; + int ret; + + create_dev.type = type; + create_dev.fd = -1; + create_dev.flags = test ? KVM_CREATE_DEVICE_TEST : 0; + ret = ioctl(vm_get_fd(vm), KVM_CREATE_DEVICE, &create_dev); + if (ret == -1) + return -errno; + return test ? 0 : create_dev.fd; +} + +int kvm_create_device(struct kvm_vm *vm, uint64_t type, bool test) +{ + int ret = _kvm_create_device(vm, type, test); + + TEST_ASSERT(ret >= 0, "KVM_CREATE_DEVICE IOCTL failed,\n" + " errno: %i", errno); + return ret; +} + +int _kvm_device_access(int dev_fd, uint32_t group, uint64_t attr, + void *val, bool write) +{ + struct kvm_device_attr kvmattr = { + .group = group, + .attr = attr, + .flags = 0, + .addr = (uintptr_t)val, + }; + int ret; + + ret = ioctl(dev_fd, write ? KVM_SET_DEVICE_ATTR : KVM_GET_DEVICE_ATTR, + &kvmattr); + if (ret < 0) + return -errno; + return ret; +} + +int kvm_device_access(int dev_fd, uint32_t group, uint64_t attr, + void *val, bool write) +{ + int ret = _kvm_device_access(dev_fd, group, attr, val, write); + + TEST_ASSERT(ret >= 0, "KVM_SET|GET_DEVICE_ATTR IOCTL failed,\n" + " errno: %i", errno); + return ret; +} + /* * VM Dump * -- cgit v1.2.3 From 69baf1a2a41a87eb16dc98aa9ddbdadd8070e5b2 Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Tue, 6 Apr 2021 18:51:02 +0800 Subject: perf mem-events: Remove unnecessary 'struct mem_info' forward declaration 'struct mem_info' is defined at 22nd line. The declaration here is unnecessary. Remove it. Signed-off-by: Wan Jiabing Cc: Alexander Shishkin Cc: Ian Rogers Cc: Jiri Olsa Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: kael_w@yeah.net Link: http://lore.kernel.org/lkml/20210406105104.675879-1-wanjiabing@vivo.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/mem-events.h | 1 - 1 file changed, 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/mem-events.h b/tools/perf/util/mem-events.h index 63dd383b6ce2..cacdebd65b8a 100644 --- a/tools/perf/util/mem-events.h +++ b/tools/perf/util/mem-events.h @@ -44,7 +44,6 @@ bool is_mem_loads_aux_event(struct evsel *leader); void perf_mem_events__list(void); -struct mem_info; int perf_mem__tlb_scnprintf(char *out, size_t sz, struct mem_info *mem_info); int perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info); int perf_mem__snp_scnprintf(char *out, size_t sz, struct mem_info *mem_info); -- cgit v1.2.3 From 4cffb2df4260ed38c7ae4105f6913ad2d71a16ec Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Wed, 7 Apr 2021 15:59:37 +0200 Subject: KVM: selftests: vgic_init kvm selftests fixup Bring some improvements/rationalization over the first version of the vgic_init selftests: - ucall_init is moved in run_cpu() - vcpu_args_set is not called as not needed - whenever a helper is supposed to succeed, call the non "_" version - helpers do not return -errno, instead errno is checked by the caller - vm_gic struct is used whenever possible, as well as vm_gic_destroy - _kvm_create_device takes an addition fd parameter Signed-off-by: Eric Auger Suggested-by: Andrew Jones Reviewed-by: Andrew Jones Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210407135937.533141-1-eric.auger@redhat.com --- tools/testing/selftests/kvm/aarch64/vgic_init.c | 275 +++++++++++------------- tools/testing/selftests/kvm/include/kvm_util.h | 2 +- tools/testing/selftests/kvm/lib/kvm_util.c | 30 ++- 3 files changed, 136 insertions(+), 171 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/aarch64/vgic_init.c b/tools/testing/selftests/kvm/aarch64/vgic_init.c index 03f62fc96f1d..623f31a14326 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_init.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_init.c @@ -27,7 +27,7 @@ struct vm_gic { int gic_fd; }; -int max_ipa_bits; +static int max_ipa_bits; /* helper to access a redistributor register */ static int access_redist_reg(int gicv3_fd, int vcpu, int offset, @@ -51,12 +51,8 @@ static void guest_code(void) /* we don't want to assert on run execution, hence that helper */ static int run_vcpu(struct kvm_vm *vm, uint32_t vcpuid) { - int ret; - - vcpu_args_set(vm, vcpuid, 1); - ret = _vcpu_ioctl(vm, vcpuid, KVM_RUN, NULL); - get_ucall(vm, vcpuid, NULL); - + ucall_init(vm, NULL); + int ret = _vcpu_ioctl(vm, vcpuid, KVM_RUN, NULL); if (ret) return -errno; return 0; @@ -68,7 +64,6 @@ static struct vm_gic vm_gic_create(void) v.vm = vm_create_default_with_vcpus(NR_VCPUS, 0, 0, guest_code, NULL); v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, false); - TEST_ASSERT(v.gic_fd > 0, "GICv3 device created"); return v; } @@ -91,66 +86,62 @@ static void subtest_dist_rdist(struct vm_gic *v) uint64_t addr; /* Check existing group/attributes */ - ret = _kvm_device_check_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_DIST); - TEST_ASSERT(!ret, "KVM_DEV_ARM_VGIC_GRP_ADDR/KVM_VGIC_V3_ADDR_TYPE_DIST supported"); + kvm_device_check_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_DIST); - ret = _kvm_device_check_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST); - TEST_ASSERT(!ret, "KVM_DEV_ARM_VGIC_GRP_ADDR/KVM_VGIC_V3_ADDR_TYPE_REDIST supported"); + kvm_device_check_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST); /* check non existing attribute */ ret = _kvm_device_check_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 0); - TEST_ASSERT(ret == -ENXIO, "attribute not supported"); + TEST_ASSERT(ret && errno == ENXIO, "attribute not supported"); /* misaligned DIST and REDIST address settings */ addr = 0x1000; ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, KVM_VGIC_V3_ADDR_TYPE_DIST, &addr, true); - TEST_ASSERT(ret == -EINVAL, "GICv3 dist base not 64kB aligned"); + TEST_ASSERT(ret && errno == EINVAL, "GICv3 dist base not 64kB aligned"); ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr, true); - TEST_ASSERT(ret == -EINVAL, "GICv3 redist base not 64kB aligned"); + TEST_ASSERT(ret && errno == EINVAL, "GICv3 redist base not 64kB aligned"); /* out of range address */ if (max_ipa_bits) { addr = 1ULL << max_ipa_bits; ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, KVM_VGIC_V3_ADDR_TYPE_DIST, &addr, true); - TEST_ASSERT(ret == -E2BIG, "dist address beyond IPA limit"); + TEST_ASSERT(ret && errno == E2BIG, "dist address beyond IPA limit"); ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr, true); - TEST_ASSERT(ret == -E2BIG, "redist address beyond IPA limit"); + TEST_ASSERT(ret && errno == E2BIG, "redist address beyond IPA limit"); } /* set REDIST base address @0x0*/ addr = 0x00000; - ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr, true); - TEST_ASSERT(!ret, "GICv3 redist base set"); + kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr, true); /* Attempt to create a second legacy redistributor region */ addr = 0xE0000; ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr, true); - TEST_ASSERT(ret == -EEXIST, "GICv3 redist base set again"); + TEST_ASSERT(ret && errno == EEXIST, "GICv3 redist base set again"); /* Attempt to mix legacy and new redistributor regions */ addr = REDIST_REGION_ATTR_ADDR(NR_VCPUS, 0x100000, 0, 0); ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); - TEST_ASSERT(ret == -EINVAL, "attempt to mix GICv3 REDIST and REDIST_REGION"); + TEST_ASSERT(ret && errno == EINVAL, "attempt to mix GICv3 REDIST and REDIST_REGION"); /* * Set overlapping DIST / REDIST, cannot be detected here. Will be detected * on first vcpu run instead. */ addr = 3 * 2 * 0x10000; - ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, KVM_VGIC_V3_ADDR_TYPE_DIST, - &addr, true); - TEST_ASSERT(!ret, "dist overlapping rdist"); + kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, KVM_VGIC_V3_ADDR_TYPE_DIST, + &addr, true); } /* Test the new REDIST region API */ @@ -166,57 +157,59 @@ static void subtest_redist_regions(struct vm_gic *v) addr = REDIST_REGION_ATTR_ADDR(NR_VCPUS, 0x100000, 2, 0); ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); - TEST_ASSERT(ret == -EINVAL, "redist region attr value with flags != 0"); + TEST_ASSERT(ret && errno == EINVAL, "redist region attr value with flags != 0"); addr = REDIST_REGION_ATTR_ADDR(0, 0x100000, 0, 0); ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); - TEST_ASSERT(ret == -EINVAL, "redist region attr value with count== 0"); + TEST_ASSERT(ret && errno == EINVAL, "redist region attr value with count== 0"); addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 1); ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); - TEST_ASSERT(ret == -EINVAL, "attempt to register the first rdist region with index != 0"); + TEST_ASSERT(ret && errno == EINVAL, + "attempt to register the first rdist region with index != 0"); addr = REDIST_REGION_ATTR_ADDR(2, 0x201000, 0, 1); ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); - TEST_ASSERT(ret == -EINVAL, "rdist region with misaligned address"); + TEST_ASSERT(ret && errno == EINVAL, "rdist region with misaligned address"); addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 0); - ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); - TEST_ASSERT(!ret, "First valid redist region with 2 rdist @ 0x200000, index 0"); + kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 1); ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); - TEST_ASSERT(ret == -EINVAL, "register an rdist region with already used index"); + TEST_ASSERT(ret && errno == EINVAL, "register an rdist region with already used index"); addr = REDIST_REGION_ATTR_ADDR(1, 0x210000, 0, 2); ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); - TEST_ASSERT(ret == -EINVAL, "register an rdist region overlapping with another one"); + TEST_ASSERT(ret && errno == EINVAL, + "register an rdist region overlapping with another one"); addr = REDIST_REGION_ATTR_ADDR(1, 0x240000, 0, 2); ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); - TEST_ASSERT(ret == -EINVAL, "register redist region with index not +1"); + TEST_ASSERT(ret && errno == EINVAL, "register redist region with index not +1"); addr = REDIST_REGION_ATTR_ADDR(1, 0x240000, 0, 1); - ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); - TEST_ASSERT(!ret, "register valid redist region with 1 rdist @ 0x220000, index 1"); + kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); addr = REDIST_REGION_ATTR_ADDR(1, 1ULL << max_ipa_bits, 0, 2); ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); - TEST_ASSERT(ret == -E2BIG, "register redist region with base address beyond IPA range"); + TEST_ASSERT(ret && errno == E2BIG, + "register redist region with base address beyond IPA range"); addr = 0x260000; ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr, true); - TEST_ASSERT(ret == -EINVAL, "Mix KVM_VGIC_V3_ADDR_TYPE_REDIST and REDIST_REGION"); + TEST_ASSERT(ret && errno == EINVAL, + "Mix KVM_VGIC_V3_ADDR_TYPE_REDIST and REDIST_REGION"); /* * Now there are 2 redist regions: @@ -240,17 +233,16 @@ static void subtest_redist_regions(struct vm_gic *v) addr = REDIST_REGION_ATTR_ADDR(0, 0, 0, 2); ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, false); - TEST_ASSERT(ret == -ENOENT, "read characteristics of non existing region"); + TEST_ASSERT(ret && errno == ENOENT, "read characteristics of non existing region"); addr = 0x260000; - ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_DIST, &addr, true); - TEST_ASSERT(!ret, "set dist region"); + kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_DIST, &addr, true); addr = REDIST_REGION_ATTR_ADDR(1, 0x260000, 0, 2); ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); - TEST_ASSERT(ret == -EINVAL, "register redist region colliding with dist"); + TEST_ASSERT(ret && errno == EINVAL, "register redist region colliding with dist"); } /* @@ -264,7 +256,6 @@ static void test_vgic_then_vcpus(void) v.vm = vm_create_default(0, 0, guest_code); v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, false); - TEST_ASSERT(v.gic_fd > 0, "GICv3 device created"); subtest_dist_rdist(&v); @@ -272,7 +263,6 @@ static void test_vgic_then_vcpus(void) for (i = 1; i < NR_VCPUS; ++i) vm_vcpu_add_default(v.vm, i, guest_code); - ucall_init(v.vm, NULL); ret = run_vcpu(v.vm, 3); TEST_ASSERT(ret == -EINVAL, "dist/rdist overlap detected on 1st vcpu run"); @@ -289,7 +279,6 @@ static void test_vcpus_then_vgic(void) subtest_dist_rdist(&v); - ucall_init(v.vm, NULL); ret = run_vcpu(v.vm, 3); TEST_ASSERT(ret == -EINVAL, "dist/rdist overlap detected on 1st vcpu run"); @@ -305,11 +294,9 @@ static void test_new_redist_regions(void) v = vm_gic_create(); subtest_redist_regions(&v); - ret = _kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, - KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true); - TEST_ASSERT(!ret, "init the vgic"); + kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true); - ucall_init(v.vm, NULL); ret = run_vcpu(v.vm, 3); TEST_ASSERT(ret == -ENXIO, "running without sufficient number of rdists"); vm_gic_destroy(&v); @@ -320,11 +307,9 @@ static void test_new_redist_regions(void) subtest_redist_regions(&v); addr = REDIST_REGION_ATTR_ADDR(1, 0x280000, 0, 2); - ret = _kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); - TEST_ASSERT(!ret, "register a third region allowing to cover the 4 vcpus"); + kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); - ucall_init(v.vm, NULL); ret = run_vcpu(v.vm, 3); TEST_ASSERT(ret == -EBUSY, "running without vgic explicit init"); @@ -335,20 +320,18 @@ static void test_new_redist_regions(void) v = vm_gic_create(); subtest_redist_regions(&v); - ret = _kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, dummy, true); - TEST_ASSERT(ret == -EFAULT, "register a third region allowing to cover the 4 vcpus"); + _kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, dummy, true); + TEST_ASSERT(ret && errno == EFAULT, + "register a third region allowing to cover the 4 vcpus"); addr = REDIST_REGION_ATTR_ADDR(1, 0x280000, 0, 2); - ret = _kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); - TEST_ASSERT(!ret, "register a third region allowing to cover the 4 vcpus"); + kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); - ret = _kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, - KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true); - TEST_ASSERT(!ret, "init the vgic"); + kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true); - ucall_init(v.vm, NULL); ret = run_vcpu(v.vm, 3); TEST_ASSERT(!ret, "vcpu run"); @@ -357,76 +340,71 @@ static void test_new_redist_regions(void) static void test_typer_accesses(void) { - int ret, i, gicv3_fd = -1; + struct vm_gic v; uint64_t addr; - struct kvm_vm *vm; uint32_t val; + int ret, i; - vm = vm_create_default(0, 0, guest_code); + v.vm = vm_create_default(0, 0, guest_code); - gicv3_fd = kvm_create_device(vm, KVM_DEV_TYPE_ARM_VGIC_V3, false); - TEST_ASSERT(gicv3_fd >= 0, "VGIC_V3 device created"); + v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, false); - vm_vcpu_add_default(vm, 3, guest_code); + vm_vcpu_add_default(v.vm, 3, guest_code); - ret = access_redist_reg(gicv3_fd, 1, GICR_TYPER, &val, false); - TEST_ASSERT(ret == -EINVAL, "attempting to read GICR_TYPER of non created vcpu"); + ret = access_redist_reg(v.gic_fd, 1, GICR_TYPER, &val, false); + TEST_ASSERT(ret && errno == EINVAL, "attempting to read GICR_TYPER of non created vcpu"); - vm_vcpu_add_default(vm, 1, guest_code); + vm_vcpu_add_default(v.vm, 1, guest_code); - ret = access_redist_reg(gicv3_fd, 1, GICR_TYPER, &val, false); - TEST_ASSERT(ret == -EBUSY, "read GICR_TYPER before GIC initialized"); + ret = access_redist_reg(v.gic_fd, 1, GICR_TYPER, &val, false); + TEST_ASSERT(ret && errno == EBUSY, "read GICR_TYPER before GIC initialized"); - vm_vcpu_add_default(vm, 2, guest_code); + vm_vcpu_add_default(v.vm, 2, guest_code); - ret = _kvm_device_access(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, - KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true); - TEST_ASSERT(!ret, "init the vgic after the vcpu creations"); + kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true); for (i = 0; i < NR_VCPUS ; i++) { - ret = access_redist_reg(gicv3_fd, 0, GICR_TYPER, &val, false); + ret = access_redist_reg(v.gic_fd, 0, GICR_TYPER, &val, false); TEST_ASSERT(!ret && !val, "read GICR_TYPER before rdist region setting"); } addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 0); - ret = _kvm_device_access(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); - TEST_ASSERT(!ret, "first rdist region with a capacity of 2 rdists"); + kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); /* The 2 first rdists should be put there (vcpu 0 and 3) */ - ret = access_redist_reg(gicv3_fd, 0, GICR_TYPER, &val, false); + ret = access_redist_reg(v.gic_fd, 0, GICR_TYPER, &val, false); TEST_ASSERT(!ret && !val, "read typer of rdist #0"); - ret = access_redist_reg(gicv3_fd, 3, GICR_TYPER, &val, false); + ret = access_redist_reg(v.gic_fd, 3, GICR_TYPER, &val, false); TEST_ASSERT(!ret && val == 0x310, "read typer of rdist #1"); addr = REDIST_REGION_ATTR_ADDR(10, 0x100000, 0, 1); - ret = _kvm_device_access(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + ret = _kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); - TEST_ASSERT(ret == -EINVAL, "collision with previous rdist region"); + TEST_ASSERT(ret && errno == EINVAL, "collision with previous rdist region"); - ret = access_redist_reg(gicv3_fd, 1, GICR_TYPER, &val, false); + ret = access_redist_reg(v.gic_fd, 1, GICR_TYPER, &val, false); TEST_ASSERT(!ret && val == 0x100, "no redist region attached to vcpu #1 yet, last cannot be returned"); - ret = access_redist_reg(gicv3_fd, 2, GICR_TYPER, &val, false); + ret = access_redist_reg(v.gic_fd, 2, GICR_TYPER, &val, false); TEST_ASSERT(!ret && val == 0x200, "no redist region attached to vcpu #2, last cannot be returned"); addr = REDIST_REGION_ATTR_ADDR(10, 0x20000, 0, 1); - ret = _kvm_device_access(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); - TEST_ASSERT(!ret, "second rdist region"); + kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); - ret = access_redist_reg(gicv3_fd, 1, GICR_TYPER, &val, false); + ret = access_redist_reg(v.gic_fd, 1, GICR_TYPER, &val, false); TEST_ASSERT(!ret && val == 0x100, "read typer of rdist #1"); - ret = access_redist_reg(gicv3_fd, 2, GICR_TYPER, &val, false); + ret = access_redist_reg(v.gic_fd, 2, GICR_TYPER, &val, false); TEST_ASSERT(!ret && val == 0x210, "read typer of rdist #1, last properly returned"); - close(gicv3_fd); - kvm_vm_free(vm); + vm_gic_destroy(&v); } /** @@ -442,127 +420,116 @@ static void test_typer_accesses(void) static void test_last_bit_redist_regions(void) { uint32_t vcpuids[] = { 0, 3, 5, 4, 1, 2 }; - int ret, gicv3_fd; + struct vm_gic v; uint64_t addr; - struct kvm_vm *vm; uint32_t val; + int ret; - vm = vm_create_default_with_vcpus(6, 0, 0, guest_code, vcpuids); + v.vm = vm_create_default_with_vcpus(6, 0, 0, guest_code, vcpuids); - gicv3_fd = kvm_create_device(vm, KVM_DEV_TYPE_ARM_VGIC_V3, false); - TEST_ASSERT(gicv3_fd >= 0, "VGIC_V3 device created"); + v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, false); - ret = _kvm_device_access(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, - KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true); - TEST_ASSERT(!ret, "init the vgic after the vcpu creations"); + kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true); addr = REDIST_REGION_ATTR_ADDR(2, 0x100000, 0, 0); - ret = _kvm_device_access(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); - TEST_ASSERT(!ret, "rdist region #0 (2 rdist)"); + kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); addr = REDIST_REGION_ATTR_ADDR(2, 0x240000, 0, 1); - ret = _kvm_device_access(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); - TEST_ASSERT(!ret, "rdist region #1 (1 rdist) contiguous with #2"); + kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 2); - ret = _kvm_device_access(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); - TEST_ASSERT(!ret, "rdist region #2 with a capacity of 2 rdists"); + kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); - ret = access_redist_reg(gicv3_fd, 0, GICR_TYPER, &val, false); + ret = access_redist_reg(v.gic_fd, 0, GICR_TYPER, &val, false); TEST_ASSERT(!ret && val == 0x000, "read typer of rdist #0"); - ret = access_redist_reg(gicv3_fd, 1, GICR_TYPER, &val, false); + ret = access_redist_reg(v.gic_fd, 1, GICR_TYPER, &val, false); TEST_ASSERT(!ret && val == 0x100, "read typer of rdist #1"); - ret = access_redist_reg(gicv3_fd, 2, GICR_TYPER, &val, false); + ret = access_redist_reg(v.gic_fd, 2, GICR_TYPER, &val, false); TEST_ASSERT(!ret && val == 0x200, "read typer of rdist #2"); - ret = access_redist_reg(gicv3_fd, 3, GICR_TYPER, &val, false); + ret = access_redist_reg(v.gic_fd, 3, GICR_TYPER, &val, false); TEST_ASSERT(!ret && val == 0x310, "read typer of rdist #3"); - ret = access_redist_reg(gicv3_fd, 5, GICR_TYPER, &val, false); + ret = access_redist_reg(v.gic_fd, 5, GICR_TYPER, &val, false); TEST_ASSERT(!ret && val == 0x500, "read typer of rdist #5"); - ret = access_redist_reg(gicv3_fd, 4, GICR_TYPER, &val, false); + ret = access_redist_reg(v.gic_fd, 4, GICR_TYPER, &val, false); TEST_ASSERT(!ret && val == 0x410, "read typer of rdist #4"); - close(gicv3_fd); - kvm_vm_free(vm); + vm_gic_destroy(&v); } /* Test last bit with legacy region */ static void test_last_bit_single_rdist(void) { uint32_t vcpuids[] = { 0, 3, 5, 4, 1, 2 }; - int ret, gicv3_fd; + struct vm_gic v; uint64_t addr; - struct kvm_vm *vm; uint32_t val; + int ret; - vm = vm_create_default_with_vcpus(6, 0, 0, guest_code, vcpuids); + v.vm = vm_create_default_with_vcpus(6, 0, 0, guest_code, vcpuids); - gicv3_fd = kvm_create_device(vm, KVM_DEV_TYPE_ARM_VGIC_V3, false); - TEST_ASSERT(gicv3_fd >= 0, "VGIC_V3 device created"); + v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, false); - ret = _kvm_device_access(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, - KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true); - TEST_ASSERT(!ret, "init the vgic after the vcpu creations"); + kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true); addr = 0x10000; - ret = _kvm_device_access(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, - KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr, true); + kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr, true); - ret = access_redist_reg(gicv3_fd, 0, GICR_TYPER, &val, false); + ret = access_redist_reg(v.gic_fd, 0, GICR_TYPER, &val, false); TEST_ASSERT(!ret && val == 0x000, "read typer of rdist #0"); - ret = access_redist_reg(gicv3_fd, 3, GICR_TYPER, &val, false); + ret = access_redist_reg(v.gic_fd, 3, GICR_TYPER, &val, false); TEST_ASSERT(!ret && val == 0x300, "read typer of rdist #1"); - ret = access_redist_reg(gicv3_fd, 5, GICR_TYPER, &val, false); + ret = access_redist_reg(v.gic_fd, 5, GICR_TYPER, &val, false); TEST_ASSERT(!ret && val == 0x500, "read typer of rdist #2"); - ret = access_redist_reg(gicv3_fd, 1, GICR_TYPER, &val, false); + ret = access_redist_reg(v.gic_fd, 1, GICR_TYPER, &val, false); TEST_ASSERT(!ret && val == 0x100, "read typer of rdist #3"); - ret = access_redist_reg(gicv3_fd, 2, GICR_TYPER, &val, false); + ret = access_redist_reg(v.gic_fd, 2, GICR_TYPER, &val, false); TEST_ASSERT(!ret && val == 0x210, "read typer of rdist #3"); - close(gicv3_fd); - kvm_vm_free(vm); + vm_gic_destroy(&v); } void test_kvm_device(void) { struct vm_gic v; - int ret; + int ret, fd; v.vm = vm_create_default_with_vcpus(NR_VCPUS, 0, 0, guest_code, NULL); /* try to create a non existing KVM device */ - ret = _kvm_create_device(v.vm, 0, true); - TEST_ASSERT(ret == -ENODEV, "unsupported device"); + ret = _kvm_create_device(v.vm, 0, true, &fd); + TEST_ASSERT(ret && errno == ENODEV, "unsupported device"); /* trial mode with VGIC_V3 device */ - ret = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, true); + ret = _kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, true, &fd); if (ret) { print_skip("GICv3 not supported"); exit(KSFT_SKIP); } v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, false); - TEST_ASSERT(v.gic_fd, "create the GICv3 device"); - ret = _kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, false); - TEST_ASSERT(ret == -EEXIST, "create GICv3 device twice"); + ret = _kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, false, &fd); + TEST_ASSERT(ret && errno == EEXIST, "create GICv3 device twice"); - ret = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, true); - TEST_ASSERT(!ret, "create GICv3 in test mode while the same already is created"); + kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, true); - if (!_kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V2, true)) { - ret = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V2, false); - TEST_ASSERT(ret == -EINVAL, "create GICv2 while v3 exists"); + if (!_kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V2, true, &fd)) { + ret = _kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V2, false, &fd); + TEST_ASSERT(ret && errno == EINVAL, "create GICv2 while v3 exists"); } vm_gic_destroy(&v); diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 13594b8ae5c5..bea4644d645d 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -225,7 +225,7 @@ void *vcpu_map_dirty_ring(struct kvm_vm *vm, uint32_t vcpuid); int _kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr); int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr); -int _kvm_create_device(struct kvm_vm *vm, uint64_t type, bool test); +int _kvm_create_device(struct kvm_vm *vm, uint64_t type, bool test, int *fd); int kvm_create_device(struct kvm_vm *vm, uint64_t type, bool test); int _kvm_device_access(int dev_fd, uint32_t group, uint64_t attr, void *val, bool write); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 05037ecaa8e5..5067d041fb00 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1739,22 +1739,19 @@ int _kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr) .attr = attr, .flags = 0, }; - int ret = ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute); - if (ret == -1) - return -errno; - return 0; + return ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute); } int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr) { int ret = _kvm_device_check_attr(dev_fd, group, attr); - TEST_ASSERT(ret >= 0, "KVM_HAS_DEVICE_ATTR failed, errno: %i", errno); + TEST_ASSERT(ret >= 0, "KVM_HAS_DEVICE_ATTR failed, rc: %i errno: %i", ret, errno); return ret; } -int _kvm_create_device(struct kvm_vm *vm, uint64_t type, bool test) +int _kvm_create_device(struct kvm_vm *vm, uint64_t type, bool test, int *fd) { struct kvm_create_device create_dev; int ret; @@ -1763,17 +1760,21 @@ int _kvm_create_device(struct kvm_vm *vm, uint64_t type, bool test) create_dev.fd = -1; create_dev.flags = test ? KVM_CREATE_DEVICE_TEST : 0; ret = ioctl(vm_get_fd(vm), KVM_CREATE_DEVICE, &create_dev); - if (ret == -1) - return -errno; - return test ? 0 : create_dev.fd; + *fd = create_dev.fd; + return ret; } int kvm_create_device(struct kvm_vm *vm, uint64_t type, bool test) { - int ret = _kvm_create_device(vm, type, test); + int fd, ret; - TEST_ASSERT(ret >= 0, "KVM_CREATE_DEVICE IOCTL failed,\n" - " errno: %i", errno); + ret = _kvm_create_device(vm, type, test, &fd); + + if (!test) { + TEST_ASSERT(ret >= 0, + "KVM_CREATE_DEVICE IOCTL failed, rc: %i errno: %i", ret, errno); + return fd; + } return ret; } @@ -1790,8 +1791,6 @@ int _kvm_device_access(int dev_fd, uint32_t group, uint64_t attr, ret = ioctl(dev_fd, write ? KVM_SET_DEVICE_ATTR : KVM_GET_DEVICE_ATTR, &kvmattr); - if (ret < 0) - return -errno; return ret; } @@ -1800,8 +1799,7 @@ int kvm_device_access(int dev_fd, uint32_t group, uint64_t attr, { int ret = _kvm_device_access(dev_fd, group, attr, val, write); - TEST_ASSERT(ret >= 0, "KVM_SET|GET_DEVICE_ATTR IOCTL failed,\n" - " errno: %i", errno); + TEST_ASSERT(ret >= 0, "KVM_SET|GET_DEVICE_ATTR IOCTL failed, rc: %i errno: %i", ret, errno); return ret; } -- cgit v1.2.3 From e527db8f39d4c71128b3ef5f6a4e433513f5246b Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 6 Apr 2021 14:30:14 -0700 Subject: ACPICA: Tree-wide: fix various typos and spelling mistakes This commit squashes the following: ACPICA commit bc8939e2d902653e71bb1601b129a993c37fcfad ACPICA commit 2d9e5e98e23f2a569e5691e6bed183146e25798d ACPICA commit 937358156631ea7a0eef3569c213c82a031097d5 Fix more spelling issues found using the codespell checker and found without tools. Link: https://github.com/acpica/acpica/commit/bc8939e2 Link: https://github.com/acpica/acpica/commit/2d9e5e98 Link: https://github.com/acpica/acpica/commit/93735815 Signed-off-by: Colin Ian King Signed-off-by: Christophe JAILLET Signed-off-by: Bhaskar Chowdhury Signed-off-by: Bob Moore Signed-off-by: Erik Kaneda Signed-off-by: Rafael J. Wysocki --- include/acpi/acoutput.h | 2 +- include/acpi/platform/acgcc.h | 2 +- tools/power/acpi/common/cmfsize.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/include/acpi/acoutput.h b/include/acpi/acoutput.h index 1538a6853822..1b4c45815695 100644 --- a/include/acpi/acoutput.h +++ b/include/acpi/acoutput.h @@ -362,7 +362,7 @@ * * A less-safe version of the macros is provided for optional use if the * compiler uses excessive CPU stack (for example, this may happen in the - * debug case if code optimzation is disabled.) + * debug case if code optimization is disabled.) */ /* Exit trace helper macro */ diff --git a/include/acpi/platform/acgcc.h b/include/acpi/platform/acgcc.h index 0cd4f61d4248..f6656be81760 100644 --- a/include/acpi/platform/acgcc.h +++ b/include/acpi/platform/acgcc.h @@ -61,7 +61,7 @@ typedef __builtin_va_list va_list; #endif /* - * Explictly mark intentional explicit fallthrough to silence + * Explicitly mark intentional explicit fallthrough to silence * -Wimplicit-fallthrough in GCC 7.1+. */ diff --git a/tools/power/acpi/common/cmfsize.c b/tools/power/acpi/common/cmfsize.c index 9ea2c0aeb86c..185b8c588e1d 100644 --- a/tools/power/acpi/common/cmfsize.c +++ b/tools/power/acpi/common/cmfsize.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 /****************************************************************************** * - * Module Name: cfsize - Common get file size function + * Module Name: cmfsize - Common get file size function * * Copyright (C) 2000 - 2021, Intel Corp. * -- cgit v1.2.3 From 2e70b710f36c80b6e78cf32a5c30b46dbb72213c Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Thu, 4 Mar 2021 17:31:49 -0800 Subject: tools/power/x86/intel-speed-select: Increase string size The current string size to print cpulist can accommodate upto 80 logical CPUs per package. But this limit is not enough. So increase the string size. Also prevent buffer overflow, if the string size reaches limit. Signed-off-by: Srinivas Pandruvada Signed-off-by: Hans de Goede --- tools/power/x86/intel-speed-select/isst-display.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/power/x86/intel-speed-select/isst-display.c b/tools/power/x86/intel-speed-select/isst-display.c index 8e54ce47648e..3bf1820c0da1 100644 --- a/tools/power/x86/intel-speed-select/isst-display.c +++ b/tools/power/x86/intel-speed-select/isst-display.c @@ -25,10 +25,14 @@ static void printcpulist(int str_len, char *str, int mask_size, index = snprintf(&str[curr_index], str_len - curr_index, ","); curr_index += index; + if (curr_index >= str_len) + break; } index = snprintf(&str[curr_index], str_len - curr_index, "%d", i); curr_index += index; + if (curr_index >= str_len) + break; first = 0; } } @@ -64,10 +68,14 @@ static void printcpumask(int str_len, char *str, int mask_size, index = snprintf(&str[curr_index], str_len - curr_index, "%08x", mask[i]); curr_index += index; + if (curr_index >= str_len) + break; if (i) { strncat(&str[curr_index], ",", str_len - curr_index); curr_index++; } + if (curr_index >= str_len) + break; } free(mask); @@ -185,7 +193,7 @@ static void _isst_pbf_display_information(int cpu, FILE *outf, int level, int disp_level) { char header[256]; - char value[256]; + char value[512]; snprintf(header, sizeof(header), "speed-select-base-freq-properties"); format_and_print(outf, disp_level, header, NULL); @@ -349,7 +357,7 @@ void isst_ctdp_display_information(int cpu, FILE *outf, int tdp_level, struct isst_pkg_ctdp *pkg_dev) { char header[256]; - char value[256]; + char value[512]; static int level; int i; -- cgit v1.2.3 From b84733a1c52c2f93897a3cbbc1745c06250a4432 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Thu, 4 Mar 2021 17:45:14 -0800 Subject: tools/power/x86/intel-speed-select: Process mailbox read error for core-power Some older kernels don't support reading core-power status. In that case mailbox command fails. So, display core-power status as "unknown" instead of supported. Signed-off-by: Srinivas Pandruvada Signed-off-by: Hans de Goede --- tools/power/x86/intel-speed-select/isst-config.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'tools') diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c index 582feb88eca3..7b98a4a52d9e 100644 --- a/tools/power/x86/intel-speed-select/isst-config.c +++ b/tools/power/x86/intel-speed-select/isst-config.c @@ -959,6 +959,10 @@ static void isst_print_extended_platform_info(void) fprintf(outf, "Intel(R) SST-BF (feature base-freq) is not supported\n"); ret = isst_read_pm_config(i, &cp_state, &cp_cap); + if (ret) { + fprintf(outf, "Intel(R) SST-CP (feature core-power) status is unknown\n"); + return; + } if (cp_cap) fprintf(outf, "Intel(R) SST-CP (feature core-power) is supported\n"); else -- cgit v1.2.3 From 0d3dfd75708117cedf0cea200e9c6fa266129fb5 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Mon, 29 Mar 2021 10:24:41 -0700 Subject: tools/power/x86/intel-speed-select: Add options to force online It is possible that users manually offlined CPUs via sysfs interface and then started this utility. In this case we will not be able to get package and die id of the those CPUs. So add an option to force online if required for some commands. Signed-off-by: Srinivas Pandruvada Signed-off-by: Hans de Goede --- tools/power/x86/intel-speed-select/isst-config.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c index 7b98a4a52d9e..398938879e9f 100644 --- a/tools/power/x86/intel-speed-select/isst-config.c +++ b/tools/power/x86/intel-speed-select/isst-config.c @@ -381,6 +381,18 @@ static void set_cpu_online_offline(int cpu, int state) close(fd); } +static void force_all_cpus_online(void) +{ + int i; + + fprintf(stderr, "Forcing all CPUs online\n"); + + for (i = 0; i < topo_max_cpus; ++i) + set_cpu_online_offline(i, 1); + + unlink("/var/run/isst_cpu_topology.dat"); +} + #define MAX_PACKAGE_COUNT 8 #define MAX_DIE_PER_PACKAGE 2 static void for_each_online_package_in_set(void (*callback)(int, void *, void *, @@ -2767,6 +2779,7 @@ static void usage(void) printf("\t[-f|--format] : output format [json|text]. Default: text\n"); printf("\t[-h|--help] : Print help\n"); printf("\t[-i|--info] : Print platform information\n"); + printf("\t[-a|--all-cpus-online] : Force online every CPU in the system\n"); printf("\t[-o|--out] : Output file\n"); printf("\t\t\tDefault : stderr\n"); printf("\t[-p|--pause] : Delay between two mail box commands in milliseconds\n"); @@ -2804,11 +2817,12 @@ static void cmdline(int argc, char **argv) const char *pathname = "/dev/isst_interface"; char *ptr; FILE *fp; - int opt; + int opt, force_cpus_online = 0; int option_index = 0; int ret; static struct option long_options[] = { + { "all-cpus-online", no_argument, 0, 'a' }, { "cpu", required_argument, 0, 'c' }, { "debug", no_argument, 0, 'd' }, { "format", required_argument, 0, 'f' }, @@ -2844,9 +2858,12 @@ static void cmdline(int argc, char **argv) } progname = argv[0]; - while ((opt = getopt_long_only(argc, argv, "+c:df:hio:v", long_options, + while ((opt = getopt_long_only(argc, argv, "+c:df:hio:va", long_options, &option_index)) != -1) { switch (opt) { + case 'a': + force_cpus_online = 1; + break; case 'c': parse_cpu_command(optarg); break; @@ -2896,6 +2913,8 @@ static void cmdline(int argc, char **argv) exit(0); } set_max_cpu_num(); + if (force_cpus_online) + force_all_cpus_online(); store_cpu_topology(); set_cpu_present_cpu_mask(); set_cpu_target_cpu_mask(); -- cgit v1.2.3 From 17de9a5654f577356f7384a5676e82e544bb2ca1 Mon Sep 17 00:00:00 2001 From: Antonio Terceiro Date: Fri, 12 Mar 2021 10:57:46 -0300 Subject: tools/power/x86/intel-speed-select: Drop __DATE__ and __TIME__ macros These macros introduce nondeterminism in builds, and break reproducible builds. Signed-off-by: Antonio Terceiro Cc: Srinivas Pandruvada Signed-off-by: Srinivas Pandruvada Signed-off-by: Hans de Goede --- tools/power/x86/intel-speed-select/isst-config.c | 1 - 1 file changed, 1 deletion(-) (limited to 'tools') diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c index 398938879e9f..aa1ad99299a7 100644 --- a/tools/power/x86/intel-speed-select/isst-config.c +++ b/tools/power/x86/intel-speed-select/isst-config.c @@ -2808,7 +2808,6 @@ static void usage(void) static void print_version(void) { fprintf(outf, "Version %s\n", version_str); - fprintf(outf, "Build date %s time %s\n", __DATE__, __TIME__); exit(0); } -- cgit v1.2.3 From 61ce18ff01ec17de2b89bbab319e6974d3a3231c Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Thu, 4 Mar 2021 17:53:48 -0800 Subject: tools/power/x86/intel-speed-select: v1.9 release This release adds following changes: - Support increased number of CPUs - Return error when mailbox commmand fails to enable core-power - Option to online all CPUs - Removes build date and time print Signed-off-by: Srinivas Pandruvada Signed-off-by: Hans de Goede --- tools/power/x86/intel-speed-select/isst-config.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c index aa1ad99299a7..ab940c508ef0 100644 --- a/tools/power/x86/intel-speed-select/isst-config.c +++ b/tools/power/x86/intel-speed-select/isst-config.c @@ -15,7 +15,7 @@ struct process_cmd_struct { int arg; }; -static const char *version_str = "v1.8"; +static const char *version_str = "v1.9"; static const int supported_api_ver = 1; static struct isst_if_platform_info isst_platform_info; static char *progname; -- cgit v1.2.3 From c3eaa5f667cb60ea23c1bbc59e83c4f71de18e48 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 6 Apr 2021 17:16:03 -0700 Subject: selftests: mptcp: add the net device name testcase This patch added a new testcase for setting the net device name. In it, pass the net device name to pm_nl_ctl to set the ifindex field of struct mptcp_pm_addr_entry. Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index abeb24b7f8ec..fd99485cf2a4 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -777,6 +777,14 @@ subflows_tests() ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow run_tests $ns1 $ns2 10.0.1.1 chk_join_nr "multiple subflows, limited by server" 2 2 1 + + # single subflow, dev + reset + ip netns exec $ns1 ./pm_nl_ctl limits 0 1 + ip netns exec $ns2 ./pm_nl_ctl limits 0 1 + ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow dev ns2eth3 + run_tests $ns1 $ns2 10.0.1.1 + chk_join_nr "single subflow, dev" 1 1 1 } signal_address_tests() -- cgit v1.2.3 From e75074781f1735c1976bc551e29ccf2ba9a4b17f Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 7 Apr 2021 19:57:28 +0000 Subject: selftests/resctrl: Change a few printed messages Change a few printed messages to report test progress more clearly. Add a missing "\n" at the end of one printed message. Suggested-by: Shuah Khan Signed-off-by: Fenghua Yu Signed-off-by: Shuah Khan --- tools/testing/selftests/resctrl/cache.c | 2 +- tools/testing/selftests/resctrl/mba_test.c | 6 +++--- tools/testing/selftests/resctrl/mbm_test.c | 2 +- tools/testing/selftests/resctrl/resctrlfs.c | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/resctrl/cache.c b/tools/testing/selftests/resctrl/cache.c index 362e3a418caa..68ff856d36f0 100644 --- a/tools/testing/selftests/resctrl/cache.c +++ b/tools/testing/selftests/resctrl/cache.c @@ -301,7 +301,7 @@ int show_cache_info(unsigned long sum_llc_val, int no_of_bits, ret = platform && abs((int)diff_percent) > max_diff_percent && (cmt ? (abs(avg_diff) > max_diff) : true); - ksft_print_msg("%s cache miss rate within %d%%\n", + ksft_print_msg("%s Check cache miss rate within %d%%\n", ret ? "Fail:" : "Pass:", max_diff_percent); ksft_print_msg("Percent diff=%d\n", abs((int)diff_percent)); diff --git a/tools/testing/selftests/resctrl/mba_test.c b/tools/testing/selftests/resctrl/mba_test.c index 26f12ad4c663..1a1bdb6180cf 100644 --- a/tools/testing/selftests/resctrl/mba_test.c +++ b/tools/testing/selftests/resctrl/mba_test.c @@ -80,7 +80,7 @@ static void show_mba_info(unsigned long *bw_imc, unsigned long *bw_resc) avg_diff = (float)labs(avg_bw_resc - avg_bw_imc) / avg_bw_imc; avg_diff_per = (int)(avg_diff * 100); - ksft_print_msg("%s MBA: diff within %d%% for schemata %u\n", + ksft_print_msg("%s Check MBA diff within %d%% for schemata %u\n", avg_diff_per > MAX_DIFF_PERCENT ? "Fail:" : "Pass:", MAX_DIFF_PERCENT, @@ -93,10 +93,10 @@ static void show_mba_info(unsigned long *bw_imc, unsigned long *bw_resc) failed = true; } - ksft_print_msg("%s schemata change using MBA\n", + ksft_print_msg("%s Check schemata change using MBA\n", failed ? "Fail:" : "Pass:"); if (failed) - ksft_print_msg("At least one test failed"); + ksft_print_msg("At least one test failed\n"); } static int check_results(void) diff --git a/tools/testing/selftests/resctrl/mbm_test.c b/tools/testing/selftests/resctrl/mbm_test.c index 02b1ed03f1e5..8392e5c55ed0 100644 --- a/tools/testing/selftests/resctrl/mbm_test.c +++ b/tools/testing/selftests/resctrl/mbm_test.c @@ -37,7 +37,7 @@ show_bw_info(unsigned long *bw_imc, unsigned long *bw_resc, int span) avg_diff_per = (int)(avg_diff * 100); ret = avg_diff_per > MAX_DIFF_PERCENT; - ksft_print_msg("%s MBM: diff within %d%%\n", + ksft_print_msg("%s Check MBM diff within %d%%\n", ret ? "Fail:" : "Pass:", MAX_DIFF_PERCENT); ksft_print_msg("avg_diff_per: %d%%\n", avg_diff_per); ksft_print_msg("Span (MB): %d\n", span); diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c index ade5f2b8b843..5f5a166ade60 100644 --- a/tools/testing/selftests/resctrl/resctrlfs.c +++ b/tools/testing/selftests/resctrl/resctrlfs.c @@ -570,14 +570,14 @@ bool check_resctrlfs_support(void) fclose(inf); - ksft_print_msg("%s kernel supports resctrl filesystem\n", + ksft_print_msg("%s Check kernel supports resctrl filesystem\n", ret ? "Pass:" : "Fail:"); if (!ret) return ret; dp = opendir(RESCTRL_PATH); - ksft_print_msg("%s resctrl mountpoint \"%s\" exists\n", + ksft_print_msg("%s Check resctrl mountpoint \"%s\" exists\n", dp ? "Pass:" : "Fail:", RESCTRL_PATH); if (dp) closedir(dp); -- cgit v1.2.3 From 29e3ea8cbd2958cf237b84652ec236803f2c6202 Mon Sep 17 00:00:00 2001 From: Jordan Niethe Date: Mon, 8 Feb 2021 14:29:57 +1100 Subject: selftests/powerpc: Test for spurious kernel memory faults on radix Previously when mapping kernel memory on radix, no ptesync was included which would periodically lead to unhandled spurious faults. Mapping kernel memory is used when code patching with Strict RWX enabled. As suggested by Chris Riedl, turning ftrace on and off does a large amount of code patching so is a convenient way to see this kind of fault. Add a selftest to try and trigger this kind of a spurious fault. It tests for 30 seconds which is usually long enough for the issue to show up. Signed-off-by: Jordan Niethe [mpe: Rename it to better reflect what it does, rather than the symptom] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210208032957.1232102-2-jniethe5@gmail.com --- tools/testing/selftests/powerpc/mm/Makefile | 1 + .../selftests/powerpc/mm/stress_code_patching.sh | 49 ++++++++++++++++++++++ 2 files changed, 50 insertions(+) create mode 100755 tools/testing/selftests/powerpc/mm/stress_code_patching.sh (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/mm/Makefile b/tools/testing/selftests/powerpc/mm/Makefile index defe488d6bf1..40253abc6208 100644 --- a/tools/testing/selftests/powerpc/mm/Makefile +++ b/tools/testing/selftests/powerpc/mm/Makefile @@ -5,6 +5,7 @@ noarg: TEST_GEN_PROGS := hugetlb_vs_thp_test subpage_prot prot_sao segv_errors wild_bctr \ large_vm_fork_separation bad_accesses pkey_exec_prot \ pkey_siginfo stack_expansion_signal stack_expansion_ldst +TEST_PROGS := stress_code_patching.sh TEST_GEN_PROGS_EXTENDED := tlbie_test TEST_GEN_FILES := tempfile diff --git a/tools/testing/selftests/powerpc/mm/stress_code_patching.sh b/tools/testing/selftests/powerpc/mm/stress_code_patching.sh new file mode 100755 index 000000000000..e454509659f6 --- /dev/null +++ b/tools/testing/selftests/powerpc/mm/stress_code_patching.sh @@ -0,0 +1,49 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0-or-later + +TIMEOUT=30 + +DEBUFS_DIR=`cat /proc/mounts | grep debugfs | awk '{print $2}'` +if [ ! -e "$DEBUFS_DIR" ] +then + echo "debugfs not found, skipping" 1>&2 + exit 4 +fi + +if [ ! -e "$DEBUFS_DIR/tracing/current_tracer" ] +then + echo "Tracing files not found, skipping" 1>&2 + exit 4 +fi + + +echo "Testing for spurious faults when mapping kernel memory..." + +if grep -q "FUNCTION TRACING IS CORRUPTED" "$DEBUFS_DIR/tracing/trace" +then + echo "FAILED: Ftrace already dead. Probably due to a spurious fault" 1>&2 + exit 1 +fi + +dmesg -C +START_TIME=`date +%s` +END_TIME=`expr $START_TIME + $TIMEOUT` +while [ `date +%s` -lt $END_TIME ] +do + echo function > $DEBUFS_DIR/tracing/current_tracer + echo nop > $DEBUFS_DIR/tracing/current_tracer + if dmesg | grep -q 'ftrace bug' + then + break + fi +done + +echo nop > $DEBUFS_DIR/tracing/current_tracer +if dmesg | grep -q 'ftrace bug' +then + echo "FAILED: Mapping kernel memory causes spurious faults" 1>&2 + exit 1 +else + echo "OK: Mapping kernel memory does not cause spurious faults" + exit 0 +fi -- cgit v1.2.3 From 812aa68ef7d4d71bed996468ead665092a3f8de9 Mon Sep 17 00:00:00 2001 From: Jordan Niethe Date: Thu, 25 Feb 2021 14:21:07 +1100 Subject: selftests/powerpc: Suggest memtrace instead of /dev/mem for ci memory The suggested alternative for getting cache-inhibited memory with 'mem=' and /dev/mem is pretty hacky. Also, PAPR guests do not allow system memory to be mapped cache-inhibited so despite /dev/mem being available this will not work which can cause confusion. Instead recommend using the memtrace buffers. memtrace is only available on powernv so there will not be any chance of trying to do this in a guest. Signed-off-by: Jordan Niethe Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210225032108.1458352-2-jniethe5@gmail.com --- tools/testing/selftests/powerpc/alignment/alignment_handler.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/alignment/alignment_handler.c b/tools/testing/selftests/powerpc/alignment/alignment_handler.c index c25cf7cd45e9..33ee34fc0828 100644 --- a/tools/testing/selftests/powerpc/alignment/alignment_handler.c +++ b/tools/testing/selftests/powerpc/alignment/alignment_handler.c @@ -10,16 +10,7 @@ * * We create two sets of source and destination buffers, one in regular memory, * the other cache-inhibited (by default we use /dev/fb0 for this, but an - * alterative path for cache-inhibited memory may be provided). - * - * One way to get cache-inhibited memory is to use the "mem" kernel parameter - * to limit the kernel to less memory than actually exists. Addresses above - * the limit may still be accessed but will be treated as cache-inhibited. For - * example, if there is actually 4GB of memory and the parameter "mem=3GB" is - * used, memory from address 0xC0000000 onwards is treated as cache-inhibited. - * To access this region /dev/mem is used. The kernel should be configured - * without CONFIG_STRICT_DEVMEM. In this case use: - * ./alignment_handler /dev/mem 0xc0000000 + * alterative path for cache-inhibited memory may be provided, e.g. memtrace). * * We initialise the source buffers, then use whichever set of load/store * instructions is under test to copy bytes from the source buffers to the -- cgit v1.2.3 From 68ef8735d253f3d840082b78f996bf2d89ee6e5f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 1 Apr 2021 16:23:47 -0700 Subject: lkdtm: Add REPORT_STACK for checking stack offsets For validating the stack offset behavior, report the offset from a given process's first seen stack address. Add s script to calculate the results to the LKDTM kselftests. Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210401232347.2791257-7-keescook@chromium.org --- drivers/misc/lkdtm/bugs.c | 17 ++++++++++++ drivers/misc/lkdtm/core.c | 1 + drivers/misc/lkdtm/lkdtm.h | 1 + tools/testing/selftests/lkdtm/.gitignore | 1 + tools/testing/selftests/lkdtm/Makefile | 1 + tools/testing/selftests/lkdtm/stack-entropy.sh | 36 ++++++++++++++++++++++++++ 6 files changed, 57 insertions(+) create mode 100755 tools/testing/selftests/lkdtm/stack-entropy.sh (limited to 'tools') diff --git a/drivers/misc/lkdtm/bugs.c b/drivers/misc/lkdtm/bugs.c index 110f5a8538e9..0e8254d0cf0b 100644 --- a/drivers/misc/lkdtm/bugs.c +++ b/drivers/misc/lkdtm/bugs.c @@ -134,6 +134,23 @@ noinline void lkdtm_CORRUPT_STACK_STRONG(void) __lkdtm_CORRUPT_STACK((void *)&data); } +static pid_t stack_pid; +static unsigned long stack_addr; + +void lkdtm_REPORT_STACK(void) +{ + volatile uintptr_t magic; + pid_t pid = task_pid_nr(current); + + if (pid != stack_pid) { + pr_info("Starting stack offset tracking for pid %d\n", pid); + stack_pid = pid; + stack_addr = (uintptr_t)&magic; + } + + pr_info("Stack offset: %d\n", (int)(stack_addr - (uintptr_t)&magic)); +} + void lkdtm_UNALIGNED_LOAD_STORE_WRITE(void) { static u8 data[5] __attribute__((aligned(4))) = {1, 2, 3, 4, 5}; diff --git a/drivers/misc/lkdtm/core.c b/drivers/misc/lkdtm/core.c index b2aff4d87c01..8024b6a5cc7f 100644 --- a/drivers/misc/lkdtm/core.c +++ b/drivers/misc/lkdtm/core.c @@ -110,6 +110,7 @@ static const struct crashtype crashtypes[] = { CRASHTYPE(EXHAUST_STACK), CRASHTYPE(CORRUPT_STACK), CRASHTYPE(CORRUPT_STACK_STRONG), + CRASHTYPE(REPORT_STACK), CRASHTYPE(CORRUPT_LIST_ADD), CRASHTYPE(CORRUPT_LIST_DEL), CRASHTYPE(STACK_GUARD_PAGE_LEADING), diff --git a/drivers/misc/lkdtm/lkdtm.h b/drivers/misc/lkdtm/lkdtm.h index 5ae48c64df24..99f90d3e5e9c 100644 --- a/drivers/misc/lkdtm/lkdtm.h +++ b/drivers/misc/lkdtm/lkdtm.h @@ -17,6 +17,7 @@ void lkdtm_LOOP(void); void lkdtm_EXHAUST_STACK(void); void lkdtm_CORRUPT_STACK(void); void lkdtm_CORRUPT_STACK_STRONG(void); +void lkdtm_REPORT_STACK(void); void lkdtm_UNALIGNED_LOAD_STORE_WRITE(void); void lkdtm_SOFTLOCKUP(void); void lkdtm_HARDLOCKUP(void); diff --git a/tools/testing/selftests/lkdtm/.gitignore b/tools/testing/selftests/lkdtm/.gitignore index f26212605b6b..d4b0be857deb 100644 --- a/tools/testing/selftests/lkdtm/.gitignore +++ b/tools/testing/selftests/lkdtm/.gitignore @@ -1,2 +1,3 @@ *.sh !run.sh +!stack-entropy.sh diff --git a/tools/testing/selftests/lkdtm/Makefile b/tools/testing/selftests/lkdtm/Makefile index 1bcc9ee990eb..c71109ceeb2d 100644 --- a/tools/testing/selftests/lkdtm/Makefile +++ b/tools/testing/selftests/lkdtm/Makefile @@ -5,6 +5,7 @@ include ../lib.mk # NOTE: $(OUTPUT) won't get default value if used before lib.mk TEST_FILES := tests.txt +TEST_PROGS := stack-entropy.sh TEST_GEN_PROGS = $(patsubst %,$(OUTPUT)/%.sh,$(shell awk '{print $$1}' tests.txt | sed -e 's/\#//')) all: $(TEST_GEN_PROGS) diff --git a/tools/testing/selftests/lkdtm/stack-entropy.sh b/tools/testing/selftests/lkdtm/stack-entropy.sh new file mode 100755 index 000000000000..b1b8a5097cbb --- /dev/null +++ b/tools/testing/selftests/lkdtm/stack-entropy.sh @@ -0,0 +1,36 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# +# Measure kernel stack entropy by sampling via LKDTM's REPORT_STACK test. +set -e +samples="${1:-1000}" + +# Capture dmesg continuously since it may fill up depending on sample size. +log=$(mktemp -t stack-entropy-XXXXXX) +dmesg --follow >"$log" & pid=$! +report=-1 +for i in $(seq 1 $samples); do + echo "REPORT_STACK" >/sys/kernel/debug/provoke-crash/DIRECT + if [ -t 1 ]; then + percent=$(( 100 * $i / $samples )) + if [ "$percent" -ne "$report" ]; then + /bin/echo -en "$percent%\r" + report="$percent" + fi + fi +done +kill "$pid" + +# Count unique offsets since last run. +seen=$(tac "$log" | grep -m1 -B"$samples"0 'Starting stack offset' | \ + grep 'Stack offset' | awk '{print $NF}' | sort | uniq -c | wc -l) +bits=$(echo "obase=2; $seen" | bc | wc -L) +echo "Bits of stack entropy: $bits" +rm -f "$log" + +# We would expect any functional stack randomization to be at least 5 bits. +if [ "$bits" -lt 5 ]; then + exit 1 +else + exit 0 +fi -- cgit v1.2.3 From dedb76d3598618e67b3a9af89bf4f418430acbe4 Mon Sep 17 00:00:00 2001 From: John Garry Date: Wed, 7 Apr 2021 18:32:45 +0800 Subject: perf metricgroup: Make find_metric() public with name change Function find_metric() is required for the metric processing in the pmu-events testcase, so make it public. Also change the name to include "metricgroup". Tested-by: Paul A. Clarke Reviewed-by: Kajol Jain Signed-off-by: John Garry Acked-by: Jiri Olsa Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mathieu Poirier Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Shaokun Zhang Cc: Will Deacon Cc: linuxarm@huawei.com Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/1617791570-165223-2-git-send-email-john.garry@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/metricgroup.c | 5 +++-- tools/perf/util/metricgroup.h | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c index 6acb44ad439b..37fe34a5d93d 100644 --- a/tools/perf/util/metricgroup.c +++ b/tools/perf/util/metricgroup.c @@ -900,7 +900,8 @@ static int __add_metric(struct list_head *metric_list, (match_metric(__pe->metric_group, __metric) || \ match_metric(__pe->metric_name, __metric))) -static struct pmu_event *find_metric(const char *metric, struct pmu_events_map *map) +struct pmu_event *metricgroup__find_metric(const char *metric, + struct pmu_events_map *map) { struct pmu_event *pe; int i; @@ -985,7 +986,7 @@ static int __resolve_metric(struct metric *m, struct expr_id *parent; struct pmu_event *pe; - pe = find_metric(cur->key, map); + pe = metricgroup__find_metric(cur->key, map); if (!pe) continue; diff --git a/tools/perf/util/metricgroup.h b/tools/perf/util/metricgroup.h index 026bbf416c48..cc4a92492a61 100644 --- a/tools/perf/util/metricgroup.h +++ b/tools/perf/util/metricgroup.h @@ -43,7 +43,8 @@ int metricgroup__parse_groups(const struct option *opt, bool metric_no_group, bool metric_no_merge, struct rblist *metric_events); - +struct pmu_event *metricgroup__find_metric(const char *metric, + struct pmu_events_map *map); int metricgroup__parse_groups_test(struct evlist *evlist, struct pmu_events_map *map, const char *str, -- cgit v1.2.3 From a48a995edcde832f2d4c4ec1bfb73e0da93810fb Mon Sep 17 00:00:00 2001 From: John Garry Date: Wed, 7 Apr 2021 18:32:46 +0800 Subject: perf test: Handle metric reuse in pmu-events parsing test The pmu-events parsing test does not handle metric reuse at all. Introduce some simple handling to resolve metrics who reference other metrics. Reviewed-by: Kajol Jain Signed-off-by: John Garry Tested-by: Paul A. Clarke Acked-by: Jiri Olsa Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mathieu Poirier Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Shaokun Zhang Cc: Will Deacon Cc: linuxarm@huawei.com Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/1617791570-165223-3-git-send-email-john.garry@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/pmu-events.c | 81 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) (limited to 'tools') diff --git a/tools/perf/tests/pmu-events.c b/tools/perf/tests/pmu-events.c index 0ca6a5a53523..cb5b25d2fb27 100644 --- a/tools/perf/tests/pmu-events.c +++ b/tools/perf/tests/pmu-events.c @@ -12,6 +12,7 @@ #include "util/evlist.h" #include "util/expr.h" #include "util/parse-events.h" +#include "metricgroup.h" struct perf_pmu_test_event { /* used for matching against events from generated pmu-events.c */ @@ -471,6 +472,71 @@ static void expr_failure(const char *msg, pr_debug("On expression %s\n", pe->metric_expr); } +struct metric { + struct list_head list; + struct metric_ref metric_ref; +}; + +static int resolve_metric_simple(struct expr_parse_ctx *pctx, + struct list_head *compound_list, + struct pmu_events_map *map, + const char *metric_name) +{ + struct hashmap_entry *cur, *cur_tmp; + struct metric *metric, *tmp; + size_t bkt; + bool all; + int rc; + + do { + all = true; + hashmap__for_each_entry_safe((&pctx->ids), cur, cur_tmp, bkt) { + struct metric_ref *ref; + struct pmu_event *pe; + + pe = metricgroup__find_metric(cur->key, map); + if (!pe) + continue; + + if (!strcmp(metric_name, (char *)cur->key)) { + pr_warning("Recursion detected for metric %s\n", metric_name); + rc = -1; + goto out_err; + } + + all = false; + + /* The metric key itself needs to go out.. */ + expr__del_id(pctx, cur->key); + + metric = malloc(sizeof(*metric)); + if (!metric) { + rc = -ENOMEM; + goto out_err; + } + + ref = &metric->metric_ref; + ref->metric_name = pe->metric_name; + ref->metric_expr = pe->metric_expr; + list_add_tail(&metric->list, compound_list); + + rc = expr__find_other(pe->metric_expr, NULL, pctx, 0); + if (rc) + goto out_err; + break; /* The hashmap has been modified, so restart */ + } + } while (!all); + + return 0; + +out_err: + list_for_each_entry_safe(metric, tmp, compound_list, list) + free(metric); + + return rc; + +} + static int test_parsing(void) { struct pmu_events_map *cpus_map = perf_pmu__find_map(NULL); @@ -488,7 +554,9 @@ static int test_parsing(void) break; j = 0; for (;;) { + struct metric *metric, *tmp; struct hashmap_entry *cur; + LIST_HEAD(compound_list); size_t bkt; pe = &map->table[j++]; @@ -504,6 +572,13 @@ static int test_parsing(void) continue; } + if (resolve_metric_simple(&ctx, &compound_list, map, + pe->metric_name)) { + expr_failure("Could not resolve metrics", map, pe); + ret++; + goto exit; /* Don't tolerate errors due to severity */ + } + /* * Add all ids with a made up value. The value may * trigger divide by zero when subtracted and so try to @@ -519,6 +594,11 @@ static int test_parsing(void) ret++; } + list_for_each_entry_safe(metric, tmp, &compound_list, list) { + expr__add_ref(&ctx, &metric->metric_ref); + free(metric); + } + if (expr__parse(&result, &ctx, pe->metric_expr, 0)) { expr_failure("Parse failed", map, pe); ret++; @@ -527,6 +607,7 @@ static int test_parsing(void) } } /* TODO: fail when not ok */ +exit: return ret == 0 ? TEST_OK : TEST_SKIP; } -- cgit v1.2.3 From e126bef55f1dfb44440d632f9aae66af3240a435 Mon Sep 17 00:00:00 2001 From: John Garry Date: Wed, 7 Apr 2021 18:32:47 +0800 Subject: perf pmu: Add pmu_events_map__find() function to find the common PMU map for the system Add a function to find the common PMU map for the system. For arm64, a special variant is added. This is because arm64 supports heterogeneous CPU systems. As such, it cannot be guaranteed that the cpumap is same for all CPUs. So in case of heterogeneous systems, don't return a cpumap. Reviewed-by: Kajol Jain Signed-off-by: John Garry Tested-by: Paul A. Clarke Acked-by: Jiri Olsa Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mathieu Poirier Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Shaokun Zhang Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Cc: linuxarm@huawei.com Link: https://lore.kernel.org/r/1617791570-165223-4-git-send-email-john.garry@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm64/util/Build | 1 + tools/perf/arch/arm64/util/pmu.c | 25 +++++++++++++++++++++++++ tools/perf/tests/pmu-events.c | 2 +- tools/perf/util/metricgroup.c | 7 +++---- tools/perf/util/pmu.c | 5 +++++ tools/perf/util/pmu.h | 1 + tools/perf/util/s390-sample-raw.c | 4 +--- 7 files changed, 37 insertions(+), 8 deletions(-) create mode 100644 tools/perf/arch/arm64/util/pmu.c (limited to 'tools') diff --git a/tools/perf/arch/arm64/util/Build b/tools/perf/arch/arm64/util/Build index ead2f2275eee..9fcb4e68add9 100644 --- a/tools/perf/arch/arm64/util/Build +++ b/tools/perf/arch/arm64/util/Build @@ -2,6 +2,7 @@ perf-y += header.o perf-y += machine.o perf-y += perf_regs.o perf-y += tsc.o +perf-y += pmu.o perf-y += kvm-stat.o perf-$(CONFIG_DWARF) += dwarf-regs.o perf-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind.o diff --git a/tools/perf/arch/arm64/util/pmu.c b/tools/perf/arch/arm64/util/pmu.c new file mode 100644 index 000000000000..d3259d61ca75 --- /dev/null +++ b/tools/perf/arch/arm64/util/pmu.c @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "../../util/cpumap.h" +#include "../../util/pmu.h" + +struct pmu_events_map *pmu_events_map__find(void) +{ + struct perf_pmu *pmu = NULL; + + while ((pmu = perf_pmu__scan(pmu))) { + if (!is_pmu_core(pmu->name)) + continue; + + /* + * The cpumap should cover all CPUs. Otherwise, some CPUs may + * not support some events or have different event IDs. + */ + if (pmu->cpus->nr != cpu__max_cpu()) + return NULL; + + return perf_pmu__find_map(pmu); + } + + return NULL; +} diff --git a/tools/perf/tests/pmu-events.c b/tools/perf/tests/pmu-events.c index cb5b25d2fb27..b8aff8fb50d8 100644 --- a/tools/perf/tests/pmu-events.c +++ b/tools/perf/tests/pmu-events.c @@ -539,7 +539,7 @@ out_err: static int test_parsing(void) { - struct pmu_events_map *cpus_map = perf_pmu__find_map(NULL); + struct pmu_events_map *cpus_map = pmu_events_map__find(); struct pmu_events_map *map; struct pmu_event *pe; int i, j, k; diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c index 37fe34a5d93d..8336dd8e8098 100644 --- a/tools/perf/util/metricgroup.c +++ b/tools/perf/util/metricgroup.c @@ -618,7 +618,7 @@ static int metricgroup__print_sys_event_iter(struct pmu_event *pe, void *data) void metricgroup__print(bool metrics, bool metricgroups, char *filter, bool raw, bool details) { - struct pmu_events_map *map = perf_pmu__find_map(NULL); + struct pmu_events_map *map = pmu_events_map__find(); struct pmu_event *pe; int i; struct rblist groups; @@ -1254,8 +1254,7 @@ int metricgroup__parse_groups(const struct option *opt, struct rblist *metric_events) { struct evlist *perf_evlist = *(struct evlist **)opt->value; - struct pmu_events_map *map = perf_pmu__find_map(NULL); - + struct pmu_events_map *map = pmu_events_map__find(); return parse_groups(perf_evlist, str, metric_no_group, metric_no_merge, NULL, metric_events, map); @@ -1274,7 +1273,7 @@ int metricgroup__parse_groups_test(struct evlist *evlist, bool metricgroup__has_metric(const char *metric) { - struct pmu_events_map *map = perf_pmu__find_map(NULL); + struct pmu_events_map *map = pmu_events_map__find(); struct pmu_event *pe; int i; diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 4e52b94f1ac6..286d5e415bdc 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -717,6 +717,11 @@ struct pmu_events_map *perf_pmu__find_map(struct perf_pmu *pmu) return map; } +struct pmu_events_map *__weak pmu_events_map__find(void) +{ + return perf_pmu__find_map(NULL); +} + bool pmu_uncore_alias_match(const char *pmu_name, const char *name) { char *tmp = NULL, *tok, *str; diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index 160b0f561771..1f1749ba830f 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -114,6 +114,7 @@ void pmu_add_cpu_aliases_map(struct list_head *head, struct perf_pmu *pmu, struct pmu_events_map *map); struct pmu_events_map *perf_pmu__find_map(struct perf_pmu *pmu); +struct pmu_events_map *pmu_events_map__find(void); bool pmu_uncore_alias_match(const char *pmu_name, const char *name); void perf_pmu_free_alias(struct perf_pmu_alias *alias); diff --git a/tools/perf/util/s390-sample-raw.c b/tools/perf/util/s390-sample-raw.c index cfcf8d534d76..08ec3c3ae0ee 100644 --- a/tools/perf/util/s390-sample-raw.c +++ b/tools/perf/util/s390-sample-raw.c @@ -160,11 +160,9 @@ static void s390_cpumcfdg_dump(struct perf_sample *sample) const char *color = PERF_COLOR_BLUE; struct cf_ctrset_entry *cep, ce; struct pmu_events_map *map; - struct perf_pmu pmu; u64 *p; - memset(&pmu, 0, sizeof(pmu)); - map = perf_pmu__find_map(&pmu); + map = pmu_events_map__find(); while (offset < len) { cep = (struct cf_ctrset_entry *)(buf + offset); -- cgit v1.2.3 From c4e1dc4a94931805fd4c69de71117dc040d8db2a Mon Sep 17 00:00:00 2001 From: John Garry Date: Wed, 7 Apr 2021 18:32:48 +0800 Subject: perf vendor events arm64: Add Hisi hip08 L1 metrics Add L1 metrics. Formula is as consistent as possible with MAN pages description for these metrics. Reviewed-by: Kajol Jain Signed-off-by: John Garry Acked-by: Jiri Olsa Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mathieu Poirier Cc: Namhyung Kim Cc: Paul Clarke Cc: Peter Zijlstra Cc: Shaokun Zhang Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Cc: linuxarm@huawei.com Link: https://lore.kernel.org/r/1617791570-165223-5-git-send-email-john.garry@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/arm64/hisilicon/hip08/metrics.json | 30 ++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 tools/perf/pmu-events/arch/arm64/hisilicon/hip08/metrics.json (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/arm64/hisilicon/hip08/metrics.json b/tools/perf/pmu-events/arch/arm64/hisilicon/hip08/metrics.json new file mode 100644 index 000000000000..dc5ff3051639 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/hisilicon/hip08/metrics.json @@ -0,0 +1,30 @@ +[ + { + "MetricExpr": "FETCH_BUBBLE / (4 * CPU_CYCLES)", + "PublicDescription": "Frontend bound L1 topdown metric", + "BriefDescription": "Frontend bound L1 topdown metric", + "MetricGroup": "TopDownL1", + "MetricName": "frontend_bound" + }, + { + "MetricExpr": "(INST_SPEC - INST_RETIRED) / (4 * CPU_CYCLES)", + "PublicDescription": "Bad Speculation L1 topdown metric", + "BriefDescription": "Bad Speculation L1 topdown metric", + "MetricGroup": "TopDownL1", + "MetricName": "bad_speculation" + }, + { + "MetricExpr": "INST_RETIRED / (CPU_CYCLES * 4)", + "PublicDescription": "Retiring L1 topdown metric", + "BriefDescription": "Retiring L1 topdown metric", + "MetricGroup": "TopDownL1", + "MetricName": "retiring" + }, + { + "MetricExpr": "1 - (frontend_bound + bad_speculation + retiring)", + "PublicDescription": "Backend Bound L1 topdown metric", + "BriefDescription": "Backend Bound L1 topdown metric", + "MetricGroup": "TopDownL1", + "MetricName": "backend_bound" + }, +] -- cgit v1.2.3 From 03837173487a1c664b71f047e97209112be37dd5 Mon Sep 17 00:00:00 2001 From: John Garry Date: Wed, 7 Apr 2021 18:32:49 +0800 Subject: perf vendor events arm64: Add Hisi hip08 L2 metrics Add L2 metrics. Reviewed-by: Kajol Jain Signed-off-by: John Garry Acked-by: Jiri Olsa Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mathieu Poirier Cc: Namhyung Kim Cc: Paul Clarke Cc: Peter Zijlstra Cc: Shaokun Zhang Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Cc: linuxarm@huawei.com Link: https://lore.kernel.org/r/1617791570-165223-6-git-send-email-john.garry@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/arm64/hisilicon/hip08/metrics.json | 42 ++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/arm64/hisilicon/hip08/metrics.json b/tools/perf/pmu-events/arch/arm64/hisilicon/hip08/metrics.json index dc5ff3051639..dda898d23c2d 100644 --- a/tools/perf/pmu-events/arch/arm64/hisilicon/hip08/metrics.json +++ b/tools/perf/pmu-events/arch/arm64/hisilicon/hip08/metrics.json @@ -27,4 +27,46 @@ "MetricGroup": "TopDownL1", "MetricName": "backend_bound" }, + { + "MetricExpr": "armv8_pmuv3_0@event\\=0x201d@ / CPU_CYCLES", + "PublicDescription": "Fetch latency bound L2 topdown metric", + "BriefDescription": "Fetch latency bound L2 topdown metric", + "MetricGroup": "TopDownL2", + "MetricName": "fetch_latency_bound" + }, + { + "MetricExpr": "frontend_bound - fetch_latency_bound", + "PublicDescription": "Fetch bandwidth bound L2 topdown metric", + "BriefDescription": "Fetch bandwidth bound L2 topdown metric", + "MetricGroup": "TopDownL2", + "MetricName": "fetch_bandwidth_bound" + }, + { + "MetricExpr": "(bad_speculation * BR_MIS_PRED) / (BR_MIS_PRED + armv8_pmuv3_0@event\\=0x2013@)", + "PublicDescription": "Branch mispredicts L2 topdown metric", + "BriefDescription": "Branch mispredicts L2 topdown metric", + "MetricGroup": "TopDownL2", + "MetricName": "branch_mispredicts" + }, + { + "MetricExpr": "bad_speculation - branch_mispredicts", + "PublicDescription": "Machine clears L2 topdown metric", + "BriefDescription": "Machine clears L2 topdown metric", + "MetricGroup": "TopDownL2", + "MetricName": "machine_clears" + }, + { + "MetricExpr": "(EXE_STALL_CYCLE - (MEM_STALL_ANYLOAD + armv8_pmuv3_0@event\\=0x7005@)) / CPU_CYCLES", + "PublicDescription": "Core bound L2 topdown metric", + "BriefDescription": "Core bound L2 topdown metric", + "MetricGroup": "TopDownL2", + "MetricName": "core_bound" + }, + { + "MetricExpr": "(MEM_STALL_ANYLOAD + armv8_pmuv3_0@event\\=0x7005@) / CPU_CYCLES", + "PublicDescription": "Memory bound L2 topdown metric", + "BriefDescription": "Memory bound L2 topdown metric", + "MetricGroup": "TopDownL2", + "MetricName": "memory_bound" + }, ] -- cgit v1.2.3 From 0cc177cfc95d565e1a458136a592b0bd6d487db0 Mon Sep 17 00:00:00 2001 From: John Garry Date: Wed, 7 Apr 2021 18:32:50 +0800 Subject: perf vendor events arm64: Add Hisi hip08 L3 metrics Add L3 metrics. Reviewed-by: Kajol Jain Signed-off-by: John Garry Acked-by: Jiri Olsa Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Mathieu Poirier Cc: Namhyung Kim Cc: Paul Clarke Cc: Peter Zijlstra Cc: Shaokun Zhang Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Cc: linuxarm@huawei.com Link: https://lore.kernel.org/r/1617791570-165223-7-git-send-email-john.garry@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/arm64/hisilicon/hip08/metrics.json | 161 +++++++++++++++++++++ 1 file changed, 161 insertions(+) (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/arm64/hisilicon/hip08/metrics.json b/tools/perf/pmu-events/arch/arm64/hisilicon/hip08/metrics.json index dda898d23c2d..dda8e59149d2 100644 --- a/tools/perf/pmu-events/arch/arm64/hisilicon/hip08/metrics.json +++ b/tools/perf/pmu-events/arch/arm64/hisilicon/hip08/metrics.json @@ -69,4 +69,165 @@ "MetricGroup": "TopDownL2", "MetricName": "memory_bound" }, + { + "MetricExpr": "(((L2I_TLB - L2I_TLB_REFILL) * 15) + (L2I_TLB_REFILL * 100)) / CPU_CYCLES", + "PublicDescription": "Idle by itlb miss L3 topdown metric", + "BriefDescription": "Idle by itlb miss L3 topdown metric", + "MetricGroup": "TopDownL3", + "MetricName": "idle_by_itlb_miss" + }, + { + "MetricExpr": "(((L2I_CACHE - L2I_CACHE_REFILL) * 15) + (L2I_CACHE_REFILL * 100)) / CPU_CYCLES", + "PublicDescription": "Idle by icache miss L3 topdown metric", + "BriefDescription": "Idle by icache miss L3 topdown metric", + "MetricGroup": "TopDownL3", + "MetricName": "idle_by_icache_miss" + }, + { + "MetricExpr": "(BR_MIS_PRED * 5) / CPU_CYCLES", + "PublicDescription": "BP misp flush L3 topdown metric", + "BriefDescription": "BP misp flush L3 topdown metric", + "MetricGroup": "TopDownL3", + "MetricName": "bp_misp_flush" + }, + { + "MetricExpr": "(armv8_pmuv3_0@event\\=0x2013@ * 5) / CPU_CYCLES", + "PublicDescription": "OOO flush L3 topdown metric", + "BriefDescription": "OOO flush L3 topdown metric", + "MetricGroup": "TopDownL3", + "MetricName": "ooo_flush" + }, + { + "MetricExpr": "(armv8_pmuv3_0@event\\=0x1001@ * 5) / CPU_CYCLES", + "PublicDescription": "Static predictor flush L3 topdown metric", + "BriefDescription": "Static predictor flush L3 topdown metric", + "MetricGroup": "TopDownL3", + "MetricName": "sp_flush" + }, + { + "MetricExpr": "armv8_pmuv3_0@event\\=0x1010@ / BR_MIS_PRED", + "PublicDescription": "Indirect branch L3 topdown metric", + "BriefDescription": "Indirect branch L3 topdown metric", + "MetricGroup": "TopDownL3", + "MetricName": "indirect_branch" + }, + { + "MetricExpr": "(armv8_pmuv3_0@event\\=0x1014@ + armv8_pmuv3_0@event\\=0x1018@) / BR_MIS_PRED", + "PublicDescription": "Push branch L3 topdown metric", + "BriefDescription": "Push branch L3 topdown metric", + "MetricGroup": "TopDownL3", + "MetricName": "push_branch" + }, + { + "MetricExpr": "armv8_pmuv3_0@event\\=0x100c@ / BR_MIS_PRED", + "PublicDescription": "Pop branch L3 topdown metric", + "BriefDescription": "Pop branch L3 topdown metric", + "MetricGroup": "TopDownL3", + "MetricName": "pop_branch" + }, + { + "MetricExpr": "(BR_MIS_PRED - armv8_pmuv3_0@event\\=0x1010@ - armv8_pmuv3_0@event\\=0x1014@ - armv8_pmuv3_0@event\\=0x1018@ - armv8_pmuv3_0@event\\=0x100c@) / BR_MIS_PRED", + "PublicDescription": "Other branch L3 topdown metric", + "BriefDescription": "Other branch L3 topdown metric", + "MetricGroup": "TopDownL3", + "MetricName": "other_branch" + }, + { + "MetricExpr": "armv8_pmuv3_0@event\\=0x2012@ / armv8_pmuv3_0@event\\=0x2013@", + "PublicDescription": "Nuke flush L3 topdown metric", + "BriefDescription": "Nuke flush L3 topdown metric", + "MetricGroup": "TopDownL3", + "MetricName": "nuke_flush" + }, + { + "MetricExpr": "1 - nuke_flush", + "PublicDescription": "Other flush L3 topdown metric", + "BriefDescription": "Other flush L3 topdown metric", + "MetricGroup": "TopDownL3", + "MetricName": "other_flush" + }, + { + "MetricExpr": "armv8_pmuv3_0@event\\=0x2010@ / CPU_CYCLES", + "PublicDescription": "Sync stall L3 topdown metric", + "BriefDescription": "Sync stall L3 topdown metric", + "MetricGroup": "TopDownL3", + "MetricName": "sync_stall" + }, + { + "MetricExpr": "armv8_pmuv3_0@event\\=0x2004@ / CPU_CYCLES", + "PublicDescription": "Rob stall L3 topdown metric", + "BriefDescription": "Rob stall L3 topdown metric", + "MetricGroup": "TopDownL3", + "MetricName": "rob_stall" + }, + { + "MetricExpr": "(armv8_pmuv3_0@event\\=0x2006@ + armv8_pmuv3_0@event\\=0x2007@ + armv8_pmuv3_0@event\\=0x2008@) / CPU_CYCLES", + "PublicDescription": "Ptag stall L3 topdown metric", + "BriefDescription": "Ptag stall L3 topdown metric", + "MetricGroup": "TopDownL3", + "MetricName": "ptag_stall" + }, + { + "MetricExpr": "armv8_pmuv3_0@event\\=0x201e@ / CPU_CYCLES", + "PublicDescription": "SaveOpQ stall L3 topdown metric", + "BriefDescription": "SaveOpQ stall L3 topdown metric", + "MetricGroup": "TopDownL3", + "MetricName": "saveopq_stall" + }, + { + "MetricExpr": "armv8_pmuv3_0@event\\=0x2005@ / CPU_CYCLES", + "PublicDescription": "PC buffer stall L3 topdown metric", + "BriefDescription": "PC buffer stall L3 topdown metric", + "MetricGroup": "TopDownL3", + "MetricName": "pc_buffer_stall" + }, + { + "MetricExpr": "armv8_pmuv3_0@event\\=0x7002@ / CPU_CYCLES", + "PublicDescription": "Divider L3 topdown metric", + "BriefDescription": "Divider L3 topdown metric", + "MetricGroup": "TopDownL3", + "MetricName": "divider" + }, + { + "MetricExpr": "armv8_pmuv3_0@event\\=0x7003@ / CPU_CYCLES", + "PublicDescription": "FSU stall L3 topdown metric", + "BriefDescription": "FSU stall L3 topdown metric", + "MetricGroup": "TopDownL3", + "MetricName": "fsu_stall" + }, + { + "MetricExpr": "core_bound - divider - fsu_stall", + "PublicDescription": "EXE ports util L3 topdown metric", + "BriefDescription": "EXE ports util L3 topdown metric", + "MetricGroup": "TopDownL3", + "MetricName": "exe_ports_util" + }, + { + "MetricExpr": "(MEM_STALL_ANYLOAD - MEM_STALL_L1MISS) / CPU_CYCLES", + "PublicDescription": "L1 bound L3 topdown metric", + "BriefDescription": "L1 bound L3 topdown metric", + "MetricGroup": "TopDownL3", + "MetricName": "l1_bound" + }, + { + "MetricExpr": "(MEM_STALL_L1MISS - MEM_STALL_L2MISS) / CPU_CYCLES", + "PublicDescription": "L2 bound L3 topdown metric", + "BriefDescription": "L2 bound L3 topdown metric", + "MetricGroup": "TopDownL3", + "MetricName": "l2_bound" + }, + { + "MetricExpr": "MEM_STALL_L2MISS / CPU_CYCLES", + "PublicDescription": "Mem bound L3 topdown metric", + "BriefDescription": "Mem bound L3 topdown metric", + "MetricGroup": "TopDownL3", + "MetricName": "mem_bound" + }, + { + "MetricExpr": "armv8_pmuv3_0@event\\=0x7005@ / CPU_CYCLES", + "PublicDescription": "Store bound L3 topdown metric", + "BriefDescription": "Store bound L3 topdown metric", + "MetricGroup": "TopDownL3", + "MetricName": "store_bound" + }, ] -- cgit v1.2.3 From 86c2bc3da769124e3e856b6e9457be3667c30919 Mon Sep 17 00:00:00 2001 From: Smita Koralahalli Date: Tue, 6 Apr 2021 16:59:41 -0500 Subject: perf vendor events amd: Fix broken L2 Cache Hits from L2 HWPF metric MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 08ed77e414ab2342 ("perf vendor events amd: Add recommended events") added the hits event "L2 Cache Hits from L2 HWPF" with the same metric expression as the accesses event "L2 Cache Accesses from L2 HWPF": $ perf list --details ... l2_cache_accesses_from_l2_hwpf [L2 Cache Accesses from L2 HWPF] [l2_pf_hit_l2 + l2_pf_miss_l2_hit_l3 + l2_pf_miss_l2_l3] l2_cache_hits_from_l2_hwpf [L2 Cache Hits from L2 HWPF] [l2_pf_hit_l2 + l2_pf_miss_l2_hit_l3 + l2_pf_miss_l2_l3] ... This was wrong and led to counting hits the same as accesses. Section 2.1.15.2 "Performance Measurement" of "PPR for AMD Family 17h Model 31h B0 - 55803 Rev 0.54 - Sep 12, 2019", documents the hits event with EventCode 0x70 which is the same as l2_pf_hit_l2. Fix this, and massage the description for l2_pf_hit_l2 as the hits event is now the duplicate of l2_pf_hit_l2. AMD recommends using the recommended event over other events if the duplicate exists and maintain both for consistency. Hence, l2_cache_hits_from_l2_hwpf should override l2_pf_hit_l2. Before: # perf stat -M l2_cache_accesses_from_l2_hwpf,l2_cache_hits_from_l2_hwpf sleep 1 Performance counter stats for 'sleep 1': 1,436 l2_pf_miss_l2_l3 # 11114.00 l2_cache_accesses_from_l2_hwpf # 11114.00 l2_cache_hits_from_l2_hwpf 4,482 l2_pf_hit_l2 5,196 l2_pf_miss_l2_hit_l3 1.001765339 seconds time elapsed After: # perf stat -M l2_cache_accesses_from_l2_hwpf sleep 1 Performance counter stats for 'sleep 1': 1,477 l2_pf_miss_l2_l3 # 10442.00 l2_cache_accesses_from_l2_hwpf 3,978 l2_pf_hit_l2 4,987 l2_pf_miss_l2_hit_l3 1.001491186 seconds time elapsed # perf stat -e l2_cache_hits_from_l2_hwpf sleep 1 Performance counter stats for 'sleep 1': 3,983 l2_cache_hits_from_l2_hwpf 1.001329970 seconds time elapsed Note the difference in performance counter values for the accesses versus the hits after the fix, and the hits event now counting the same as l2_pf_hit_l2. Fixes: 08ed77e414ab ("perf vendor events amd: Add recommended events") Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=206537 Reviewed-by: Robert Richter Signed-off-by: Smita Koralahalli Tested-by: Arnaldo Carvalho de Melo # On a 3900X Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kim Phillips Cc: Mark Rutland Cc: Martin Liška Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Vijay Thakkar Cc: linux-perf-users@vger.kernel.org Link: https://lore.kernel.org/r/20210406215944.113332-2-Smita.KoralahalliChannabasappa@amd.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/x86/amdzen1/cache.json | 2 +- tools/perf/pmu-events/arch/x86/amdzen1/recommended.json | 6 +++--- tools/perf/pmu-events/arch/x86/amdzen2/cache.json | 2 +- tools/perf/pmu-events/arch/x86/amdzen2/recommended.json | 6 +++--- 4 files changed, 8 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/x86/amdzen1/cache.json b/tools/perf/pmu-events/arch/x86/amdzen1/cache.json index 4ea7ec4f496e..008f1683e540 100644 --- a/tools/perf/pmu-events/arch/x86/amdzen1/cache.json +++ b/tools/perf/pmu-events/arch/x86/amdzen1/cache.json @@ -275,7 +275,7 @@ { "EventName": "l2_pf_hit_l2", "EventCode": "0x70", - "BriefDescription": "L2 prefetch hit in L2.", + "BriefDescription": "L2 prefetch hit in L2. Use l2_cache_hits_from_l2_hwpf instead.", "UMask": "0xff" }, { diff --git a/tools/perf/pmu-events/arch/x86/amdzen1/recommended.json b/tools/perf/pmu-events/arch/x86/amdzen1/recommended.json index 2cfe2d2f3bfd..3c954543d1ae 100644 --- a/tools/perf/pmu-events/arch/x86/amdzen1/recommended.json +++ b/tools/perf/pmu-events/arch/x86/amdzen1/recommended.json @@ -79,10 +79,10 @@ "UMask": "0x70" }, { - "MetricName": "l2_cache_hits_from_l2_hwpf", + "EventName": "l2_cache_hits_from_l2_hwpf", + "EventCode": "0x70", "BriefDescription": "L2 Cache Hits from L2 HWPF", - "MetricExpr": "l2_pf_hit_l2 + l2_pf_miss_l2_hit_l3 + l2_pf_miss_l2_l3", - "MetricGroup": "l2_cache" + "UMask": "0xff" }, { "EventName": "l3_accesses", diff --git a/tools/perf/pmu-events/arch/x86/amdzen2/cache.json b/tools/perf/pmu-events/arch/x86/amdzen2/cache.json index f61b982f83ca..8ba84a48188d 100644 --- a/tools/perf/pmu-events/arch/x86/amdzen2/cache.json +++ b/tools/perf/pmu-events/arch/x86/amdzen2/cache.json @@ -205,7 +205,7 @@ { "EventName": "l2_pf_hit_l2", "EventCode": "0x70", - "BriefDescription": "L2 prefetch hit in L2.", + "BriefDescription": "L2 prefetch hit in L2. Use l2_cache_hits_from_l2_hwpf instead.", "UMask": "0xff" }, { diff --git a/tools/perf/pmu-events/arch/x86/amdzen2/recommended.json b/tools/perf/pmu-events/arch/x86/amdzen2/recommended.json index 2ef91e25e661..1c624cee9ef4 100644 --- a/tools/perf/pmu-events/arch/x86/amdzen2/recommended.json +++ b/tools/perf/pmu-events/arch/x86/amdzen2/recommended.json @@ -79,10 +79,10 @@ "UMask": "0x70" }, { - "MetricName": "l2_cache_hits_from_l2_hwpf", + "EventName": "l2_cache_hits_from_l2_hwpf", + "EventCode": "0x70", "BriefDescription": "L2 Cache Hits from L2 HWPF", - "MetricExpr": "l2_pf_hit_l2 + l2_pf_miss_l2_hit_l3 + l2_pf_miss_l2_l3", - "MetricGroup": "l2_cache" + "UMask": "0xff" }, { "EventName": "l3_accesses", -- cgit v1.2.3 From ff64c98195c5c48c4cd98ff1347543cdb0631433 Mon Sep 17 00:00:00 2001 From: Smita Koralahalli Date: Tue, 6 Apr 2021 16:59:42 -0500 Subject: perf vendor events amd: Use lowercases for all the eventcodes and umasks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The values of event codes and umasks are inconsistent with letter cases. Enforce a unique style and default everything to lower case as this helps in tracking changes of automatically generated event tables. Reviewed-by: Robert Richter Signed-off-by: Smita Koralahalli Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kim Phillips Cc: Mark Rutland Cc: Martin Liška Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Vijay Thakkar Cc: linux-perf-users@vger.kernel.org Link: https://lore.kernel.org/r/20210406215944.113332-3-Smita.KoralahalliChannabasappa@amd.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/x86/amdzen2/branch.json | 2 +- tools/perf/pmu-events/arch/x86/amdzen2/cache.json | 2 +- tools/perf/pmu-events/arch/x86/amdzen2/memory.json | 16 ++++++++-------- 3 files changed, 10 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/x86/amdzen2/branch.json b/tools/perf/pmu-events/arch/x86/amdzen2/branch.json index ef4166a66288..f5d16846aa1d 100644 --- a/tools/perf/pmu-events/arch/x86/amdzen2/branch.json +++ b/tools/perf/pmu-events/arch/x86/amdzen2/branch.json @@ -24,7 +24,7 @@ "EventName": "bp_l1_tlb_fetch_hit", "EventCode": "0x94", "BriefDescription": "The number of instruction fetches that hit in the L1 ITLB.", - "UMask": "0xFF" + "UMask": "0xff" }, { "EventName": "bp_l1_tlb_fetch_hit.if1g", diff --git a/tools/perf/pmu-events/arch/x86/amdzen2/cache.json b/tools/perf/pmu-events/arch/x86/amdzen2/cache.json index 8ba84a48188d..899ccc81263c 100644 --- a/tools/perf/pmu-events/arch/x86/amdzen2/cache.json +++ b/tools/perf/pmu-events/arch/x86/amdzen2/cache.json @@ -353,7 +353,7 @@ }, { "EventName": "xi_ccx_sdp_req1.all_l3_miss_req_typs", - "EventCode": "0x9A", + "EventCode": "0x9a", "BriefDescription": "All L3 Miss Request Types. Ignores SliceMask and ThreadMask.", "UMask": "0x3f", "Unit": "L3PMC" diff --git a/tools/perf/pmu-events/arch/x86/amdzen2/memory.json b/tools/perf/pmu-events/arch/x86/amdzen2/memory.json index 715046b339cb..609d9e3da3f7 100644 --- a/tools/perf/pmu-events/arch/x86/amdzen2/memory.json +++ b/tools/perf/pmu-events/arch/x86/amdzen2/memory.json @@ -60,17 +60,17 @@ }, { "EventName": "ls_smi_rx", - "EventCode": "0x2B", + "EventCode": "0x2b", "BriefDescription": "Number of SMIs received." }, { "EventName": "ls_int_taken", - "EventCode": "0x2C", + "EventCode": "0x2c", "BriefDescription": "Number of interrupts taken." }, { "EventName": "ls_rdtsc", - "EventCode": "0x2D", + "EventCode": "0x2d", "BriefDescription": "Number of reads of the TSC (RDTSC instructions). The count is speculative." }, { @@ -300,31 +300,31 @@ }, { "EventName": "ls_hw_pf_dc_fill.ls_mabresp_rmt_dram", - "EventCode": "0x5A", + "EventCode": "0x5a", "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. From DRAM (home node remote).", "UMask": "0x40" }, { "EventName": "ls_hw_pf_dc_fill.ls_mabresp_rmt_cache", - "EventCode": "0x5A", + "EventCode": "0x5a", "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. From another cache (home node remote).", "UMask": "0x10" }, { "EventName": "ls_hw_pf_dc_fill.ls_mabresp_lcl_dram", - "EventCode": "0x5A", + "EventCode": "0x5a", "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. From DRAM (home node local).", "UMask": "0x8" }, { "EventName": "ls_hw_pf_dc_fill.ls_mabresp_lcl_cache", - "EventCode": "0x5A", + "EventCode": "0x5a", "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. From another cache (home node local).", "UMask": "0x2" }, { "EventName": "ls_hw_pf_dc_fill.ls_mabresp_lcl_l2", - "EventCode": "0x5A", + "EventCode": "0x5a", "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. Local L2 hit.", "UMask": "0x1" }, -- cgit v1.2.3 From e5f2b4e1b8b1c709d32e895c9ca77845b8e71ee3 Mon Sep 17 00:00:00 2001 From: Smita Koralahalli Date: Tue, 6 Apr 2021 16:59:43 -0500 Subject: perf vendor events amd: Use 0x%02x format for event code and umask MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use 0x%02x format for all event codes and umasks as this helps in tracking changes of automatically generated event tables. Reviewed-by: Robert Richter Signed-off-by: Smita Koralahalli Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kim Phillips Cc: Mark Rutland Cc: Martin Liška Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Vijay Thakkar Cc: linux-perf-users@vger.kernel.org Link: https://lore.kernel.org/r/20210406215944.113332-4-Smita.KoralahalliChannabasappa@amd.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/x86/amdzen1/cache.json | 46 +++++++------- tools/perf/pmu-events/arch/x86/amdzen1/core.json | 12 ++-- .../arch/x86/amdzen1/floating-point.json | 42 ++++++------- tools/perf/pmu-events/arch/x86/amdzen1/memory.json | 42 ++++++------- tools/perf/pmu-events/arch/x86/amdzen1/other.json | 12 ++-- .../pmu-events/arch/x86/amdzen1/recommended.json | 2 +- tools/perf/pmu-events/arch/x86/amdzen2/branch.json | 6 +- tools/perf/pmu-events/arch/x86/amdzen2/cache.json | 56 ++++++++--------- tools/perf/pmu-events/arch/x86/amdzen2/core.json | 12 ++-- .../arch/x86/amdzen2/floating-point.json | 42 ++++++------- tools/perf/pmu-events/arch/x86/amdzen2/memory.json | 70 +++++++++++----------- tools/perf/pmu-events/arch/x86/amdzen2/other.json | 20 +++---- .../pmu-events/arch/x86/amdzen2/recommended.json | 2 +- 13 files changed, 182 insertions(+), 182 deletions(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/x86/amdzen1/cache.json b/tools/perf/pmu-events/arch/x86/amdzen1/cache.json index 008f1683e540..0d46cb82bd52 100644 --- a/tools/perf/pmu-events/arch/x86/amdzen1/cache.json +++ b/tools/perf/pmu-events/arch/x86/amdzen1/cache.json @@ -38,31 +38,31 @@ "EventName": "ic_fetch_stall.ic_stall_any", "EventCode": "0x87", "BriefDescription": "Instruction Pipe Stall. IC pipe was stalled during this clock cycle for any reason (nothing valid in pipe ICM1).", - "UMask": "0x4" + "UMask": "0x04" }, { "EventName": "ic_fetch_stall.ic_stall_dq_empty", "EventCode": "0x87", "BriefDescription": "Instruction Pipe Stall. IC pipe was stalled during this clock cycle (including IC to OC fetches) due to DQ empty.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "ic_fetch_stall.ic_stall_back_pressure", "EventCode": "0x87", "BriefDescription": "Instruction Pipe Stall. IC pipe was stalled during this clock cycle (including IC to OC fetches) due to back-pressure.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "ic_cache_inval.l2_invalidating_probe", "EventCode": "0x8c", "BriefDescription": "IC line invalidated due to L2 invalidating probe (external or LS). The number of instruction cache lines invalidated. A non-SMC event is CMC (cross modifying code), either from the other thread of the core or another core.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "ic_cache_inval.fill_invalidated", "EventCode": "0x8c", "BriefDescription": "IC line invalidated due to overwriting fill response. The number of instruction cache lines invalidated. A non-SMC event is CMC (cross modifying code), either from the other thread of the core or another core.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "bp_tlb_rel", @@ -97,25 +97,25 @@ "EventName": "l2_request_g1.change_to_x", "EventCode": "0x60", "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). Data cache state change requests. Request change to writable, check L2 for current state.", - "UMask": "0x8" + "UMask": "0x08" }, { "EventName": "l2_request_g1.prefetch_l2_cmd", "EventCode": "0x60", "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). PrefetchL2Cmd.", - "UMask": "0x4" + "UMask": "0x04" }, { "EventName": "l2_request_g1.l2_hw_pf", "EventCode": "0x60", "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). L2 Prefetcher. All prefetches accepted by L2 pipeline, hit or miss. Types of PF and L2 hit/miss broken out in a separate perfmon event.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "l2_request_g1.group2", "EventCode": "0x60", "BriefDescription": "Miscellaneous events covered in more detail by l2_request_g2 (PMCx061).", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "l2_request_g1.all_no_prefetch", @@ -150,31 +150,31 @@ "EventName": "l2_request_g2.ic_rd_sized_nc", "EventCode": "0x61", "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Instruction cache read sized non-cacheable.", - "UMask": "0x8" + "UMask": "0x08" }, { "EventName": "l2_request_g2.smc_inval", "EventCode": "0x61", "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Self-modifying code invalidates.", - "UMask": "0x4" + "UMask": "0x04" }, { "EventName": "l2_request_g2.bus_locks_originator", "EventCode": "0x61", "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Bus locks.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "l2_request_g2.bus_locks_responses", "EventCode": "0x61", "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Bus lock response.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "l2_latency.l2_cycles_waiting_on_fills", "EventCode": "0x62", "BriefDescription": "Total cycles spent waiting for L2 fills to complete from L3 or memory, divided by four. Event counts are for both threads. To calculate average latency, the number of fills from both threads must be used.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "l2_wcb_req.wcb_write", @@ -192,13 +192,13 @@ "EventName": "l2_wcb_req.zero_byte_store", "EventCode": "0x63", "BriefDescription": "LS to L2 WCB zero byte store requests. LS (Load/Store unit) to L2 WCB (Write Combining Buffer) zero byte store requests.", - "UMask": "0x4" + "UMask": "0x04" }, { "EventName": "l2_wcb_req.cl_zero", "EventCode": "0x63", "BriefDescription": "LS to L2 WCB cache line zeroing requests. LS (Load/Store unit) to L2 WCB (Write Combining Buffer) cache line zeroing requests.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "l2_cache_req_stat.ls_rd_blk_cs", @@ -228,37 +228,37 @@ "EventName": "l2_cache_req_stat.ls_rd_blk_c", "EventCode": "0x64", "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Data cache request miss in L2 (all types).", - "UMask": "0x8" + "UMask": "0x08" }, { "EventName": "l2_cache_req_stat.ic_fill_hit_x", "EventCode": "0x64", "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache hit modifiable line in L2.", - "UMask": "0x4" + "UMask": "0x04" }, { "EventName": "l2_cache_req_stat.ic_fill_hit_s", "EventCode": "0x64", "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache hit clean line in L2.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "l2_cache_req_stat.ic_fill_miss", "EventCode": "0x64", "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache request miss in L2.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "l2_cache_req_stat.ic_access_in_l2", "EventCode": "0x64", "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache requests in L2.", - "UMask": "0x7" + "UMask": "0x07" }, { "EventName": "l2_cache_req_stat.ic_dc_miss_in_l2", "EventCode": "0x64", "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache request miss in L2 and Data cache request miss in L2 (all types).", - "UMask": "0x9" + "UMask": "0x09" }, { "EventName": "l2_cache_req_stat.ic_dc_hit_in_l2", @@ -270,7 +270,7 @@ "EventName": "l2_fill_pending.l2_fill_busy", "EventCode": "0x6d", "BriefDescription": "Cycles with fill pending from L2. Total cycles spent with one or more fill requests in flight from L2.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "l2_pf_hit_l2", diff --git a/tools/perf/pmu-events/arch/x86/amdzen1/core.json b/tools/perf/pmu-events/arch/x86/amdzen1/core.json index 653b11b23399..4dceeabc4a9f 100644 --- a/tools/perf/pmu-events/arch/x86/amdzen1/core.json +++ b/tools/perf/pmu-events/arch/x86/amdzen1/core.json @@ -68,21 +68,21 @@ "EventCode": "0xcb", "BriefDescription": "SSE instructions (SSE, SSE2, SSE3, SSSE3, SSE4A, SSE41, SSE42, AVX).", "PublicDescription": "The number of MMX, SSE or x87 instructions retired. The UnitMask allows the selection of the individual classes of instructions as given in the table. Each increment represents one complete instruction. Since this event includes non-numeric instructions it is not suitable for measuring MFLOPS. SSE instructions (SSE, SSE2, SSE3, SSSE3, SSE4A, SSE41, SSE42, AVX).", - "UMask": "0x4" + "UMask": "0x04" }, { "EventName": "ex_ret_mmx_fp_instr.mmx_instr", "EventCode": "0xcb", "BriefDescription": "MMX instructions.", "PublicDescription": "The number of MMX, SSE or x87 instructions retired. The UnitMask allows the selection of the individual classes of instructions as given in the table. Each increment represents one complete instruction. Since this event includes non-numeric instructions it is not suitable for measuring MFLOPS. MMX instructions.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "ex_ret_mmx_fp_instr.x87_instr", "EventCode": "0xcb", "BriefDescription": "x87 instructions.", "PublicDescription": "The number of MMX, SSE or x87 instructions retired. The UnitMask allows the selection of the individual classes of instructions as given in the table. Each increment represents one complete instruction. Since this event includes non-numeric instructions it is not suitable for measuring MFLOPS. x87 instructions.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "ex_ret_cond", @@ -103,19 +103,19 @@ "EventName": "ex_tagged_ibs_ops.ibs_count_rollover", "EventCode": "0x1cf", "BriefDescription": "Tagged IBS Ops. Number of times an op could not be tagged by IBS because of a previous tagged op that has not retired.", - "UMask": "0x4" + "UMask": "0x04" }, { "EventName": "ex_tagged_ibs_ops.ibs_tagged_ops_ret", "EventCode": "0x1cf", "BriefDescription": "Tagged IBS Ops. Number of Ops tagged by IBS that retired.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "ex_tagged_ibs_ops.ibs_tagged_ops", "EventCode": "0x1cf", "BriefDescription": "Tagged IBS Ops. Number of Ops tagged by IBS.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "ex_ret_fus_brnch_inst", diff --git a/tools/perf/pmu-events/arch/x86/amdzen1/floating-point.json b/tools/perf/pmu-events/arch/x86/amdzen1/floating-point.json index a35542bd3b36..3995b528ebd6 100644 --- a/tools/perf/pmu-events/arch/x86/amdzen1/floating-point.json +++ b/tools/perf/pmu-events/arch/x86/amdzen1/floating-point.json @@ -39,35 +39,35 @@ "EventCode": "0x00", "BriefDescription": "Total number uOps assigned to all fpu pipes.", "PublicDescription": "The number of operations (uOps) and dual-pipe uOps dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to all pipes.", - "UMask": "0xf" + "UMask": "0x0f" }, { "EventName": "fpu_pipe_assignment.total3", "EventCode": "0x00", "BriefDescription": "Total number of fp uOps on pipe 3.", "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one-cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 3.", - "UMask": "0x8" + "UMask": "0x08" }, { "EventName": "fpu_pipe_assignment.total2", "EventCode": "0x00", "BriefDescription": "Total number of fp uOps on pipe 2.", "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 2.", - "UMask": "0x4" + "UMask": "0x04" }, { "EventName": "fpu_pipe_assignment.total1", "EventCode": "0x00", "BriefDescription": "Total number of fp uOps on pipe 1.", "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 1.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "fpu_pipe_assignment.total0", "EventCode": "0x00", "BriefDescription": "Total number of fp uOps on pipe 0.", "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 0.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "fp_sched_empty", @@ -79,28 +79,28 @@ "EventCode": "0x02", "BriefDescription": "All Ops.", "PublicDescription": "The number of x87 floating-point Ops that have retired. The number of events logged per cycle can vary from 0 to 8.", - "UMask": "0x7" + "UMask": "0x07" }, { "EventName": "fp_retx87_fp_ops.div_sqr_r_ops", "EventCode": "0x02", "BriefDescription": "Divide and square root Ops.", "PublicDescription": "The number of x87 floating-point Ops that have retired. The number of events logged per cycle can vary from 0 to 8. Divide and square root Ops.", - "UMask": "0x4" + "UMask": "0x04" }, { "EventName": "fp_retx87_fp_ops.mul_ops", "EventCode": "0x02", "BriefDescription": "Multiply Ops.", "PublicDescription": "The number of x87 floating-point Ops that have retired. The number of events logged per cycle can vary from 0 to 8. Multiply Ops.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "fp_retx87_fp_ops.add_sub_ops", "EventCode": "0x02", "BriefDescription": "Add/subtract Ops.", "PublicDescription": "The number of x87 floating-point Ops that have retired. The number of events logged per cycle can vary from 0 to 8. Add/subtract Ops.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "fp_ret_sse_avx_ops.all", @@ -142,83 +142,83 @@ "EventCode": "0x03", "BriefDescription": "Single precision multiply-add FLOPS. Multiply-add counts as 2 FLOPS.", "PublicDescription": "This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15. Single precision multiply-add FLOPS. Multiply-add counts as 2 FLOPS.", - "UMask": "0x8" + "UMask": "0x08" }, { "EventName": "fp_ret_sse_avx_ops.sp_div_flops", "EventCode": "0x03", "BriefDescription": "Single-precision divide/square root FLOPS.", "PublicDescription": "This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15. Single-precision divide/square root FLOPS.", - "UMask": "0x4" + "UMask": "0x04" }, { "EventName": "fp_ret_sse_avx_ops.sp_mult_flops", "EventCode": "0x03", "BriefDescription": "Single-precision multiply FLOPS.", "PublicDescription": "This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15. Single-precision multiply FLOPS.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "fp_ret_sse_avx_ops.sp_add_sub_flops", "EventCode": "0x03", "BriefDescription": "Single-precision add/subtract FLOPS.", "PublicDescription": "This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15. Single-precision add/subtract FLOPS.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "fp_num_mov_elim_scal_op.optimized", "EventCode": "0x04", "BriefDescription": "Number of Scalar Ops optimized.", "PublicDescription": "This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes. Number of Scalar Ops optimized.", - "UMask": "0x8" + "UMask": "0x08" }, { "EventName": "fp_num_mov_elim_scal_op.opt_potential", "EventCode": "0x04", "BriefDescription": "Number of Ops that are candidates for optimization (have Z-bit either set or pass).", "PublicDescription": "This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes. Number of Ops that are candidates for optimization (have Z-bit either set or pass).", - "UMask": "0x4" + "UMask": "0x04" }, { "EventName": "fp_num_mov_elim_scal_op.sse_mov_ops_elim", "EventCode": "0x04", "BriefDescription": "Number of SSE Move Ops eliminated.", "PublicDescription": "This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes. Number of SSE Move Ops eliminated.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "fp_num_mov_elim_scal_op.sse_mov_ops", "EventCode": "0x04", "BriefDescription": "Number of SSE Move Ops.", "PublicDescription": "This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes. Number of SSE Move Ops.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "fp_retired_ser_ops.x87_ctrl_ret", "EventCode": "0x05", "BriefDescription": "x87 control word mispredict traps due to mispredictions in RC or PC, or changes in mask bits.", "PublicDescription": "The number of serializing Ops retired. x87 control word mispredict traps due to mispredictions in RC or PC, or changes in mask bits.", - "UMask": "0x8" + "UMask": "0x08" }, { "EventName": "fp_retired_ser_ops.x87_bot_ret", "EventCode": "0x05", "BriefDescription": "x87 bottom-executing uOps retired.", "PublicDescription": "The number of serializing Ops retired. x87 bottom-executing uOps retired.", - "UMask": "0x4" + "UMask": "0x04" }, { "EventName": "fp_retired_ser_ops.sse_ctrl_ret", "EventCode": "0x05", "BriefDescription": "SSE control word mispredict traps due to mispredictions in RC, FTZ or DAZ, or changes in mask bits.", "PublicDescription": "The number of serializing Ops retired. SSE control word mispredict traps due to mispredictions in RC, FTZ or DAZ, or changes in mask bits.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "fp_retired_ser_ops.sse_bot_ret", "EventCode": "0x05", "BriefDescription": "SSE bottom-executing uOps retired.", "PublicDescription": "The number of serializing Ops retired. SSE bottom-executing uOps retired.", - "UMask": "0x1" + "UMask": "0x01" } ] diff --git a/tools/perf/pmu-events/arch/x86/amdzen1/memory.json b/tools/perf/pmu-events/arch/x86/amdzen1/memory.json index b33a3c308019..385022fb026e 100644 --- a/tools/perf/pmu-events/arch/x86/amdzen1/memory.json +++ b/tools/perf/pmu-events/arch/x86/amdzen1/memory.json @@ -3,25 +3,25 @@ "EventName": "ls_locks.bus_lock", "EventCode": "0x25", "BriefDescription": "Bus lock when a locked operations crosses a cache boundary or is done on an uncacheable memory type.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "ls_dispatch.ld_st_dispatch", "EventCode": "0x29", "BriefDescription": "Counts the number of operations dispatched to the LS unit. Unit Masks ADDed. Load-op-Stores.", - "UMask": "0x4" + "UMask": "0x04" }, { "EventName": "ls_dispatch.store_dispatch", "EventCode": "0x29", "BriefDescription": "Counts the number of stores dispatched to the LS unit. Unit Masks ADDed.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "ls_dispatch.ld_dispatch", "EventCode": "0x29", "BriefDescription": "Counts the number of loads dispatched to the LS unit. Unit Masks ADDed.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "ls_stlf", @@ -37,13 +37,13 @@ "EventName": "ls_mab_alloc.dc_prefetcher", "EventCode": "0x41", "BriefDescription": "LS MAB allocates by type - DC prefetcher.", - "UMask": "0x8" + "UMask": "0x08" }, { "EventName": "ls_mab_alloc.stores", "EventCode": "0x41", "BriefDescription": "LS MAB allocates by type - stores.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "ls_mab_alloc.loads", @@ -85,61 +85,61 @@ "EventName": "ls_l1_d_tlb_miss.tlb_reload_1g_l2_hit", "EventCode": "0x45", "BriefDescription": "L1 DTLB Reload of a page of 1G size.", - "UMask": "0x8" + "UMask": "0x08" }, { "EventName": "ls_l1_d_tlb_miss.tlb_reload_2m_l2_hit", "EventCode": "0x45", "BriefDescription": "L1 DTLB Reload of a page of 2M size.", - "UMask": "0x4" + "UMask": "0x04" }, { "EventName": "ls_l1_d_tlb_miss.tlb_reload_32k_l2_hit", "EventCode": "0x45", "BriefDescription": "L1 DTLB Reload of a page of 32K size.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "ls_l1_d_tlb_miss.tlb_reload_4k_l2_hit", "EventCode": "0x45", "BriefDescription": "L1 DTLB Reload of a page of 4K size.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "ls_tablewalker.iside", "EventCode": "0x46", "BriefDescription": "Total Page Table Walks on I-side.", - "UMask": "0xc" + "UMask": "0x0c" }, { "EventName": "ls_tablewalker.ic_type1", "EventCode": "0x46", "BriefDescription": "Total Page Table Walks IC Type 1.", - "UMask": "0x8" + "UMask": "0x08" }, { "EventName": "ls_tablewalker.ic_type0", "EventCode": "0x46", "BriefDescription": "Total Page Table Walks IC Type 0.", - "UMask": "0x4" + "UMask": "0x04" }, { "EventName": "ls_tablewalker.dside", "EventCode": "0x46", "BriefDescription": "Total Page Table Walks on D-side.", - "UMask": "0x3" + "UMask": "0x03" }, { "EventName": "ls_tablewalker.dc_type1", "EventCode": "0x46", "BriefDescription": "Total Page Table Walks DC Type 1.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "ls_tablewalker.dc_type0", "EventCode": "0x46", "BriefDescription": "Total Page Table Walks DC Type 0.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "ls_misal_accesses", @@ -150,31 +150,31 @@ "EventName": "ls_pref_instr_disp.prefetch_nta", "EventCode": "0x4b", "BriefDescription": "Software Prefetch Instructions (PREFETCHNTA instruction) Dispatched.", - "UMask": "0x4" + "UMask": "0x04" }, { "EventName": "ls_pref_instr_disp.store_prefetch_w", "EventCode": "0x4b", "BriefDescription": "Software Prefetch Instructions (3DNow PREFETCHW instruction) Dispatched.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "ls_pref_instr_disp.load_prefetch_w", "EventCode": "0x4b", "BriefDescription": "Software Prefetch Instructions Dispatched. Prefetch, Prefetch_T0_T1_T2.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "ls_inef_sw_pref.mab_mch_cnt", "EventCode": "0x52", "BriefDescription": "The number of software prefetches that did not fetch data outside of the processor core. Software PREFETCH instruction saw a match on an already-allocated miss request buffer.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "ls_inef_sw_pref.data_pipe_sw_pf_dc_hit", "EventCode": "0x52", "BriefDescription": "The number of software prefetches that did not fetch data outside of the processor core. Software PREFETCH instruction saw a DC hit.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "ls_not_halted_cyc", diff --git a/tools/perf/pmu-events/arch/x86/amdzen1/other.json b/tools/perf/pmu-events/arch/x86/amdzen1/other.json index ff780098d36e..7626986ce1fb 100644 --- a/tools/perf/pmu-events/arch/x86/amdzen1/other.json +++ b/tools/perf/pmu-events/arch/x86/amdzen1/other.json @@ -3,13 +3,13 @@ "EventName": "ic_oc_mode_switch.oc_ic_mode_switch", "EventCode": "0x28a", "BriefDescription": "OC Mode Switch. OC to IC mode switch.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "ic_oc_mode_switch.ic_oc_mode_switch", "EventCode": "0x28a", "BriefDescription": "OC Mode Switch. IC to OC mode switch.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "de_dis_dispatch_token_stalls0.retire_token_stall", @@ -33,24 +33,24 @@ "EventName": "de_dis_dispatch_token_stalls0.alsq3_0_token_stall", "EventCode": "0xaf", "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. ALSQ 3_0 Tokens unavailable.", - "UMask": "0x8" + "UMask": "0x08" }, { "EventName": "de_dis_dispatch_token_stalls0.alsq3_token_stall", "EventCode": "0xaf", "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. ALSQ 3 Tokens unavailable.", - "UMask": "0x4" + "UMask": "0x04" }, { "EventName": "de_dis_dispatch_token_stalls0.alsq2_token_stall", "EventCode": "0xaf", "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. ALSQ 2 Tokens unavailable.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "de_dis_dispatch_token_stalls0.alsq1_token_stall", "EventCode": "0xaf", "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. ALSQ 1 Tokens unavailable.", - "UMask": "0x1" + "UMask": "0x01" } ] diff --git a/tools/perf/pmu-events/arch/x86/amdzen1/recommended.json b/tools/perf/pmu-events/arch/x86/amdzen1/recommended.json index 3c954543d1ae..bf5083c1c260 100644 --- a/tools/perf/pmu-events/arch/x86/amdzen1/recommended.json +++ b/tools/perf/pmu-events/arch/x86/amdzen1/recommended.json @@ -10,7 +10,7 @@ "EventName": "all_dc_accesses", "EventCode": "0x29", "BriefDescription": "All L1 Data Cache Accesses", - "UMask": "0x7" + "UMask": "0x07" }, { "MetricName": "all_l2_cache_accesses", diff --git a/tools/perf/pmu-events/arch/x86/amdzen2/branch.json b/tools/perf/pmu-events/arch/x86/amdzen2/branch.json index f5d16846aa1d..84fb43fa59ad 100644 --- a/tools/perf/pmu-events/arch/x86/amdzen2/branch.json +++ b/tools/perf/pmu-events/arch/x86/amdzen2/branch.json @@ -30,19 +30,19 @@ "EventName": "bp_l1_tlb_fetch_hit.if1g", "EventCode": "0x94", "BriefDescription": "The number of instruction fetches that hit in the L1 ITLB. Instruction fetches to a 1GB page.", - "UMask": "0x4" + "UMask": "0x04" }, { "EventName": "bp_l1_tlb_fetch_hit.if2m", "EventCode": "0x94", "BriefDescription": "The number of instruction fetches that hit in the L1 ITLB. Instruction fetches to a 2MB page.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "bp_l1_tlb_fetch_hit.if4k", "EventCode": "0x94", "BriefDescription": "The number of instruction fetches that hit in the L1 ITLB. Instruction fetches to a 4KB page.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "bp_tlb_rel", diff --git a/tools/perf/pmu-events/arch/x86/amdzen2/cache.json b/tools/perf/pmu-events/arch/x86/amdzen2/cache.json index 899ccc81263c..c858fb9477e3 100644 --- a/tools/perf/pmu-events/arch/x86/amdzen2/cache.json +++ b/tools/perf/pmu-events/arch/x86/amdzen2/cache.json @@ -27,25 +27,25 @@ "EventName": "l2_request_g1.change_to_x", "EventCode": "0x60", "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). Data cache state change requests. Request change to writable, check L2 for current state.", - "UMask": "0x8" + "UMask": "0x08" }, { "EventName": "l2_request_g1.prefetch_l2_cmd", "EventCode": "0x60", "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). PrefetchL2Cmd.", - "UMask": "0x4" + "UMask": "0x04" }, { "EventName": "l2_request_g1.l2_hw_pf", "EventCode": "0x60", "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). L2 Prefetcher. All prefetches accepted by L2 pipeline, hit or miss. Types of PF and L2 hit/miss broken out in a separate perfmon event.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "l2_request_g1.group2", "EventCode": "0x60", "BriefDescription": "Miscellaneous events covered in more detail by l2_request_g2 (PMCx061).", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "l2_request_g1.all_no_prefetch", @@ -80,31 +80,31 @@ "EventName": "l2_request_g2.ic_rd_sized_nc", "EventCode": "0x61", "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Instruction cache read sized non-cacheable.", - "UMask": "0x8" + "UMask": "0x08" }, { "EventName": "l2_request_g2.smc_inval", "EventCode": "0x61", "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Self-modifying code invalidates.", - "UMask": "0x4" + "UMask": "0x04" }, { "EventName": "l2_request_g2.bus_locks_originator", "EventCode": "0x61", "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Bus locks.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "l2_request_g2.bus_locks_responses", "EventCode": "0x61", "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Bus lock response.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "l2_latency.l2_cycles_waiting_on_fills", "EventCode": "0x62", "BriefDescription": "Total cycles spent waiting for L2 fills to complete from L3 or memory, divided by four. Event counts are for both threads. To calculate average latency, the number of fills from both threads must be used.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "l2_wcb_req.wcb_write", @@ -122,13 +122,13 @@ "EventName": "l2_wcb_req.zero_byte_store", "EventCode": "0x63", "BriefDescription": "LS to L2 WCB zero byte store requests. LS (Load/Store unit) to L2 WCB (Write Combining Buffer) zero byte store requests.", - "UMask": "0x4" + "UMask": "0x04" }, { "EventName": "l2_wcb_req.cl_zero", "EventCode": "0x63", "BriefDescription": "LS to L2 WCB cache line zeroing requests. LS (Load/Store unit) to L2 WCB (Write Combining Buffer) cache line zeroing requests.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "l2_cache_req_stat.ls_rd_blk_cs", @@ -158,37 +158,37 @@ "EventName": "l2_cache_req_stat.ls_rd_blk_c", "EventCode": "0x64", "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Data cache request miss in L2 (all types).", - "UMask": "0x8" + "UMask": "0x08" }, { "EventName": "l2_cache_req_stat.ic_fill_hit_x", "EventCode": "0x64", "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache hit modifiable line in L2.", - "UMask": "0x4" + "UMask": "0x04" }, { "EventName": "l2_cache_req_stat.ic_fill_hit_s", "EventCode": "0x64", "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache hit clean line in L2.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "l2_cache_req_stat.ic_fill_miss", "EventCode": "0x64", "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache request miss in L2.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "l2_cache_req_stat.ic_access_in_l2", "EventCode": "0x64", "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache requests in L2.", - "UMask": "0x7" + "UMask": "0x07" }, { "EventName": "l2_cache_req_stat.ic_dc_miss_in_l2", "EventCode": "0x64", "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache request miss in L2 and Data cache request miss in L2 (all types).", - "UMask": "0x9" + "UMask": "0x09" }, { "EventName": "l2_cache_req_stat.ic_dc_hit_in_l2", @@ -200,7 +200,7 @@ "EventName": "l2_fill_pending.l2_fill_busy", "EventCode": "0x6d", "BriefDescription": "Cycles with fill pending from L2. Total cycles spent with one or more fill requests in flight from L2.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "l2_pf_hit_l2", @@ -255,19 +255,19 @@ "EventName": "bp_l1_tlb_miss_l2_tlb_miss.if1g", "EventCode": "0x85", "BriefDescription": "The number of instruction fetches that miss in both the L1 and L2 TLBs. Instruction fetches to a 1GB page.", - "UMask": "0x4" + "UMask": "0x04" }, { "EventName": "bp_l1_tlb_miss_l2_tlb_miss.if2m", "EventCode": "0x85", "BriefDescription": "The number of instruction fetches that miss in both the L1 and L2 TLBs. Instruction fetches to a 2MB page.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "bp_l1_tlb_miss_l2_tlb_miss.if4k", "EventCode": "0x85", "BriefDescription": "The number of instruction fetches that miss in both the L1 and L2 TLBs. Instruction fetches to a 4KB page.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "bp_snp_re_sync", @@ -278,43 +278,43 @@ "EventName": "ic_fetch_stall.ic_stall_any", "EventCode": "0x87", "BriefDescription": "Instruction Pipe Stall. IC pipe was stalled during this clock cycle for any reason (nothing valid in pipe ICM1).", - "UMask": "0x4" + "UMask": "0x04" }, { "EventName": "ic_fetch_stall.ic_stall_dq_empty", "EventCode": "0x87", "BriefDescription": "Instruction Pipe Stall. IC pipe was stalled during this clock cycle (including IC to OC fetches) due to DQ empty.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "ic_fetch_stall.ic_stall_back_pressure", "EventCode": "0x87", "BriefDescription": "Instruction Pipe Stall. IC pipe was stalled during this clock cycle (including IC to OC fetches) due to back-pressure.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "ic_cache_inval.l2_invalidating_probe", "EventCode": "0x8c", "BriefDescription": "IC line invalidated due to L2 invalidating probe (external or LS). The number of instruction cache lines invalidated. A non-SMC event is CMC (cross modifying code), either from the other thread of the core or another core.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "ic_cache_inval.fill_invalidated", "EventCode": "0x8c", "BriefDescription": "IC line invalidated due to overwriting fill response. The number of instruction cache lines invalidated. A non-SMC event is CMC (cross modifying code), either from the other thread of the core or another core.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "ic_oc_mode_switch.oc_ic_mode_switch", "EventCode": "0x28a", "BriefDescription": "OC Mode Switch. OC to IC mode switch.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "ic_oc_mode_switch.ic_oc_mode_switch", "EventCode": "0x28a", "BriefDescription": "OC Mode Switch. IC to OC mode switch.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "l3_request_g1.caching_l3_cache_accesses", diff --git a/tools/perf/pmu-events/arch/x86/amdzen2/core.json b/tools/perf/pmu-events/arch/x86/amdzen2/core.json index 4b75183da94a..bed14829f0bc 100644 --- a/tools/perf/pmu-events/arch/x86/amdzen2/core.json +++ b/tools/perf/pmu-events/arch/x86/amdzen2/core.json @@ -68,21 +68,21 @@ "EventCode": "0xcb", "BriefDescription": "SSE instructions (SSE, SSE2, SSE3, SSSE3, SSE4A, SSE41, SSE42, AVX).", "PublicDescription": "The number of MMX, SSE or x87 instructions retired. The UnitMask allows the selection of the individual classes of instructions as given in the table. Each increment represents one complete instruction. Since this event includes non-numeric instructions it is not suitable for measuring MFLOPS. SSE instructions (SSE, SSE2, SSE3, SSSE3, SSE4A, SSE41, SSE42, AVX).", - "UMask": "0x4" + "UMask": "0x04" }, { "EventName": "ex_ret_mmx_fp_instr.mmx_instr", "EventCode": "0xcb", "BriefDescription": "MMX instructions.", "PublicDescription": "The number of MMX, SSE or x87 instructions retired. The UnitMask allows the selection of the individual classes of instructions as given in the table. Each increment represents one complete instruction. Since this event includes non-numeric instructions it is not suitable for measuring MFLOPS. MMX instructions.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "ex_ret_mmx_fp_instr.x87_instr", "EventCode": "0xcb", "BriefDescription": "x87 instructions.", "PublicDescription": "The number of MMX, SSE or x87 instructions retired. The UnitMask allows the selection of the individual classes of instructions as given in the table. Each increment represents one complete instruction. Since this event includes non-numeric instructions it is not suitable for measuring MFLOPS. x87 instructions.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "ex_ret_cond", @@ -108,19 +108,19 @@ "EventName": "ex_tagged_ibs_ops.ibs_count_rollover", "EventCode": "0x1cf", "BriefDescription": "Tagged IBS Ops. Number of times an op could not be tagged by IBS because of a previous tagged op that has not retired.", - "UMask": "0x4" + "UMask": "0x04" }, { "EventName": "ex_tagged_ibs_ops.ibs_tagged_ops_ret", "EventCode": "0x1cf", "BriefDescription": "Tagged IBS Ops. Number of Ops tagged by IBS that retired.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "ex_tagged_ibs_ops.ibs_tagged_ops", "EventCode": "0x1cf", "BriefDescription": "Tagged IBS Ops. Number of Ops tagged by IBS.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "ex_ret_fus_brnch_inst", diff --git a/tools/perf/pmu-events/arch/x86/amdzen2/floating-point.json b/tools/perf/pmu-events/arch/x86/amdzen2/floating-point.json index 622a0c420e46..91ed96f2580b 100644 --- a/tools/perf/pmu-events/arch/x86/amdzen2/floating-point.json +++ b/tools/perf/pmu-events/arch/x86/amdzen2/floating-point.json @@ -4,35 +4,35 @@ "EventCode": "0x00", "BriefDescription": "Total number of fp uOps.", "PublicDescription": "Total number of fp uOps. The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS.", - "UMask": "0xf" + "UMask": "0x0f" }, { "EventName": "fpu_pipe_assignment.total3", "EventCode": "0x00", "BriefDescription": "Total number uOps assigned to pipe 3.", "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one-cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 3.", - "UMask": "0x8" + "UMask": "0x08" }, { "EventName": "fpu_pipe_assignment.total2", "EventCode": "0x00", "BriefDescription": "Total number uOps assigned to pipe 2.", "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 2.", - "UMask": "0x4" + "UMask": "0x04" }, { "EventName": "fpu_pipe_assignment.total1", "EventCode": "0x00", "BriefDescription": "Total number uOps assigned to pipe 1.", "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 1.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "fpu_pipe_assignment.total0", "EventCode": "0x00", "BriefDescription": "Total number of fp uOps on pipe 0.", "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 0.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "fp_ret_sse_avx_ops.all", @@ -45,96 +45,96 @@ "EventCode": "0x03", "BriefDescription": "Multiply-add FLOPS. Multiply-add counts as 2 FLOPS. This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15.", "PublicDescription": "", - "UMask": "0x8" + "UMask": "0x08" }, { "EventName": "fp_ret_sse_avx_ops.div_flops", "EventCode": "0x03", "BriefDescription": "Divide/square root FLOPS. This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15.", - "UMask": "0x4" + "UMask": "0x04" }, { "EventName": "fp_ret_sse_avx_ops.mult_flops", "EventCode": "0x03", "BriefDescription": "Multiply FLOPS. This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "fp_ret_sse_avx_ops.add_sub_flops", "EventCode": "0x03", "BriefDescription": "Add/subtract FLOPS. This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "fp_num_mov_elim_scal_op.optimized", "EventCode": "0x04", "BriefDescription": "Number of Scalar Ops optimized. This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes.", - "UMask": "0x8" + "UMask": "0x08" }, { "EventName": "fp_num_mov_elim_scal_op.opt_potential", "EventCode": "0x04", "BriefDescription": "Number of Ops that are candidates for optimization (have Z-bit either set or pass). This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes.", - "UMask": "0x4" + "UMask": "0x04" }, { "EventName": "fp_num_mov_elim_scal_op.sse_mov_ops_elim", "EventCode": "0x04", "BriefDescription": "Number of SSE Move Ops eliminated. This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "fp_num_mov_elim_scal_op.sse_mov_ops", "EventCode": "0x04", "BriefDescription": "Number of SSE Move Ops. This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "fp_retired_ser_ops.sse_bot_ret", "EventCode": "0x05", "BriefDescription": "SSE bottom-executing uOps retired. The number of serializing Ops retired.", - "UMask": "0x8" + "UMask": "0x08" }, { "EventName": "fp_retired_ser_ops.sse_ctrl_ret", "EventCode": "0x05", "BriefDescription": "The number of serializing Ops retired. SSE control word mispredict traps due to mispredictions in RC, FTZ or DAZ, or changes in mask bits.", - "UMask": "0x4" + "UMask": "0x04" }, { "EventName": "fp_retired_ser_ops.x87_bot_ret", "EventCode": "0x05", "BriefDescription": "x87 bottom-executing uOps retired. The number of serializing Ops retired.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "fp_retired_ser_ops.x87_ctrl_ret", "EventCode": "0x05", "BriefDescription": "x87 control word mispredict traps due to mispredictions in RC or PC, or changes in mask bits. The number of serializing Ops retired.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "fp_disp_faults.ymm_spill_fault", "EventCode": "0x0e", "BriefDescription": "Floating Point Dispatch Faults. YMM spill fault.", - "UMask": "0x8" + "UMask": "0x08" }, { "EventName": "fp_disp_faults.ymm_fill_fault", "EventCode": "0x0e", "BriefDescription": "Floating Point Dispatch Faults. YMM fill fault.", - "UMask": "0x4" + "UMask": "0x04" }, { "EventName": "fp_disp_faults.xmm_fill_fault", "EventCode": "0x0e", "BriefDescription": "Floating Point Dispatch Faults. XMM fill fault.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "fp_disp_faults.x87_fill_fault", "EventCode": "0x0e", "BriefDescription": "Floating Point Dispatch Faults. x87 fill fault.", - "UMask": "0x1" + "UMask": "0x01" } ] diff --git a/tools/perf/pmu-events/arch/x86/amdzen2/memory.json b/tools/perf/pmu-events/arch/x86/amdzen2/memory.json index 609d9e3da3f7..89822b9ddb79 100644 --- a/tools/perf/pmu-events/arch/x86/amdzen2/memory.json +++ b/tools/perf/pmu-events/arch/x86/amdzen2/memory.json @@ -4,31 +4,31 @@ "EventCode": "0x24", "BriefDescription": "Non-forwardable conflict; used to reduce STLI's via software. All reasons. Store To Load Interlock (STLI) are loads that were unable to complete because of a possible match with an older store, and the older store could not do STLF for some reason.", "PublicDescription" : "Store-to-load conflicts: A load was unable to complete due to a non-forwardable conflict with an older store. Most commonly, a load's address range partially but not completely overlaps with an uncompleted older store. Software can avoid this problem by using same-size and same-alignment loads and stores when accessing the same data. Vector/SIMD code is particularly susceptible to this problem; software should construct wide vector stores by manipulating vector elements in registers using shuffle/blend/swap instructions prior to storing to memory, instead of using narrow element-by-element stores.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "ls_locks.spec_lock_hi_spec", "EventCode": "0x25", "BriefDescription": "Retired lock instructions. High speculative cacheable lock speculation succeeded.", - "UMask": "0x8" + "UMask": "0x08" }, { "EventName": "ls_locks.spec_lock_lo_spec", "EventCode": "0x25", "BriefDescription": "Retired lock instructions. Low speculative cacheable lock speculation succeeded.", - "UMask": "0x4" + "UMask": "0x04" }, { "EventName": "ls_locks.non_spec_lock", "EventCode": "0x25", "BriefDescription": "Retired lock instructions. Non-speculative lock succeeded.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "ls_locks.bus_lock", "EventCode": "0x25", "BriefDescription": "Retired lock instructions. Bus lock when a locked operations crosses a cache boundary or is done on an uncacheable memory type. Comparable to legacy bus lock.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "ls_ret_cl_flush", @@ -44,19 +44,19 @@ "EventName": "ls_dispatch.ld_st_dispatch", "EventCode": "0x29", "BriefDescription": "Dispatch of a single op that performs a load from and store to the same memory address. Number of single ops that do load/store to an address.", - "UMask": "0x4" + "UMask": "0x04" }, { "EventName": "ls_dispatch.store_dispatch", "EventCode": "0x29", "BriefDescription": "Number of stores dispatched. Counts the number of operations dispatched to the LS unit. Unit Masks ADDed.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "ls_dispatch.ld_dispatch", "EventCode": "0x29", "BriefDescription": "Number of loads dispatched. Counts the number of operations dispatched to the LS unit. Unit Masks ADDed.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "ls_smi_rx", @@ -93,19 +93,19 @@ "EventName": "ls_mab_alloc.dc_prefetcher", "EventCode": "0x41", "BriefDescription": "LS MAB Allocates by Type. DC prefetcher.", - "UMask": "0x8" + "UMask": "0x08" }, { "EventName": "ls_mab_alloc.stores", "EventCode": "0x41", "BriefDescription": "LS MAB Allocates by Type. Stores.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "ls_mab_alloc.loads", "EventCode": "0x41", "BriefDescription": "LS MAB Allocates by Type. Loads.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "ls_refills_from_sys.ls_mabresp_rmt_dram", @@ -123,19 +123,19 @@ "EventName": "ls_refills_from_sys.ls_mabresp_lcl_dram", "EventCode": "0x43", "BriefDescription": "Demand Data Cache Fills by Data Source. DRAM or IO from this thread's die.", - "UMask": "0x8" + "UMask": "0x08" }, { "EventName": "ls_refills_from_sys.ls_mabresp_lcl_cache", "EventCode": "0x43", "BriefDescription": "Demand Data Cache Fills by Data Source. Hit in cache; local CCX (not Local L2), or Remote CCX and the address's Home Node is on this thread's die.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "ls_refills_from_sys.ls_mabresp_lcl_l2", "EventCode": "0x43", "BriefDescription": "Demand Data Cache Fills by Data Source. Local L2 hit.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "ls_l1_d_tlb_miss.all", @@ -171,61 +171,61 @@ "EventName": "ls_l1_d_tlb_miss.tlb_reload_1g_l2_hit", "EventCode": "0x45", "BriefDescription": "L1 DTLB Miss. DTLB reload to a 1G page that hit in the L2 TLB.", - "UMask": "0x8" + "UMask": "0x08" }, { "EventName": "ls_l1_d_tlb_miss.tlb_reload_2m_l2_hit", "EventCode": "0x45", "BriefDescription": "L1 DTLB Miss. DTLB reload to a 2M page that hit in the L2 TLB.", - "UMask": "0x4" + "UMask": "0x04" }, { "EventName": "ls_l1_d_tlb_miss.tlb_reload_coalesced_page_hit", "EventCode": "0x45", "BriefDescription": "L1 DTLB Miss. DTLB reload hit a coalesced page.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "ls_l1_d_tlb_miss.tlb_reload_4k_l2_hit", "EventCode": "0x45", "BriefDescription": "L1 DTLB Miss. DTLB reload to a 4K page that hit in the L2 TLB.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "ls_tablewalker.iside", "EventCode": "0x46", "BriefDescription": "Total Page Table Walks on I-side.", - "UMask": "0xc" + "UMask": "0x0c" }, { "EventName": "ls_tablewalker.ic_type1", "EventCode": "0x46", "BriefDescription": "Total Page Table Walks IC Type 1.", - "UMask": "0x8" + "UMask": "0x08" }, { "EventName": "ls_tablewalker.ic_type0", "EventCode": "0x46", "BriefDescription": "Total Page Table Walks IC Type 0.", - "UMask": "0x4" + "UMask": "0x04" }, { "EventName": "ls_tablewalker.dside", "EventCode": "0x46", "BriefDescription": "Total Page Table Walks on D-side.", - "UMask": "0x3" + "UMask": "0x03" }, { "EventName": "ls_tablewalker.dc_type1", "EventCode": "0x46", "BriefDescription": "Total Page Table Walks DC Type 1.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "ls_tablewalker.dc_type0", "EventCode": "0x46", "BriefDescription": "Total Page Table Walks DC Type 0.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "ls_misal_accesses", @@ -242,31 +242,31 @@ "EventName": "ls_pref_instr_disp.prefetch_nta", "EventCode": "0x4b", "BriefDescription": "Software Prefetch Instructions Dispatched (Speculative). PrefetchNTA instruction. See docAPM3 PREFETCHlevel.", - "UMask": "0x4" + "UMask": "0x04" }, { "EventName": "ls_pref_instr_disp.prefetch_w", "EventCode": "0x4b", "BriefDescription": "Software Prefetch Instructions Dispatched (Speculative). See docAPM3 PREFETCHW.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "ls_pref_instr_disp.prefetch", "EventCode": "0x4b", "BriefDescription": "Software Prefetch Instructions Dispatched (Speculative). Prefetch_T0_T1_T2. PrefetchT0, T1 and T2 instructions. See docAPM3 PREFETCHlevel.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "ls_inef_sw_pref.mab_mch_cnt", "EventCode": "0x52", "BriefDescription": "The number of software prefetches that did not fetch data outside of the processor core. Software PREFETCH instruction saw a match on an already-allocated miss request buffer.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "ls_inef_sw_pref.data_pipe_sw_pf_dc_hit", "EventCode": "0x52", "BriefDescription": "The number of software prefetches that did not fetch data outside of the processor core. Software PREFETCH instruction saw a DC hit.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "ls_sw_pf_dc_fill.ls_mabresp_rmt_dram", @@ -284,19 +284,19 @@ "EventName": "ls_sw_pf_dc_fill.ls_mabresp_lcl_dram", "EventCode": "0x59", "BriefDescription": "Software Prefetch Data Cache Fills by Data Source. DRAM or IO from this thread's die. From DRAM (home node local).", - "UMask": "0x8" + "UMask": "0x08" }, { "EventName": "ls_sw_pf_dc_fill.ls_mabresp_lcl_cache", "EventCode": "0x59", "BriefDescription": "Software Prefetch Data Cache Fills by Data Source. From another cache (home node local).", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "ls_sw_pf_dc_fill.ls_mabresp_lcl_l2", "EventCode": "0x59", "BriefDescription": "Software Prefetch Data Cache Fills by Data Source. Local L2 hit.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "ls_hw_pf_dc_fill.ls_mabresp_rmt_dram", @@ -314,19 +314,19 @@ "EventName": "ls_hw_pf_dc_fill.ls_mabresp_lcl_dram", "EventCode": "0x5a", "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. From DRAM (home node local).", - "UMask": "0x8" + "UMask": "0x08" }, { "EventName": "ls_hw_pf_dc_fill.ls_mabresp_lcl_cache", "EventCode": "0x5a", "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. From another cache (home node local).", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "ls_hw_pf_dc_fill.ls_mabresp_lcl_l2", "EventCode": "0x5a", "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. Local L2 hit.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "ls_not_halted_cyc", diff --git a/tools/perf/pmu-events/arch/x86/amdzen2/other.json b/tools/perf/pmu-events/arch/x86/amdzen2/other.json index e94994d4a60e..1bdf106ca785 100644 --- a/tools/perf/pmu-events/arch/x86/amdzen2/other.json +++ b/tools/perf/pmu-events/arch/x86/amdzen2/other.json @@ -14,13 +14,13 @@ "EventName": "de_dis_uops_from_decoder.opcache_dispatched", "EventCode": "0xaa", "BriefDescription": "Count of dispatched Ops from OpCache.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "de_dis_uops_from_decoder.decoder_dispatched", "EventCode": "0xaa", "BriefDescription": "Count of dispatched Ops from Decoder.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "de_dis_dispatch_token_stalls1.fp_misc_rsrc_stall", @@ -50,25 +50,25 @@ "EventName": "de_dis_dispatch_token_stalls1.int_sched_misc_token_stall", "EventCode": "0xae", "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. Integer Scheduler miscellaneous resource stall.", - "UMask": "0x8" + "UMask": "0x08" }, { "EventName": "de_dis_dispatch_token_stalls1.store_queue_token_stall", "EventCode": "0xae", "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. Store queue resource stall. Applies to all ops with store semantics.", - "UMask": "0x4" + "UMask": "0x04" }, { "EventName": "de_dis_dispatch_token_stalls1.load_queue_token_stall", "EventCode": "0xae", "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. Load queue resource stall. Applies to all ops with load semantics.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "de_dis_dispatch_token_stalls1.int_phy_reg_file_token_stall", "EventCode": "0xae", "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. Integer Physical Register File resource stall. Applies to all ops that have an integer destination register.", - "UMask": "0x1" + "UMask": "0x01" }, { "EventName": "de_dis_dispatch_token_stalls0.sc_agu_dispatch_stall", @@ -92,24 +92,24 @@ "EventName": "de_dis_dispatch_token_stalls0.alu_token_stall", "EventCode": "0xaf", "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. ALU tokens total unavailable.", - "UMask": "0x8" + "UMask": "0x08" }, { "EventName": "de_dis_dispatch_token_stalls0.alsq3_0_token_stall", "EventCode": "0xaf", "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. ALSQ3_0_TokenStall.", - "UMask": "0x4" + "UMask": "0x04" }, { "EventName": "de_dis_dispatch_token_stalls0.alsq2_token_stall", "EventCode": "0xaf", "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. ALSQ 2 Tokens unavailable.", - "UMask": "0x2" + "UMask": "0x02" }, { "EventName": "de_dis_dispatch_token_stalls0.alsq1_token_stall", "EventCode": "0xaf", "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. ALSQ 1 Tokens unavailable.", - "UMask": "0x1" + "UMask": "0x01" } ] diff --git a/tools/perf/pmu-events/arch/x86/amdzen2/recommended.json b/tools/perf/pmu-events/arch/x86/amdzen2/recommended.json index 1c624cee9ef4..a71694a043ba 100644 --- a/tools/perf/pmu-events/arch/x86/amdzen2/recommended.json +++ b/tools/perf/pmu-events/arch/x86/amdzen2/recommended.json @@ -10,7 +10,7 @@ "EventName": "all_dc_accesses", "EventCode": "0x29", "BriefDescription": "All L1 Data Cache Accesses", - "UMask": "0x7" + "UMask": "0x07" }, { "MetricName": "all_l2_cache_accesses", -- cgit v1.2.3 From da66658638c947cab0fb157289f03698453ff8d5 Mon Sep 17 00:00:00 2001 From: Smita Koralahalli Date: Tue, 6 Apr 2021 16:59:44 -0500 Subject: perf vendor events amd: Add Zen3 events MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add PMU events for AMD Zen3 processors as documented in the AMD Processor Programming Reference for Family 19h and Model 01h [1]. Below are the events which are new on Zen3: PMCx041 ls_mab_alloc.{all_allocations|hardware_prefetcher_allocations|load_store_allocations} PMCx043 ls_dmnd_fills_from_sys.ext_cache_local PMCx044 ls_any_fills_from_sys.{mem_io_remote|ext_cache_remote|mem_io_local|ext_cache_local|int_cache|lcl_l2} PMCx047 ls_misal_loads.{ma4k|ma64} PMCx059 ls_sw_pf_dc_fills.ext_cache_local PMCx05a ls_hw_pf_dc_fills.ext_cache_local PMCx05f ls_alloc_mab_count PMCx085 bp_l1_tlb_miss_l2_tlb_miss.coalesced_4k PMCx0ab de_dis_cops_from_decoder.disp_op_type.{any_integer_dispatch|any_fp_dispatch} PMCx0cc ex_ret_ind_brch_instr PMCx18e ic_tag_hit_miss.{all_instruction_cache_accesses|instruction_cache_miss|instruction_cache_hit} PMCx1c7 ex_ret_msprd_brnch_instr_dir_msmtch PMCx28f op_cache_hit_miss.{all_op_cache_accesses|op_cache_miss|op_cache_hit} Section 2.1.17.2 "Performance Measurement" of "PPR for AMD Family 19h, Model 01h, Revision B1 Processors - 55898 Rev 0.35 - Feb 5, 2021." lists new metrics. Add them. Preserve the events for Zen3 if they are measurable and non-zero as taken from Zen2 directory even if the PPR of Zen3 [1] omits them. Those events are the following: PMCx000 fpu_pipe_assignment.{total|total0|total1|total2|total3} PMCx004 fp_num_mov_elim_scal_op.{optimized|opt_potential|sse_mov_ops_elim|sse_mov_ops} PMCx02D ls_rdtsc PMCx040 ls_dc_accesses PMCx046 ls_tablewalker.{iside|ic_type1|ic_type0|dside|dc_type1|dc_type0} PMCx061 l2_request_g2.{group1|ls_rd_sized|ls_rd_sized_nc|ic_rd_sized|ic_rd_sized_nc|smc_inval|bus_lock_originator|bus_locks_responses} PMCx062 l2_latency.l2_cycles_waiting_on_fills PMCx063 l2_wcb_req.{wcb_write|wcb_close|zero_byte_store|cl_zero} PMCx06d l2_fill_pending.l2_fill_busy PMCx080 ic_fw32 PMCx081 ic_fw32_miss PMCx086 bp_snp_re_sync PMCx087 ic_fetch_stall.{ic_stall_any|ic_stall_dq_empty|ic_stall_back_pressure} PMCx08a bp_l1_btb_correct PMCx08c ic_cache_inval.{l2_invalidating_probe|fill_invalidated} PMCx099 bp_tlb_rel PMCx0a9 de_dis_uop_queue_empty_di0 PMCx0c7 ex_ret_brn_resync PMCx28a ic_oc_mode_switch.{oc_ic_mode_switch|ic_oc_mode_switch} L3PMCx01 l3_request_g1.caching_l3_cache_accesses L3PMCx06 l3_comb_clstr_state.{other_l3_miss_typs|request_miss} [1] Processor Programming Reference (PPR) for AMD Family 19h, Model 01h, Revision B1 Processors - 55898 Rev 0.35 - Feb 5, 2021. [2] Processor Programming Reference (PPR) for AMD Family 17h Model 71h, Revision B0 Processors, 56176 Rev 3.06 - Jul 17, 2019. [3] Processor Programming Reference (PPR) for AMD Family 17h Models 01h,08h, Revision B2 Processors, 54945 Rev 3.03 - Jun 14, 2019. All of the PPRs can be found at: https://bugzilla.kernel.org/show_bug.cgi?id=206537 Reviewed-by: Robert Richter Signed-off-by: Smita Koralahalli Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kim Phillips Cc: Mark Rutland Cc: Martin Liška Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Vijay Thakkar Cc: linux-perf-users@vger.kernel.org Link: https://lore.kernel.org/r/20210406215944.113332-5-Smita.KoralahalliChannabasappa@amd.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/x86/amdzen3/branch.json | 53 +++ tools/perf/pmu-events/arch/x86/amdzen3/cache.json | 402 +++++++++++++++++++ tools/perf/pmu-events/arch/x86/amdzen3/core.json | 137 +++++++ .../pmu-events/arch/x86/amdzen3/data-fabric.json | 98 +++++ .../arch/x86/amdzen3/floating-point.json | 139 +++++++ tools/perf/pmu-events/arch/x86/amdzen3/memory.json | 428 +++++++++++++++++++++ tools/perf/pmu-events/arch/x86/amdzen3/other.json | 103 +++++ .../pmu-events/arch/x86/amdzen3/recommended.json | 214 +++++++++++ tools/perf/pmu-events/arch/x86/mapfile.csv | 2 +- 9 files changed, 1575 insertions(+), 1 deletion(-) create mode 100644 tools/perf/pmu-events/arch/x86/amdzen3/branch.json create mode 100644 tools/perf/pmu-events/arch/x86/amdzen3/cache.json create mode 100644 tools/perf/pmu-events/arch/x86/amdzen3/core.json create mode 100644 tools/perf/pmu-events/arch/x86/amdzen3/data-fabric.json create mode 100644 tools/perf/pmu-events/arch/x86/amdzen3/floating-point.json create mode 100644 tools/perf/pmu-events/arch/x86/amdzen3/memory.json create mode 100644 tools/perf/pmu-events/arch/x86/amdzen3/other.json create mode 100644 tools/perf/pmu-events/arch/x86/amdzen3/recommended.json (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/x86/amdzen3/branch.json b/tools/perf/pmu-events/arch/x86/amdzen3/branch.json new file mode 100644 index 000000000000..018a7fe94fb9 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen3/branch.json @@ -0,0 +1,53 @@ +[ + { + "EventName": "bp_l1_btb_correct", + "EventCode": "0x8a", + "BriefDescription": "L1 Branch Prediction Overrides Existing Prediction (speculative)." + }, + { + "EventName": "bp_l2_btb_correct", + "EventCode": "0x8b", + "BriefDescription": "L2 Branch Prediction Overrides Existing Prediction (speculative)." + }, + { + "EventName": "bp_dyn_ind_pred", + "EventCode": "0x8e", + "BriefDescription": "Dynamic Indirect Predictions.", + "PublicDescription": "The number of times a branch used the indirect predictor to make a prediction." + }, + { + "EventName": "bp_de_redirect", + "EventCode": "0x91", + "BriefDescription": "Decode Redirects", + "PublicDescription": "The number of times the instruction decoder overrides the predicted target." + }, + { + "EventName": "bp_l1_tlb_fetch_hit", + "EventCode": "0x94", + "BriefDescription": "The number of instruction fetches that hit in the L1 ITLB.", + "UMask": "0xff" + }, + { + "EventName": "bp_l1_tlb_fetch_hit.if1g", + "EventCode": "0x94", + "BriefDescription": "The number of instruction fetches that hit in the L1 ITLB. L1 Instruction TLB hit (1G page size).", + "UMask": "0x04" + }, + { + "EventName": "bp_l1_tlb_fetch_hit.if2m", + "EventCode": "0x94", + "BriefDescription": "The number of instruction fetches that hit in the L1 ITLB. L1 Instruction TLB hit (2M page size).", + "UMask": "0x02" + }, + { + "EventName": "bp_l1_tlb_fetch_hit.if4k", + "EventCode": "0x94", + "BriefDescription": "The number of instruction fetches that hit in the L1 ITLB. L1 Instrcution TLB hit (4K or 16K page size).", + "UMask": "0x01" + }, + { + "EventName": "bp_tlb_rel", + "EventCode": "0x99", + "BriefDescription": "The number of ITLB reload requests." + } +] diff --git a/tools/perf/pmu-events/arch/x86/amdzen3/cache.json b/tools/perf/pmu-events/arch/x86/amdzen3/cache.json new file mode 100644 index 000000000000..fa1d7499a2e3 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen3/cache.json @@ -0,0 +1,402 @@ +[ + { + "EventName": "l2_request_g1.rd_blk_l", + "EventCode": "0x60", + "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). Data cache reads (including hardware and software prefetch).", + "UMask": "0x80" + }, + { + "EventName": "l2_request_g1.rd_blk_x", + "EventCode": "0x60", + "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). Data cache stores.", + "UMask": "0x40" + }, + { + "EventName": "l2_request_g1.ls_rd_blk_c_s", + "EventCode": "0x60", + "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). Data cache shared reads.", + "UMask": "0x20" + }, + { + "EventName": "l2_request_g1.cacheable_ic_read", + "EventCode": "0x60", + "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). Instruction cache reads.", + "UMask": "0x10" + }, + { + "EventName": "l2_request_g1.change_to_x", + "EventCode": "0x60", + "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). Data cache state change requests. Request change to writable, check L2 for current state.", + "UMask": "0x08" + }, + { + "EventName": "l2_request_g1.prefetch_l2_cmd", + "EventCode": "0x60", + "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). PrefetchL2Cmd.", + "UMask": "0x04" + }, + { + "EventName": "l2_request_g1.l2_hw_pf", + "EventCode": "0x60", + "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). L2 Prefetcher. All prefetches accepted by L2 pipeline, hit or miss. Types of PF and L2 hit/miss broken out in a separate perfmon event.", + "UMask": "0x02" + }, + { + "EventName": "l2_request_g1.group2", + "EventCode": "0x60", + "BriefDescription": "Miscellaneous events covered in more detail by l2_request_g2 (PMCx061).", + "UMask": "0x01" + }, + { + "EventName": "l2_request_g1.all_no_prefetch", + "EventCode": "0x60", + "UMask": "0xf9" + }, + { + "EventName": "l2_request_g2.group1", + "EventCode": "0x61", + "BriefDescription": "Miscellaneous events covered in more detail by l2_request_g1 (PMCx060).", + "UMask": "0x80" + }, + { + "EventName": "l2_request_g2.ls_rd_sized", + "EventCode": "0x61", + "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Data cache read sized.", + "UMask": "0x40" + }, + { + "EventName": "l2_request_g2.ls_rd_sized_nc", + "EventCode": "0x61", + "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Data cache read sized non-cacheable.", + "UMask": "0x20" + }, + { + "EventName": "l2_request_g2.ic_rd_sized", + "EventCode": "0x61", + "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Instruction cache read sized.", + "UMask": "0x10" + }, + { + "EventName": "l2_request_g2.ic_rd_sized_nc", + "EventCode": "0x61", + "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Instruction cache read sized non-cacheable.", + "UMask": "0x08" + }, + { + "EventName": "l2_request_g2.smc_inval", + "EventCode": "0x61", + "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Self-modifying code invalidates.", + "UMask": "0x04" + }, + { + "EventName": "l2_request_g2.bus_locks_originator", + "EventCode": "0x61", + "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Bus locks.", + "UMask": "0x02" + }, + { + "EventName": "l2_request_g2.bus_locks_responses", + "EventCode": "0x61", + "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Bus lock response.", + "UMask": "0x01" + }, + { + "EventName": "l2_latency.l2_cycles_waiting_on_fills", + "EventCode": "0x62", + "BriefDescription": "Total cycles spent waiting for L2 fills to complete from L3 or memory, divided by four. Event counts are for both threads. To calculate average latency, the number of fills from both threads must be used.", + "UMask": "0x01" + }, + { + "EventName": "l2_wcb_req.wcb_write", + "EventCode": "0x63", + "BriefDescription": "LS to L2 WCB write requests. LS (Load/Store unit) to L2 WCB (Write Combining Buffer) write requests.", + "UMask": "0x40" + }, + { + "EventName": "l2_wcb_req.wcb_close", + "EventCode": "0x63", + "BriefDescription": "LS to L2 WCB close requests. LS (Load/Store unit) to L2 WCB (Write Combining Buffer) close requests.", + "UMask": "0x20" + }, + { + "EventName": "l2_wcb_req.zero_byte_store", + "EventCode": "0x63", + "BriefDescription": "LS to L2 WCB zero byte store requests. LS (Load/Store unit) to L2 WCB (Write Combining Buffer) zero byte store requests.", + "UMask": "0x04" + }, + { + "EventName": "l2_wcb_req.cl_zero", + "EventCode": "0x63", + "BriefDescription": "LS to L2 WCB cache line zeroing requests. LS (Load/Store unit) to L2 WCB (Write Combining Buffer) cache line zeroing requests.", + "UMask": "0x01" + }, + { + "EventName": "l2_cache_req_stat.ls_rd_blk_cs", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Data cache shared read hit in L2", + "UMask": "0x80" + }, + { + "EventName": "l2_cache_req_stat.ls_rd_blk_l_hit_x", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Data cache read hit in L2. Modifiable.", + "UMask": "0x40" + }, + { + "EventName": "l2_cache_req_stat.ls_rd_blk_l_hit_s", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Data cache read hit non-modifiable line in L2.", + "UMask": "0x20" + }, + { + "EventName": "l2_cache_req_stat.ls_rd_blk_x", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Data cache store or state change hit in L2.", + "UMask": "0x10" + }, + { + "EventName": "l2_cache_req_stat.ls_rd_blk_c", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Data cache request miss in L2 (all types). Use l2_cache_misses_from_dc_misses instead.", + "UMask": "0x08" + }, + { + "EventName": "l2_cache_req_stat.ic_fill_hit_x", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache hit modifiable line in L2.", + "UMask": "0x04" + }, + { + "EventName": "l2_cache_req_stat.ic_fill_hit_s", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache hit non-modifiable line in L2.", + "UMask": "0x02" + }, + { + "EventName": "l2_cache_req_stat.ic_fill_miss", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache request miss in L2. Use l2_cache_misses_from_ic_miss instead.", + "UMask": "0x01" + }, + { + "EventName": "l2_cache_req_stat.ic_access_in_l2", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache requests in L2.", + "UMask": "0x07" + }, + { + "EventName": "l2_cache_req_stat.ic_dc_miss_in_l2", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache request miss in L2 and Data cache request miss in L2 (all types).", + "UMask": "0x09" + }, + { + "EventName": "l2_cache_req_stat.ic_dc_hit_in_l2", + "EventCode": "0x64", + "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache request hit in L2 and Data cache request hit in L2 (all types).", + "UMask": "0xf6" + }, + { + "EventName": "l2_fill_pending.l2_fill_busy", + "EventCode": "0x6d", + "BriefDescription": "Cycles with fill pending from L2. Total cycles spent with one or more fill requests in flight from L2.", + "UMask": "0x01" + }, + { + "EventName": "l2_pf_hit_l2", + "EventCode": "0x70", + "BriefDescription": "L2 prefetch hit in L2. Use l2_cache_hits_from_l2_hwpf instead.", + "UMask": "0xff" + }, + { + "EventName": "l2_pf_miss_l2_hit_l3", + "EventCode": "0x71", + "BriefDescription": "L2 prefetcher hits in L3. Counts all L2 prefetches accepted by the L2 pipeline which miss the L2 cache and hit the L3.", + "UMask": "0xff" + }, + { + "EventName": "l2_pf_miss_l2_l3", + "EventCode": "0x72", + "BriefDescription": "L2 prefetcher misses in L3. Counts all L2 prefetches accepted by the L2 pipeline which miss the L2 and the L3 caches.", + "UMask": "0xff" + }, + { + "EventName": "ic_fw32", + "EventCode": "0x80", + "BriefDescription": "The number of 32B fetch windows transferred from IC pipe to DE instruction decoder (includes non-cacheable and cacheable fill responses)." + }, + { + "EventName": "ic_fw32_miss", + "EventCode": "0x81", + "BriefDescription": "The number of 32B fetch windows tried to read the L1 IC and missed in the full tag." + }, + { + "EventName": "ic_cache_fill_l2", + "EventCode": "0x82", + "BriefDescription": "Instruction Cache Refills from L2. The number of 64 byte instruction cache line was fulfilled from the L2 cache." + }, + { + "EventName": "ic_cache_fill_sys", + "EventCode": "0x83", + "BriefDescription": "Instruction Cache Refills from System. The number of 64 byte instruction cache line fulfilled from system memory or another cache." + }, + { + "EventName": "bp_l1_tlb_miss_l2_tlb_hit", + "EventCode": "0x84", + "BriefDescription": "L1 ITLB Miss, L2 ITLB Hit. The number of instruction fetches that miss in the L1 ITLB but hit in the L2 ITLB." + }, + { + "EventName": "bp_l1_tlb_miss_l2_tlb_miss", + "EventCode": "0x85", + "BriefDescription": "The number of instruction fetches that miss in both the L1 and L2 TLBs.", + "UMask": "0xff" + }, + { + "EventName": "bp_l1_tlb_miss_l2_tlb_miss.coalesced_4k", + "EventCode": "0x85", + "BriefDescription": "The number of valid fills into the ITLB originating from the LS Page-Table Walker. Tablewalk requests are issued for L1-ITLB and L2-ITLB misses. Walk for >4K Coalesced page.", + "UMask": "0x08" + }, + { + "EventName": "bp_l1_tlb_miss_l2_tlb_miss.if1g", + "EventCode": "0x85", + "BriefDescription": "The number of valid fills into the ITLB originating from the LS Page-Table Walker. Tablewalk requests are issued for L1-ITLB and L2-ITLB misses. Walk for 1G page.", + "UMask": "0x04" + }, + { + "EventName": "bp_l1_tlb_miss_l2_tlb_miss.if2m", + "EventCode": "0x85", + "BriefDescription": "The number of valid fills into the ITLB originating from the LS Page-Table Walker. Tablewalk requests are issued for L1-ITLB and L2-ITLB misses. Walk for 2M page.", + "UMask": "0x02" + }, + { + "EventName": "bp_l1_tlb_miss_l2_tlb_miss.if4k", + "EventCode": "0x85", + "BriefDescription": "The number of valid fills into the ITLB originating from the LS Page-Table Walker. Tablewalk requests are issued for L1-ITLB and L2-ITLB misses. Walk to 4K page.", + "UMask": "0x01" + }, + { + "EventName": "bp_snp_re_sync", + "EventCode": "0x86", + "BriefDescription": "The number of pipeline restarts caused by invalidating probes that hit on the instruction stream currently being executed. This would happen if the active instruction stream was being modified by another processor in an MP system - typically a highly unlikely event." + }, + { + "EventName": "ic_fetch_stall.ic_stall_any", + "EventCode": "0x87", + "BriefDescription": "Instruction Pipe Stall. IC pipe was stalled during this clock cycle for any reason (nothing valid in pipe ICM1).", + "UMask": "0x04" + }, + { + "EventName": "ic_fetch_stall.ic_stall_dq_empty", + "EventCode": "0x87", + "BriefDescription": "Instruction Pipe Stall. IC pipe was stalled during this clock cycle (including IC to OC fetches) due to DQ empty.", + "UMask": "0x02" + }, + { + "EventName": "ic_fetch_stall.ic_stall_back_pressure", + "EventCode": "0x87", + "BriefDescription": "Instruction Pipe Stall. IC pipe was stalled during this clock cycle (including IC to OC fetches) due to back-pressure.", + "UMask": "0x01" + }, + { + "EventName": "ic_cache_inval.l2_invalidating_probe", + "EventCode": "0x8c", + "BriefDescription": "IC line invalidated due to L2 invalidating probe (external or LS). The number of instruction cache lines invalidated. A non-SMC event is CMC (cross modifying code), either from the other thread of the core or another core.", + "UMask": "0x02" + }, + { + "EventName": "ic_cache_inval.fill_invalidated", + "EventCode": "0x8c", + "BriefDescription": "IC line invalidated due to overwriting fill response. The number of instruction cache lines invalidated. A non-SMC event is CMC (cross modifying code), either from the other thread of the core or another core.", + "UMask": "0x01" + }, + { + "EventName": "ic_tag_hit_miss.all_instruction_cache_accesses", + "EventCode": "0x18e", + "BriefDescription": "All Instruction Cache Accesses. Counts various IC tag related hit and miss events.", + "UMask": "0x1f" + }, + { + "EventName": "ic_tag_hit_miss.instruction_cache_miss", + "EventCode": "0x18e", + "BriefDescription": "Instruction Cache Miss. Counts various IC tag related hit and miss events.", + "UMask": "0x18" + }, + { + "EventName": "ic_tag_hit_miss.instruction_cache_hit", + "EventCode": "0x18e", + "BriefDescription": "Instruction Cache Hit. Counts various IC tag related hit and miss events.", + "UMask": "0x07" + }, + { + "EventName": "ic_oc_mode_switch.oc_ic_mode_switch", + "EventCode": "0x28a", + "BriefDescription": "OC Mode Switch. OC to IC mode switch.", + "UMask": "0x02" + }, + { + "EventName": "ic_oc_mode_switch.ic_oc_mode_switch", + "EventCode": "0x28a", + "BriefDescription": "OC Mode Switch. IC to OC mode switch.", + "UMask": "0x01" + }, + { + "EventName": "op_cache_hit_miss.all_op_cache_accesses", + "EventCode": "0x28f", + "BriefDescription": "All Op Cache accesses. Counts Op Cache micro-tag hit/miss events", + "UMask": "0x07" + }, + { + "EventName": "op_cache_hit_miss.op_cache_miss", + "EventCode": "0x28f", + "BriefDescription": "Op Cache Miss. Counts Op Cache micro-tag hit/miss events", + "UMask": "0x04" + }, + { + "EventName": "op_cache_hit_miss.op_cache_hit", + "EventCode": "0x28f", + "BriefDescription": "Op Cache Hit. Counts Op Cache micro-tag hit/miss events", + "UMask": "0x03" + }, + { + "EventName": "l3_request_g1.caching_l3_cache_accesses", + "EventCode": "0x01", + "BriefDescription": "Caching: L3 cache accesses", + "UMask": "0x80", + "Unit": "L3PMC" + }, + { + "EventName": "l3_lookup_state.all_l3_req_typs", + "EventCode": "0x04", + "BriefDescription": "All L3 Request Types. All L3 cache Requests", + "UMask": "0xff", + "Unit": "L3PMC" + }, + { + "EventName": "l3_comb_clstr_state.other_l3_miss_typs", + "EventCode": "0x06", + "BriefDescription": "Other L3 Miss Request Types", + "UMask": "0xfe", + "Unit": "L3PMC" + }, + { + "EventName": "l3_comb_clstr_state.request_miss", + "EventCode": "0x06", + "BriefDescription": "L3 cache misses", + "UMask": "0x01", + "Unit": "L3PMC" + }, + { + "EventName": "xi_sys_fill_latency", + "EventCode": "0x90", + "BriefDescription": "L3 Cache Miss Latency. Total cycles for all transactions divided by 16. Ignores SliceMask and ThreadMask.", + "Unit": "L3PMC" + }, + { + "EventName": "xi_ccx_sdp_req1", + "EventCode": "0x9a", + "BriefDescription": "L3 Misses by Request Type. Ignores SliceID, EnAllSlices, CoreID, EnAllCores and ThreadMask. Requires unit mask 0xFF to engage event for counting.", + "UMask": "0xff", + "Unit": "L3PMC" + } +] diff --git a/tools/perf/pmu-events/arch/x86/amdzen3/core.json b/tools/perf/pmu-events/arch/x86/amdzen3/core.json new file mode 100644 index 000000000000..4e27a2be359e --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen3/core.json @@ -0,0 +1,137 @@ +[ + { + "EventName": "ex_ret_instr", + "EventCode": "0xc0", + "BriefDescription": "Retired Instructions." + }, + { + "EventName": "ex_ret_ops", + "EventCode": "0xc1", + "BriefDescription": "Retired Ops. Use macro_ops_retired instead.", + "PublicDescription": "The number of macro-ops retired." + }, + { + "EventName": "ex_ret_brn", + "EventCode": "0xc2", + "BriefDescription": "Retired Branch Instructions.", + "PublicDescription": "The number of branch instructions retired. This includes all types of architectural control flow changes, including exceptions and interrupts." + }, + { + "EventName": "ex_ret_brn_misp", + "EventCode": "0xc3", + "BriefDescription": "Retired Branch Instructions Mispredicted.", + "PublicDescription": "The number of retired branch instructions, that were mispredicted." + }, + { + "EventName": "ex_ret_brn_tkn", + "EventCode": "0xc4", + "BriefDescription": "Retired Taken Branch Instructions.", + "PublicDescription": "The number of taken branches that were retired. This includes all types of architectural control flow changes, including exceptions and interrupts." + }, + { + "EventName": "ex_ret_brn_tkn_misp", + "EventCode": "0xc5", + "BriefDescription": "Retired Taken Branch Instructions Mispredicted.", + "PublicDescription": "The number of retired taken branch instructions that were mispredicted." + }, + { + "EventName": "ex_ret_brn_far", + "EventCode": "0xc6", + "BriefDescription": "Retired Far Control Transfers.", + "PublicDescription": "The number of far control transfers retired including far call/jump/return, IRET, SYSCALL and SYSRET, plus exceptions and interrupts. Far control transfers are not subject to branch prediction." + }, + { + "EventName": "ex_ret_brn_resync", + "EventCode": "0xc7", + "BriefDescription": "Retired Branch Resyncs.", + "PublicDescription": "The number of resync branches. These reflect pipeline restarts due to certain microcode assists and events such as writes to the active instruction stream, among other things. Each occurrence reflects a restart penalty similar to a branch mispredict. This is relatively rare." + }, + { + "EventName": "ex_ret_near_ret", + "EventCode": "0xc8", + "BriefDescription": "Retired Near Returns.", + "PublicDescription": "The number of near return instructions (RET or RET Iw) retired." + }, + { + "EventName": "ex_ret_near_ret_mispred", + "EventCode": "0xc9", + "BriefDescription": "Retired Near Returns Mispredicted.", + "PublicDescription": "The number of near returns retired that were not correctly predicted by the return address predictor. Each such mispredict incurs the same penalty as a mispredicted conditional branch instruction." + }, + { + "EventName": "ex_ret_brn_ind_misp", + "EventCode": "0xca", + "BriefDescription": "Retired Indirect Branch Instructions Mispredicted.", + "PublicDescription": "The number of indirect branches retired that were not correctly predicted. Each such mispredict incurs the same penalty as a mispredicted conditional branch instruction. Note that only EX mispredicts are counted." + }, + { + "EventName": "ex_ret_mmx_fp_instr.sse_instr", + "EventCode": "0xcb", + "BriefDescription": "SSE instructions (SSE, SSE2, SSE3, SSSE3, SSE4A, SSE41, SSE42, AVX).", + "PublicDescription": "The number of MMX, SSE or x87 instructions retired. The UnitMask allows the selection of the individual classes of instructions as given in the table. Each increment represents one complete instruction. Since this event includes non-numeric instructions it is not suitable for measuring MFLOPS.", + "UMask": "0x04" + }, + { + "EventName": "ex_ret_mmx_fp_instr.mmx_instr", + "EventCode": "0xcb", + "BriefDescription": "MMX instructions.", + "PublicDescription": "The number of MMX, SSE or x87 instructions retired. The UnitMask allows the selection of the individual classes of instructions as given in the table. Each increment represents one complete instruction. Since this event includes non-numeric instructions it is not suitable for measuring MFLOPS. MMX instructions.", + "UMask": "0x02" + }, + { + "EventName": "ex_ret_mmx_fp_instr.x87_instr", + "EventCode": "0xcb", + "BriefDescription": "x87 instructions.", + "PublicDescription": "The number of MMX, SSE or x87 instructions retired. The UnitMask allows the selection of the individual classes of instructions as given in the table. Each increment represents one complete instruction. Since this event includes non-numeric instructions it is not suitable for measuring MFLOPS. x87 instructions.", + "UMask": "0x01" + }, + { + "EventName": "ex_ret_ind_brch_instr", + "EventCode": "0xcc", + "BriefDescription": "Retired Indirect Branch Instructions. The number of indirect branches retired." + }, + { + "EventName": "ex_ret_cond", + "EventCode": "0xd1", + "BriefDescription": "Retired Conditional Branch Instructions." + }, + { + "EventName": "ex_div_busy", + "EventCode": "0xd3", + "BriefDescription": "Div Cycles Busy count." + }, + { + "EventName": "ex_div_count", + "EventCode": "0xd4", + "BriefDescription": "Div Op Count." + }, + { + "EventName": "ex_ret_msprd_brnch_instr_dir_msmtch", + "EventCode": "0x1c7", + "BriefDescription": "Retired Mispredicted Branch Instructions due to Direction Mismatch", + "PublicDescription": "The number of retired conditional branch instructions that were not correctly predicted because of a branch direction mismatch." + }, + { + "EventName": "ex_tagged_ibs_ops.ibs_count_rollover", + "EventCode": "0x1cf", + "BriefDescription": "Tagged IBS Ops. Number of times an op could not be tagged by IBS because of a previous tagged op that has not retired.", + "UMask": "0x04" + }, + { + "EventName": "ex_tagged_ibs_ops.ibs_tagged_ops_ret", + "EventCode": "0x1cf", + "BriefDescription": "Tagged IBS Ops. Number of Ops tagged by IBS that retired.", + "UMask": "0x02" + }, + { + "EventName": "ex_tagged_ibs_ops.ibs_tagged_ops", + "EventCode": "0x1cf", + "BriefDescription": "Tagged IBS Ops. Number of Ops tagged by IBS.", + "UMask": "0x01" + }, + { + "EventName": "ex_ret_fused_instr", + "EventCode": "0x1d0", + "BriefDescription": "Counts retired Fused Instructions." + } +] diff --git a/tools/perf/pmu-events/arch/x86/amdzen3/data-fabric.json b/tools/perf/pmu-events/arch/x86/amdzen3/data-fabric.json new file mode 100644 index 000000000000..40271df40015 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen3/data-fabric.json @@ -0,0 +1,98 @@ +[ + { + "EventName": "remote_outbound_data_controller_0", + "PublicDescription": "Remote Link Controller Outbound Packet Types: Data (32B): Remote Link Controller 0", + "EventCode": "0x7c7", + "UMask": "0x02", + "PerPkg": "1", + "Unit": "DFPMC" + }, + { + "EventName": "remote_outbound_data_controller_1", + "PublicDescription": "Remote Link Controller Outbound Packet Types: Data (32B): Remote Link Controller 1", + "EventCode": "0x807", + "UMask": "0x02", + "PerPkg": "1", + "Unit": "DFPMC" + }, + { + "EventName": "remote_outbound_data_controller_2", + "PublicDescription": "Remote Link Controller Outbound Packet Types: Data (32B): Remote Link Controller 2", + "EventCode": "0x847", + "UMask": "0x02", + "PerPkg": "1", + "Unit": "DFPMC" + }, + { + "EventName": "remote_outbound_data_controller_3", + "PublicDescription": "Remote Link Controller Outbound Packet Types: Data (32B): Remote Link Controller 3", + "EventCode": "0x887", + "UMask": "0x02", + "PerPkg": "1", + "Unit": "DFPMC" + }, + { + "EventName": "dram_channel_data_controller_0", + "PublicDescription": "DRAM Channel Controller Request Types: Requests with Data (64B): DRAM Channel Controller 0", + "EventCode": "0x07", + "UMask": "0x38", + "PerPkg": "1", + "Unit": "DFPMC" + }, + { + "EventName": "dram_channel_data_controller_1", + "PublicDescription": "DRAM Channel Controller Request Types: Requests with Data (64B): DRAM Channel Controller 0", + "EventCode": "0x47", + "UMask": "0x38", + "PerPkg": "1", + "Unit": "DFPMC" + }, + { + "EventName": "dram_channel_data_controller_2", + "PublicDescription": "DRAM Channel Controller Request Types: Requests with Data (64B): DRAM Channel Controller 0", + "EventCode": "0x87", + "UMask": "0x38", + "PerPkg": "1", + "Unit": "DFPMC" + }, + { + "EventName": "dram_channel_data_controller_3", + "PublicDescription": "DRAM Channel Controller Request Types: Requests with Data (64B): DRAM Channel Controller 0", + "EventCode": "0xc7", + "UMask": "0x38", + "PerPkg": "1", + "Unit": "DFPMC" + }, + { + "EventName": "dram_channel_data_controller_4", + "PublicDescription": "DRAM Channel Controller Request Types: Requests with Data (64B): DRAM Channel Controller 0", + "EventCode": "0x107", + "UMask": "0x38", + "PerPkg": "1", + "Unit": "DFPMC" + }, + { + "EventName": "dram_channel_data_controller_5", + "PublicDescription": "DRAM Channel Controller Request Types: Requests with Data (64B): DRAM Channel Controller 0", + "EventCode": "0x147", + "UMask": "0x38", + "PerPkg": "1", + "Unit": "DFPMC" + }, + { + "EventName": "dram_channel_data_controller_6", + "PublicDescription": "DRAM Channel Controller Request Types: Requests with Data (64B): DRAM Channel Controller 0", + "EventCode": "0x187", + "UMask": "0x38", + "PerPkg": "1", + "Unit": "DFPMC" + }, + { + "EventName": "dram_channel_data_controller_7", + "PublicDescription": "DRAM Channel Controller Request Types: Requests with Data (64B): DRAM Channel Controller 0", + "EventCode": "0x1c7", + "UMask": "0x38", + "PerPkg": "1", + "Unit": "DFPMC" + } +] diff --git a/tools/perf/pmu-events/arch/x86/amdzen3/floating-point.json b/tools/perf/pmu-events/arch/x86/amdzen3/floating-point.json new file mode 100644 index 000000000000..98cfcb9c78ec --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen3/floating-point.json @@ -0,0 +1,139 @@ +[ + { + "EventName": "fpu_pipe_assignment.total", + "EventCode": "0x00", + "BriefDescription": "Total number of fp uOps.", + "PublicDescription": "Total number of fp uOps. The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS.", + "UMask": "0x0f" + }, + { + "EventName": "fpu_pipe_assignment.total3", + "EventCode": "0x00", + "BriefDescription": "Total number uOps assigned to pipe 3.", + "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one-cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 3.", + "UMask": "0x08" + }, + { + "EventName": "fpu_pipe_assignment.total2", + "EventCode": "0x00", + "BriefDescription": "Total number uOps assigned to pipe 2.", + "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 2.", + "UMask": "0x04" + }, + { + "EventName": "fpu_pipe_assignment.total1", + "EventCode": "0x00", + "BriefDescription": "Total number uOps assigned to pipe 1.", + "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 1.", + "UMask": "0x02" + }, + { + "EventName": "fpu_pipe_assignment.total0", + "EventCode": "0x00", + "BriefDescription": "Total number of fp uOps on pipe 0.", + "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 0.", + "UMask": "0x01" + }, + { + "EventName": "fp_ret_sse_avx_ops.all", + "EventCode": "0x03", + "BriefDescription": "All FLOPS. This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15.", + "UMask": "0xff" + }, + { + "EventName": "fp_ret_sse_avx_ops.mac_flops", + "EventCode": "0x03", + "BriefDescription": "Multiply-Accumulate FLOPs. Each MAC operation is counted as 2 FLOPS. This is a retire-based event. The number of retired SSE/AVX FLOPs. The number of events logged per cycle can vary from 0 to 64. This event requires the use of the MergeEvent since it can count above 15 events per cycle. See 2.1.17.3 [Large Increment per Cycle Events]. It does not provide a useful count without the use of the MergeEvent.", + "UMask": "0x08" + }, + { + "EventName": "fp_ret_sse_avx_ops.div_flops", + "EventCode": "0x03", + "BriefDescription": "Divide/square root FLOPs. This is a retire-based event. The number of retired SSE/AVX FLOPs. The number of events logged per cycle can vary from 0 to 64. This event requires the use of the MergeEvent since it can count above 15 events per cycle. See 2.1.17.3 [Large Increment per Cycle Events]. It does not provide a useful count without the use of the MergeEvent.", + "UMask": "0x04" + }, + { + "EventName": "fp_ret_sse_avx_ops.mult_flops", + "EventCode": "0x03", + "BriefDescription": "Multiply FLOPs. This is a retire-based event. The number of retired SSE/AVX FLOPs. The number of events logged per cycle can vary from 0 to 64. This event requires the use of the MergeEvent since it can count above 15 events per cycle. See 2.1.17.3 [Large Increment per Cycle Events]. It does not provide a useful count without the use of the MergeEvent.", + "UMask": "0x02" + }, + { + "EventName": "fp_ret_sse_avx_ops.add_sub_flops", + "EventCode": "0x03", + "BriefDescription": "Add/subtract FLOPs. This is a retire-based event. The number of retired SSE/AVX FLOPs. The number of events logged per cycle can vary from 0 to 64. This event requires the use of the MergeEvent since it can count above 15 events per cycle. See 2.1.17.3 [Large Increment per Cycle Events]. It does not provide a useful count without the use of the MergeEvent.", + "UMask": "0x01" + }, + { + "EventName": "fp_num_mov_elim_scal_op.optimized", + "EventCode": "0x04", + "BriefDescription": "Number of Scalar Ops optimized. This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes.", + "UMask": "0x08" + }, + { + "EventName": "fp_num_mov_elim_scal_op.opt_potential", + "EventCode": "0x04", + "BriefDescription": "Number of Ops that are candidates for optimization (have Z-bit either set or pass). This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes.", + "UMask": "0x04" + }, + { + "EventName": "fp_num_mov_elim_scal_op.sse_mov_ops_elim", + "EventCode": "0x04", + "BriefDescription": "Number of SSE Move Ops eliminated. This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes.", + "UMask": "0x02" + }, + { + "EventName": "fp_num_mov_elim_scal_op.sse_mov_ops", + "EventCode": "0x04", + "BriefDescription": "Number of SSE Move Ops. This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes.", + "UMask": "0x01" + }, + { + "EventName": "fp_retired_ser_ops.sse_bot_ret", + "EventCode": "0x05", + "BriefDescription": "SSE/AVX bottom-executing ops retired. The number of serializing Ops retired.", + "UMask": "0x08" + }, + { + "EventName": "fp_retired_ser_ops.sse_ctrl_ret", + "EventCode": "0x05", + "BriefDescription": "SSE/AVX control word mispredict traps. The number of serializing Ops retired.", + "UMask": "0x04" + }, + { + "EventName": "fp_retired_ser_ops.x87_bot_ret", + "EventCode": "0x05", + "BriefDescription": "x87 bottom-executing ops retired. The number of serializing Ops retired.", + "UMask": "0x02" + }, + { + "EventName": "fp_retired_ser_ops.x87_ctrl_ret", + "EventCode": "0x05", + "BriefDescription": "x87 control word mispredict traps due to mispredictions in RC or PC, or changes in mask bits. The number of serializing Ops retired.", + "UMask": "0x01" + }, + { + "EventName": "fp_disp_faults.ymm_spill_fault", + "EventCode": "0x0e", + "BriefDescription": "Floating Point Dispatch Faults. YMM spill fault.", + "UMask": "0x08" + }, + { + "EventName": "fp_disp_faults.ymm_fill_fault", + "EventCode": "0x0e", + "BriefDescription": "Floating Point Dispatch Faults. YMM fill fault.", + "UMask": "0x04" + }, + { + "EventName": "fp_disp_faults.xmm_fill_fault", + "EventCode": "0x0e", + "BriefDescription": "Floating Point Dispatch Faults. XMM fill fault.", + "UMask": "0x02" + }, + { + "EventName": "fp_disp_faults.x87_fill_fault", + "EventCode": "0x0e", + "BriefDescription": "Floating Point Dispatch Faults. x87 fill fault.", + "UMask": "0x01" + } +] diff --git a/tools/perf/pmu-events/arch/x86/amdzen3/memory.json b/tools/perf/pmu-events/arch/x86/amdzen3/memory.json new file mode 100644 index 000000000000..a2833955dcd2 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen3/memory.json @@ -0,0 +1,428 @@ +[ + { + "EventName": "ls_bad_status2.stli_other", + "EventCode": "0x24", + "BriefDescription": "Non-forwardable conflict; used to reduce STLI's via software. All reasons. Store To Load Interlock (STLI) are loads that were unable to complete because of a possible match with an older store, and the older store could not do STLF for some reason.", + "PublicDescription" : "Store-to-load conflicts: A load was unable to complete due to a non-forwardable conflict with an older store. Most commonly, a load's address range partially but not completely overlaps with an uncompleted older store. Software can avoid this problem by using same-size and same-alignment loads and stores when accessing the same data. Vector/SIMD code is particularly susceptible to this problem; software should construct wide vector stores by manipulating vector elements in registers using shuffle/blend/swap instructions prior to storing to memory, instead of using narrow element-by-element stores.", + "UMask": "0x02" + }, + { + "EventName": "ls_locks.spec_lock_hi_spec", + "EventCode": "0x25", + "BriefDescription": "Retired lock instructions. High speculative cacheable lock speculation succeeded.", + "UMask": "0x08" + }, + { + "EventName": "ls_locks.spec_lock_lo_spec", + "EventCode": "0x25", + "BriefDescription": "Retired lock instructions. Low speculative cacheable lock speculation succeeded.", + "UMask": "0x04" + }, + { + "EventName": "ls_locks.non_spec_lock", + "EventCode": "0x25", + "BriefDescription": "Retired lock instructions. Non-speculative lock succeeded.", + "UMask": "0x02" + }, + { + "EventName": "ls_locks.bus_lock", + "EventCode": "0x25", + "BriefDescription": "Retired lock instructions. Comparable to legacy bus lock.", + "UMask": "0x01" + }, + { + "EventName": "ls_ret_cl_flush", + "EventCode": "0x26", + "BriefDescription": "The number of retired CLFLUSH instructions. This is a non-speculative event." + }, + { + "EventName": "ls_ret_cpuid", + "EventCode": "0x27", + "BriefDescription": "The number of CPUID instructions retired." + }, + { + "EventName": "ls_dispatch.ld_st_dispatch", + "EventCode": "0x29", + "BriefDescription": "Load-op-Store Dispatch. Dispatch of a single op that performs a load from and store to the same memory address. Counts the number of operations dispatched to the LS unit. Unit Masks ADDed.", + "UMask": "0x04" + }, + { + "EventName": "ls_dispatch.store_dispatch", + "EventCode": "0x29", + "BriefDescription": "Dispatch of a single op that performs a memory store. Counts the number of operations dispatched to the LS unit. Unit Masks ADDed.", + "UMask": "0x02" + }, + { + "EventName": "ls_dispatch.ld_dispatch", + "EventCode": "0x29", + "BriefDescription": "Dispatch of a single op that performs a memory load. Counts the number of operations dispatched to the LS unit. Unit Masks ADDed.", + "UMask": "0x01" + }, + { + "EventName": "ls_smi_rx", + "EventCode": "0x2b", + "BriefDescription": "Counts the number of SMIs received." + }, + { + "EventName": "ls_int_taken", + "EventCode": "0x2c", + "BriefDescription": "Counts the number of interrupts taken." + }, + { + "EventName": "ls_rdtsc", + "EventCode": "0x2d", + "BriefDescription": "Number of reads of the TSC (RDTSC instructions). The count is speculative." + }, + { + "EventName": "ls_stlf", + "EventCode": "0x35", + "BriefDescription": "Number of STLF hits." + }, + { + "EventName": "ls_st_commit_cancel2.st_commit_cancel_wcb_full", + "EventCode": "0x37", + "BriefDescription": "A non-cacheable store and the non-cacheable commit buffer is full.", + "UMask": "0x01" + }, + { + "EventName": "ls_dc_accesses", + "EventCode": "0x40", + "BriefDescription": "Number of accesses to the dcache for load/store references.", + "PublicDescription": "The number of accesses to the data cache for load and store references. This may include certain microcode scratchpad accesses, although these are generally rare. Each increment represents an eight-byte access, although the instruction may only be accessing a portion of that. This event is a speculative event." + }, + { + "EventName": "ls_mab_alloc.all_allocations", + "EventCode": "0x41", + "BriefDescription": "All Allocations. Counts when a LS pipe allocates a MAB entry.", + "UMask": "0x7f" + }, + { + "EventName": "ls_mab_alloc.hardware_prefetcher_allocations", + "EventCode": "0x41", + "BriefDescription": "Hardware Prefetcher Allocations. Counts when a LS pipe allocates a MAB entry.", + "UMask": "0x40" + }, + { + "EventName": "ls_mab_alloc.load_store_allocations", + "EventCode": "0x41", + "BriefDescription": "Load Store Allocations. Counts when a LS pipe allocates a MAB entry.", + "UMask": "0x3f" + }, + { + "EventName": "ls_mab_alloc.dc_prefetcher", + "EventCode": "0x41", + "BriefDescription": "LS MAB Allocates by Type. DC prefetcher.", + "UMask": "0x08" + }, + { + "EventName": "ls_mab_alloc.stores", + "EventCode": "0x41", + "BriefDescription": "LS MAB Allocates by Type. Stores.", + "UMask": "0x02" + }, + { + "EventName": "ls_mab_alloc.loads", + "EventCode": "0x41", + "BriefDescription": "LS MAB Allocates by Type. Loads.", + "UMask": "0x01" + }, + { + "EventName": "ls_dmnd_fills_from_sys.mem_io_remote", + "EventCode": "0x43", + "BriefDescription": "Demand Data Cache Fills by Data Source. From DRAM or IO connected in different Node.", + "UMask": "0x40" + }, + { + "EventName": "ls_dmnd_fills_from_sys.ext_cache_remote", + "EventCode": "0x43", + "BriefDescription": "Demand Data Cache Fills by Data Source. From CCX Cache in different Node.", + "UMask": "0x10" + }, + { + "EventName": "ls_dmnd_fills_from_sys.mem_io_local", + "EventCode": "0x43", + "BriefDescription": "Demand Data Cache Fills by Data Source. From DRAM or IO connected in same node.", + "UMask": "0x08" + }, + { + "EventName": "ls_dmnd_fills_from_sys.ext_cache_local", + "EventCode": "0x43", + "BriefDescription": "Demand Data Cache Fills by Data Source. From cache of different CCX in same node.", + "UMask": "0x04" + }, + { + "EventName": "ls_dmnd_fills_from_sys.int_cache", + "EventCode": "0x43", + "BriefDescription": "Demand Data Cache Fills by Data Source. From L3 or different L2 in same CCX.", + "UMask": "0x02" + }, + { + "EventName": "ls_dmnd_fills_from_sys.lcl_l2", + "EventCode": "0x43", + "BriefDescription": "Demand Data Cache Fills by Data Source. From Local L2 to the core.", + "UMask": "0x01" + }, + { + "EventName": "ls_any_fills_from_sys.mem_io_remote", + "EventCode": "0x44", + "BriefDescription": "Any Data Cache Fills by Data Source. From DRAM or IO connected in different Node.", + "UMask": "0x40" + }, + { + "EventName": "ls_any_fills_from_sys.ext_cache_remote", + "EventCode": "0x44", + "BriefDescription": "Any Data Cache Fills by Data Source. From CCX Cache in different Node.", + "UMask": "0x10" + }, + { + "EventName": "ls_any_fills_from_sys.mem_io_local", + "EventCode": "0x44", + "BriefDescription": "Any Data Cache Fills by Data Source. From DRAM or IO connected in same node.", + "UMask": "0x08" + }, + { + "EventName": "ls_any_fills_from_sys.ext_cache_local", + "EventCode": "0x44", + "BriefDescription": "Any Data Cache Fills by Data Source. From cache of different CCX in same node.", + "UMask": "0x04" + }, + { + "EventName": "ls_any_fills_from_sys.int_cache", + "EventCode": "0x44", + "BriefDescription": "Any Data Cache Fills by Data Source. From L3 or different L2 in same CCX.", + "UMask": "0x02" + }, + { + "EventName": "ls_any_fills_from_sys.lcl_l2", + "EventCode": "0x44", + "BriefDescription": "Any Data Cache Fills by Data Source. From Local L2 to the core.", + "UMask": "0x01" + }, + { + "EventName": "ls_l1_d_tlb_miss.all", + "EventCode": "0x45", + "BriefDescription": "All L1 DTLB Misses or Reloads. Use l1_dtlb_misses instead.", + "UMask": "0xff" + }, + { + "EventName": "ls_l1_d_tlb_miss.tlb_reload_1g_l2_miss", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB Miss. DTLB reload to a 1G page that also missed in the L2 TLB.", + "UMask": "0x80" + }, + { + "EventName": "ls_l1_d_tlb_miss.tlb_reload_2m_l2_miss", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB Miss. DTLB reload to a 2M page that also missed in the L2 TLB.", + "UMask": "0x40" + }, + { + "EventName": "ls_l1_d_tlb_miss.tlb_reload_coalesced_page_miss", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB Miss. DTLB reload coalesced page that also missed in the L2 TLB.", + "UMask": "0x20" + }, + { + "EventName": "ls_l1_d_tlb_miss.tlb_reload_4k_l2_miss", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB Miss. DTLB reload to a 4K page that missed the L2 TLB.", + "UMask": "0x10" + }, + { + "EventName": "ls_l1_d_tlb_miss.tlb_reload_1g_l2_hit", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB Miss. DTLB reload to a 1G page that hit in the L2 TLB.", + "UMask": "0x08" + }, + { + "EventName": "ls_l1_d_tlb_miss.tlb_reload_2m_l2_hit", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB Miss. DTLB reload to a 2M page that hit in the L2 TLB.", + "UMask": "0x04" + }, + { + "EventName": "ls_l1_d_tlb_miss.tlb_reload_coalesced_page_hit", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB Miss. DTLB reload to a coalesced page that hit in the L2 TLB.", + "UMask": "0x02" + }, + { + "EventName": "ls_l1_d_tlb_miss.tlb_reload_4k_l2_hit", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB Miss. DTLB reload to a 4K page that hit in the L2 TLB.", + "UMask": "0x01" + }, + { + "EventName": "ls_tablewalker.iside", + "EventCode": "0x46", + "BriefDescription": "Total Page Table Walks on I-side.", + "UMask": "0x0c" + }, + { + "EventName": "ls_tablewalker.ic_type1", + "EventCode": "0x46", + "BriefDescription": "Total Page Table Walks IC Type 1.", + "UMask": "0x08" + }, + { + "EventName": "ls_tablewalker.ic_type0", + "EventCode": "0x46", + "BriefDescription": "Total Page Table Walks IC Type 0.", + "UMask": "0x04" + }, + { + "EventName": "ls_tablewalker.dside", + "EventCode": "0x46", + "BriefDescription": "Total Page Table Walks on D-side.", + "UMask": "0x03" + }, + { + "EventName": "ls_tablewalker.dc_type1", + "EventCode": "0x46", + "BriefDescription": "Total Page Table Walks DC Type 1.", + "UMask": "0x02" + }, + { + "EventName": "ls_tablewalker.dc_type0", + "EventCode": "0x46", + "BriefDescription": "Total Page Table Walks DC Type 0.", + "UMask": "0x01" + }, + { + "EventName": "ls_misal_loads.ma4k", + "EventCode": "0x47", + "BriefDescription": "The number of 4KB misaligned (i.e., page crossing) loads.", + "UMask": "0x02" + }, + { + "EventName": "ls_misal_loads.ma64", + "EventCode": "0x47", + "BriefDescription": "The number of 64B misaligned (i.e., cacheline crossing) loads.", + "UMask": "0x01" + }, + { + "EventName": "ls_pref_instr_disp", + "EventCode": "0x4b", + "BriefDescription": "Software Prefetch Instructions Dispatched (Speculative).", + "UMask": "0xff" + }, + { + "EventName": "ls_pref_instr_disp.prefetch_nta", + "EventCode": "0x4b", + "BriefDescription": "Software Prefetch Instructions Dispatched (Speculative). PrefetchNTA instruction. See docAPM3 PREFETCHlevel.", + "UMask": "0x04" + }, + { + "EventName": "ls_pref_instr_disp.prefetch_w", + "EventCode": "0x4b", + "BriefDescription": "Software Prefetch Instructions Dispatched (Speculative). PrefetchW instruction. See docAPM3 PREFETCHW.", + "UMask": "0x02" + }, + { + "EventName": "ls_pref_instr_disp.prefetch", + "EventCode": "0x4b", + "BriefDescription": "Software Prefetch Instructions Dispatched (Speculative). PrefetchT0, T1 and T2 instructions. See docAPM3 PREFETCHlevel.", + "UMask": "0x01" + }, + { + "EventName": "ls_inef_sw_pref.mab_mch_cnt", + "EventCode": "0x52", + "BriefDescription": "The number of software prefetches that did not fetch data outside of the processor core. Software PREFETCH instruction saw a match on an already-allocated miss request buffer.", + "UMask": "0x02" + }, + { + "EventName": "ls_inef_sw_pref.data_pipe_sw_pf_dc_hit", + "EventCode": "0x52", + "BriefDescription": "The number of software prefetches that did not fetch data outside of the processor core. Software PREFETCH instruction saw a DC hit.", + "UMask": "0x01" + }, + { + "EventName": "ls_sw_pf_dc_fills.mem_io_remote", + "EventCode": "0x59", + "BriefDescription": "Software Prefetch Data Cache Fills by Data Source. From DRAM or IO connected in different Node.", + "UMask": "0x40" + }, + { + "EventName": "ls_sw_pf_dc_fills.ext_cache_remote", + "EventCode": "0x59", + "BriefDescription": "Software Prefetch Data Cache Fills by Data Source. From CCX Cache in different Node.", + "UMask": "0x10" + }, + { + "EventName": "ls_sw_pf_dc_fills.mem_io_local", + "EventCode": "0x59", + "BriefDescription": "Software Prefetch Data Cache Fills by Data Source. From DRAM or IO connected in same node.", + "UMask": "0x08" + }, + { + "EventName": "ls_sw_pf_dc_fills.ext_cache_local", + "EventCode": "0x59", + "BriefDescription": "Software Prefetch Data Cache Fills by Data Source. From cache of different CCX in same node.", + "UMask": "0x04" + }, + { + "EventName": "ls_sw_pf_dc_fills.int_cache", + "EventCode": "0x59", + "BriefDescription": "Software Prefetch Data Cache Fills by Data Source. From L3 or different L2 in same CCX.", + "UMask": "0x02" + }, + { + "EventName": "ls_sw_pf_dc_fills.lcl_l2", + "EventCode": "0x59", + "BriefDescription": "Software Prefetch Data Cache Fills by Data Source. From Local L2 to the core.", + "UMask": "0x01" + }, + { + "EventName": "ls_hw_pf_dc_fills.mem_io_remote", + "EventCode": "0x5a", + "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. From DRAM or IO connected in different Node.", + "UMask": "0x40" + }, + { + "EventName": "ls_hw_pf_dc_fills.ext_cache_remote", + "EventCode": "0x5a", + "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. From CCX Cache in different Node.", + "UMask": "0x10" + }, + { + "EventName": "ls_hw_pf_dc_fills.mem_io_local", + "EventCode": "0x5a", + "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. From DRAM or IO connected in same node.", + "UMask": "0x08" + }, + { + "EventName": "ls_hw_pf_dc_fills.ext_cache_local", + "EventCode": "0x5a", + "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. From cache of different CCX in same node.", + "UMask": "0x04" + }, + { + "EventName": "ls_hw_pf_dc_fills.int_cache", + "EventCode": "0x5a", + "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. From L3 or different L2 in same CCX.", + "UMask": "0x02" + }, + { + "EventName": "ls_hw_pf_dc_fills.lcl_l2", + "EventCode": "0x5a", + "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. From Local L2 to the core.", + "UMask": "0x01" + }, + { + "EventName": "ls_alloc_mab_count", + "EventCode": "0x5f", + "BriefDescription": "Count of Allocated Mabs", + "PublicDescription": "This event counts the in-flight L1 data cache misses (allocated Miss Address Buffers) divided by 4 and rounded down each cycle unless used with the MergeEvent functionality. If the MergeEvent is used, it counts the exact number of outstanding L1 data cache misses. See 2.1.17.3 [Large Increment per Cycle Events]." + }, + { + "EventName": "ls_not_halted_cyc", + "EventCode": "0x76", + "BriefDescription": "Cycles not in Halt." + }, + { + "EventName": "ls_tlb_flush.all_tlb_flushes", + "EventCode": "0x78", + "BriefDescription": "All TLB Flushes. Requires unit mask 0xFF to engage event for counting. Use all_tlbs_flushed instead", + "UMask": "0xff" + } +] diff --git a/tools/perf/pmu-events/arch/x86/amdzen3/other.json b/tools/perf/pmu-events/arch/x86/amdzen3/other.json new file mode 100644 index 000000000000..7da5d0791ea3 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen3/other.json @@ -0,0 +1,103 @@ +[ + { + "EventName": "de_dis_uop_queue_empty_di0", + "EventCode": "0xa9", + "BriefDescription": "Cycles where the Micro-Op Queue is empty." + }, + { + "EventName": "de_dis_cops_from_decoder.disp_op_type.any_integer_dispatch", + "EventCode": "0xab", + "BriefDescription": "Any Integer dispatch. Types of Oops Dispatched from Decoder.", + "UMask": "0x08" + }, + { + "EventName": "de_dis_cops_from_decoder.disp_op_type.any_fp_dispatch", + "EventCode": "0xab", + "BriefDescription": "Any FP dispatch. Types of Oops Dispatched from Decoder.", + "UMask": "0x04" + }, + { + "EventName": "de_dis_dispatch_token_stalls1.fp_flush_recovery_stall", + "EventCode": "0xae", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a Token Stall. Also counts cycles when the thread is not selected to dispatch but would have been stalled due to a Token Stall. FP Flush recovery stall.", + "UMask": "0x80" + }, + { + "EventName": "de_dis_dispatch_token_stalls1.fp_sch_rsrc_stall", + "EventCode": "0xae", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a Token Stall. Also counts cycles when the thread is not selected to dispatch but would have been stalled due to a Token Stall. FP scheduler resource stall. Applies to ops that use the FP scheduler.", + "UMask": "0x40" + }, + { + "EventName": "de_dis_dispatch_token_stalls1.fp_reg_file_rsrc_stall", + "EventCode": "0xae", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a Token Stall. Also counts cycles when the thread is not selected to dispatch but would have been stalled due to a Token Stall. Floating point register file resource stall. Applies to all FP ops that have a destination register.", + "UMask": "0x20" + }, + { + "EventName": "de_dis_dispatch_token_stalls1.taken_brnch_buffer_rsrc", + "EventCode": "0xae", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a Token Stall. Also counts cycles when the thread is not selected to dispatch but would have been stalled due to a Token Stall. Taken branch buffer resource stall.", + "UMask": "0x10" + }, + { + "EventName": "de_dis_dispatch_token_stalls1.int_sched_misc_token_stall", + "EventCode": "0xae", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. Integer Scheduler miscellaneous resource stall.", + "UMask": "0x08" + }, + { + "EventName": "de_dis_dispatch_token_stalls1.store_queue_rsrc_stall", + "EventCode": "0xae", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a Token Stall. Also counts cycles when the thread is not selected to dispatch but would have been stalled due to a Token Stall. Store Queue resource stall. Applies to all ops with store semantics.", + "UMask": "0x04" + }, + { + "EventName": "de_dis_dispatch_token_stalls1.load_queue_rsrc_stall", + "EventCode": "0xae", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a Token Stall. Also counts cycles when the thread is not selected to dispatch but would have been stalled due to a Token Stall. Load Queue resource stall. Applies to all ops with load semantics.", + "UMask": "0x02" + }, + { + "EventName": "de_dis_dispatch_token_stalls1.int_phy_reg_file_rsrc_stall", + "EventCode": "0xae", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a Token Stall. Also counts cycles when the thread is not selected to dispatch but would have been stalled due to a Token Stall. Integer Physical Register File resource stall. Integer Physical Register File, applies to all ops that have an integer destination register.", + "UMask": "0x01" + }, + { + "EventName": "de_dis_dispatch_token_stalls2.retire_token_stall", + "EventCode": "0xaf", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. Insufficient Retire Queue tokens available.", + "UMask": "0x20" + }, + { + "EventName": "de_dis_dispatch_token_stalls2.agsq_token_stall", + "EventCode": "0xaf", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. AGSQ Tokens unavailable.", + "UMask": "0x10" + }, + { + "EventName": "de_dis_dispatch_token_stalls2.int_sch3_token_stall", + "EventCode": "0xaf", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. No tokens for Integer Scheduler Queue 3 available.", + "UMask": "0x08" + }, + { + "EventName": "de_dis_dispatch_token_stalls2.int_sch2_token_stall", + "EventCode": "0xaf", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. No tokens for Integer Scheduler Queue 2 available.", + "UMask": "0x04" + }, + { + "EventName": "de_dis_dispatch_token_stalls2.int_sch1_token_stall", + "EventCode": "0xaf", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. No tokens for Integer Scheduler Queue 1 available.", + "UMask": "0x02" + }, + { + "EventName": "de_dis_dispatch_token_stalls2.int_sch0_token_stall", + "EventCode": "0xaf", + "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. No tokens for Integer Scheduler Queue 0 available.", + "UMask": "0x01" + } +] diff --git a/tools/perf/pmu-events/arch/x86/amdzen3/recommended.json b/tools/perf/pmu-events/arch/x86/amdzen3/recommended.json new file mode 100644 index 000000000000..988cf68ae825 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/amdzen3/recommended.json @@ -0,0 +1,214 @@ +[ + { + "MetricName": "branch_misprediction_ratio", + "BriefDescription": "Execution-Time Branch Misprediction Ratio (Non-Speculative)", + "MetricExpr": "d_ratio(ex_ret_brn_misp, ex_ret_brn)", + "MetricGroup": "branch_prediction", + "ScaleUnit": "100%" + }, + { + "EventName": "all_data_cache_accesses", + "EventCode": "0x29", + "BriefDescription": "All L1 Data Cache Accesses", + "UMask": "0x07" + }, + { + "MetricName": "all_l2_cache_accesses", + "BriefDescription": "All L2 Cache Accesses", + "MetricExpr": "l2_request_g1.all_no_prefetch + l2_pf_hit_l2 + l2_pf_miss_l2_hit_l3 + l2_pf_miss_l2_l3", + "MetricGroup": "l2_cache" + }, + { + "EventName": "l2_cache_accesses_from_ic_misses", + "EventCode": "0x60", + "BriefDescription": "L2 Cache Accesses from L1 Instruction Cache Misses (including prefetch)", + "UMask": "0x10" + }, + { + "EventName": "l2_cache_accesses_from_dc_misses", + "EventCode": "0x60", + "BriefDescription": "L2 Cache Accesses from L1 Data Cache Misses (including prefetch)", + "UMask": "0xe8" + }, + { + "MetricName": "l2_cache_accesses_from_l2_hwpf", + "BriefDescription": "L2 Cache Accesses from L2 HWPF", + "MetricExpr": "l2_pf_hit_l2 + l2_pf_miss_l2_hit_l3 + l2_pf_miss_l2_l3", + "MetricGroup": "l2_cache" + }, + { + "MetricName": "all_l2_cache_misses", + "BriefDescription": "All L2 Cache Misses", + "MetricExpr": "l2_cache_req_stat.ic_dc_miss_in_l2 + l2_pf_miss_l2_hit_l3 + l2_pf_miss_l2_l3", + "MetricGroup": "l2_cache" + }, + { + "EventName": "l2_cache_misses_from_ic_miss", + "EventCode": "0x64", + "BriefDescription": "L2 Cache Misses from L1 Instruction Cache Misses", + "UMask": "0x01" + }, + { + "EventName": "l2_cache_misses_from_dc_misses", + "EventCode": "0x64", + "BriefDescription": "L2 Cache Misses from L1 Data Cache Misses", + "UMask": "0x08" + }, + { + "MetricName": "l2_cache_misses_from_l2_hwpf", + "BriefDescription": "L2 Cache Misses from L2 Cache HWPF", + "MetricExpr": "l2_pf_miss_l2_hit_l3 + l2_pf_miss_l2_l3", + "MetricGroup": "l2_cache" + }, + { + "MetricName": "all_l2_cache_hits", + "BriefDescription": "All L2 Cache Hits", + "MetricExpr": "l2_cache_req_stat.ic_dc_hit_in_l2 + l2_pf_hit_l2", + "MetricGroup": "l2_cache" + }, + { + "EventName": "l2_cache_hits_from_ic_misses", + "EventCode": "0x64", + "BriefDescription": "L2 Cache Hits from L1 Instruction Cache Misses", + "UMask": "0x06" + }, + { + "EventName": "l2_cache_hits_from_dc_misses", + "EventCode": "0x64", + "BriefDescription": "L2 Cache Hits from L1 Data Cache Misses", + "UMask": "0xf0" + }, + { + "EventName": "l2_cache_hits_from_l2_hwpf", + "EventCode": "0x70", + "BriefDescription": "L2 Cache Hits from L2 Cache HWPF", + "UMask": "0xff" + }, + { + "EventName": "l3_cache_accesses", + "EventCode": "0x04", + "BriefDescription": "L3 Cache Accesses", + "UMask": "0xff", + "Unit": "L3PMC" + }, + { + "EventName": "l3_misses", + "EventCode": "0x04", + "BriefDescription": "L3 Misses (includes cacheline state change requests)", + "UMask": "0x01", + "Unit": "L3PMC" + }, + { + "MetricName": "l3_read_miss_latency", + "BriefDescription": "Average L3 Read Miss Latency (in core clocks)", + "MetricExpr": "(xi_sys_fill_latency * 16) / xi_ccx_sdp_req1", + "MetricGroup": "l3_cache", + "ScaleUnit": "1core clocks" + }, + { + "MetricName": "op_cache_fetch_miss_ratio", + "BriefDescription": "Op Cache (64B) Fetch Miss Ratio", + "MetricExpr": "d_ratio(op_cache_hit_miss.op_cache_miss, op_cache_hit_miss.all_op_cache_accesses)", + "MetricGroup": "l2_cache" + }, + { + "MetricName": "ic_fetch_miss_ratio", + "BriefDescription": "Instruction Cache (32B) Fetch Miss Ratio", + "MetricExpr": "d_ratio(ic_tag_hit_miss.instruction_cache_miss, ic_tag_hit_miss.all_instruction_cache_accesses)", + "MetricGroup": "l2_cache", + "ScaleUnit": "100%" + }, + { + "EventName": "l1_data_cache_fills_from_memory", + "EventCode": "0x44", + "BriefDescription": "L1 Data Cache Fills: From Memory", + "UMask": "0x48" + }, + { + "EventName": "l1_data_cache_fills_from_remote_node", + "EventCode": "0x44", + "BriefDescription": "L1 Data Cache Fills: From Remote Node", + "UMask": "0x50" + }, + { + "EventName": "l1_data_cache_fills_from_within_same_ccx", + "EventCode": "0x44", + "BriefDescription": "L1 Data Cache Fills: From within same CCX", + "UMask": "0x03" + }, + { + "EventName": "l1_data_cache_fills_from_external_ccx_cache", + "EventCode": "0x44", + "BriefDescription": "L1 Data Cache Fills: From External CCX Cache", + "UMask": "0x14" + }, + { + "EventName": "l1_data_cache_fills_all", + "EventCode": "0x44", + "BriefDescription": "L1 Data Cache Fills: All", + "UMask": "0xff" + }, + { + "MetricName": "l1_itlb_misses", + "BriefDescription": "L1 ITLB Misses", + "MetricExpr": "bp_l1_tlb_miss_l2_tlb_hit + bp_l1_tlb_miss_l2_tlb_miss", + "MetricGroup": "tlb" + }, + { + "EventName": "l2_itlb_misses", + "EventCode": "0x85", + "BriefDescription": "L2 ITLB Misses & Instruction page walks", + "UMask": "0x07" + }, + { + "EventName": "l1_dtlb_misses", + "EventCode": "0x45", + "BriefDescription": "L1 DTLB Misses", + "UMask": "0xff" + }, + { + "EventName": "l2_dtlb_misses", + "EventCode": "0x45", + "BriefDescription": "L2 DTLB Misses & Data page walks", + "UMask": "0xf0" + }, + { + "EventName": "all_tlbs_flushed", + "EventCode": "0x78", + "BriefDescription": "All TLBs Flushed", + "UMask": "0xff" + }, + { + "MetricName": "macro_ops_dispatched", + "BriefDescription": "Macro-ops Dispatched", + "MetricExpr": "de_dis_cops_from_decoder.disp_op_type.any_integer_dispatch + de_dis_cops_from_decoder.disp_op_type.any_fp_dispatch", + "MetricGroup": "decoder" + }, + { + "EventName": "sse_avx_stalls", + "EventCode": "0x0e", + "BriefDescription": "Mixed SSE/AVX Stalls", + "UMask": "0x0e" + }, + { + "EventName": "macro_ops_retired", + "EventCode": "0xc1", + "BriefDescription": "Macro-ops Retired" + }, + { + "MetricName": "all_remote_links_outbound", + "BriefDescription": "Approximate: Outbound data bytes for all Remote Links for a node (die)", + "MetricExpr": "remote_outbound_data_controller_0 + remote_outbound_data_controller_1 + remote_outbound_data_controller_2 + remote_outbound_data_controller_3", + "MetricGroup": "data_fabric", + "PerPkg": "1", + "ScaleUnit": "3e-5MiB" + }, + { + "MetricName": "nps1_die_to_dram", + "BriefDescription": "Approximate: Combined DRAM B/bytes of all channels on a NPS1 node (die) (may need --metric-no-group)", + "MetricExpr": "dram_channel_data_controller_0 + dram_channel_data_controller_1 + dram_channel_data_controller_2 + dram_channel_data_controller_3 + dram_channel_data_controller_4 + dram_channel_data_controller_5 + dram_channel_data_controller_6 + dram_channel_data_controller_7", + "MetricGroup": "data_fabric", + "PerPkg": "1", + "ScaleUnit": "6.1e-5MiB" + } +] diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index 2f2a209e87e1..9f97b32533a0 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -38,4 +38,4 @@ GenuineIntel-6-7E,v1,icelake,core GenuineIntel-6-86,v1,tremontx,core AuthenticAMD-23-([12][0-9A-F]|[0-9A-F]),v2,amdzen1,core AuthenticAMD-23-[[:xdigit:]]+,v1,amdzen2,core -AuthenticAMD-25-[[:xdigit:]]+,v1,amdzen2,core +AuthenticAMD-25-[[:xdigit:]]+,v1,amdzen3,core -- cgit v1.2.3 From 84de8154c516b821bd60493b90d4782c5a4905ab Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Fri, 2 Apr 2021 16:05:14 +0900 Subject: tools: Fix a typo in kernel-chktaint This patch fixes a spelling typo in kernel-chktaint Signed-off-by: Masanari Iida Acked-by: Randy Dunlap Link: https://lore.kernel.org/r/20210402070514.336376-1-standby24x7@gmail.com Signed-off-by: Jonathan Corbet --- tools/debugging/kernel-chktaint | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/debugging/kernel-chktaint b/tools/debugging/kernel-chktaint index 607b2b280945..719f18b1edf0 100755 --- a/tools/debugging/kernel-chktaint +++ b/tools/debugging/kernel-chktaint @@ -25,7 +25,7 @@ if [ "$1"x != "x" ]; then elif [ $1 -ge 0 ] 2>/dev/null ; then taint=$1 else - echo "Error: Parameter '$1' not a positive interger. Aborting." >&2 + echo "Error: Parameter '$1' not a positive integer. Aborting." >&2 exit 1 fi else -- cgit v1.2.3 From 79749ae19de6863d0748419d413b231b0d57e28a Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Wed, 7 Apr 2021 18:46:42 +0300 Subject: tc-testing: add simple action test to verify batch add cleanup Verify cleanup of failed actions batch add where second action in batch fails after successful init of first action. Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- .../tc-testing/tc-tests/actions/simple.json | 30 ++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/simple.json b/tools/testing/selftests/tc-testing/tc-tests/actions/simple.json index e15f708b0fa4..d5bcbb919dcc 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/simple.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/simple.json @@ -175,5 +175,35 @@ "teardown": [ "$TC actions flush action simple" ] + }, + { + "id": "8d07", + "name": "Verify cleanup of failed actions batch add", + "category": [ + "actions", + "simple" + ], + "setup": [ + [ + "$TC actions flush action simple", + 0, + 1, + 255 + ], + "$TC actions add action simple sdata \"2\" index 2", + [ + "$TC actions add action simple sdata \"1\" index 1 action simple sdata \"2\" index 2", + 255 + ], + "$TC actions flush action simple" + ], + "cmdUnderTest": "$TC actions add action simple sdata \"2\" index 2", + "expExitCode": "0", + "verifyCmd": "$TC actions list action simple", + "matchPattern": "action order [0-9]*: Simple <2>.*index 2 ref", + "matchCount": "1", + "teardown": [ + "$TC actions flush action simple" + ] } ] -- cgit v1.2.3 From 652e3124c3ee60d9770e070afab0e0a862e9bfd7 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Wed, 7 Apr 2021 18:46:43 +0300 Subject: tc-testing: add simple action test to verify batch change cleanup Verify cleanup of failed actions batch change where second action in batch fails after successful init of first action. Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- .../tc-testing/tc-tests/actions/simple.json | 29 ++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/simple.json b/tools/testing/selftests/tc-testing/tc-tests/actions/simple.json index d5bcbb919dcc..e0c5f060ccb9 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/simple.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/simple.json @@ -205,5 +205,34 @@ "teardown": [ "$TC actions flush action simple" ] + }, + { + "id": "a68a", + "name": "Verify cleanup of failed actions batch change", + "category": [ + "actions", + "simple" + ], + "setup": [ + [ + "$TC actions flush action simple", + 0, + 1, + 255 + ], + [ + "$TC actions change action simple sdata \"1\" index 1 action simple sdata \"2\" goto chain 42 index 2", + 255 + ], + "$TC actions flush action simple" + ], + "cmdUnderTest": "$TC actions add action simple sdata \"1\" index 1", + "expExitCode": "0", + "verifyCmd": "$TC actions list action simple", + "matchPattern": "action order [0-9]*: Simple <1>.*index 1 ref", + "matchCount": "1", + "teardown": [ + "$TC actions flush action simple" + ] } ] -- cgit v1.2.3 From ff182bc572cec4ca757775bf2a33a3ce8611227a Mon Sep 17 00:00:00 2001 From: Yauheni Kaliuta Date: Thu, 8 Apr 2021 09:13:02 +0300 Subject: selftests/bpf: test_progs/sockopt_sk: Remove version As pointed by Andrii Nakryiko, _version is useless now, remove it. Signed-off-by: Yauheni Kaliuta Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210408061310.95877-1-yauheni.kaliuta@redhat.com --- tools/testing/selftests/bpf/progs/sockopt_sk.c | 1 - 1 file changed, 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/sockopt_sk.c b/tools/testing/selftests/bpf/progs/sockopt_sk.c index d3597f81e6e9..978a68005966 100644 --- a/tools/testing/selftests/bpf/progs/sockopt_sk.c +++ b/tools/testing/selftests/bpf/progs/sockopt_sk.c @@ -6,7 +6,6 @@ #include char _license[] SEC("license") = "GPL"; -__u32 _version SEC("version") = 1; #ifndef PAGE_SIZE #define PAGE_SIZE 4096 -- cgit v1.2.3 From cad99cce133dd5377f22da1e75d7eb5c4b886d75 Mon Sep 17 00:00:00 2001 From: Yauheni Kaliuta Date: Thu, 8 Apr 2021 09:13:03 +0300 Subject: selftests/bpf: test_progs/sockopt_sk: Convert to use BPF skeleton Switch the test to use BPF skeleton to save some boilerplate and make it easy to access bpf program bss segment. The latter will be used to pass PAGE_SIZE from userspace since there is no convenient way for bpf program to get it from inside of the kernel. Signed-off-by: Yauheni Kaliuta Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210408061310.95877-2-yauheni.kaliuta@redhat.com --- .../testing/selftests/bpf/prog_tests/sockopt_sk.c | 65 ++++++---------------- 1 file changed, 17 insertions(+), 48 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c index d5b44b135c00..7274b12abe17 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c +++ b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c @@ -3,6 +3,7 @@ #include "cgroup_helpers.h" #include +#include "sockopt_sk.skel.h" #ifndef SOL_TCP #define SOL_TCP IPPROTO_TCP @@ -191,60 +192,28 @@ err: return -1; } -static int prog_attach(struct bpf_object *obj, int cgroup_fd, const char *title) -{ - enum bpf_attach_type attach_type; - enum bpf_prog_type prog_type; - struct bpf_program *prog; - int err; - - err = libbpf_prog_type_by_name(title, &prog_type, &attach_type); - if (err) { - log_err("Failed to deduct types for %s BPF program", title); - return -1; - } - - prog = bpf_object__find_program_by_title(obj, title); - if (!prog) { - log_err("Failed to find %s BPF program", title); - return -1; - } - - err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd, - attach_type, 0); - if (err) { - log_err("Failed to attach %s BPF program", title); - return -1; - } - - return 0; -} - static void run_test(int cgroup_fd) { - struct bpf_prog_load_attr attr = { - .file = "./sockopt_sk.o", - }; - struct bpf_object *obj; - int ignored; - int err; - - err = bpf_prog_load_xattr(&attr, &obj, &ignored); - if (CHECK_FAIL(err)) - return; + struct sockopt_sk *skel; + + skel = sockopt_sk__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_load")) + goto cleanup; - err = prog_attach(obj, cgroup_fd, "cgroup/getsockopt"); - if (CHECK_FAIL(err)) - goto close_bpf_object; + skel->links._setsockopt = + bpf_program__attach_cgroup(skel->progs._setsockopt, cgroup_fd); + if (!ASSERT_OK_PTR(skel->links._setsockopt, "setsockopt_link")) + goto cleanup; - err = prog_attach(obj, cgroup_fd, "cgroup/setsockopt"); - if (CHECK_FAIL(err)) - goto close_bpf_object; + skel->links._getsockopt = + bpf_program__attach_cgroup(skel->progs._getsockopt, cgroup_fd); + if (!ASSERT_OK_PTR(skel->links._getsockopt, "getsockopt_link")) + goto cleanup; - CHECK_FAIL(getsetsockopt()); + ASSERT_OK(getsetsockopt(), "getsetsockopt"); -close_bpf_object: - bpf_object__close(obj); +cleanup: + sockopt_sk__destroy(skel); } void test_sockopt_sk(void) -- cgit v1.2.3 From 361d32028c7d52d28d7f0562193a2f4a41d10351 Mon Sep 17 00:00:00 2001 From: Yauheni Kaliuta Date: Thu, 8 Apr 2021 09:13:04 +0300 Subject: selftests/bpf: Pass page size from userspace in sockopt_sk Since there is no convenient way for bpf program to get PAGE_SIZE from inside of the kernel, pass the value from userspace. Zero-initialize the variable in bpf prog, otherwise it will cause problems on some versions of Clang. Signed-off-by: Yauheni Kaliuta Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210408061310.95877-3-yauheni.kaliuta@redhat.com --- tools/testing/selftests/bpf/prog_tests/sockopt_sk.c | 2 ++ tools/testing/selftests/bpf/progs/sockopt_sk.c | 10 ++++------ 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c index 7274b12abe17..4b937e5dbaca 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c +++ b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c @@ -200,6 +200,8 @@ static void run_test(int cgroup_fd) if (!ASSERT_OK_PTR(skel, "skel_load")) goto cleanup; + skel->bss->page_size = getpagesize(); + skel->links._setsockopt = bpf_program__attach_cgroup(skel->progs._setsockopt, cgroup_fd); if (!ASSERT_OK_PTR(skel->links._setsockopt, "setsockopt_link")) diff --git a/tools/testing/selftests/bpf/progs/sockopt_sk.c b/tools/testing/selftests/bpf/progs/sockopt_sk.c index 978a68005966..8acdb99b5959 100644 --- a/tools/testing/selftests/bpf/progs/sockopt_sk.c +++ b/tools/testing/selftests/bpf/progs/sockopt_sk.c @@ -7,9 +7,7 @@ char _license[] SEC("license") = "GPL"; -#ifndef PAGE_SIZE -#define PAGE_SIZE 4096 -#endif +int page_size = 0; /* userspace should set it */ #ifndef SOL_TCP #define SOL_TCP IPPROTO_TCP @@ -89,7 +87,7 @@ int _getsockopt(struct bpf_sockopt *ctx) * program can only see the first PAGE_SIZE * bytes of data. */ - if (optval_end - optval != PAGE_SIZE) + if (optval_end - optval != page_size) return 0; /* EPERM, unexpected data size */ return 1; @@ -160,7 +158,7 @@ int _setsockopt(struct bpf_sockopt *ctx) if (ctx->level == SOL_IP && ctx->optname == IP_FREEBIND) { /* Original optlen is larger than PAGE_SIZE. */ - if (ctx->optlen != PAGE_SIZE * 2) + if (ctx->optlen != page_size * 2) return 0; /* EPERM, unexpected data size */ if (optval + 1 > optval_end) @@ -174,7 +172,7 @@ int _setsockopt(struct bpf_sockopt *ctx) * program can only see the first PAGE_SIZE * bytes of data. */ - if (optval_end - optval != PAGE_SIZE) + if (optval_end - optval != page_size) return 0; /* EPERM, unexpected data size */ return 1; -- cgit v1.2.3 From 7a85e4dfa7f5d052ae5016e96f1ee426a8870e78 Mon Sep 17 00:00:00 2001 From: Yauheni Kaliuta Date: Thu, 8 Apr 2021 09:13:05 +0300 Subject: selftests/bpf: Pass page size from userspace in map_ptr Use ASSERT to check result but keep CHECK where format was used to report error. Use bpf_map__set_max_entries() to set map size dynamically from userspace according to page size. Zero-initialize the variable in bpf prog, otherwise it will cause problems on some versions of Clang. Signed-off-by: Yauheni Kaliuta Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210408061310.95877-4-yauheni.kaliuta@redhat.com --- tools/testing/selftests/bpf/prog_tests/map_ptr.c | 15 +++++++++++++-- tools/testing/selftests/bpf/progs/map_ptr_kern.c | 4 ++-- 2 files changed, 15 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/map_ptr.c b/tools/testing/selftests/bpf/prog_tests/map_ptr.c index c230a573c373..4972f92205c7 100644 --- a/tools/testing/selftests/bpf/prog_tests/map_ptr.c +++ b/tools/testing/selftests/bpf/prog_tests/map_ptr.c @@ -12,11 +12,22 @@ void test_map_ptr(void) __u32 duration = 0, retval; char buf[128]; int err; + int page_size = getpagesize(); - skel = map_ptr_kern__open_and_load(); - if (CHECK(!skel, "skel_open_load", "open_load failed\n")) + skel = map_ptr_kern__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) return; + err = bpf_map__set_max_entries(skel->maps.m_ringbuf, page_size); + if (!ASSERT_OK(err, "bpf_map__set_max_entries")) + goto cleanup; + + err = map_ptr_kern__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + skel->bss->page_size = page_size; + err = bpf_prog_test_run(bpf_program__fd(skel->progs.cg_skb), 1, &pkt_v4, sizeof(pkt_v4), buf, NULL, &retval, NULL); diff --git a/tools/testing/selftests/bpf/progs/map_ptr_kern.c b/tools/testing/selftests/bpf/progs/map_ptr_kern.c index d8850bc6a9f1..d1d304c980f0 100644 --- a/tools/testing/selftests/bpf/progs/map_ptr_kern.c +++ b/tools/testing/selftests/bpf/progs/map_ptr_kern.c @@ -12,6 +12,7 @@ _Static_assert(MAX_ENTRIES < LOOP_BOUND, "MAX_ENTRIES must be < LOOP_BOUND"); enum bpf_map_type g_map_type = BPF_MAP_TYPE_UNSPEC; __u32 g_line = 0; +int page_size = 0; /* userspace should set it */ #define VERIFY_TYPE(type, func) ({ \ g_map_type = type; \ @@ -635,7 +636,6 @@ struct bpf_ringbuf_map { struct { __uint(type, BPF_MAP_TYPE_RINGBUF); - __uint(max_entries, 1 << 12); } m_ringbuf SEC(".maps"); static inline int check_ringbuf(void) @@ -643,7 +643,7 @@ static inline int check_ringbuf(void) struct bpf_ringbuf_map *ringbuf = (struct bpf_ringbuf_map *)&m_ringbuf; struct bpf_map *map = (struct bpf_map *)&m_ringbuf; - VERIFY(check(&ringbuf->map, map, 0, 0, 1 << 12)); + VERIFY(check(&ringbuf->map, map, 0, 0, page_size)); return 1; } -- cgit v1.2.3 From 34090aaf256e469a39401cc04d5cd43275ccd97d Mon Sep 17 00:00:00 2001 From: Yauheni Kaliuta Date: Thu, 8 Apr 2021 09:13:06 +0300 Subject: selftests/bpf: mmap: Use runtime page size Replace hardcoded 4096 with runtime value in the userspace part of the test and set bpf table sizes dynamically according to the value. Do not switch to ASSERT macros, keep CHECK, for consistency with the rest of the test. Can be a separate cleanup patch. Signed-off-by: Yauheni Kaliuta Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210408061310.95877-5-yauheni.kaliuta@redhat.com --- tools/testing/selftests/bpf/prog_tests/mmap.c | 24 +++++++++++++++++++----- tools/testing/selftests/bpf/progs/test_mmap.c | 2 -- 2 files changed, 19 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/mmap.c b/tools/testing/selftests/bpf/prog_tests/mmap.c index 9c3c5c0f068f..37b002ca1167 100644 --- a/tools/testing/selftests/bpf/prog_tests/mmap.c +++ b/tools/testing/selftests/bpf/prog_tests/mmap.c @@ -29,22 +29,36 @@ void test_mmap(void) struct test_mmap *skel; __u64 val = 0; - skel = test_mmap__open_and_load(); - if (CHECK(!skel, "skel_open_and_load", "skeleton open/load failed\n")) + skel = test_mmap__open(); + if (CHECK(!skel, "skel_open", "skeleton open failed\n")) return; + err = bpf_map__set_max_entries(skel->maps.rdonly_map, page_size); + if (CHECK(err != 0, "bpf_map__set_max_entries", "bpf_map__set_max_entries failed\n")) + goto cleanup; + + /* at least 4 pages of data */ + err = bpf_map__set_max_entries(skel->maps.data_map, + 4 * (page_size / sizeof(u64))); + if (CHECK(err != 0, "bpf_map__set_max_entries", "bpf_map__set_max_entries failed\n")) + goto cleanup; + + err = test_mmap__load(skel); + if (CHECK(err != 0, "skel_load", "skeleton load failed\n")) + goto cleanup; + bss_map = skel->maps.bss; data_map = skel->maps.data_map; data_map_fd = bpf_map__fd(data_map); rdmap_fd = bpf_map__fd(skel->maps.rdonly_map); - tmp1 = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, rdmap_fd, 0); + tmp1 = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, rdmap_fd, 0); if (CHECK(tmp1 != MAP_FAILED, "rdonly_write_mmap", "unexpected success\n")) { - munmap(tmp1, 4096); + munmap(tmp1, page_size); goto cleanup; } /* now double-check if it's mmap()'able at all */ - tmp1 = mmap(NULL, 4096, PROT_READ, MAP_SHARED, rdmap_fd, 0); + tmp1 = mmap(NULL, page_size, PROT_READ, MAP_SHARED, rdmap_fd, 0); if (CHECK(tmp1 == MAP_FAILED, "rdonly_read_mmap", "failed: %d\n", errno)) goto cleanup; diff --git a/tools/testing/selftests/bpf/progs/test_mmap.c b/tools/testing/selftests/bpf/progs/test_mmap.c index 4eb42cff5fe9..5a5cc19a15bf 100644 --- a/tools/testing/selftests/bpf/progs/test_mmap.c +++ b/tools/testing/selftests/bpf/progs/test_mmap.c @@ -9,7 +9,6 @@ char _license[] SEC("license") = "GPL"; struct { __uint(type, BPF_MAP_TYPE_ARRAY); - __uint(max_entries, 4096); __uint(map_flags, BPF_F_MMAPABLE | BPF_F_RDONLY_PROG); __type(key, __u32); __type(value, char); @@ -17,7 +16,6 @@ struct { struct { __uint(type, BPF_MAP_TYPE_ARRAY); - __uint(max_entries, 512 * 4); /* at least 4 pages of data */ __uint(map_flags, BPF_F_MMAPABLE); __type(key, __u32); __type(value, __u64); -- cgit v1.2.3 From 23a65766066bb6e42a44e9097ddf79f72292a19f Mon Sep 17 00:00:00 2001 From: Yauheni Kaliuta Date: Thu, 8 Apr 2021 09:13:07 +0300 Subject: selftests/bpf: ringbuf: Use runtime page size Replace hardcoded 4096 with runtime value in the userspace part of the test and set bpf table sizes dynamically according to the value. Do not switch to ASSERT macros, keep CHECK, for consistency with the rest of the test. Can be a separate cleanup patch. Signed-off-by: Yauheni Kaliuta Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210408061310.95877-6-yauheni.kaliuta@redhat.com --- tools/testing/selftests/bpf/prog_tests/ringbuf.c | 17 +++++++++++++---- tools/testing/selftests/bpf/progs/test_ringbuf.c | 1 - 2 files changed, 13 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf.c b/tools/testing/selftests/bpf/prog_tests/ringbuf.c index fddbc5db5d6a..de78617f6550 100644 --- a/tools/testing/selftests/bpf/prog_tests/ringbuf.c +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf.c @@ -87,11 +87,20 @@ void test_ringbuf(void) pthread_t thread; long bg_ret = -1; int err, cnt; + int page_size = getpagesize(); - skel = test_ringbuf__open_and_load(); - if (CHECK(!skel, "skel_open_load", "skeleton open&load failed\n")) + skel = test_ringbuf__open(); + if (CHECK(!skel, "skel_open", "skeleton open failed\n")) return; + err = bpf_map__set_max_entries(skel->maps.ringbuf, page_size); + if (CHECK(err != 0, "bpf_map__set_max_entries", "bpf_map__set_max_entries failed\n")) + goto cleanup; + + err = test_ringbuf__load(skel); + if (CHECK(err != 0, "skel_load", "skeleton load failed\n")) + goto cleanup; + /* only trigger BPF program for current process */ skel->bss->pid = getpid(); @@ -110,9 +119,9 @@ void test_ringbuf(void) CHECK(skel->bss->avail_data != 3 * rec_sz, "err_avail_size", "exp %ld, got %ld\n", 3L * rec_sz, skel->bss->avail_data); - CHECK(skel->bss->ring_size != 4096, + CHECK(skel->bss->ring_size != page_size, "err_ring_size", "exp %ld, got %ld\n", - 4096L, skel->bss->ring_size); + (long)page_size, skel->bss->ring_size); CHECK(skel->bss->cons_pos != 0, "err_cons_pos", "exp %ld, got %ld\n", 0L, skel->bss->cons_pos); diff --git a/tools/testing/selftests/bpf/progs/test_ringbuf.c b/tools/testing/selftests/bpf/progs/test_ringbuf.c index 8ba9959b036b..6b3f288b7c63 100644 --- a/tools/testing/selftests/bpf/progs/test_ringbuf.c +++ b/tools/testing/selftests/bpf/progs/test_ringbuf.c @@ -15,7 +15,6 @@ struct sample { struct { __uint(type, BPF_MAP_TYPE_RINGBUF); - __uint(max_entries, 1 << 12); } ringbuf SEC(".maps"); /* inputs */ -- cgit v1.2.3 From b3278099b2f6e81771c6c2b70fcf9a56e9ba5d93 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 8 Apr 2021 09:13:08 +0300 Subject: libbpf: Add bpf_map__inner_map API The API gives access to inner map for map in map types (array or hash of map). It will be used to dynamically set max_entries in it. Signed-off-by: Yauheni Kaliuta Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210408061310.95877-7-yauheni.kaliuta@redhat.com --- tools/lib/bpf/libbpf.c | 10 ++++++++++ tools/lib/bpf/libbpf.h | 1 + tools/lib/bpf/libbpf.map | 1 + 3 files changed, 12 insertions(+) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 7aad78dbb4b4..ed5586cce227 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -2194,6 +2194,7 @@ static int parse_btf_map_def(struct bpf_object *obj, map->inner_map = calloc(1, sizeof(*map->inner_map)); if (!map->inner_map) return -ENOMEM; + map->inner_map->fd = -1; map->inner_map->sec_idx = obj->efile.btf_maps_shndx; map->inner_map->name = malloc(strlen(map->name) + sizeof(".inner") + 1); @@ -3845,6 +3846,14 @@ __u32 bpf_map__max_entries(const struct bpf_map *map) return map->def.max_entries; } +struct bpf_map *bpf_map__inner_map(struct bpf_map *map) +{ + if (!bpf_map_type__is_map_in_map(map->def.type)) + return NULL; + + return map->inner_map; +} + int bpf_map__set_max_entries(struct bpf_map *map, __u32 max_entries) { if (map->fd >= 0) @@ -9476,6 +9485,7 @@ int bpf_map__set_inner_map_fd(struct bpf_map *map, int fd) pr_warn("error: inner_map_fd already specified\n"); return -EINVAL; } + zfree(&map->inner_map); map->inner_map_fd = fd; return 0; } diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index f500621d28e5..bec4e6a6e31d 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -480,6 +480,7 @@ LIBBPF_API int bpf_map__pin(struct bpf_map *map, const char *path); LIBBPF_API int bpf_map__unpin(struct bpf_map *map, const char *path); LIBBPF_API int bpf_map__set_inner_map_fd(struct bpf_map *map, int fd); +LIBBPF_API struct bpf_map *bpf_map__inner_map(struct bpf_map *map); LIBBPF_API long libbpf_get_error(const void *ptr); diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index f5990f7208ce..b9b29baf1df8 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -359,5 +359,6 @@ LIBBPF_0.4.0 { bpf_linker__finalize; bpf_linker__free; bpf_linker__new; + bpf_map__inner_map; bpf_object__set_kversion; } LIBBPF_0.3.0; -- cgit v1.2.3 From f3f4c23e1238783f3d719cfbda6c5e2fd03a48c4 Mon Sep 17 00:00:00 2001 From: Yauheni Kaliuta Date: Thu, 8 Apr 2021 09:13:09 +0300 Subject: selftests/bpf: ringbuf_multi: Use runtime page size Set bpf table sizes dynamically according to the runtime page size value. Do not switch to ASSERT macros, keep CHECK, for consistency with the rest of the test. Can be a separate cleanup patch. Signed-off-by: Yauheni Kaliuta Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210408061310.95877-8-yauheni.kaliuta@redhat.com --- .../selftests/bpf/prog_tests/ringbuf_multi.c | 23 +++++++++++++++++++--- .../selftests/bpf/progs/test_ringbuf_multi.c | 1 - 2 files changed, 20 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c b/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c index d37161e59bb2..159de99621c7 100644 --- a/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c @@ -41,13 +41,30 @@ static int process_sample(void *ctx, void *data, size_t len) void test_ringbuf_multi(void) { struct test_ringbuf_multi *skel; - struct ring_buffer *ringbuf; + struct ring_buffer *ringbuf = NULL; int err; + int page_size = getpagesize(); - skel = test_ringbuf_multi__open_and_load(); - if (CHECK(!skel, "skel_open_load", "skeleton open&load failed\n")) + skel = test_ringbuf_multi__open(); + if (CHECK(!skel, "skel_open", "skeleton open failed\n")) return; + err = bpf_map__set_max_entries(skel->maps.ringbuf1, page_size); + if (CHECK(err != 0, "bpf_map__set_max_entries", "bpf_map__set_max_entries failed\n")) + goto cleanup; + + err = bpf_map__set_max_entries(skel->maps.ringbuf2, page_size); + if (CHECK(err != 0, "bpf_map__set_max_entries", "bpf_map__set_max_entries failed\n")) + goto cleanup; + + err = bpf_map__set_max_entries(bpf_map__inner_map(skel->maps.ringbuf_arr), page_size); + if (CHECK(err != 0, "bpf_map__set_max_entries", "bpf_map__set_max_entries failed\n")) + goto cleanup; + + err = test_ringbuf_multi__load(skel); + if (CHECK(err != 0, "skel_load", "skeleton load failed\n")) + goto cleanup; + /* only trigger BPF program for current process */ skel->bss->pid = getpid(); diff --git a/tools/testing/selftests/bpf/progs/test_ringbuf_multi.c b/tools/testing/selftests/bpf/progs/test_ringbuf_multi.c index edf3b6953533..055c10b2ff80 100644 --- a/tools/testing/selftests/bpf/progs/test_ringbuf_multi.c +++ b/tools/testing/selftests/bpf/progs/test_ringbuf_multi.c @@ -15,7 +15,6 @@ struct sample { struct ringbuf_map { __uint(type, BPF_MAP_TYPE_RINGBUF); - __uint(max_entries, 1 << 12); } ringbuf1 SEC(".maps"), ringbuf2 SEC(".maps"); -- cgit v1.2.3 From cfc0889cebccbc398a9d7a14e4e1b4724afe4bb3 Mon Sep 17 00:00:00 2001 From: Yauheni Kaliuta Date: Thu, 8 Apr 2021 09:13:10 +0300 Subject: selftests/bpf: ringbuf_multi: Test bpf_map__set_inner_map_fd Test map__set_inner_map_fd() interaction with map-in-map initialization. Use hashmap of maps just to make it different to existing array of maps. Signed-off-by: Yauheni Kaliuta Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210408061310.95877-9-yauheni.kaliuta@redhat.com --- tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c | 14 ++++++++++++++ tools/testing/selftests/bpf/progs/test_ringbuf_multi.c | 11 +++++++++++ 2 files changed, 25 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c b/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c index 159de99621c7..cef63e703924 100644 --- a/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c @@ -44,6 +44,7 @@ void test_ringbuf_multi(void) struct ring_buffer *ringbuf = NULL; int err; int page_size = getpagesize(); + int proto_fd = -1; skel = test_ringbuf_multi__open(); if (CHECK(!skel, "skel_open", "skeleton open failed\n")) @@ -61,10 +62,21 @@ void test_ringbuf_multi(void) if (CHECK(err != 0, "bpf_map__set_max_entries", "bpf_map__set_max_entries failed\n")) goto cleanup; + proto_fd = bpf_create_map(BPF_MAP_TYPE_RINGBUF, 0, 0, page_size, 0); + if (CHECK(proto_fd == -1, "bpf_create_map", "bpf_create_map failed\n")) + goto cleanup; + + err = bpf_map__set_inner_map_fd(skel->maps.ringbuf_hash, proto_fd); + if (CHECK(err != 0, "bpf_map__set_inner_map_fd", "bpf_map__set_inner_map_fd failed\n")) + goto cleanup; + err = test_ringbuf_multi__load(skel); if (CHECK(err != 0, "skel_load", "skeleton load failed\n")) goto cleanup; + close(proto_fd); + proto_fd = -1; + /* only trigger BPF program for current process */ skel->bss->pid = getpid(); @@ -114,6 +126,8 @@ void test_ringbuf_multi(void) 2L, skel->bss->total); cleanup: + if (proto_fd >= 0) + close(proto_fd); ring_buffer__free(ringbuf); test_ringbuf_multi__destroy(skel); } diff --git a/tools/testing/selftests/bpf/progs/test_ringbuf_multi.c b/tools/testing/selftests/bpf/progs/test_ringbuf_multi.c index 055c10b2ff80..197b86546dca 100644 --- a/tools/testing/selftests/bpf/progs/test_ringbuf_multi.c +++ b/tools/testing/selftests/bpf/progs/test_ringbuf_multi.c @@ -30,6 +30,17 @@ struct { }, }; +struct { + __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS); + __uint(max_entries, 1); + __type(key, int); + __array(values, struct ringbuf_map); +} ringbuf_hash SEC(".maps") = { + .values = { + [0] = &ringbuf1, + }, +}; + /* inputs */ int pid = 0; int target_ring = 0; -- cgit v1.2.3 From 1c3cadbe02420e6c85251c416a78a16f17761231 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 9 Apr 2021 13:04:40 +0200 Subject: self-tests: add veth tests Add some basic veth tests, that verify the expected flags and aggregation with different setups (default, xdp, etc...) Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- tools/testing/selftests/net/Makefile | 1 + tools/testing/selftests/net/veth.sh | 177 +++++++++++++++++++++++++++++++++++ 2 files changed, 178 insertions(+) create mode 100755 tools/testing/selftests/net/veth.sh (limited to 'tools') diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 2d71b283dde3..f4242a961088 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -24,6 +24,7 @@ TEST_PROGS += vrf_route_leaking.sh TEST_PROGS += bareudp.sh TEST_PROGS += unicast_extensions.sh TEST_PROGS += udpgro_fwd.sh +TEST_PROGS += veth.sh TEST_PROGS_EXTENDED := in_netns.sh TEST_GEN_FILES = socket nettest TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy reuseport_addr_any diff --git a/tools/testing/selftests/net/veth.sh b/tools/testing/selftests/net/veth.sh new file mode 100755 index 000000000000..2fedc0781ce8 --- /dev/null +++ b/tools/testing/selftests/net/veth.sh @@ -0,0 +1,177 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +readonly STATS="$(mktemp -p /tmp ns-XXXXXX)" +readonly BASE=`basename $STATS` +readonly SRC=2 +readonly DST=1 +readonly DST_NAT=100 +readonly NS_SRC=$BASE$SRC +readonly NS_DST=$BASE$DST + +# "baremetal" network used for raw UDP traffic +readonly BM_NET_V4=192.168.1. +readonly BM_NET_V6=2001:db8:: + +readonly NPROCS=`nproc` +ret=0 + +cleanup() { + local ns + local -r jobs="$(jobs -p)" + [ -n "${jobs}" ] && kill -1 ${jobs} 2>/dev/null + rm -f $STATS + + for ns in $NS_SRC $NS_DST; do + ip netns del $ns 2>/dev/null + done +} + +trap cleanup EXIT + +create_ns() { + local ns + + for ns in $NS_SRC $NS_DST; do + ip netns add $ns + ip -n $ns link set dev lo up + done + + ip link add name veth$SRC type veth peer name veth$DST + + for ns in $SRC $DST; do + ip link set dev veth$ns netns $BASE$ns up + ip -n $BASE$ns addr add dev veth$ns $BM_NET_V4$ns/24 + ip -n $BASE$ns addr add dev veth$ns $BM_NET_V6$ns/64 nodad + done + echo "#kernel" > $BASE + chmod go-rw $BASE +} + +__chk_flag() { + local msg="$1" + local target=$2 + local expected=$3 + local flagname=$4 + + local flag=`ip netns exec $BASE$target ethtool -k veth$target |\ + grep $flagname | awk '{print $2}'` + + printf "%-60s" "$msg" + if [ "$flag" = "$expected" ]; then + echo " ok " + else + echo " fail - expected $expected found $flag" + ret=1 + fi +} + +chk_gro_flag() { + __chk_flag "$1" $2 $3 generic-receive-offload +} + +chk_tso_flag() { + __chk_flag "$1" $2 $3 tcp-segmentation-offload +} + +chk_gro() { + local msg="$1" + local expected=$2 + + ip netns exec $BASE$SRC ping -qc 1 $BM_NET_V4$DST >/dev/null + NSTAT_HISTORY=$STATS ip netns exec $NS_DST nstat -n + + printf "%-60s" "$msg" + ip netns exec $BASE$DST ./udpgso_bench_rx -C 1000 -R 10 & + local spid=$! + sleep 0.1 + + ip netns exec $NS_SRC ./udpgso_bench_tx -4 -s 13000 -S 1300 -M 1 -D $BM_NET_V4$DST + local retc=$? + wait $spid + local rets=$? + if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then + echo " fail client exit code $retc, server $rets" + ret=1 + return + fi + + local pkts=`NSTAT_HISTORY=$STATS ip netns exec $NS_DST nstat IpInReceives | \ + awk '{print $2}' | tail -n 1` + if [ "$pkts" = "$expected" ]; then + echo " ok " + else + echo " fail - got $pkts packets, expected $expected " + ret=1 + fi +} + +if [ ! -f ../bpf/xdp_dummy.o ]; then + echo "Missing xdp_dummy helper. Build bpf selftest first" + exit -1 +fi + +create_ns +chk_gro_flag "default - gro flag" $SRC off +chk_gro_flag " - peer gro flag" $DST off +chk_tso_flag " - tso flag" $SRC on +chk_tso_flag " - peer tso flag" $DST on +chk_gro " - aggregation" 1 +ip netns exec $NS_SRC ethtool -K veth$SRC tx-udp-segmentation off +chk_gro " - aggregation with TSO off" 10 +cleanup + +create_ns +ip netns exec $NS_DST ethtool -K veth$DST gro on +chk_gro_flag "with gro on - gro flag" $DST on +chk_gro_flag " - peer gro flag" $SRC off +chk_tso_flag " - tso flag" $SRC on +chk_tso_flag " - peer tso flag" $DST on +ip netns exec $NS_SRC ethtool -K veth$SRC tx-udp-segmentation off +ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on +chk_gro " - aggregation with TSO off" 1 +cleanup + +create_ns +ip -n $NS_DST link set dev veth$DST down +ip netns exec $NS_DST ethtool -K veth$DST gro on +chk_gro_flag "with gro enabled on link down - gro flag" $DST on +chk_gro_flag " - peer gro flag" $SRC off +chk_tso_flag " - tso flag" $SRC on +chk_tso_flag " - peer tso flag" $DST on +ip -n $NS_DST link set dev veth$DST up +ip netns exec $NS_SRC ethtool -K veth$SRC tx-udp-segmentation off +ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on +chk_gro " - aggregation with TSO off" 1 +cleanup + +create_ns +ip -n $NS_DST link set dev veth$DST xdp object ../bpf/xdp_dummy.o section xdp_dummy 2>/dev/null +chk_gro_flag "with xdp attached - gro flag" $DST on +chk_gro_flag " - peer gro flag" $SRC off +chk_tso_flag " - tso flag" $SRC off +chk_tso_flag " - peer tso flag" $DST on +ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on +chk_gro " - aggregation" 1 + + +ip -n $NS_DST link set dev veth$DST down +ip -n $NS_SRC link set dev veth$SRC down +chk_gro_flag " - after dev off, flag" $DST on +chk_gro_flag " - peer flag" $SRC off + +ip netns exec $NS_DST ethtool -K veth$DST gro on +ip -n $NS_DST link set dev veth$DST xdp off +chk_gro_flag " - after gro on xdp off, gro flag" $DST on +chk_gro_flag " - peer gro flag" $SRC off +chk_tso_flag " - tso flag" $SRC on +chk_tso_flag " - peer tso flag" $DST on +ip -n $NS_DST link set dev veth$DST up +ip -n $NS_SRC link set dev veth$SRC up +chk_gro " - aggregation" 1 + +ip netns exec $NS_DST ethtool -K veth$DST gro off +ip netns exec $NS_SRC ethtool -K veth$SRC tx-udp-segmentation off +chk_gro "aggregation again with default and TSO off" 10 + +exit $ret -- cgit v1.2.3 From cbaa683bb3923df4d3c12481bff6cb6d8fdbc060 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 12 Apr 2021 17:19:00 +0200 Subject: bpf: Sync bpf headers in tooling infrastucture Synchronize tools/include/uapi/linux/bpf.h which was missing changes from various commits: - f3c45326ee71 ("bpf: Document PROG_TEST_RUN limitations") - e5e35e754c28 ("bpf: BPF-helper for MTU checking add length input") Signed-off-by: Daniel Borkmann --- tools/include/uapi/linux/bpf.h | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 69902603012c..e1ee1be7e49b 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -312,6 +312,27 @@ union bpf_iter_link_info { * *ctx_out*, *data_out* (for example, packet data), result of the * execution *retval*, and *duration* of the test run. * + * The sizes of the buffers provided as input and output + * parameters *ctx_in*, *ctx_out*, *data_in*, and *data_out* must + * be provided in the corresponding variables *ctx_size_in*, + * *ctx_size_out*, *data_size_in*, and/or *data_size_out*. If any + * of these parameters are not provided (ie set to NULL), the + * corresponding size field must be zero. + * + * Some program types have particular requirements: + * + * **BPF_PROG_TYPE_SK_LOOKUP** + * *data_in* and *data_out* must be NULL. + * + * **BPF_PROG_TYPE_XDP** + * *ctx_in* and *ctx_out* must be NULL. + * + * **BPF_PROG_TYPE_RAW_TRACEPOINT**, + * **BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE** + * + * *ctx_out*, *data_in* and *data_out* must be NULL. + * *repeat* must be zero. + * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. @@ -4578,7 +4599,7 @@ union bpf_attr { * * long bpf_check_mtu(void *ctx, u32 ifindex, u32 *mtu_len, s32 len_diff, u64 flags) * Description - * Check ctx packet size against exceeding MTU of net device (based + * Check packet size against exceeding MTU of net device (based * on *ifindex*). This helper will likely be used in combination * with helpers that adjust/change the packet size. * @@ -4595,6 +4616,14 @@ union bpf_attr { * against the current net device. This is practical if this isn't * used prior to redirect. * + * On input *mtu_len* must be a valid pointer, else verifier will + * reject BPF program. If the value *mtu_len* is initialized to + * zero then the ctx packet size is use. When value *mtu_len* is + * provided as input this specify the L3 length that the MTU check + * is done against. Remember XDP and TC length operate at L2, but + * this value is L3 as this correlate to MTU and IP-header tot_len + * values which are L3 (similar behavior as bpf_fib_lookup). + * * The Linux kernel route table can configure MTUs on a more * specific per route level, which is not provided by this helper. * For route level MTU checks use the **bpf_fib_lookup**\ () @@ -4619,11 +4648,9 @@ union bpf_attr { * * On return *mtu_len* pointer contains the MTU value of the net * device. Remember the net device configured MTU is the L3 size, - * which is returned here and XDP and TX length operate at L2. + * which is returned here and XDP and TC length operate at L2. * Helper take this into account for you, but remember when using - * MTU value in your BPF-code. On input *mtu_len* must be a valid - * pointer and be initialized (to zero), else verifier will reject - * BPF program. + * MTU value in your BPF-code. * * Return * * 0 on success, and populate MTU value in *mtu_len* pointer. -- cgit v1.2.3 From 5c507329000e282dce91e6c98ee6ffa61a8a5e49 Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Mon, 12 Apr 2021 16:24:32 -0300 Subject: libbpf: Clarify flags in ringbuf helpers In 'bpf_ringbuf_reserve()' we require the flag to '0' at the moment. For 'bpf_ringbuf_{discard,submit,output}' a flag of '0' might send a notification to the process if needed. Signed-off-by: Pedro Tammela Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210412192434.944343-1-pctammela@mojatatu.com --- include/uapi/linux/bpf.h | 16 ++++++++++++++++ tools/include/uapi/linux/bpf.h | 16 ++++++++++++++++ 2 files changed, 32 insertions(+) (limited to 'tools') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e1ee1be7e49b..85c924bc21b1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4082,12 +4082,20 @@ union bpf_attr { * of new data availability is sent. * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * An adaptive notification is a notification sent whenever the user-space + * process has caught up and consumed all available payloads. In case the user-space + * process is still processing a previous payload, then no notification is needed + * as it will process the newly added payload automatically. * Return * 0 on success, or a negative error in case of failure. * * void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags) * Description * Reserve *size* bytes of payload in a ring buffer *ringbuf*. + * *flags* must be 0. * Return * Valid pointer with *size* bytes of memory available; NULL, * otherwise. @@ -4099,6 +4107,10 @@ union bpf_attr { * of new data availability is sent. * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * See 'bpf_ringbuf_output()' for the definition of adaptive notification. * Return * Nothing. Always succeeds. * @@ -4109,6 +4121,10 @@ union bpf_attr { * of new data availability is sent. * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * See 'bpf_ringbuf_output()' for the definition of adaptive notification. * Return * Nothing. Always succeeds. * diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e1ee1be7e49b..85c924bc21b1 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4082,12 +4082,20 @@ union bpf_attr { * of new data availability is sent. * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * An adaptive notification is a notification sent whenever the user-space + * process has caught up and consumed all available payloads. In case the user-space + * process is still processing a previous payload, then no notification is needed + * as it will process the newly added payload automatically. * Return * 0 on success, or a negative error in case of failure. * * void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags) * Description * Reserve *size* bytes of payload in a ring buffer *ringbuf*. + * *flags* must be 0. * Return * Valid pointer with *size* bytes of memory available; NULL, * otherwise. @@ -4099,6 +4107,10 @@ union bpf_attr { * of new data availability is sent. * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * See 'bpf_ringbuf_output()' for the definition of adaptive notification. * Return * Nothing. Always succeeds. * @@ -4109,6 +4121,10 @@ union bpf_attr { * of new data availability is sent. * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * See 'bpf_ringbuf_output()' for the definition of adaptive notification. * Return * Nothing. Always succeeds. * -- cgit v1.2.3 From 441e8c66b23e027c00ccebd70df9fd933918eefe Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Tue, 13 Apr 2021 11:16:06 +0200 Subject: bpf: Return target info when a tracing bpf_link is queried MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is currently no way to discover the target of a tracing program attachment after the fact. Add this information to bpf_link_info and return it when querying the bpf_link fd. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210413091607.58945-1-toke@redhat.com --- include/linux/bpf_verifier.h | 9 +++++++++ include/uapi/linux/bpf.h | 2 ++ kernel/bpf/syscall.c | 3 +++ tools/include/uapi/linux/bpf.h | 2 ++ 4 files changed, 16 insertions(+) (limited to 'tools') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 51c2ffa3d901..6023a1367853 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -487,6 +487,15 @@ static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog, return ((u64)btf_obj_id(btf) << 32) | 0x80000000 | btf_id; } +/* unpack the IDs from the key as constructed above */ +static inline void bpf_trampoline_unpack_key(u64 key, u32 *obj_id, u32 *btf_id) +{ + if (obj_id) + *obj_id = key >> 32; + if (btf_id) + *btf_id = key & 0x7FFFFFFF; +} + int bpf_check_attach_target(struct bpf_verifier_log *log, const struct bpf_prog *prog, const struct bpf_prog *tgt_prog, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 85c924bc21b1..df164a44bb41 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5416,6 +5416,8 @@ struct bpf_link_info { } raw_tracepoint; struct { __u32 attach_type; + __u32 target_obj_id; /* prog_id for PROG_EXT, otherwise btf object id */ + __u32 target_btf_id; /* BTF type id inside the object */ } tracing; struct { __u64 cgroup_id; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6428634da57e..fd495190115e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2551,6 +2551,9 @@ static int bpf_tracing_link_fill_link_info(const struct bpf_link *link, container_of(link, struct bpf_tracing_link, link); info->tracing.attach_type = tr_link->attach_type; + bpf_trampoline_unpack_key(tr_link->trampoline->key, + &info->tracing.target_obj_id, + &info->tracing.target_btf_id); return 0; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 85c924bc21b1..df164a44bb41 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5416,6 +5416,8 @@ struct bpf_link_info { } raw_tracepoint; struct { __u32 attach_type; + __u32 target_obj_id; /* prog_id for PROG_EXT, otherwise btf object id */ + __u32 target_btf_id; /* BTF type id inside the object */ } tracing; struct { __u64 cgroup_id; -- cgit v1.2.3 From 463c2149ede72b696c42b0d6c5a03c061600d04c Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Tue, 13 Apr 2021 11:16:07 +0200 Subject: selftests/bpf: Add tests for target information in bpf_link info queries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend the fexit_bpf2bpf test to check that the info for the bpf_link returned by the kernel matches the expected values. While we're updating the test, change existing uses of CHEC() to use the much easier to read ASSERT_*() macros. v2: - Convert last CHECK() call and get rid of 'duration' var - Split ASSERT_OK_PTR() checks to two separate if statements Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210413091607.58945-2-toke@redhat.com --- .../selftests/bpf/prog_tests/fexit_bpf2bpf.c | 58 ++++++++++++++++------ 1 file changed, 44 insertions(+), 14 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c b/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c index 5c0448910426..63990842d20f 100644 --- a/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c +++ b/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c @@ -58,42 +58,73 @@ static void test_fexit_bpf2bpf_common(const char *obj_file, test_cb cb) { struct bpf_object *obj = NULL, *tgt_obj; + __u32 retval, tgt_prog_id, info_len; + struct bpf_prog_info prog_info = {}; struct bpf_program **prog = NULL; struct bpf_link **link = NULL; - __u32 duration = 0, retval; int err, tgt_fd, i; + struct btf *btf; err = bpf_prog_load(target_obj_file, BPF_PROG_TYPE_UNSPEC, &tgt_obj, &tgt_fd); - if (CHECK(err, "tgt_prog_load", "file %s err %d errno %d\n", - target_obj_file, err, errno)) + if (!ASSERT_OK(err, "tgt_prog_load")) return; DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts, .attach_prog_fd = tgt_fd, ); + info_len = sizeof(prog_info); + err = bpf_obj_get_info_by_fd(tgt_fd, &prog_info, &info_len); + if (!ASSERT_OK(err, "tgt_fd_get_info")) + goto close_prog; + + tgt_prog_id = prog_info.id; + btf = bpf_object__btf(tgt_obj); + link = calloc(sizeof(struct bpf_link *), prog_cnt); + if (!ASSERT_OK_PTR(link, "link_ptr")) + goto close_prog; + prog = calloc(sizeof(struct bpf_program *), prog_cnt); - if (CHECK(!link || !prog, "alloc_memory", "failed to alloc memory")) + if (!ASSERT_OK_PTR(prog, "prog_ptr")) goto close_prog; obj = bpf_object__open_file(obj_file, &opts); - if (CHECK(IS_ERR_OR_NULL(obj), "obj_open", - "failed to open %s: %ld\n", obj_file, - PTR_ERR(obj))) + if (!ASSERT_OK_PTR(obj, "obj_open")) goto close_prog; err = bpf_object__load(obj); - if (CHECK(err, "obj_load", "err %d\n", err)) + if (!ASSERT_OK(err, "obj_load")) goto close_prog; for (i = 0; i < prog_cnt; i++) { + struct bpf_link_info link_info; + char *tgt_name; + __s32 btf_id; + + tgt_name = strstr(prog_name[i], "/"); + if (!ASSERT_OK_PTR(tgt_name, "tgt_name")) + goto close_prog; + btf_id = btf__find_by_name_kind(btf, tgt_name + 1, BTF_KIND_FUNC); + prog[i] = bpf_object__find_program_by_title(obj, prog_name[i]); - if (CHECK(!prog[i], "find_prog", "prog %s not found\n", prog_name[i])) + if (!ASSERT_OK_PTR(prog[i], prog_name[i])) goto close_prog; + link[i] = bpf_program__attach_trace(prog[i]); - if (CHECK(IS_ERR(link[i]), "attach_trace", "failed to link\n")) + if (!ASSERT_OK_PTR(link[i], "attach_trace")) goto close_prog; + + info_len = sizeof(link_info); + memset(&link_info, 0, sizeof(link_info)); + err = bpf_obj_get_info_by_fd(bpf_link__fd(link[i]), + &link_info, &info_len); + ASSERT_OK(err, "link_fd_get_info"); + ASSERT_EQ(link_info.tracing.attach_type, + bpf_program__get_expected_attach_type(prog[i]), + "link_attach_type"); + ASSERT_EQ(link_info.tracing.target_obj_id, tgt_prog_id, "link_tgt_obj_id"); + ASSERT_EQ(link_info.tracing.target_btf_id, btf_id, "link_tgt_btf_id"); } if (cb) { @@ -106,10 +137,9 @@ static void test_fexit_bpf2bpf_common(const char *obj_file, goto close_prog; err = bpf_prog_test_run(tgt_fd, 1, &pkt_v6, sizeof(pkt_v6), - NULL, NULL, &retval, &duration); - CHECK(err || retval, "ipv6", - "err %d errno %d retval %d duration %d\n", - err, errno, retval, duration); + NULL, NULL, &retval, NULL); + ASSERT_OK(err, "prog_run"); + ASSERT_EQ(retval, 0, "prog_run_ret"); if (check_data_map(obj, prog_cnt, false)) goto close_prog; -- cgit v1.2.3 From 5676dba708bbb1fc94a9d3b2e9c114db9e4c6699 Mon Sep 17 00:00:00 2001 From: Yang Jihong Date: Fri, 19 Mar 2021 20:35:27 +0800 Subject: perf annotate: Fix sample events lost in stdio mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In hist__find_annotations(), since different 'struct hist_entry' entries may point to same symbol, we free notes->src to signal already processed this symbol in stdio mode; when annotate, entry will skipped if notes->src is NULL to avoid repeated output. However, there is a problem, for example, run the following command: # perf record -e branch-misses -e branch-instructions -a sleep 1 perf.data file contains different types of sample event. If the same IP sample event exists in branch-misses and branch-instructions, this event uses the same symbol. When annotate branch-misses events, notes->src corresponding to this event is set to null, as a result, when annotate branch-instructions events, this event is skipped and no annotate is output. Solution of this patch is to remove zfree in hists__find_annotations and change sort order to "dso,symbol" to avoid duplicate output when different processes correspond to the same symbol. Signed-off-by: Yang Jihong Acked-by: Namhyung Kim Cc: Alexander Shishkin Cc: Gustavo A. R. Silva Cc: Jin Yao Cc: Jiri Olsa Cc: Mark Rutland Cc: Martin Liška Cc: Peter Zijlstra Cc: zhangjinhao2@huawei.com Link: http://lore.kernel.org/lkml/20210319123527.173883-1-yangjihong1@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-annotate.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 021d974c978e..524e6f0dff22 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -374,13 +374,6 @@ find_next: } else { hist_entry__tty_annotate(he, evsel, ann); nd = rb_next(nd); - /* - * Since we have a hist_entry per IP for the same - * symbol, free he->ms.sym->src to signal we already - * processed this symbol. - */ - zfree(¬es->src->cycles_hist); - zfree(¬es->src); } } } @@ -623,14 +616,22 @@ int cmd_annotate(int argc, const char **argv) setup_browser(true); - if ((use_browser == 1 || annotate.use_stdio2) && annotate.has_br_stack) { + /* + * Events of different processes may correspond to the same + * symbol, we do not care about the processes in annotate, + * set sort order to avoid repeated output. + */ + sort_order = "dso,symbol"; + + /* + * Set SORT_MODE__BRANCH so that annotate display IPC/Cycle + * if branch info is in perf data in TUI mode. + */ + if ((use_browser == 1 || annotate.use_stdio2) && annotate.has_br_stack) sort__mode = SORT_MODE__BRANCH; - if (setup_sorting(annotate.session->evlist) < 0) - usage_with_options(annotate_usage, options); - } else { - if (setup_sorting(NULL) < 0) - usage_with_options(annotate_usage, options); - } + + if (setup_sorting(NULL) < 0) + usage_with_options(annotate_usage, options); ret = __cmd_annotate(&annotate); -- cgit v1.2.3 From 1969b3c60db675040ec0d1b09698807647aac7ed Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Wed, 14 Apr 2021 17:56:32 +0200 Subject: selftests/bpf: Fix the ASSERT_ERR_PTR macro It is just missing a ';'. This macro is not used by any test yet. Fixes: 22ba36351631 ("selftests/bpf: Move and extend ASSERT_xxx() testing macros") Signed-off-by: Florent Revest Signed-off-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210414155632.737866-1-revest@chromium.org --- tools/testing/selftests/bpf/test_progs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index e87c8546230e..ee7e3b45182a 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -210,7 +210,7 @@ extern int test__join_cgroup(const char *path); #define ASSERT_ERR_PTR(ptr, name) ({ \ static int duration = 0; \ const void *___res = (ptr); \ - bool ___ok = IS_ERR(___res) \ + bool ___ok = IS_ERR(___res); \ CHECK(!___ok, (name), "unexpected pointer: %p\n", ___res); \ ___ok; \ }) -- cgit v1.2.3 From 069904ce318e0e15dc67f3c2829303237c5e912b Mon Sep 17 00:00:00 2001 From: zuoqilin Date: Wed, 14 Apr 2021 22:16:39 +0800 Subject: tools/testing: Remove unused variable Remove unused variable "ret2". Signed-off-by: zuoqilin Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210414141639.1446-1-zuoqilin1@163.com --- tools/testing/selftests/bpf/progs/test_tunnel_kern.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c index ba6eadfec565..e7b673117436 100644 --- a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c +++ b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c @@ -396,7 +396,7 @@ int _ip6vxlan_get_tunnel(struct __sk_buff *skb) SEC("geneve_set_tunnel") int _geneve_set_tunnel(struct __sk_buff *skb) { - int ret, ret2; + int ret; struct bpf_tunnel_key key; struct geneve_opt gopt; -- cgit v1.2.3 From 9865ea8ab31f2e56be59125099ee251ce573f293 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 14 Apr 2021 10:08:16 -0300 Subject: perf evlist: Add a method to return the list of evsels as a string Add a 'scnprintf' method to obtain the list of evsels in a evlist as a string, excluding the "dummy" event used for things like receiving metadata events (PERF_RECORD_FORK, MMAP, etc) when synthesizing preexisting threads. Will be used to improve the error message for workload failure in 'perf record. Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: http://lore.kernel.org/lkml/20210414131628.2064862-2-acme@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evlist.c | 19 +++++++++++++++++++ tools/perf/util/evlist.h | 2 ++ 2 files changed, 21 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index f1c79ecf8107..d29a8a118973 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -2138,3 +2138,22 @@ struct evsel *evlist__find_evsel(struct evlist *evlist, int idx) } return NULL; } + +int evlist__scnprintf_evsels(struct evlist *evlist, size_t size, char *bf) +{ + struct evsel *evsel; + int printed = 0; + + evlist__for_each_entry(evlist, evsel) { + if (evsel__is_dummy_event(evsel)) + continue; + if (size > (strlen(evsel__name(evsel)) + (printed ? 2 : 1))) { + printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "," : "", evsel__name(evsel)); + } else { + printed += scnprintf(bf + printed, size - printed, "%s...", printed ? "," : ""); + break; + } + } + + return printed; +} diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index b695ffaae519..a8b97b50cceb 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -365,4 +365,6 @@ int evlist__ctlfd_ack(struct evlist *evlist); #define EVLIST_DISABLED_MSG "Events disabled\n" struct evsel *evlist__find_evsel(struct evlist *evlist, int idx); + +int evlist__scnprintf_evsels(struct evlist *evlist, size_t size, char *bf); #endif /* __PERF_EVLIST_H */ -- cgit v1.2.3 From 3535a6967c0d590381c16d6676c6fdfa60f4d733 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 14 Apr 2021 09:32:14 -0300 Subject: perf record: Improve 'Workload failed' message printing events + what was exec'ed Before: # perf record -a cycles,instructions,cache-misses Workload failed: No such file or directory # After: # perf record -a cycles,instructions,cache-misses Failed to collect 'cycles' for the 'cycles,instructions,cache-misses' workload: No such file or directory # Helps disambiguating other error scenarios: # perf record -a -e cycles,instructions,cache-misses bla Failed to collect 'cycles,instructions,cache-misses' for the 'bla' workload: No such file or directory # perf record -a cycles,instructions,cache-misses sleep 1 Failed to collect 'cycles' for the 'cycles,instructions,cache-misses' workload: No such file or directory # When all goes well we're back to the usual: # perf record -a -e cycles,instructions,cache-misses sleep 1 [ perf record: Woken up 3 times to write data ] [ perf record: Captured and wrote 3.151 MB perf.data (21242 samples) ] # Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: http://lore.kernel.org/lkml/20210414131628.2064862-3-acme@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 35465d1db6dd..5fb9665a2ec2 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -1977,9 +1977,13 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) record__auxtrace_snapshot_exit(rec); if (forks && workload_exec_errno) { - char msg[STRERR_BUFSIZE]; + char msg[STRERR_BUFSIZE], strevsels[2048]; const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg)); - pr_err("Workload failed: %s\n", emsg); + + evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels); + + pr_err("Failed to collect '%s' for the '%s' workload: %s\n", + strevsels, argv[0], emsg); err = -1; goto out_child; } -- cgit v1.2.3 From 2e1daee14e67fbf9b27280b974e2c680a22cabea Mon Sep 17 00:00:00 2001 From: Vitaly Chikunov Date: Wed, 14 Apr 2021 21:27:23 +0300 Subject: perf beauty: Fix fsconfig generator After gnulib update sed stopped matching `[[:space:]]*+' as before, causing the following compilation error: In file included from builtin-trace.c:719: trace/beauty/generated/fsconfig_arrays.c:2:3: error: expected expression before ']' token 2 | [] = "", | ^ trace/beauty/generated/fsconfig_arrays.c:2:3: error: array index in initializer not of integer type trace/beauty/generated/fsconfig_arrays.c:2:3: note: (near initialization for 'fsconfig_cmds') Fix this by correcting the regular expression used in the generator. Also, clean up the script by removing redundant egrep, xargs, and printf invocations. Committer testing: Continues to work: $ cat tools/perf/trace/beauty/fsconfig.sh #!/bin/sh # SPDX-License-Identifier: LGPL-2.1 if [ $# -ne 1 ] ; then linux_header_dir=tools/include/uapi/linux else linux_header_dir=$1 fi linux_mount=${linux_header_dir}/mount.h printf "static const char *fsconfig_cmds[] = {\n" ms='[[:space:]]*' sed -nr "s/^${ms}FSCONFIG_([[:alnum:]_]+)${ms}=${ms}([[:digit:]]+)${ms},.*/\t[\2] = \"\1\",/p" \ ${linux_mount} printf "};\n" $ tools/perf/trace/beauty/fsconfig.sh static const char *fsconfig_cmds[] = { [0] = "SET_FLAG", [1] = "SET_STRING", [2] = "SET_BINARY", [3] = "SET_PATH", [4] = "SET_PATH_EMPTY", [5] = "SET_FD", [6] = "CMD_CREATE", [7] = "CMD_RECONFIGURE", }; $ Fixes: d35293004a5e4 ("perf beauty: Add generator for fsconfig's 'cmd' arg values") Signed-off-by: Vitaly Chikunov Co-authored-by: Dmitry V. Levin Tested-by: Arnaldo Carvalho de Melo Link: http://lore.kernel.org/lkml/20210414182723.1670663-1-vt@altlinux.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/trace/beauty/fsconfig.sh | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/trace/beauty/fsconfig.sh b/tools/perf/trace/beauty/fsconfig.sh index 83fb24df05c9..bc6ef7bb7a5f 100755 --- a/tools/perf/trace/beauty/fsconfig.sh +++ b/tools/perf/trace/beauty/fsconfig.sh @@ -10,8 +10,7 @@ fi linux_mount=${linux_header_dir}/mount.h printf "static const char *fsconfig_cmds[] = {\n" -regex='^[[:space:]]*+FSCONFIG_([[:alnum:]_]+)[[:space:]]*=[[:space:]]*([[:digit:]]+)[[:space:]]*,[[:space:]]*.*' -egrep $regex ${linux_mount} | \ - sed -r "s/$regex/\2 \1/g" | \ - xargs printf "\t[%s] = \"%s\",\n" +ms='[[:space:]]*' +sed -nr "s/^${ms}FSCONFIG_([[:alnum:]_]+)${ms}=${ms}([[:digit:]]+)${ms},.*/\t[\2] = \"\1\",/p" \ + ${linux_mount} printf "};\n" -- cgit v1.2.3 From 2fc83c2cd77703cfcfc1ffaa092614fb1f837292 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 14 Apr 2021 10:54:09 -0500 Subject: tools include: Add an initial math64.h Add an initial math64.h similar to linux/math64.h with functions mul_u64_u64_div64() and mul_u64_u32_shr(). This isn't a direct copy of include/linux/math64.h as that doesn't define mul_u64_u64_div64(). Implementation was written by Peter Zilkstra based on linux/math64.h and div64.h[1]. The original implementation was not optimal on arm64 as __int128 division is not optimal with a call out to __udivti3, so I dropped the __int128 variant of mul_u64_u64_div64(). [1] https://lore.kernel.org/lkml/20200322101848.GF2452@worktop.programming.kicks-ass.net/ Signed-off-by: Rob Herring Acked-by: Jiri Olsa Acked-by: Namhyung Kim Cc: Catalin Marinas Cc: Itaru Kitayama Cc: Mark Rutland Cc: Peter Zijlstra Cc: Will Deacon Link: http://lore.kernel.org/lkml/20210414155412.3697605-2-robh@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/linux/math64.h | 75 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 tools/include/linux/math64.h (limited to 'tools') diff --git a/tools/include/linux/math64.h b/tools/include/linux/math64.h new file mode 100644 index 000000000000..4ad45d5943dc --- /dev/null +++ b/tools/include/linux/math64.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_MATH64_H +#define _LINUX_MATH64_H + +#include + +#ifdef __x86_64__ +static inline u64 mul_u64_u64_div64(u64 a, u64 b, u64 c) +{ + u64 q; + + asm ("mulq %2; divq %3" : "=a" (q) + : "a" (a), "rm" (b), "rm" (c) + : "rdx"); + + return q; +} +#define mul_u64_u64_div64 mul_u64_u64_div64 +#endif + +#ifdef __SIZEOF_INT128__ +static inline u64 mul_u64_u32_shr(u64 a, u32 b, unsigned int shift) +{ + return (u64)(((unsigned __int128)a * b) >> shift); +} + +#else + +#ifdef __i386__ +static inline u64 mul_u32_u32(u32 a, u32 b) +{ + u32 high, low; + + asm ("mull %[b]" : "=a" (low), "=d" (high) + : [a] "a" (a), [b] "rm" (b) ); + + return low | ((u64)high) << 32; +} +#else +static inline u64 mul_u32_u32(u32 a, u32 b) +{ + return (u64)a * b; +} +#endif + +static inline u64 mul_u64_u32_shr(u64 a, u32 b, unsigned int shift) +{ + u32 ah, al; + u64 ret; + + al = a; + ah = a >> 32; + + ret = mul_u32_u32(al, b) >> shift; + if (ah) + ret += mul_u32_u32(ah, b) << (32 - shift); + + return ret; +} + +#endif /* __SIZEOF_INT128__ */ + +#ifndef mul_u64_u64_div64 +static inline u64 mul_u64_u64_div64(u64 a, u64 b, u64 c) +{ + u64 quot, rem; + + quot = a / c; + rem = a % c; + + return quot * b + (rem * b) / c; +} +#endif + +#endif /* _LINUX_MATH64_H */ -- cgit v1.2.3 From 6cd70754f262e593febc06a02d7ea637c927ea42 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 14 Apr 2021 11:07:37 -0500 Subject: libperf: Add evsel mmap support In order to support usersapce access, an event must be mmapped. While there's already mmap support for evlist, the usecase is a bit different than the self monitoring with userspace access. So let's add new perf_evsel__mmap()/perf_evsel_munmap() functions to mmap/munmap an evsel. This allows implementing userspace access as a fastpath for perf_evsel__read(). The mmapped address is returned by perf_evsel__mmap_base() which primarily for users/tests to check if userspace access is enabled. Signed-off-by: Rob Herring Acked-by: Jiri Olsa Acked-by: Namhyung Kim Cc: Catalin Marinas Cc: Itaru Kitayama Cc: Mark Rutland Cc: Peter Zijlstra Cc: Will Deacon Link: http://lore.kernel.org/lkml/20210414155412.3697605-2-robh@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/perf/Documentation/libperf.txt | 3 ++ tools/lib/perf/evsel.c | 76 ++++++++++++++++++++++++++++++++ tools/lib/perf/include/internal/evsel.h | 1 + tools/lib/perf/include/perf/evsel.h | 3 ++ tools/lib/perf/libperf.map | 3 ++ 5 files changed, 86 insertions(+) (limited to 'tools') diff --git a/tools/lib/perf/Documentation/libperf.txt b/tools/lib/perf/Documentation/libperf.txt index 0c74c30ed23a..63ae5e0195ce 100644 --- a/tools/lib/perf/Documentation/libperf.txt +++ b/tools/lib/perf/Documentation/libperf.txt @@ -136,6 +136,9 @@ SYNOPSIS struct perf_thread_map *threads); void perf_evsel__close(struct perf_evsel *evsel); void perf_evsel__close_cpu(struct perf_evsel *evsel, int cpu); + int perf_evsel__mmap(struct perf_evsel *evsel, int pages); + void perf_evsel__munmap(struct perf_evsel *evsel); + void *perf_evsel__mmap_base(struct perf_evsel *evsel, int cpu, int thread); int perf_evsel__read(struct perf_evsel *evsel, int cpu, int thread, struct perf_counts_values *count); int perf_evsel__enable(struct perf_evsel *evsel); diff --git a/tools/lib/perf/evsel.c b/tools/lib/perf/evsel.c index 4dc06289f4c7..cd203998e875 100644 --- a/tools/lib/perf/evsel.c +++ b/tools/lib/perf/evsel.c @@ -11,10 +11,12 @@ #include #include #include +#include #include #include #include #include +#include void perf_evsel__init(struct perf_evsel *evsel, struct perf_event_attr *attr) { @@ -38,6 +40,7 @@ void perf_evsel__delete(struct perf_evsel *evsel) } #define FD(e, x, y) (*(int *) xyarray__entry(e->fd, x, y)) +#define MMAP(e, x, y) (e->mmap ? ((struct perf_mmap *) xyarray__entry(e->mmap, x, y)) : NULL) int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads) { @@ -55,6 +58,13 @@ int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads) return evsel->fd != NULL ? 0 : -ENOMEM; } +static int perf_evsel__alloc_mmap(struct perf_evsel *evsel, int ncpus, int nthreads) +{ + evsel->mmap = xyarray__new(ncpus, nthreads, sizeof(struct perf_mmap)); + + return evsel->mmap != NULL ? 0 : -ENOMEM; +} + static int sys_perf_event_open(struct perf_event_attr *attr, pid_t pid, int cpu, int group_fd, @@ -156,6 +166,72 @@ void perf_evsel__close_cpu(struct perf_evsel *evsel, int cpu) perf_evsel__close_fd_cpu(evsel, cpu); } +void perf_evsel__munmap(struct perf_evsel *evsel) +{ + int cpu, thread; + + if (evsel->fd == NULL || evsel->mmap == NULL) + return; + + for (cpu = 0; cpu < xyarray__max_x(evsel->fd); cpu++) { + for (thread = 0; thread < xyarray__max_y(evsel->fd); thread++) { + int fd = FD(evsel, cpu, thread); + struct perf_mmap *map = MMAP(evsel, cpu, thread); + + if (fd < 0) + continue; + + perf_mmap__munmap(map); + } + } + + xyarray__delete(evsel->mmap); + evsel->mmap = NULL; +} + +int perf_evsel__mmap(struct perf_evsel *evsel, int pages) +{ + int ret, cpu, thread; + struct perf_mmap_param mp = { + .prot = PROT_READ | PROT_WRITE, + .mask = (pages * page_size) - 1, + }; + + if (evsel->fd == NULL || evsel->mmap) + return -EINVAL; + + if (perf_evsel__alloc_mmap(evsel, xyarray__max_x(evsel->fd), xyarray__max_y(evsel->fd)) < 0) + return -ENOMEM; + + for (cpu = 0; cpu < xyarray__max_x(evsel->fd); cpu++) { + for (thread = 0; thread < xyarray__max_y(evsel->fd); thread++) { + int fd = FD(evsel, cpu, thread); + struct perf_mmap *map = MMAP(evsel, cpu, thread); + + if (fd < 0) + continue; + + perf_mmap__init(map, NULL, false, NULL); + + ret = perf_mmap__mmap(map, &mp, fd, cpu); + if (ret) { + perf_evsel__munmap(evsel); + return ret; + } + } + } + + return 0; +} + +void *perf_evsel__mmap_base(struct perf_evsel *evsel, int cpu, int thread) +{ + if (FD(evsel, cpu, thread) < 0 || MMAP(evsel, cpu, thread) == NULL) + return NULL; + + return MMAP(evsel, cpu, thread)->base; +} + int perf_evsel__read_size(struct perf_evsel *evsel) { u64 read_format = evsel->attr.read_format; diff --git a/tools/lib/perf/include/internal/evsel.h b/tools/lib/perf/include/internal/evsel.h index 1ffd083b235e..1c067d088bc6 100644 --- a/tools/lib/perf/include/internal/evsel.h +++ b/tools/lib/perf/include/internal/evsel.h @@ -41,6 +41,7 @@ struct perf_evsel { struct perf_cpu_map *own_cpus; struct perf_thread_map *threads; struct xyarray *fd; + struct xyarray *mmap; struct xyarray *sample_id; u64 *id; u32 ids; diff --git a/tools/lib/perf/include/perf/evsel.h b/tools/lib/perf/include/perf/evsel.h index c82ec39a4ad0..60eae25076d3 100644 --- a/tools/lib/perf/include/perf/evsel.h +++ b/tools/lib/perf/include/perf/evsel.h @@ -27,6 +27,9 @@ LIBPERF_API int perf_evsel__open(struct perf_evsel *evsel, struct perf_cpu_map * struct perf_thread_map *threads); LIBPERF_API void perf_evsel__close(struct perf_evsel *evsel); LIBPERF_API void perf_evsel__close_cpu(struct perf_evsel *evsel, int cpu); +LIBPERF_API int perf_evsel__mmap(struct perf_evsel *evsel, int pages); +LIBPERF_API void perf_evsel__munmap(struct perf_evsel *evsel); +LIBPERF_API void *perf_evsel__mmap_base(struct perf_evsel *evsel, int cpu, int thread); LIBPERF_API int perf_evsel__read(struct perf_evsel *evsel, int cpu, int thread, struct perf_counts_values *count); LIBPERF_API int perf_evsel__enable(struct perf_evsel *evsel); diff --git a/tools/lib/perf/libperf.map b/tools/lib/perf/libperf.map index 7be1af8a546c..c0c7ceb11060 100644 --- a/tools/lib/perf/libperf.map +++ b/tools/lib/perf/libperf.map @@ -23,6 +23,9 @@ LIBPERF_0.0.1 { perf_evsel__disable; perf_evsel__open; perf_evsel__close; + perf_evsel__mmap; + perf_evsel__munmap; + perf_evsel__mmap_base; perf_evsel__read; perf_evsel__cpus; perf_evsel__threads; -- cgit v1.2.3 From d3003d9e686890a1e9f0cc7c08aa02ef2953b1f0 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 14 Apr 2021 11:07:38 -0500 Subject: libperf tests: Add support for verbose printing Add __T_VERBOSE() so tests can add verbose output. The verbose output is enabled with the '-v' command line option. Running 'make tests V=1' will enable the '-v' option when running the tests. It'll be used in the next patch, for a user space counter access test. Signed-off-by: Rob Herring Acked-by: Jiri Olsa Acked-by: Namhyung Kim Cc: Catalin Marinas Cc: Itaru Kitayama Cc: Mark Rutland Cc: Peter Zijlstra Cc: Will Deacon Link: http://lore.kernel.org/lkml/20210414155412.3697605-3-robh@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/perf/include/internal/tests.h | 32 ++++++++++++++++++++++++++++++++ tools/lib/perf/tests/Makefile | 6 ++++-- 2 files changed, 36 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/lib/perf/include/internal/tests.h b/tools/lib/perf/include/internal/tests.h index 2093e8868a67..29425c2dabe1 100644 --- a/tools/lib/perf/include/internal/tests.h +++ b/tools/lib/perf/include/internal/tests.h @@ -3,11 +3,32 @@ #define __LIBPERF_INTERNAL_TESTS_H #include +#include int tests_failed; +int tests_verbose; + +static inline int get_verbose(char **argv, int argc) +{ + int c; + int verbose = 0; + + while ((c = getopt(argc, argv, "v")) != -1) { + switch (c) + { + case 'v': + verbose = 1; + break; + default: + break; + } + } + return verbose; +} #define __T_START \ do { \ + tests_verbose = get_verbose(argv, argc); \ fprintf(stdout, "- running %s...", __FILE__); \ fflush(NULL); \ tests_failed = 0; \ @@ -30,4 +51,15 @@ do { } \ } while (0) +#define __T_VERBOSE(...) \ +do { \ + if (tests_verbose) { \ + if (tests_verbose == 1) { \ + fputc('\n', stderr); \ + tests_verbose++; \ + } \ + fprintf(stderr, ##__VA_ARGS__); \ + } \ +} while (0) + #endif /* __LIBPERF_INTERNAL_TESTS_H */ diff --git a/tools/lib/perf/tests/Makefile b/tools/lib/perf/tests/Makefile index 96841775feaf..b536cc9a26dd 100644 --- a/tools/lib/perf/tests/Makefile +++ b/tools/lib/perf/tests/Makefile @@ -5,6 +5,8 @@ TESTS = test-cpumap test-threadmap test-evlist test-evsel TESTS_SO := $(addsuffix -so,$(TESTS)) TESTS_A := $(addsuffix -a,$(TESTS)) +TEST_ARGS := $(if $(V),-v) + # Set compile option CFLAGS ifdef EXTRA_CFLAGS CFLAGS := $(EXTRA_CFLAGS) @@ -28,9 +30,9 @@ all: $(TESTS_A) $(TESTS_SO) run: @echo "running static:" - @for i in $(TESTS_A); do ./$$i; done + @for i in $(TESTS_A); do ./$$i $(TEST_ARGS); done @echo "running dynamic:" - @for i in $(TESTS_SO); do LD_LIBRARY_PATH=../ ./$$i; done + @for i in $(TESTS_SO); do LD_LIBRARY_PATH=../ ./$$i $(TEST_ARGS); done clean: $(call QUIET_CLEAN, tests)$(RM) $(TESTS_A) $(TESTS_SO) -- cgit v1.2.3 From d3d93e34bd98e4dbb002310fed08630f4b549a08 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 15 Apr 2021 07:18:17 -0700 Subject: libbpf: Remove unused field. relo->processed is set, but not used. Remove it. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210415141817.53136-1-alexei.starovoitov@gmail.com --- tools/lib/bpf/libbpf.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index ed5586cce227..9cc2d45b0080 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -195,7 +195,6 @@ struct reloc_desc { int insn_idx; int map_idx; int sym_off; - bool processed; }; struct bpf_sec_def; @@ -3499,8 +3498,6 @@ static int bpf_program__record_reloc(struct bpf_program *prog, const char *sym_sec_name; struct bpf_map *map; - reloc_desc->processed = false; - if (!is_call_insn(insn) && !is_ldimm64_insn(insn)) { pr_warn("prog '%s': invalid relo against '%s' for insns[%d].code 0x%x\n", prog->name, sym_name, insn_idx, insn->code); @@ -6314,13 +6311,11 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog) case RELO_LD64: insn[0].src_reg = BPF_PSEUDO_MAP_FD; insn[0].imm = obj->maps[relo->map_idx].fd; - relo->processed = true; break; case RELO_DATA: insn[0].src_reg = BPF_PSEUDO_MAP_VALUE; insn[1].imm = insn[0].imm + relo->sym_off; insn[0].imm = obj->maps[relo->map_idx].fd; - relo->processed = true; break; case RELO_EXTERN_VAR: ext = &obj->externs[relo->sym_off]; @@ -6338,13 +6333,11 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog) insn[1].imm = ext->ksym.addr >> 32; } } - relo->processed = true; break; case RELO_EXTERN_FUNC: ext = &obj->externs[relo->sym_off]; insn[0].src_reg = BPF_PSEUDO_KFUNC_CALL; insn[0].imm = ext->ksym.kernel_btf_id; - relo->processed = true; break; case RELO_SUBPROG_ADDR: insn[0].src_reg = BPF_PSEUDO_FUNC; @@ -6630,9 +6623,6 @@ bpf_object__reloc_code(struct bpf_object *obj, struct bpf_program *main_prog, * different main programs */ insn->imm = subprog->sub_insn_off - (prog->sub_insn_off + insn_idx) - 1; - if (relo) - relo->processed = true; - pr_debug("prog '%s': insn #%zu relocated, imm %d points to subprog '%s' (now at %zu offset)\n", prog->name, insn_idx, insn->imm, subprog->name, subprog->sub_insn_off); } @@ -6725,7 +6715,7 @@ static int bpf_object__relocate_calls(struct bpf_object *obj, struct bpf_program *prog) { struct bpf_program *subprog; - int i, j, err; + int i, err; /* mark all subprogs as not relocated (yet) within the context of * current main program @@ -6736,9 +6726,6 @@ bpf_object__relocate_calls(struct bpf_object *obj, struct bpf_program *prog) continue; subprog->sub_insn_off = 0; - for (j = 0; j < subprog->nr_reloc; j++) - if (subprog->reloc_desc[j].type == RELO_CALL) - subprog->reloc_desc[j].processed = false; } err = bpf_object__reloc_code(obj, prog, prog); -- cgit v1.2.3 From 26e6dd1072763cd5696b75994c03982dde952ad9 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 13 Apr 2021 08:34:13 -0700 Subject: selftests: Set CC to clang in lib.mk if LLVM is set selftests/bpf/Makefile includes lib.mk. With the following command make -j60 LLVM=1 LLVM_IAS=1 <=== compile kernel make -j60 -C tools/testing/selftests/bpf LLVM=1 LLVM_IAS=1 V=1 some files are still compiled with gcc. This patch fixed lib.mk issue which sets CC to gcc in all cases. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210413153413.3027426-1-yhs@fb.com --- tools/testing/selftests/lib.mk | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index a5ce26d548e4..9a41d8bb9ff1 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -1,6 +1,10 @@ # This mimics the top-level Makefile. We do it explicitly here so that this # Makefile can operate with or without the kbuild infrastructure. +ifneq ($(LLVM),) +CC := clang +else CC := $(CROSS_COMPILE)gcc +endif ifeq (0,$(MAKELEVEL)) ifeq ($(OUTPUT),) -- cgit v1.2.3 From f62700ce63a315b4607cc9e97aa15ea409a677b9 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 13 Apr 2021 08:34:19 -0700 Subject: tools: Allow proper CC/CXX/... override with LLVM=1 in Makefile.include selftests/bpf/Makefile includes tools/scripts/Makefile.include. With the following command make -j60 LLVM=1 LLVM_IAS=1 <=== compile kernel make -j60 -C tools/testing/selftests/bpf LLVM=1 LLVM_IAS=1 V=1 some files are still compiled with gcc. This patch fixed the case if CC/AR/LD/CXX/STRIP is allowed to be overridden, it will be written to clang/llvm-ar/..., instead of gcc binaries. The definition of CC_NO_CLANG is also relocated to the place after the above CC is defined. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210413153419.3028165-1-yhs@fb.com --- tools/scripts/Makefile.include | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/scripts/Makefile.include b/tools/scripts/Makefile.include index a402f32a145c..91130648d8e6 100644 --- a/tools/scripts/Makefile.include +++ b/tools/scripts/Makefile.include @@ -39,8 +39,6 @@ EXTRA_WARNINGS += -Wundef EXTRA_WARNINGS += -Wwrite-strings EXTRA_WARNINGS += -Wformat -CC_NO_CLANG := $(shell $(CC) -dM -E -x c /dev/null | grep -Fq "__clang__"; echo $$?) - # Makefiles suck: This macro sets a default value of $(2) for the # variable named by $(1), unless the variable has been set by # environment or command line. This is necessary for CC and AR @@ -52,12 +50,22 @@ define allow-override $(eval $(1) = $(2))) endef +ifneq ($(LLVM),) +$(call allow-override,CC,clang) +$(call allow-override,AR,llvm-ar) +$(call allow-override,LD,ld.lld) +$(call allow-override,CXX,clang++) +$(call allow-override,STRIP,llvm-strip) +else # Allow setting various cross-compile vars or setting CROSS_COMPILE as a prefix. $(call allow-override,CC,$(CROSS_COMPILE)gcc) $(call allow-override,AR,$(CROSS_COMPILE)ar) $(call allow-override,LD,$(CROSS_COMPILE)ld) $(call allow-override,CXX,$(CROSS_COMPILE)g++) $(call allow-override,STRIP,$(CROSS_COMPILE)strip) +endif + +CC_NO_CLANG := $(shell $(CC) -dM -E -x c /dev/null | grep -Fq "__clang__"; echo $$?) ifneq ($(LLVM),) HOSTAR ?= llvm-ar -- cgit v1.2.3 From a22c0c81da644223d911466746ef414a786cb1c8 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 13 Apr 2021 08:34:24 -0700 Subject: selftests/bpf: Fix test_cpp compilation failure with clang With clang compiler: make -j60 LLVM=1 LLVM_IAS=1 <=== compile kernel make -j60 -C tools/testing/selftests/bpf LLVM=1 LLVM_IAS=1 the test_cpp build failed due to the failure: warning: treating 'c-header' input as 'c++-header' when in C++ mode, this behavior is deprecated [-Wdeprecated] clang-13: error: cannot specify -o when generating multiple output files test_cpp compilation flag looks like: clang++ -g -Og -rdynamic -Wall -I<...> ... \ -Dbpf_prog_load=bpf_prog_test_load -Dbpf_load_program=bpf_test_load_program \ test_cpp.cpp <...>/test_core_extern.skel.h <...>/libbpf.a <...>/test_stub.o \ -lcap -lelf -lz -lrt -lpthread -o <...>/test_cpp The clang++ compiler complains the header file in the command line and also failed the compilation due to this. Let us remove the header file from the command line which is not intended any way, and this fixed the compilation problem. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210413153424.3028986-1-yhs@fb.com --- tools/testing/selftests/bpf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 6448c626498f..dcc2dc1f2a86 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -481,7 +481,7 @@ $(OUTPUT)/test_verifier: test_verifier.c verifier/tests.h $(BPFOBJ) | $(OUTPUT) # Make sure we are able to include and link libbpf against c++. $(OUTPUT)/test_cpp: test_cpp.cpp $(OUTPUT)/test_core_extern.skel.h $(BPFOBJ) $(call msg,CXX,,$@) - $(Q)$(CXX) $(CFLAGS) $^ $(LDLIBS) -o $@ + $(Q)$(CXX) $(CFLAGS) $(filter %.a %.o %.cpp,$^) $(LDLIBS) -o $@ # Benchmark runner $(OUTPUT)/bench_%.o: benchs/bench_%.c bench.h -- cgit v1.2.3 From ef9985893caf905b769fae8984780847e8527065 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 13 Apr 2021 08:34:29 -0700 Subject: selftests/bpf: Silence clang compilation warnings With clang compiler: make -j60 LLVM=1 LLVM_IAS=1 <=== compile kernel make -j60 -C tools/testing/selftests/bpf LLVM=1 LLVM_IAS=1 Some linker flags are not used/effective for some binaries and we have warnings like: warning: -lelf: 'linker' input unused [-Wunused-command-line-argument] We also have warnings like: .../selftests/bpf/prog_tests/ns_current_pid_tgid.c:74:57: note: treat the string as an argument to avoid this if (CHECK(waitpid(cpid, &wstatus, 0) == -1, "waitpid", strerror(errno))) ^ "%s", .../selftests/bpf/test_progs.h:129:35: note: expanded from macro 'CHECK' _CHECK(condition, tag, duration, format) ^ .../selftests/bpf/test_progs.h:108:21: note: expanded from macro '_CHECK' fprintf(stdout, ##format); \ ^ The first warning can be silenced with clang option -Wno-unused-command-line-argument. For the second warning, source codes are modified as suggested by the compiler to silence the warning. Since gcc does not support the option -Wno-unused-command-line-argument and the warning only happens with clang compiler, the option -Wno-unused-command-line-argument is enabled only when clang compiler is used. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210413153429.3029377-1-yhs@fb.com --- tools/testing/selftests/bpf/Makefile | 5 +++++ tools/testing/selftests/bpf/prog_tests/fexit_sleep.c | 4 ++-- tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c | 4 ++-- 3 files changed, 9 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index dcc2dc1f2a86..c45ae13b88a0 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -28,6 +28,11 @@ CFLAGS += -g -Og -rdynamic -Wall $(GENFLAGS) $(SAN_CFLAGS) \ -Dbpf_load_program=bpf_test_load_program LDLIBS += -lcap -lelf -lz -lrt -lpthread +# Silence some warnings when compiled with clang +ifneq ($(LLVM),) +CFLAGS += -Wno-unused-command-line-argument +endif + # Order correspond to 'make run_tests' order TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \ test_verifier_log test_dev_cgroup \ diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_sleep.c b/tools/testing/selftests/bpf/prog_tests/fexit_sleep.c index 6c4d42a2386f..ccc7e8a34ab6 100644 --- a/tools/testing/selftests/bpf/prog_tests/fexit_sleep.c +++ b/tools/testing/selftests/bpf/prog_tests/fexit_sleep.c @@ -39,7 +39,7 @@ void test_fexit_sleep(void) goto cleanup; cpid = clone(do_sleep, child_stack + STACK_SIZE, CLONE_FILES | SIGCHLD, fexit_skel); - if (CHECK(cpid == -1, "clone", strerror(errno))) + if (CHECK(cpid == -1, "clone", "%s\n", strerror(errno))) goto cleanup; /* wait until first sys_nanosleep ends and second sys_nanosleep starts */ @@ -65,7 +65,7 @@ void test_fexit_sleep(void) /* kill the thread to unwind sys_nanosleep stack through the trampoline */ kill(cpid, 9); - if (CHECK(waitpid(cpid, &wstatus, 0) == -1, "waitpid", strerror(errno))) + if (CHECK(waitpid(cpid, &wstatus, 0) == -1, "waitpid", "%s\n", strerror(errno))) goto cleanup; if (CHECK(WEXITSTATUS(wstatus) != 0, "exitstatus", "failed")) goto cleanup; diff --git a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c index 31a3114906e2..2535788e135f 100644 --- a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c +++ b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c @@ -68,10 +68,10 @@ static void test_ns_current_pid_tgid_new_ns(void) cpid = clone(test_current_pid_tgid, child_stack + STACK_SIZE, CLONE_NEWPID | SIGCHLD, NULL); - if (CHECK(cpid == -1, "clone", strerror(errno))) + if (CHECK(cpid == -1, "clone", "%s\n", strerror(errno))) return; - if (CHECK(waitpid(cpid, &wstatus, 0) == -1, "waitpid", strerror(errno))) + if (CHECK(waitpid(cpid, &wstatus, 0) == -1, "waitpid", "%s\n", strerror(errno))) return; if (CHECK(WEXITSTATUS(wstatus) != 0, "newns_pidtgid", "failed")) -- cgit v1.2.3 From 8af50142763c6e70d426e45278b23d7103e5b7a7 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 13 Apr 2021 08:34:35 -0700 Subject: bpftool: Fix a clang compilation warning With clang compiler: make -j60 LLVM=1 LLVM_IAS=1 <=== compile kernel # build selftests/bpf or bpftool make -j60 -C tools/testing/selftests/bpf LLVM=1 LLVM_IAS=1 make -j60 -C tools/bpf/bpftool LLVM=1 LLVM_IAS=1 the following compilation warning showed up, net.c:160:37: warning: comparison of integers of different signs: '__u32' (aka 'unsigned int') and 'int' [-Wsign-compare] for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len); ^~~~~~~~~~~~~~~~~ .../tools/include/uapi/linux/netlink.h:99:24: note: expanded from macro 'NLMSG_OK' (nlh)->nlmsg_len <= (len)) ~~~~~~~~~~~~~~~~ ^ ~~~ In this particular case, "len" is defined as "int" and (nlh)->nlmsg_len is "unsigned int". The macro NLMSG_OK is defined as below in uapi/linux/netlink.h. #define NLMSG_OK(nlh,len) ((len) >= (int)sizeof(struct nlmsghdr) && \ (nlh)->nlmsg_len >= sizeof(struct nlmsghdr) && \ (nlh)->nlmsg_len <= (len)) The clang compiler complains the comparision "(nlh)->nlmsg_len <= (len))", but in bpftool/net.c, it is already ensured that "len > 0" must be true. So theoretically the compiler could deduce that comparison of "(nlh)->nlmsg_len" and "len" is okay, but this really depends on compiler internals. Let us add an explicit type conversion (from "int" to "unsigned int") for "len" in NLMSG_OK to silence this warning right now. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210413153435.3029635-1-yhs@fb.com --- tools/bpf/bpftool/net.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c index ff3aa0cf3997..f836d115d7d6 100644 --- a/tools/bpf/bpftool/net.c +++ b/tools/bpf/bpftool/net.c @@ -157,7 +157,7 @@ static int netlink_recv(int sock, __u32 nl_pid, __u32 seq, if (len == 0) break; - for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len); + for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, (unsigned int)len); nh = NLMSG_NEXT(nh, len)) { if (nh->nlmsg_pid != nl_pid) { ret = -LIBBPF_ERRNO__WRNGPID; -- cgit v1.2.3 From 874fc35cdd55e2d46161901de43ec58ca2efc5fe Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Wed, 14 Apr 2021 18:49:55 +0300 Subject: perf intel-pt: Use aux_watermark Turns out, the default setting of attr.aux_watermark to half of the total buffer size is not very useful, especially with smaller buffers. The problem is that, after half of the buffer is filled up, the kernel updates ->aux_head and sets up the next "transaction", while observing that ->aux_tail is still zero (as userspace haven't had the chance to update it), meaning that the trace will have to stop at the end of this second "transaction". This means, for example, that the second PERF_RECORD_AUX in every trace comes with TRUNCATED flag set. Setting attr.aux_watermark to quarter of the buffer gives enough space for the ->aux_tail update to be observed and prevents the data loss. The obligatory before/after showcase: > # perf_before record -e intel_pt//u -m,8 uname > Linux > [ perf record: Woken up 6 times to write data ] > Warning: > AUX data lost 4 times out of 10! > > [ perf record: Captured and wrote 0.099 MB perf.data ] > # perf record -e intel_pt//u -m,8 uname > Linux > [ perf record: Woken up 4 times to write data ] > [ perf record: Captured and wrote 0.039 MB perf.data ] The effect is still visible with large workloads and large buffers, although less pronounced. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210414154955.49603-3-alexander.shishkin@linux.intel.com --- tools/perf/arch/x86/util/intel-pt.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'tools') diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c index a6420c647959..6df0dc00d73a 100644 --- a/tools/perf/arch/x86/util/intel-pt.c +++ b/tools/perf/arch/x86/util/intel-pt.c @@ -776,6 +776,12 @@ static int intel_pt_recording_options(struct auxtrace_record *itr, } } + if (!opts->auxtrace_snapshot_mode && !opts->auxtrace_sample_mode) { + u32 aux_watermark = opts->auxtrace_mmap_pages * page_size / 4; + + intel_pt_evsel->core.attr.aux_watermark = aux_watermark; + } + intel_pt_parse_terms(intel_pt_pmu->name, &intel_pt_pmu->format, "tsc", &tsc_bit); -- cgit v1.2.3 From f2c3c32f45002de19c6dec33f32fd259e82f2557 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 8 Apr 2021 12:36:02 +0200 Subject: selftests/perf_events: Add kselftest for process-wide sigtrap handling Add a kselftest for testing process-wide perf events with synchronous SIGTRAP on events (using breakpoints). In particular, we want to test that changes to the event propagate to all children, and the SIGTRAPs are in fact synchronously sent to the thread where the event occurred. Note: The "signal_stress" test case is also added later in the series to perf tool's built-in tests. The test here is more elaborate in that respect, which on one hand avoids bloating the perf tool unnecessarily, but we also benefit from structured tests with TAP-compliant output that the kselftest framework provides. Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210408103605.1676875-8-elver@google.com --- tools/testing/selftests/perf_events/.gitignore | 2 + tools/testing/selftests/perf_events/Makefile | 6 + tools/testing/selftests/perf_events/config | 1 + tools/testing/selftests/perf_events/settings | 1 + .../selftests/perf_events/sigtrap_threads.c | 210 +++++++++++++++++++++ 5 files changed, 220 insertions(+) create mode 100644 tools/testing/selftests/perf_events/.gitignore create mode 100644 tools/testing/selftests/perf_events/Makefile create mode 100644 tools/testing/selftests/perf_events/config create mode 100644 tools/testing/selftests/perf_events/settings create mode 100644 tools/testing/selftests/perf_events/sigtrap_threads.c (limited to 'tools') diff --git a/tools/testing/selftests/perf_events/.gitignore b/tools/testing/selftests/perf_events/.gitignore new file mode 100644 index 000000000000..4dc43e1bd79c --- /dev/null +++ b/tools/testing/selftests/perf_events/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +sigtrap_threads diff --git a/tools/testing/selftests/perf_events/Makefile b/tools/testing/selftests/perf_events/Makefile new file mode 100644 index 000000000000..973a2c39ca83 --- /dev/null +++ b/tools/testing/selftests/perf_events/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0 +CFLAGS += -Wl,-no-as-needed -Wall -I../../../../usr/include +LDFLAGS += -lpthread + +TEST_GEN_PROGS := sigtrap_threads +include ../lib.mk diff --git a/tools/testing/selftests/perf_events/config b/tools/testing/selftests/perf_events/config new file mode 100644 index 000000000000..ba58ff2203e4 --- /dev/null +++ b/tools/testing/selftests/perf_events/config @@ -0,0 +1 @@ +CONFIG_PERF_EVENTS=y diff --git a/tools/testing/selftests/perf_events/settings b/tools/testing/selftests/perf_events/settings new file mode 100644 index 000000000000..6091b45d226b --- /dev/null +++ b/tools/testing/selftests/perf_events/settings @@ -0,0 +1 @@ +timeout=120 diff --git a/tools/testing/selftests/perf_events/sigtrap_threads.c b/tools/testing/selftests/perf_events/sigtrap_threads.c new file mode 100644 index 000000000000..9c0fd442da60 --- /dev/null +++ b/tools/testing/selftests/perf_events/sigtrap_threads.c @@ -0,0 +1,210 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test for perf events with SIGTRAP across all threads. + * + * Copyright (C) 2021, Google LLC. + */ + +#define _GNU_SOURCE + +/* We need the latest siginfo from the kernel repo. */ +#include +#include +#define __have_siginfo_t 1 +#define __have_sigval_t 1 +#define __have_sigevent_t 1 +#define __siginfo_t_defined +#define __sigval_t_defined +#define __sigevent_t_defined +#define _BITS_SIGINFO_CONSTS_H 1 +#define _BITS_SIGEVENT_CONSTS_H 1 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest_harness.h" + +#define NUM_THREADS 5 + +/* Data shared between test body, threads, and signal handler. */ +static struct { + int tids_want_signal; /* Which threads still want a signal. */ + int signal_count; /* Sanity check number of signals received. */ + volatile int iterate_on; /* Variable to set breakpoint on. */ + siginfo_t first_siginfo; /* First observed siginfo_t. */ +} ctx; + +/* Unique value to check si_perf is correctly set from perf_event_attr::sig_data. */ +#define TEST_SIG_DATA(addr) (~(uint64_t)(addr)) + +static struct perf_event_attr make_event_attr(bool enabled, volatile void *addr) +{ + struct perf_event_attr attr = { + .type = PERF_TYPE_BREAKPOINT, + .size = sizeof(attr), + .sample_period = 1, + .disabled = !enabled, + .bp_addr = (unsigned long)addr, + .bp_type = HW_BREAKPOINT_RW, + .bp_len = HW_BREAKPOINT_LEN_1, + .inherit = 1, /* Children inherit events ... */ + .inherit_thread = 1, /* ... but only cloned with CLONE_THREAD. */ + .remove_on_exec = 1, /* Required by sigtrap. */ + .sigtrap = 1, /* Request synchronous SIGTRAP on event. */ + .sig_data = TEST_SIG_DATA(addr), + }; + return attr; +} + +static void sigtrap_handler(int signum, siginfo_t *info, void *ucontext) +{ + if (info->si_code != TRAP_PERF) { + fprintf(stderr, "%s: unexpected si_code %d\n", __func__, info->si_code); + return; + } + + /* + * The data in siginfo_t we're interested in should all be the same + * across threads. + */ + if (!__atomic_fetch_add(&ctx.signal_count, 1, __ATOMIC_RELAXED)) + ctx.first_siginfo = *info; + __atomic_fetch_sub(&ctx.tids_want_signal, syscall(__NR_gettid), __ATOMIC_RELAXED); +} + +static void *test_thread(void *arg) +{ + pthread_barrier_t *barrier = (pthread_barrier_t *)arg; + pid_t tid = syscall(__NR_gettid); + int iter; + int i; + + pthread_barrier_wait(barrier); + + __atomic_fetch_add(&ctx.tids_want_signal, tid, __ATOMIC_RELAXED); + iter = ctx.iterate_on; /* read */ + for (i = 0; i < iter - 1; i++) { + __atomic_fetch_add(&ctx.tids_want_signal, tid, __ATOMIC_RELAXED); + ctx.iterate_on = iter; /* idempotent write */ + } + + return NULL; +} + +FIXTURE(sigtrap_threads) +{ + struct sigaction oldact; + pthread_t threads[NUM_THREADS]; + pthread_barrier_t barrier; + int fd; +}; + +FIXTURE_SETUP(sigtrap_threads) +{ + struct perf_event_attr attr = make_event_attr(false, &ctx.iterate_on); + struct sigaction action = {}; + int i; + + memset(&ctx, 0, sizeof(ctx)); + + /* Initialize sigtrap handler. */ + action.sa_flags = SA_SIGINFO | SA_NODEFER; + action.sa_sigaction = sigtrap_handler; + sigemptyset(&action.sa_mask); + ASSERT_EQ(sigaction(SIGTRAP, &action, &self->oldact), 0); + + /* Initialize perf event. */ + self->fd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, PERF_FLAG_FD_CLOEXEC); + ASSERT_NE(self->fd, -1); + + /* Spawn threads inheriting perf event. */ + pthread_barrier_init(&self->barrier, NULL, NUM_THREADS + 1); + for (i = 0; i < NUM_THREADS; i++) + ASSERT_EQ(pthread_create(&self->threads[i], NULL, test_thread, &self->barrier), 0); +} + +FIXTURE_TEARDOWN(sigtrap_threads) +{ + pthread_barrier_destroy(&self->barrier); + close(self->fd); + sigaction(SIGTRAP, &self->oldact, NULL); +} + +static void run_test_threads(struct __test_metadata *_metadata, + FIXTURE_DATA(sigtrap_threads) *self) +{ + int i; + + pthread_barrier_wait(&self->barrier); + for (i = 0; i < NUM_THREADS; i++) + ASSERT_EQ(pthread_join(self->threads[i], NULL), 0); +} + +TEST_F(sigtrap_threads, remain_disabled) +{ + run_test_threads(_metadata, self); + EXPECT_EQ(ctx.signal_count, 0); + EXPECT_NE(ctx.tids_want_signal, 0); +} + +TEST_F(sigtrap_threads, enable_event) +{ + EXPECT_EQ(ioctl(self->fd, PERF_EVENT_IOC_ENABLE, 0), 0); + run_test_threads(_metadata, self); + + EXPECT_EQ(ctx.signal_count, NUM_THREADS); + EXPECT_EQ(ctx.tids_want_signal, 0); + EXPECT_EQ(ctx.first_siginfo.si_addr, &ctx.iterate_on); + EXPECT_EQ(ctx.first_siginfo.si_errno, PERF_TYPE_BREAKPOINT); + EXPECT_EQ(ctx.first_siginfo.si_perf, TEST_SIG_DATA(&ctx.iterate_on)); + + /* Check enabled for parent. */ + ctx.iterate_on = 0; + EXPECT_EQ(ctx.signal_count, NUM_THREADS + 1); +} + +/* Test that modification propagates to all inherited events. */ +TEST_F(sigtrap_threads, modify_and_enable_event) +{ + struct perf_event_attr new_attr = make_event_attr(true, &ctx.iterate_on); + + EXPECT_EQ(ioctl(self->fd, PERF_EVENT_IOC_MODIFY_ATTRIBUTES, &new_attr), 0); + run_test_threads(_metadata, self); + + EXPECT_EQ(ctx.signal_count, NUM_THREADS); + EXPECT_EQ(ctx.tids_want_signal, 0); + EXPECT_EQ(ctx.first_siginfo.si_addr, &ctx.iterate_on); + EXPECT_EQ(ctx.first_siginfo.si_errno, PERF_TYPE_BREAKPOINT); + EXPECT_EQ(ctx.first_siginfo.si_perf, TEST_SIG_DATA(&ctx.iterate_on)); + + /* Check enabled for parent. */ + ctx.iterate_on = 0; + EXPECT_EQ(ctx.signal_count, NUM_THREADS + 1); +} + +/* Stress test event + signal handling. */ +TEST_F(sigtrap_threads, signal_stress) +{ + ctx.iterate_on = 3000; + + EXPECT_EQ(ioctl(self->fd, PERF_EVENT_IOC_ENABLE, 0), 0); + run_test_threads(_metadata, self); + EXPECT_EQ(ioctl(self->fd, PERF_EVENT_IOC_DISABLE, 0), 0); + + EXPECT_EQ(ctx.signal_count, NUM_THREADS * ctx.iterate_on); + EXPECT_EQ(ctx.tids_want_signal, 0); + EXPECT_EQ(ctx.first_siginfo.si_addr, &ctx.iterate_on); + EXPECT_EQ(ctx.first_siginfo.si_errno, PERF_TYPE_BREAKPOINT); + EXPECT_EQ(ctx.first_siginfo.si_perf, TEST_SIG_DATA(&ctx.iterate_on)); +} + +TEST_HARNESS_MAIN -- cgit v1.2.3 From 6216798bf98e82c382922f1b71ecc4a13d6e65cb Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 8 Apr 2021 12:36:03 +0200 Subject: selftests/perf_events: Add kselftest for remove_on_exec Add kselftest to test that remove_on_exec removes inherited events from child tasks. Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210408103605.1676875-9-elver@google.com --- tools/testing/selftests/perf_events/.gitignore | 1 + tools/testing/selftests/perf_events/Makefile | 2 +- .../testing/selftests/perf_events/remove_on_exec.c | 260 +++++++++++++++++++++ 3 files changed, 262 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/perf_events/remove_on_exec.c (limited to 'tools') diff --git a/tools/testing/selftests/perf_events/.gitignore b/tools/testing/selftests/perf_events/.gitignore index 4dc43e1bd79c..790c47001e77 100644 --- a/tools/testing/selftests/perf_events/.gitignore +++ b/tools/testing/selftests/perf_events/.gitignore @@ -1,2 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only sigtrap_threads +remove_on_exec diff --git a/tools/testing/selftests/perf_events/Makefile b/tools/testing/selftests/perf_events/Makefile index 973a2c39ca83..fcafa5f0d34c 100644 --- a/tools/testing/selftests/perf_events/Makefile +++ b/tools/testing/selftests/perf_events/Makefile @@ -2,5 +2,5 @@ CFLAGS += -Wl,-no-as-needed -Wall -I../../../../usr/include LDFLAGS += -lpthread -TEST_GEN_PROGS := sigtrap_threads +TEST_GEN_PROGS := sigtrap_threads remove_on_exec include ../lib.mk diff --git a/tools/testing/selftests/perf_events/remove_on_exec.c b/tools/testing/selftests/perf_events/remove_on_exec.c new file mode 100644 index 000000000000..5814611a1dc7 --- /dev/null +++ b/tools/testing/selftests/perf_events/remove_on_exec.c @@ -0,0 +1,260 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test for remove_on_exec. + * + * Copyright (C) 2021, Google LLC. + */ + +#define _GNU_SOURCE + +/* We need the latest siginfo from the kernel repo. */ +#include +#include +#define __have_siginfo_t 1 +#define __have_sigval_t 1 +#define __have_sigevent_t 1 +#define __siginfo_t_defined +#define __sigval_t_defined +#define __sigevent_t_defined +#define _BITS_SIGINFO_CONSTS_H 1 +#define _BITS_SIGEVENT_CONSTS_H 1 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest_harness.h" + +static volatile int signal_count; + +static struct perf_event_attr make_event_attr(void) +{ + struct perf_event_attr attr = { + .type = PERF_TYPE_HARDWARE, + .size = sizeof(attr), + .config = PERF_COUNT_HW_INSTRUCTIONS, + .sample_period = 1000, + .exclude_kernel = 1, + .exclude_hv = 1, + .disabled = 1, + .inherit = 1, + /* + * Children normally retain their inherited event on exec; with + * remove_on_exec, we'll remove their event, but the parent and + * any other non-exec'd children will keep their events. + */ + .remove_on_exec = 1, + .sigtrap = 1, + }; + return attr; +} + +static void sigtrap_handler(int signum, siginfo_t *info, void *ucontext) +{ + if (info->si_code != TRAP_PERF) { + fprintf(stderr, "%s: unexpected si_code %d\n", __func__, info->si_code); + return; + } + + signal_count++; +} + +FIXTURE(remove_on_exec) +{ + struct sigaction oldact; + int fd; +}; + +FIXTURE_SETUP(remove_on_exec) +{ + struct perf_event_attr attr = make_event_attr(); + struct sigaction action = {}; + + signal_count = 0; + + /* Initialize sigtrap handler. */ + action.sa_flags = SA_SIGINFO | SA_NODEFER; + action.sa_sigaction = sigtrap_handler; + sigemptyset(&action.sa_mask); + ASSERT_EQ(sigaction(SIGTRAP, &action, &self->oldact), 0); + + /* Initialize perf event. */ + self->fd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, PERF_FLAG_FD_CLOEXEC); + ASSERT_NE(self->fd, -1); +} + +FIXTURE_TEARDOWN(remove_on_exec) +{ + close(self->fd); + sigaction(SIGTRAP, &self->oldact, NULL); +} + +/* Verify event propagates to fork'd child. */ +TEST_F(remove_on_exec, fork_only) +{ + int status; + pid_t pid = fork(); + + if (pid == 0) { + ASSERT_EQ(signal_count, 0); + ASSERT_EQ(ioctl(self->fd, PERF_EVENT_IOC_ENABLE, 0), 0); + while (!signal_count); + _exit(42); + } + + while (!signal_count); /* Child enables event. */ + EXPECT_EQ(waitpid(pid, &status, 0), pid); + EXPECT_EQ(WEXITSTATUS(status), 42); +} + +/* + * Verify that event does _not_ propagate to fork+exec'd child; event enabled + * after fork+exec. + */ +TEST_F(remove_on_exec, fork_exec_then_enable) +{ + pid_t pid_exec, pid_only_fork; + int pipefd[2]; + int tmp; + + /* + * Non-exec child, to ensure exec does not affect inherited events of + * other children. + */ + pid_only_fork = fork(); + if (pid_only_fork == 0) { + /* Block until parent enables event. */ + while (!signal_count); + _exit(42); + } + + ASSERT_NE(pipe(pipefd), -1); + pid_exec = fork(); + if (pid_exec == 0) { + ASSERT_NE(dup2(pipefd[1], STDOUT_FILENO), -1); + close(pipefd[0]); + execl("/proc/self/exe", "exec_child", NULL); + _exit((perror("exec failed"), 1)); + } + close(pipefd[1]); + + ASSERT_EQ(waitpid(pid_exec, &tmp, WNOHANG), 0); /* Child is running. */ + /* Wait for exec'd child to start spinning. */ + EXPECT_EQ(read(pipefd[0], &tmp, sizeof(int)), sizeof(int)); + EXPECT_EQ(tmp, 42); + close(pipefd[0]); + /* Now we can enable the event, knowing the child is doing work. */ + EXPECT_EQ(ioctl(self->fd, PERF_EVENT_IOC_ENABLE, 0), 0); + /* If the event propagated to the exec'd child, it will exit normally... */ + usleep(100000); /* ... give time for event to trigger (in case of bug). */ + EXPECT_EQ(waitpid(pid_exec, &tmp, WNOHANG), 0); /* Should still be running. */ + EXPECT_EQ(kill(pid_exec, SIGKILL), 0); + + /* Verify removal from child did not affect this task's event. */ + tmp = signal_count; + while (signal_count == tmp); /* Should not hang! */ + /* Nor should it have affected the first child. */ + EXPECT_EQ(waitpid(pid_only_fork, &tmp, 0), pid_only_fork); + EXPECT_EQ(WEXITSTATUS(tmp), 42); +} + +/* + * Verify that event does _not_ propagate to fork+exec'd child; event enabled + * before fork+exec. + */ +TEST_F(remove_on_exec, enable_then_fork_exec) +{ + pid_t pid_exec; + int tmp; + + EXPECT_EQ(ioctl(self->fd, PERF_EVENT_IOC_ENABLE, 0), 0); + + pid_exec = fork(); + if (pid_exec == 0) { + execl("/proc/self/exe", "exec_child", NULL); + _exit((perror("exec failed"), 1)); + } + + /* + * The child may exit abnormally at any time if the event propagated and + * a SIGTRAP is sent before the handler was set up. + */ + usleep(100000); /* ... give time for event to trigger (in case of bug). */ + EXPECT_EQ(waitpid(pid_exec, &tmp, WNOHANG), 0); /* Should still be running. */ + EXPECT_EQ(kill(pid_exec, SIGKILL), 0); + + /* Verify removal from child did not affect this task's event. */ + tmp = signal_count; + while (signal_count == tmp); /* Should not hang! */ +} + +TEST_F(remove_on_exec, exec_stress) +{ + pid_t pids[30]; + int i, tmp; + + for (i = 0; i < sizeof(pids) / sizeof(pids[0]); i++) { + pids[i] = fork(); + if (pids[i] == 0) { + execl("/proc/self/exe", "exec_child", NULL); + _exit((perror("exec failed"), 1)); + } + + /* Some forked with event disabled, rest with enabled. */ + if (i > 10) + EXPECT_EQ(ioctl(self->fd, PERF_EVENT_IOC_ENABLE, 0), 0); + } + + usleep(100000); /* ... give time for event to trigger (in case of bug). */ + + for (i = 0; i < sizeof(pids) / sizeof(pids[0]); i++) { + /* All children should still be running. */ + EXPECT_EQ(waitpid(pids[i], &tmp, WNOHANG), 0); + EXPECT_EQ(kill(pids[i], SIGKILL), 0); + } + + /* Verify event is still alive. */ + tmp = signal_count; + while (signal_count == tmp); +} + +/* For exec'd child. */ +static void exec_child(void) +{ + struct sigaction action = {}; + const int val = 42; + + /* Set up sigtrap handler in case we erroneously receive a trap. */ + action.sa_flags = SA_SIGINFO | SA_NODEFER; + action.sa_sigaction = sigtrap_handler; + sigemptyset(&action.sa_mask); + if (sigaction(SIGTRAP, &action, NULL)) + _exit((perror("sigaction failed"), 1)); + + /* Signal parent that we're starting to spin. */ + if (write(STDOUT_FILENO, &val, sizeof(int)) == -1) + _exit((perror("write failed"), 1)); + + /* Should hang here until killed. */ + while (!signal_count); +} + +#define main test_main +TEST_HARNESS_MAIN +#undef main +int main(int argc, char *argv[]) +{ + if (!strcmp(argv[0], "exec_child")) { + exec_child(); + return 1; + } + + return test_main(argc, argv); +} -- cgit v1.2.3 From dc65fe82fb07e610e03a9b05bd445f46f93175f5 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 15 Apr 2021 16:45:02 -0700 Subject: selftests: mptcp: add packet mark test case Extend mptcp_connect tool with SO_MARK support (-M ) and add a test case that checks that the packet mark gets copied to all subflows. This is done by only allowing packets with either skb->mark 1 or 2 via iptables. DROP rule packet counter is checked; if its not zero, print an error message and fail the test case. Acked-by: Paolo Abeni Signed-off-by: Florian Westphal Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/Makefile | 2 +- tools/testing/selftests/net/mptcp/mptcp_connect.c | 23 +- tools/testing/selftests/net/mptcp/mptcp_sockopt.sh | 276 +++++++++++++++++++++ 3 files changed, 299 insertions(+), 2 deletions(-) create mode 100755 tools/testing/selftests/net/mptcp/mptcp_sockopt.sh (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/Makefile b/tools/testing/selftests/net/mptcp/Makefile index 00bb158b4a5d..f1464f09b080 100644 --- a/tools/testing/selftests/net/mptcp/Makefile +++ b/tools/testing/selftests/net/mptcp/Makefile @@ -6,7 +6,7 @@ KSFT_KHDR_INSTALL := 1 CFLAGS = -Wall -Wl,--no-as-needed -O2 -g -I$(top_srcdir)/usr/include TEST_PROGS := mptcp_connect.sh pm_netlink.sh mptcp_join.sh diag.sh \ - simult_flows.sh + simult_flows.sh mptcp_sockopt.sh TEST_GEN_FILES = mptcp_connect pm_nl_ctl diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c index 69d89b5d666f..2f207cf33661 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.c +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c @@ -57,6 +57,7 @@ static bool cfg_join; static bool cfg_remove; static unsigned int cfg_do_w; static int cfg_wait; +static uint32_t cfg_mark; static void die_usage(void) { @@ -69,6 +70,7 @@ static void die_usage(void) fprintf(stderr, "\t-p num -- use port num\n"); fprintf(stderr, "\t-s [MPTCP|TCP] -- use mptcp(default) or tcp sockets\n"); fprintf(stderr, "\t-m [poll|mmap|sendfile] -- use poll(default)/mmap+write/sendfile\n"); + fprintf(stderr, "\t-M mark -- set socket packet mark\n"); fprintf(stderr, "\t-u -- check mptcp ulp\n"); fprintf(stderr, "\t-w num -- wait num sec before closing the socket\n"); exit(1); @@ -140,6 +142,17 @@ static void set_sndbuf(int fd, unsigned int size) } } +static void set_mark(int fd, uint32_t mark) +{ + int err; + + err = setsockopt(fd, SOL_SOCKET, SO_MARK, &mark, sizeof(mark)); + if (err) { + perror("set SO_MARK"); + exit(1); + } +} + static int sock_listen_mptcp(const char * const listenaddr, const char * const port) { @@ -248,6 +261,9 @@ static int sock_connect_mptcp(const char * const remoteaddr, continue; } + if (cfg_mark) + set_mark(sock, cfg_mark); + if (connect(sock, a->ai_addr, a->ai_addrlen) == 0) break; /* success */ @@ -830,7 +846,7 @@ static void parse_opts(int argc, char **argv) { int c; - while ((c = getopt(argc, argv, "6jr:lp:s:hut:m:S:R:w:")) != -1) { + while ((c = getopt(argc, argv, "6jr:lp:s:hut:m:S:R:w:M:")) != -1) { switch (c) { case 'j': cfg_join = true; @@ -880,6 +896,9 @@ static void parse_opts(int argc, char **argv) case 'w': cfg_wait = atoi(optarg)*1000000; break; + case 'M': + cfg_mark = strtol(optarg, NULL, 0); + break; } } @@ -911,6 +930,8 @@ int main(int argc, char *argv[]) set_rcvbuf(fd, cfg_rcvbuf); if (cfg_sndbuf) set_sndbuf(fd, cfg_sndbuf); + if (cfg_mark) + set_mark(fd, cfg_mark); return main_loop_s(fd); } diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh new file mode 100755 index 000000000000..2fa13946ac04 --- /dev/null +++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh @@ -0,0 +1,276 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +ret=0 +sin="" +sout="" +cin="" +cout="" +ksft_skip=4 +timeout_poll=30 +timeout_test=$((timeout_poll * 2 + 1)) +mptcp_connect="" +do_all_tests=1 + +add_mark_rules() +{ + local ns=$1 + local m=$2 + + for t in iptables ip6tables; do + # just to debug: check we have multiple subflows connection requests + ip netns exec $ns $t -A OUTPUT -p tcp --syn -m mark --mark $m -j ACCEPT + + # RST packets might be handled by a internal dummy socket + ip netns exec $ns $t -A OUTPUT -p tcp --tcp-flags RST RST -m mark --mark 0 -j ACCEPT + + ip netns exec $ns $t -A OUTPUT -p tcp -m mark --mark $m -j ACCEPT + ip netns exec $ns $t -A OUTPUT -p tcp -m mark --mark 0 -j DROP + done +} + +init() +{ + rndh=$(printf %x $sec)-$(mktemp -u XXXXXX) + + ns1="ns1-$rndh" + ns2="ns2-$rndh" + + for netns in "$ns1" "$ns2";do + ip netns add $netns || exit $ksft_skip + ip -net $netns link set lo up + ip netns exec $netns sysctl -q net.mptcp.enabled=1 + ip netns exec $netns sysctl -q net.ipv4.conf.all.rp_filter=0 + ip netns exec $netns sysctl -q net.ipv4.conf.default.rp_filter=0 + done + + for i in `seq 1 4`; do + ip link add ns1eth$i netns "$ns1" type veth peer name ns2eth$i netns "$ns2" + ip -net "$ns1" addr add 10.0.$i.1/24 dev ns1eth$i + ip -net "$ns1" addr add dead:beef:$i::1/64 dev ns1eth$i nodad + ip -net "$ns1" link set ns1eth$i up + + ip -net "$ns2" addr add 10.0.$i.2/24 dev ns2eth$i + ip -net "$ns2" addr add dead:beef:$i::2/64 dev ns2eth$i nodad + ip -net "$ns2" link set ns2eth$i up + + # let $ns2 reach any $ns1 address from any interface + ip -net "$ns2" route add default via 10.0.$i.1 dev ns2eth$i metric 10$i + + ip netns exec $ns1 ./pm_nl_ctl add 10.0.$i.1 flags signal + ip netns exec $ns1 ./pm_nl_ctl add dead:beef:$i::1 flags signal + + ip netns exec $ns2 ./pm_nl_ctl add 10.0.$i.2 flags signal + ip netns exec $ns2 ./pm_nl_ctl add dead:beef:$i::2 flags signal + done + + ip netns exec $ns1 ./pm_nl_ctl limits 8 8 + ip netns exec $ns2 ./pm_nl_ctl limits 8 8 + + add_mark_rules $ns1 1 + add_mark_rules $ns2 2 +} + +cleanup() +{ + for netns in "$ns1" "$ns2"; do + ip netns del $netns + done + rm -f "$cin" "$cout" + rm -f "$sin" "$sout" +} + +ip -Version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +iptables -V > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run all tests without iptables tool" + exit $ksft_skip +fi + +ip6tables -V > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run all tests without ip6tables tool" + exit $ksft_skip +fi + +check_mark() +{ + local ns=$1 + local af=$2 + + tables=iptables + + if [ $af -eq 6 ];then + tables=ip6tables + fi + + counters=$(ip netns exec $ns $tables -v -L OUTPUT | grep DROP) + values=${counters%DROP*} + + for v in $values; do + if [ $v -ne 0 ]; then + echo "FAIL: got $tables $values in ns $ns , not 0 - not all expected packets marked" 1>&2 + return 1 + fi + done + + return 0 +} + +print_file_err() +{ + ls -l "$1" 1>&2 + echo "Trailing bytes are: " + tail -c 27 "$1" +} + +check_transfer() +{ + in=$1 + out=$2 + what=$3 + + cmp "$in" "$out" > /dev/null 2>&1 + if [ $? -ne 0 ] ;then + echo "[ FAIL ] $what does not match (in, out):" + print_file_err "$in" + print_file_err "$out" + ret=1 + + return 1 + fi + + return 0 +} + +# $1: IP address +is_v6() +{ + [ -z "${1##*:*}" ] +} + +do_transfer() +{ + listener_ns="$1" + connector_ns="$2" + cl_proto="$3" + srv_proto="$4" + connect_addr="$5" + + port=12001 + + :> "$cout" + :> "$sout" + + mptcp_connect="./mptcp_connect -r 20" + + local local_addr + if is_v6 "${connect_addr}"; then + local_addr="::" + else + local_addr="0.0.0.0" + fi + + timeout ${timeout_test} \ + ip netns exec ${listener_ns} \ + $mptcp_connect -t ${timeout_poll} -l -M 1 -p $port -s ${srv_proto} \ + ${local_addr} < "$sin" > "$sout" & + spid=$! + + sleep 1 + + timeout ${timeout_test} \ + ip netns exec ${connector_ns} \ + $mptcp_connect -t ${timeout_poll} -M 2 -p $port -s ${cl_proto} \ + $connect_addr < "$cin" > "$cout" & + + cpid=$! + + wait $cpid + retc=$? + wait $spid + rets=$? + + if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then + echo " client exit code $retc, server $rets" 1>&2 + echo -e "\nnetns ${listener_ns} socket stat for ${port}:" 1>&2 + ip netns exec ${listener_ns} ss -Menita 1>&2 -o "sport = :$port" + + echo -e "\nnetns ${connector_ns} socket stat for ${port}:" 1>&2 + ip netns exec ${connector_ns} ss -Menita 1>&2 -o "dport = :$port" + + ret=1 + return 1 + fi + + if [ $local_addr = "::" ];then + check_mark $listener_ns 6 + check_mark $connector_ns 6 + else + check_mark $listener_ns 4 + check_mark $connector_ns 4 + fi + + check_transfer $cin $sout "file received by server" + + rets=$? + + if [ $retc -eq 0 ] && [ $rets -eq 0 ];then + return 0 + fi + + return 1 +} + +make_file() +{ + name=$1 + who=$2 + size=$3 + + dd if=/dev/urandom of="$name" bs=1024 count=$size 2> /dev/null + echo -e "\nMPTCP_TEST_FILE_END_MARKER" >> "$name" + + echo "Created $name (size $size KB) containing data sent by $who" +} + +run_tests() +{ + listener_ns="$1" + connector_ns="$2" + connect_addr="$3" + lret=0 + + do_transfer ${listener_ns} ${connector_ns} MPTCP MPTCP ${connect_addr} + + lret=$? + + if [ $lret -ne 0 ]; then + ret=$lret + return + fi +} + +sin=$(mktemp) +sout=$(mktemp) +cin=$(mktemp) +cout=$(mktemp) +init +make_file "$cin" "client" 1 +make_file "$sin" "server" 1 +trap cleanup EXIT + +run_tests $ns1 $ns2 10.0.1.1 +run_tests $ns1 $ns2 dead:beef:1::1 + + +if [ $ret -eq 0 ];then + echo "PASS: all packets had packet mark set" +fi + +exit $ret -- cgit v1.2.3 From b9c36fdedd837021f2e72a36e9a12cd4971bcb35 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Thu, 1 Apr 2021 22:25:14 +0800 Subject: KVM: selftests: remove redundant semi-colon Signed-off-by: Yang Yingliang Message-Id: <20210401142514.1688199-1-yangyingliang@huawei.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c index 804ff5ff022d..1f4a0599683c 100644 --- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c +++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c @@ -186,7 +186,7 @@ int main(int argc, char *argv[]) vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &st); } - struct vcpu_runstate_info *rs = addr_gpa2hva(vm, RUNSTATE_ADDR);; + struct vcpu_runstate_info *rs = addr_gpa2hva(vm, RUNSTATE_ADDR); rs->state = 0x5a; for (;;) { -- cgit v1.2.3 From 8826218215de1aae9d89a6ea8d3786f224711334 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 14 Apr 2021 11:20:33 +0300 Subject: selftests: fib_tests: Add test cases for interaction with mangling Test that packets are correctly routed when netfilter mangling rules are present. Without previous patch: # ./fib_tests.sh -t ipv4_mangle IPv4 mangling tests TEST: Connection with correct parameters [ OK ] TEST: Connection with incorrect parameters [ OK ] TEST: Connection with correct parameters - mangling [FAIL] TEST: Connection with correct parameters - no mangling [ OK ] TEST: Connection check - server side [FAIL] Tests passed: 3 Tests failed: 2 # ./fib_tests.sh -t ipv6_mangle IPv6 mangling tests TEST: Connection with correct parameters [ OK ] TEST: Connection with incorrect parameters [ OK ] TEST: Connection with correct parameters - mangling [FAIL] TEST: Connection with correct parameters - no mangling [ OK ] TEST: Connection check - server side [FAIL] Tests passed: 3 Tests failed: 2 With previous patch: # ./fib_tests.sh -t ipv4_mangle IPv4 mangling tests TEST: Connection with correct parameters [ OK ] TEST: Connection with incorrect parameters [ OK ] TEST: Connection with correct parameters - mangling [ OK ] TEST: Connection with correct parameters - no mangling [ OK ] TEST: Connection check - server side [ OK ] Tests passed: 5 Tests failed: 0 # ./fib_tests.sh -t ipv6_mangle IPv6 mangling tests TEST: Connection with correct parameters [ OK ] TEST: Connection with incorrect parameters [ OK ] TEST: Connection with correct parameters - mangling [ OK ] TEST: Connection with correct parameters - no mangling [ OK ] TEST: Connection check - server side [ OK ] Tests passed: 5 Tests failed: 0 Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: Pablo Neira Ayuso --- tools/testing/selftests/net/fib_tests.sh | 152 ++++++++++++++++++++++++++++++- 1 file changed, 151 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh index 2b5707738609..76d9487fb03c 100755 --- a/tools/testing/selftests/net/fib_tests.sh +++ b/tools/testing/selftests/net/fib_tests.sh @@ -9,7 +9,7 @@ ret=0 ksft_skip=4 # all tests in this script. Can be overridden with -t option -TESTS="unregister down carrier nexthop suppress ipv6_rt ipv4_rt ipv6_addr_metric ipv4_addr_metric ipv6_route_metrics ipv4_route_metrics ipv4_route_v6_gw rp_filter ipv4_del_addr" +TESTS="unregister down carrier nexthop suppress ipv6_rt ipv4_rt ipv6_addr_metric ipv4_addr_metric ipv6_route_metrics ipv4_route_metrics ipv4_route_v6_gw rp_filter ipv4_del_addr ipv4_mangle ipv6_mangle" VERBOSE=0 PAUSE_ON_FAIL=no @@ -1653,6 +1653,154 @@ ipv4_route_v6_gw_test() route_cleanup } +socat_check() +{ + if [ ! -x "$(command -v socat)" ]; then + echo "socat command not found. Skipping test" + return 1 + fi + + return 0 +} + +iptables_check() +{ + iptables -t mangle -L OUTPUT &> /dev/null + if [ $? -ne 0 ]; then + echo "iptables configuration not supported. Skipping test" + return 1 + fi + + return 0 +} + +ip6tables_check() +{ + ip6tables -t mangle -L OUTPUT &> /dev/null + if [ $? -ne 0 ]; then + echo "ip6tables configuration not supported. Skipping test" + return 1 + fi + + return 0 +} + +ipv4_mangle_test() +{ + local rc + + echo + echo "IPv4 mangling tests" + + socat_check || return 1 + iptables_check || return 1 + + route_setup + sleep 2 + + local tmp_file=$(mktemp) + ip netns exec ns2 socat UDP4-LISTEN:54321,fork $tmp_file & + + # Add a FIB rule and a route that will direct our connection to the + # listening server. + $IP rule add pref 100 ipproto udp sport 12345 dport 54321 table 123 + $IP route add table 123 172.16.101.0/24 dev veth1 + + # Add an unreachable route to the main table that will block our + # connection in case the FIB rule is not hit. + $IP route add unreachable 172.16.101.2/32 + + run_cmd "echo a | $NS_EXEC socat STDIN UDP4:172.16.101.2:54321,sourceport=12345" + log_test $? 0 " Connection with correct parameters" + + run_cmd "echo a | $NS_EXEC socat STDIN UDP4:172.16.101.2:54321,sourceport=11111" + log_test $? 1 " Connection with incorrect parameters" + + # Add a mangling rule and make sure connection is still successful. + $NS_EXEC iptables -t mangle -A OUTPUT -j MARK --set-mark 1 + + run_cmd "echo a | $NS_EXEC socat STDIN UDP4:172.16.101.2:54321,sourceport=12345" + log_test $? 0 " Connection with correct parameters - mangling" + + # Delete the mangling rule and make sure connection is still + # successful. + $NS_EXEC iptables -t mangle -D OUTPUT -j MARK --set-mark 1 + + run_cmd "echo a | $NS_EXEC socat STDIN UDP4:172.16.101.2:54321,sourceport=12345" + log_test $? 0 " Connection with correct parameters - no mangling" + + # Verify connections were indeed successful on server side. + [[ $(cat $tmp_file | wc -l) -eq 3 ]] + log_test $? 0 " Connection check - server side" + + $IP route del unreachable 172.16.101.2/32 + $IP route del table 123 172.16.101.0/24 dev veth1 + $IP rule del pref 100 + + { kill %% && wait %%; } 2>/dev/null + rm $tmp_file + + route_cleanup +} + +ipv6_mangle_test() +{ + local rc + + echo + echo "IPv6 mangling tests" + + socat_check || return 1 + ip6tables_check || return 1 + + route_setup + sleep 2 + + local tmp_file=$(mktemp) + ip netns exec ns2 socat UDP6-LISTEN:54321,fork $tmp_file & + + # Add a FIB rule and a route that will direct our connection to the + # listening server. + $IP -6 rule add pref 100 ipproto udp sport 12345 dport 54321 table 123 + $IP -6 route add table 123 2001:db8:101::/64 dev veth1 + + # Add an unreachable route to the main table that will block our + # connection in case the FIB rule is not hit. + $IP -6 route add unreachable 2001:db8:101::2/128 + + run_cmd "echo a | $NS_EXEC socat STDIN UDP6:[2001:db8:101::2]:54321,sourceport=12345" + log_test $? 0 " Connection with correct parameters" + + run_cmd "echo a | $NS_EXEC socat STDIN UDP6:[2001:db8:101::2]:54321,sourceport=11111" + log_test $? 1 " Connection with incorrect parameters" + + # Add a mangling rule and make sure connection is still successful. + $NS_EXEC ip6tables -t mangle -A OUTPUT -j MARK --set-mark 1 + + run_cmd "echo a | $NS_EXEC socat STDIN UDP6:[2001:db8:101::2]:54321,sourceport=12345" + log_test $? 0 " Connection with correct parameters - mangling" + + # Delete the mangling rule and make sure connection is still + # successful. + $NS_EXEC ip6tables -t mangle -D OUTPUT -j MARK --set-mark 1 + + run_cmd "echo a | $NS_EXEC socat STDIN UDP6:[2001:db8:101::2]:54321,sourceport=12345" + log_test $? 0 " Connection with correct parameters - no mangling" + + # Verify connections were indeed successful on server side. + [[ $(cat $tmp_file | wc -l) -eq 3 ]] + log_test $? 0 " Connection check - server side" + + $IP -6 route del unreachable 2001:db8:101::2/128 + $IP -6 route del table 123 2001:db8:101::/64 dev veth1 + $IP -6 rule del pref 100 + + { kill %% && wait %%; } 2>/dev/null + rm $tmp_file + + route_cleanup +} + ################################################################################ # usage @@ -1725,6 +1873,8 @@ do ipv6_route_metrics) ipv6_route_metrics_test;; ipv4_route_metrics) ipv4_route_metrics_test;; ipv4_route_v6_gw) ipv4_route_v6_gw_test;; + ipv4_mangle) ipv4_mangle_test;; + ipv6_mangle) ipv6_mangle_test;; help) echo "Test names: $TESTS"; exit 0;; esac -- cgit v1.2.3 From 99033461e685b48549ec77608b4bda75ddf772ce Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 24 Feb 2021 10:29:14 -0600 Subject: objtool: Support asm jump tables Objtool detection of asm jump tables would normally just work, except for the fact that asm retpolines use alternatives. Objtool thinks the alternative code path (a jump to the retpoline) is a sibling call. Don't treat alternative indirect branches as sibling calls when the original instruction has a jump table. Signed-off-by: Josh Poimboeuf Tested-by: Ard Biesheuvel Acked-by: Ard Biesheuvel Tested-by: Sami Tolvanen Acked-by: Peter Zijlstra (Intel) Acked-by: Herbert Xu Link: https://lore.kernel.org/r/460cf4dc675d64e1124146562cabd2c05aa322e8.1614182415.git.jpoimboe@redhat.com --- tools/objtool/check.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/objtool/check.c b/tools/objtool/check.c index a0f762a15ad5..46621e8a80c1 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -108,6 +108,18 @@ static struct instruction *prev_insn_same_sym(struct objtool_file *file, for (insn = next_insn_same_sec(file, insn); insn; \ insn = next_insn_same_sec(file, insn)) +static bool is_jump_table_jump(struct instruction *insn) +{ + struct alt_group *alt_group = insn->alt_group; + + if (insn->jump_table) + return true; + + /* Retpoline alternative for a jump table? */ + return alt_group && alt_group->orig_group && + alt_group->orig_group->first_insn->jump_table; +} + static bool is_sibling_call(struct instruction *insn) { /* @@ -120,7 +132,7 @@ static bool is_sibling_call(struct instruction *insn) /* An indirect jump is either a sibling call or a jump to a table. */ if (insn->type == INSN_JUMP_DYNAMIC) - return list_empty(&insn->alts); + return !is_jump_table_jump(insn); /* add_jump_destinations() sets insn->call_dest for sibling calls. */ return (is_static_jump(insn) && insn->call_dest); -- cgit v1.2.3 From bf5eb67dc80a75e0756269084b087c06f0360b78 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 16 Apr 2021 18:55:35 +0300 Subject: selftests: fib_nexthops: Test large scale nexthop flushing Test that all the nexthops are flushed when a multi-part nexthop dump is required for the flushing. Without previous patch: # ./fib_nexthops.sh TEST: Large scale nexthop flushing [FAIL] With previous patch: # ./fib_nexthops.sh TEST: Large scale nexthop flushing [ OK ] Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Reviewed-by: David Ahern Signed-off-by: David S. Miller --- tools/testing/selftests/net/fib_nexthops.sh | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index 56dd0c6f2e96..49774a8a7736 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -1933,6 +1933,21 @@ basic() log_test $? 2 "Nexthop group and blackhole" $IP nexthop flush >/dev/null 2>&1 + + # Test to ensure that flushing with a multi-part nexthop dump works as + # expected. + local batch_file=$(mktemp) + + for i in $(seq 1 $((64 * 1024))); do + echo "nexthop add id $i blackhole" >> $batch_file + done + + $IP -b $batch_file + $IP nexthop flush >/dev/null 2>&1 + [[ $($IP nexthop | wc -l) -eq 0 ]] + log_test $? 0 "Large scale nexthop flushing" + + rm $batch_file } check_nexthop_buckets_balance() -- cgit v1.2.3 From 7b15523a989b63927c2bb08e9b5b0bbc10b58bef Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Mon, 19 Apr 2021 17:52:40 +0200 Subject: bpf: Add a bpf_snprintf helper The implementation takes inspiration from the existing bpf_trace_printk helper but there are a few differences: To allow for a large number of format-specifiers, parameters are provided in an array, like in bpf_seq_printf. Because the output string takes two arguments and the array of parameters also takes two arguments, the format string needs to fit in one argument. Thankfully, ARG_PTR_TO_CONST_STR is guaranteed to point to a zero-terminated read-only map so we don't need a format string length arg. Because the format-string is known at verification time, we also do a first pass of format string validation in the verifier logic. This makes debugging easier. Signed-off-by: Florent Revest Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210419155243.1632274-4-revest@chromium.org --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 28 +++++++++++++++++++++++ kernel/bpf/helpers.c | 50 ++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 41 ++++++++++++++++++++++++++++++++++ kernel/trace/bpf_trace.c | 2 ++ tools/include/uapi/linux/bpf.h | 28 +++++++++++++++++++++++ 6 files changed, 150 insertions(+) (limited to 'tools') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c160526fc8bf..f8a45f109e96 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1953,6 +1953,7 @@ extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto; extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto; extern const struct bpf_func_proto bpf_copy_from_user_proto; extern const struct bpf_func_proto bpf_snprintf_btf_proto; +extern const struct bpf_func_proto bpf_snprintf_proto; extern const struct bpf_func_proto bpf_per_cpu_ptr_proto; extern const struct bpf_func_proto bpf_this_cpu_ptr_proto; extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index df164a44bb41..ec6d85a81744 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4708,6 +4708,33 @@ union bpf_attr { * Return * The number of traversed map elements for success, **-EINVAL** for * invalid **flags**. + * + * long bpf_snprintf(char *str, u32 str_size, const char *fmt, u64 *data, u32 data_len) + * Description + * Outputs a string into the **str** buffer of size **str_size** + * based on a format string stored in a read-only map pointed by + * **fmt**. + * + * Each format specifier in **fmt** corresponds to one u64 element + * in the **data** array. For strings and pointers where pointees + * are accessed, only the pointer values are stored in the *data* + * array. The *data_len* is the size of *data* in bytes. + * + * Formats **%s** and **%p{i,I}{4,6}** require to read kernel + * memory. Reading kernel memory may fail due to either invalid + * address or valid address but requiring a major memory fault. If + * reading kernel memory fails, the string for **%s** will be an + * empty string, and the ip address for **%p{i,I}{4,6}** will be 0. + * Not returning error to bpf program is consistent with what + * **bpf_trace_printk**\ () does for now. + * + * Return + * The strictly positive length of the formatted string, including + * the trailing zero character. If the return value is greater than + * **str_size**, **str** contains a truncated string, guaranteed to + * be zero-terminated except when **str_size** is 0. + * + * Or **-EBUSY** if the per-CPU memory copy buffer is busy. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4875,6 +4902,7 @@ union bpf_attr { FN(sock_from_file), \ FN(check_mtu), \ FN(for_each_map_elem), \ + FN(snprintf), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 9ca57eb1fc0d..85b26ca5aacd 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -925,6 +925,54 @@ out: return err; } +#define MAX_SNPRINTF_VARARGS 12 + +BPF_CALL_5(bpf_snprintf, char *, str, u32, str_size, char *, fmt, + const void *, data, u32, data_len) +{ + enum bpf_printf_mod_type mod[MAX_SNPRINTF_VARARGS]; + u64 args[MAX_SNPRINTF_VARARGS]; + int err, num_args; + + if (data_len % 8 || data_len > MAX_SNPRINTF_VARARGS * 8 || + (data_len && !data)) + return -EINVAL; + num_args = data_len / 8; + + /* ARG_PTR_TO_CONST_STR guarantees that fmt is zero-terminated so we + * can safely give an unbounded size. + */ + err = bpf_printf_prepare(fmt, UINT_MAX, data, args, mod, num_args); + if (err < 0) + return err; + + /* Maximumly we can have MAX_SNPRINTF_VARARGS parameters, just give + * all of them to snprintf(). + */ + err = snprintf(str, str_size, fmt, BPF_CAST_FMT_ARG(0, args, mod), + BPF_CAST_FMT_ARG(1, args, mod), BPF_CAST_FMT_ARG(2, args, mod), + BPF_CAST_FMT_ARG(3, args, mod), BPF_CAST_FMT_ARG(4, args, mod), + BPF_CAST_FMT_ARG(5, args, mod), BPF_CAST_FMT_ARG(6, args, mod), + BPF_CAST_FMT_ARG(7, args, mod), BPF_CAST_FMT_ARG(8, args, mod), + BPF_CAST_FMT_ARG(9, args, mod), BPF_CAST_FMT_ARG(10, args, mod), + BPF_CAST_FMT_ARG(11, args, mod)); + + bpf_printf_cleanup(); + + return err + 1; +} + +const struct bpf_func_proto bpf_snprintf_proto = { + .func = bpf_snprintf, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM_OR_NULL, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_PTR_TO_CONST_STR, + .arg4_type = ARG_PTR_TO_MEM_OR_NULL, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, +}; + const struct bpf_func_proto bpf_get_current_task_proto __weak; const struct bpf_func_proto bpf_probe_read_user_proto __weak; const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; @@ -1013,6 +1061,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_probe_read_kernel_str_proto; case BPF_FUNC_snprintf_btf: return &bpf_snprintf_btf_proto; + case BPF_FUNC_snprintf: + return &bpf_snprintf_proto; default: return NULL; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5f46dd6f3383..994ef36c5f60 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5918,6 +5918,41 @@ static int check_reference_leak(struct bpf_verifier_env *env) return state->acquired_refs ? -EINVAL : 0; } +static int check_bpf_snprintf_call(struct bpf_verifier_env *env, + struct bpf_reg_state *regs) +{ + struct bpf_reg_state *fmt_reg = ®s[BPF_REG_3]; + struct bpf_reg_state *data_len_reg = ®s[BPF_REG_5]; + struct bpf_map *fmt_map = fmt_reg->map_ptr; + int err, fmt_map_off, num_args; + u64 fmt_addr; + char *fmt; + + /* data must be an array of u64 */ + if (data_len_reg->var_off.value % 8) + return -EINVAL; + num_args = data_len_reg->var_off.value / 8; + + /* fmt being ARG_PTR_TO_CONST_STR guarantees that var_off is const + * and map_direct_value_addr is set. + */ + fmt_map_off = fmt_reg->off + fmt_reg->var_off.value; + err = fmt_map->ops->map_direct_value_addr(fmt_map, &fmt_addr, + fmt_map_off); + if (err) + return err; + fmt = (char *)(long)fmt_addr + fmt_map_off; + + /* We are also guaranteed that fmt+fmt_map_off is NULL terminated, we + * can focus on validating the format specifiers. + */ + err = bpf_printf_prepare(fmt, UINT_MAX, NULL, NULL, NULL, num_args); + if (err < 0) + verbose(env, "Invalid format string\n"); + + return err; +} + static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { @@ -6032,6 +6067,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -EINVAL; } + if (func_id == BPF_FUNC_snprintf) { + err = check_bpf_snprintf_call(env, regs); + if (err < 0) + return err; + } + /* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(env, regs, caller_saved[i]); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index a13f8644b357..2a8bcdc927c7 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1076,6 +1076,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_task_storage_delete_proto; case BPF_FUNC_for_each_map_elem: return &bpf_for_each_map_elem_proto; + case BPF_FUNC_snprintf: + return &bpf_snprintf_proto; default: return NULL; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index df164a44bb41..ec6d85a81744 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4708,6 +4708,33 @@ union bpf_attr { * Return * The number of traversed map elements for success, **-EINVAL** for * invalid **flags**. + * + * long bpf_snprintf(char *str, u32 str_size, const char *fmt, u64 *data, u32 data_len) + * Description + * Outputs a string into the **str** buffer of size **str_size** + * based on a format string stored in a read-only map pointed by + * **fmt**. + * + * Each format specifier in **fmt** corresponds to one u64 element + * in the **data** array. For strings and pointers where pointees + * are accessed, only the pointer values are stored in the *data* + * array. The *data_len* is the size of *data* in bytes. + * + * Formats **%s** and **%p{i,I}{4,6}** require to read kernel + * memory. Reading kernel memory may fail due to either invalid + * address or valid address but requiring a major memory fault. If + * reading kernel memory fails, the string for **%s** will be an + * empty string, and the ip address for **%p{i,I}{4,6}** will be 0. + * Not returning error to bpf program is consistent with what + * **bpf_trace_printk**\ () does for now. + * + * Return + * The strictly positive length of the formatted string, including + * the trailing zero character. If the return value is greater than + * **str_size**, **str** contains a truncated string, guaranteed to + * be zero-terminated except when **str_size** is 0. + * + * Or **-EBUSY** if the per-CPU memory copy buffer is busy. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4875,6 +4902,7 @@ union bpf_attr { FN(sock_from_file), \ FN(check_mtu), \ FN(for_each_map_elem), \ + FN(snprintf), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 83cd92b46484aa8f64cdc0bff8ac6940d1f78519 Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Mon, 19 Apr 2021 17:52:41 +0200 Subject: libbpf: Initialize the bpf_seq_printf parameters array field by field When initializing the __param array with a one liner, if all args are const, the initial array value will be placed in the rodata section but because libbpf does not support relocation in the rodata section, any pointer in this array will stay NULL. Fixes: c09add2fbc5a ("tools/libbpf: Add bpf_iter support") Signed-off-by: Florent Revest Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210419155243.1632274-5-revest@chromium.org --- tools/lib/bpf/bpf_tracing.h | 40 +++++++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h index f9ef37707888..1c2e91ee041d 100644 --- a/tools/lib/bpf/bpf_tracing.h +++ b/tools/lib/bpf/bpf_tracing.h @@ -413,20 +413,38 @@ typeof(name(0)) name(struct pt_regs *ctx) \ } \ static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args) +#define ___bpf_fill0(arr, p, x) do {} while (0) +#define ___bpf_fill1(arr, p, x) arr[p] = x +#define ___bpf_fill2(arr, p, x, args...) arr[p] = x; ___bpf_fill1(arr, p + 1, args) +#define ___bpf_fill3(arr, p, x, args...) arr[p] = x; ___bpf_fill2(arr, p + 1, args) +#define ___bpf_fill4(arr, p, x, args...) arr[p] = x; ___bpf_fill3(arr, p + 1, args) +#define ___bpf_fill5(arr, p, x, args...) arr[p] = x; ___bpf_fill4(arr, p + 1, args) +#define ___bpf_fill6(arr, p, x, args...) arr[p] = x; ___bpf_fill5(arr, p + 1, args) +#define ___bpf_fill7(arr, p, x, args...) arr[p] = x; ___bpf_fill6(arr, p + 1, args) +#define ___bpf_fill8(arr, p, x, args...) arr[p] = x; ___bpf_fill7(arr, p + 1, args) +#define ___bpf_fill9(arr, p, x, args...) arr[p] = x; ___bpf_fill8(arr, p + 1, args) +#define ___bpf_fill10(arr, p, x, args...) arr[p] = x; ___bpf_fill9(arr, p + 1, args) +#define ___bpf_fill11(arr, p, x, args...) arr[p] = x; ___bpf_fill10(arr, p + 1, args) +#define ___bpf_fill12(arr, p, x, args...) arr[p] = x; ___bpf_fill11(arr, p + 1, args) +#define ___bpf_fill(arr, args...) \ + ___bpf_apply(___bpf_fill, ___bpf_narg(args))(arr, 0, args) + /* * BPF_SEQ_PRINTF to wrap bpf_seq_printf to-be-printed values * in a structure. */ -#define BPF_SEQ_PRINTF(seq, fmt, args...) \ - ({ \ - _Pragma("GCC diagnostic push") \ - _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ - static const char ___fmt[] = fmt; \ - unsigned long long ___param[] = { args }; \ - _Pragma("GCC diagnostic pop") \ - int ___ret = bpf_seq_printf(seq, ___fmt, sizeof(___fmt), \ - ___param, sizeof(___param)); \ - ___ret; \ - }) +#define BPF_SEQ_PRINTF(seq, fmt, args...) \ +({ \ + static const char ___fmt[] = fmt; \ + unsigned long long ___param[___bpf_narg(args)]; \ + \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + ___bpf_fill(___param, args); \ + _Pragma("GCC diagnostic pop") \ + \ + bpf_seq_printf(seq, ___fmt, sizeof(___fmt), \ + ___param, sizeof(___param)); \ +}) #endif -- cgit v1.2.3 From 58c2b1f5e0121efd698b6ec8e45e47e58ca9caee Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Mon, 19 Apr 2021 17:52:42 +0200 Subject: libbpf: Introduce a BPF_SNPRINTF helper macro Similarly to BPF_SEQ_PRINTF, this macro turns variadic arguments into an array of u64, making it more natural to call the bpf_snprintf helper. Signed-off-by: Florent Revest Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210419155243.1632274-6-revest@chromium.org --- tools/lib/bpf/bpf_tracing.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'tools') diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h index 1c2e91ee041d..8c954ebc0c7c 100644 --- a/tools/lib/bpf/bpf_tracing.h +++ b/tools/lib/bpf/bpf_tracing.h @@ -447,4 +447,22 @@ static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args) ___param, sizeof(___param)); \ }) +/* + * BPF_SNPRINTF wraps the bpf_snprintf helper with variadic arguments instead of + * an array of u64. + */ +#define BPF_SNPRINTF(out, out_size, fmt, args...) \ +({ \ + static const char ___fmt[] = fmt; \ + unsigned long long ___param[___bpf_narg(args)]; \ + \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + ___bpf_fill(___param, args); \ + _Pragma("GCC diagnostic pop") \ + \ + bpf_snprintf(out, out_size, ___fmt, \ + ___param, sizeof(___param)); \ +}) + #endif -- cgit v1.2.3 From c2e39c6bdc7eb48459ec1d34d4f27eb82299f4b7 Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Mon, 19 Apr 2021 17:52:43 +0200 Subject: selftests/bpf: Add a series of tests for bpf_snprintf The "positive" part tests all format specifiers when things go well. The "negative" part makes sure that incorrect format strings fail at load time. Signed-off-by: Florent Revest Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210419155243.1632274-7-revest@chromium.org --- tools/testing/selftests/bpf/prog_tests/snprintf.c | 125 +++++++++++++++++++++ tools/testing/selftests/bpf/progs/test_snprintf.c | 73 ++++++++++++ .../selftests/bpf/progs/test_snprintf_single.c | 20 ++++ 3 files changed, 218 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/snprintf.c create mode 100644 tools/testing/selftests/bpf/progs/test_snprintf.c create mode 100644 tools/testing/selftests/bpf/progs/test_snprintf_single.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/snprintf.c b/tools/testing/selftests/bpf/prog_tests/snprintf.c new file mode 100644 index 000000000000..a958c22aec75 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/snprintf.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Google LLC. */ + +#include +#include "test_snprintf.skel.h" +#include "test_snprintf_single.skel.h" + +#define EXP_NUM_OUT "-8 9 96 -424242 1337 DABBAD00" +#define EXP_NUM_RET sizeof(EXP_NUM_OUT) + +#define EXP_IP_OUT "127.000.000.001 0000:0000:0000:0000:0000:0000:0000:0001" +#define EXP_IP_RET sizeof(EXP_IP_OUT) + +/* The third specifier, %pB, depends on compiler inlining so don't check it */ +#define EXP_SYM_OUT "schedule schedule+0x0/" +#define MIN_SYM_RET sizeof(EXP_SYM_OUT) + +/* The third specifier, %p, is a hashed pointer which changes on every reboot */ +#define EXP_ADDR_OUT "0000000000000000 ffff00000add4e55 " +#define EXP_ADDR_RET sizeof(EXP_ADDR_OUT "unknownhashedptr") + +#define EXP_STR_OUT "str1 longstr" +#define EXP_STR_RET sizeof(EXP_STR_OUT) + +#define EXP_OVER_OUT "%over" +#define EXP_OVER_RET 10 + +#define EXP_PAD_OUT " 4 000" +#define EXP_PAD_RET 900007 + +#define EXP_NO_ARG_OUT "simple case" +#define EXP_NO_ARG_RET 12 + +#define EXP_NO_BUF_RET 29 + +void test_snprintf_positive(void) +{ + char exp_addr_out[] = EXP_ADDR_OUT; + char exp_sym_out[] = EXP_SYM_OUT; + struct test_snprintf *skel; + + skel = test_snprintf__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + if (!ASSERT_OK(test_snprintf__attach(skel), "skel_attach")) + goto cleanup; + + /* trigger tracepoint */ + usleep(1); + + ASSERT_STREQ(skel->bss->num_out, EXP_NUM_OUT, "num_out"); + ASSERT_EQ(skel->bss->num_ret, EXP_NUM_RET, "num_ret"); + + ASSERT_STREQ(skel->bss->ip_out, EXP_IP_OUT, "ip_out"); + ASSERT_EQ(skel->bss->ip_ret, EXP_IP_RET, "ip_ret"); + + ASSERT_OK(memcmp(skel->bss->sym_out, exp_sym_out, + sizeof(exp_sym_out) - 1), "sym_out"); + ASSERT_LT(MIN_SYM_RET, skel->bss->sym_ret, "sym_ret"); + + ASSERT_OK(memcmp(skel->bss->addr_out, exp_addr_out, + sizeof(exp_addr_out) - 1), "addr_out"); + ASSERT_EQ(skel->bss->addr_ret, EXP_ADDR_RET, "addr_ret"); + + ASSERT_STREQ(skel->bss->str_out, EXP_STR_OUT, "str_out"); + ASSERT_EQ(skel->bss->str_ret, EXP_STR_RET, "str_ret"); + + ASSERT_STREQ(skel->bss->over_out, EXP_OVER_OUT, "over_out"); + ASSERT_EQ(skel->bss->over_ret, EXP_OVER_RET, "over_ret"); + + ASSERT_STREQ(skel->bss->pad_out, EXP_PAD_OUT, "pad_out"); + ASSERT_EQ(skel->bss->pad_ret, EXP_PAD_RET, "pad_ret"); + + ASSERT_STREQ(skel->bss->noarg_out, EXP_NO_ARG_OUT, "no_arg_out"); + ASSERT_EQ(skel->bss->noarg_ret, EXP_NO_ARG_RET, "no_arg_ret"); + + ASSERT_EQ(skel->bss->nobuf_ret, EXP_NO_BUF_RET, "no_buf_ret"); + +cleanup: + test_snprintf__destroy(skel); +} + +#define min(a, b) ((a) < (b) ? (a) : (b)) + +/* Loads an eBPF object calling bpf_snprintf with up to 10 characters of fmt */ +static int load_single_snprintf(char *fmt) +{ + struct test_snprintf_single *skel; + int ret; + + skel = test_snprintf_single__open(); + if (!skel) + return -EINVAL; + + memcpy(skel->rodata->fmt, fmt, min(strlen(fmt) + 1, 10)); + + ret = test_snprintf_single__load(skel); + test_snprintf_single__destroy(skel); + + return ret; +} + +void test_snprintf_negative(void) +{ + ASSERT_OK(load_single_snprintf("valid %d"), "valid usage"); + + ASSERT_ERR(load_single_snprintf("0123456789"), "no terminating zero"); + ASSERT_ERR(load_single_snprintf("%d %d"), "too many specifiers"); + ASSERT_ERR(load_single_snprintf("%pi5"), "invalid specifier 1"); + ASSERT_ERR(load_single_snprintf("%a"), "invalid specifier 2"); + ASSERT_ERR(load_single_snprintf("%"), "invalid specifier 3"); + ASSERT_ERR(load_single_snprintf("%12345678"), "invalid specifier 4"); + ASSERT_ERR(load_single_snprintf("%--------"), "invalid specifier 5"); + ASSERT_ERR(load_single_snprintf("\x80"), "non ascii character"); + ASSERT_ERR(load_single_snprintf("\x1"), "non printable character"); +} + +void test_snprintf(void) +{ + if (test__start_subtest("snprintf_positive")) + test_snprintf_positive(); + if (test__start_subtest("snprintf_negative")) + test_snprintf_negative(); +} diff --git a/tools/testing/selftests/bpf/progs/test_snprintf.c b/tools/testing/selftests/bpf/progs/test_snprintf.c new file mode 100644 index 000000000000..951a0301c553 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_snprintf.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Google LLC. */ + +#include +#include +#include + +char num_out[64] = {}; +long num_ret = 0; + +char ip_out[64] = {}; +long ip_ret = 0; + +char sym_out[64] = {}; +long sym_ret = 0; + +char addr_out[64] = {}; +long addr_ret = 0; + +char str_out[64] = {}; +long str_ret = 0; + +char over_out[6] = {}; +long over_ret = 0; + +char pad_out[10] = {}; +long pad_ret = 0; + +char noarg_out[64] = {}; +long noarg_ret = 0; + +long nobuf_ret = 0; + +extern const void schedule __ksym; + +SEC("raw_tp/sys_enter") +int handler(const void *ctx) +{ + /* Convenient values to pretty-print */ + const __u8 ex_ipv4[] = {127, 0, 0, 1}; + const __u8 ex_ipv6[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; + static const char str1[] = "str1"; + static const char longstr[] = "longstr"; + + /* Integer types */ + num_ret = BPF_SNPRINTF(num_out, sizeof(num_out), + "%d %u %x %li %llu %lX", + -8, 9, 150, -424242, 1337, 0xDABBAD00); + /* IP addresses */ + ip_ret = BPF_SNPRINTF(ip_out, sizeof(ip_out), "%pi4 %pI6", + &ex_ipv4, &ex_ipv6); + /* Symbol lookup formatting */ + sym_ret = BPF_SNPRINTF(sym_out, sizeof(sym_out), "%ps %pS %pB", + &schedule, &schedule, &schedule); + /* Kernel pointers */ + addr_ret = BPF_SNPRINTF(addr_out, sizeof(addr_out), "%pK %px %p", + 0, 0xFFFF00000ADD4E55, 0xFFFF00000ADD4E55); + /* Strings embedding */ + str_ret = BPF_SNPRINTF(str_out, sizeof(str_out), "%s %+05s", + str1, longstr); + /* Overflow */ + over_ret = BPF_SNPRINTF(over_out, sizeof(over_out), "%%overflow"); + /* Padding of fixed width numbers */ + pad_ret = BPF_SNPRINTF(pad_out, sizeof(pad_out), "%5d %0900000X", 4, 4); + /* No args */ + noarg_ret = BPF_SNPRINTF(noarg_out, sizeof(noarg_out), "simple case"); + /* No buffer */ + nobuf_ret = BPF_SNPRINTF(NULL, 0, "only interested in length %d", 60); + + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_snprintf_single.c b/tools/testing/selftests/bpf/progs/test_snprintf_single.c new file mode 100644 index 000000000000..402adaf344f9 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_snprintf_single.c @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Google LLC. */ + +#include +#include + +/* The format string is filled from the userspace such that loading fails */ +static const char fmt[10]; + +SEC("raw_tp/sys_enter") +int handler(const void *ctx) +{ + unsigned long long arg = 42; + + bpf_snprintf(NULL, 0, fmt, &arg, sizeof(arg)); + + return 0; +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From bdc4e369454fcae108e18feb0fcbb6f06815f94b Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Fri, 16 Apr 2021 13:47:03 -0700 Subject: bpf/selftests: Add bpf_get_task_stack retval bounds verifier test Add a bpf_iter test which feeds bpf_get_task_stack's return value into seq_write after confirming it's positive. No attempt to bound the value from above is made. Load will fail if verifier does not refine retval range based on buf sz input to bpf_get_task_stack. Signed-off-by: Dave Marchevsky Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20210416204704.2816874-3-davemarchevsky@fb.com --- .../testing/selftests/bpf/verifier/bpf_get_stack.c | 43 ++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/verifier/bpf_get_stack.c b/tools/testing/selftests/bpf/verifier/bpf_get_stack.c index 69b048cf46d9..3e024c891178 100644 --- a/tools/testing/selftests/bpf/verifier/bpf_get_stack.c +++ b/tools/testing/selftests/bpf/verifier/bpf_get_stack.c @@ -42,3 +42,46 @@ .result = ACCEPT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, +{ + "bpf_get_task_stack return R0 range is refined", + .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0), + BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_6, 0), // ctx->meta->seq + BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_1, 8), // ctx->task + BPF_LD_MAP_FD(BPF_REG_1, 0), // fixup_map_array_48b + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + + BPF_MOV64_REG(BPF_REG_1, BPF_REG_7), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), + BPF_MOV64_REG(BPF_REG_9, BPF_REG_0), // keep buf for seq_write + BPF_MOV64_IMM(BPF_REG_3, 48), + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_EMIT_CALL(BPF_FUNC_get_task_stack), + BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_9), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_0), + BPF_EMIT_CALL(BPF_FUNC_seq_write), + + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_TRACING, + .expected_attach_type = BPF_TRACE_ITER, + .kfunc = "task", + .runs = -1, // Don't run, just load + .fixup_map_array_48b = { 3 }, +}, -- cgit v1.2.3 From c77cec5c207b68a3cbc2af2f81070ec428f41145 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Fri, 16 Apr 2021 13:47:04 -0700 Subject: bpf/selftests: Add bpf_get_task_stack retval bounds test_prog Add a libbpf test prog which feeds bpf_get_task_stack's return value into seq_write after confirming it's positive. No attempt to bound the value from above is made. Load will fail if verifier does not refine retval range based on buf sz input to bpf_get_task_stack. Signed-off-by: Dave Marchevsky Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20210416204704.2816874-4-davemarchevsky@fb.com --- tools/testing/selftests/bpf/prog_tests/bpf_iter.c | 1 + .../selftests/bpf/progs/bpf_iter_task_stack.c | 27 ++++++++++++++++++++++ 2 files changed, 28 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c index 74c45d557a2b..2d3590cfb5e1 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c @@ -147,6 +147,7 @@ static void test_task_stack(void) return; do_dummy_read(skel->progs.dump_task_stack); + do_dummy_read(skel->progs.get_task_user_stacks); bpf_iter_task_stack__destroy(skel); } diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c index 50e59a2e142e..43c36f5f7649 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c @@ -35,3 +35,30 @@ int dump_task_stack(struct bpf_iter__task *ctx) return 0; } + +SEC("iter/task") +int get_task_user_stacks(struct bpf_iter__task *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + struct task_struct *task = ctx->task; + uint64_t buf_sz = 0; + int64_t res; + + if (task == (void *)0) + return 0; + + res = bpf_get_task_stack(task, entries, + MAX_STACK_TRACE_DEPTH * SIZE_OF_ULONG, BPF_F_USER_STACK); + if (res <= 0) + return 0; + + buf_sz += res; + + /* If the verifier doesn't refine bpf_get_task_stack res, and instead + * assumes res is entirely unknown, this program will fail to load as + * the verifier will believe that max buf_sz value allows reading + * past the end of entries in bpf_seq_write call + */ + bpf_seq_write(seq, &entries, buf_sz); + return 0; +} -- cgit v1.2.3 From fa76c775be27bdc49cc2d39f8ebdb926a9d53294 Mon Sep 17 00:00:00 2001 From: Yanan Wang Date: Tue, 30 Mar 2021 16:08:47 +0800 Subject: tools/headers: sync headers of asm-generic/hugetlb_encode.h This patch syncs contents of tools/include/asm-generic/hugetlb_encode.h and include/uapi/asm-generic/hugetlb_encode.h. Arch powerpc supports 16KB hugepages and ARM64 supports 32MB/512MB hugepages. The corresponding mmap flags have already been added in include/uapi/asm-generic/hugetlb_encode.h, but not tools/include/asm-generic/hugetlb_encode.h. Cc: Ingo Molnar Cc: Adrian Hunter Cc: Jiri Olsa Cc: Arnaldo Carvalho de Melo Signed-off-by: Yanan Wang Reviewed-by: Ben Gardon Message-Id: <20210330080856.14940-2-wangyanan55@huawei.com> Signed-off-by: Paolo Bonzini --- tools/include/asm-generic/hugetlb_encode.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'tools') diff --git a/tools/include/asm-generic/hugetlb_encode.h b/tools/include/asm-generic/hugetlb_encode.h index e4732d3c2998..4f3d5aaa11f5 100644 --- a/tools/include/asm-generic/hugetlb_encode.h +++ b/tools/include/asm-generic/hugetlb_encode.h @@ -20,13 +20,16 @@ #define HUGETLB_FLAG_ENCODE_SHIFT 26 #define HUGETLB_FLAG_ENCODE_MASK 0x3f +#define HUGETLB_FLAG_ENCODE_16KB (14 << HUGETLB_FLAG_ENCODE_SHIFT) #define HUGETLB_FLAG_ENCODE_64KB (16 << HUGETLB_FLAG_ENCODE_SHIFT) #define HUGETLB_FLAG_ENCODE_512KB (19 << HUGETLB_FLAG_ENCODE_SHIFT) #define HUGETLB_FLAG_ENCODE_1MB (20 << HUGETLB_FLAG_ENCODE_SHIFT) #define HUGETLB_FLAG_ENCODE_2MB (21 << HUGETLB_FLAG_ENCODE_SHIFT) #define HUGETLB_FLAG_ENCODE_8MB (23 << HUGETLB_FLAG_ENCODE_SHIFT) #define HUGETLB_FLAG_ENCODE_16MB (24 << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_32MB (25 << HUGETLB_FLAG_ENCODE_SHIFT) #define HUGETLB_FLAG_ENCODE_256MB (28 << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_512MB (29 << HUGETLB_FLAG_ENCODE_SHIFT) #define HUGETLB_FLAG_ENCODE_1GB (30 << HUGETLB_FLAG_ENCODE_SHIFT) #define HUGETLB_FLAG_ENCODE_2GB (31 << HUGETLB_FLAG_ENCODE_SHIFT) #define HUGETLB_FLAG_ENCODE_16GB (34 << HUGETLB_FLAG_ENCODE_SHIFT) -- cgit v1.2.3 From c412d6ac28ac55505c5b079e259caddd9f55d293 Mon Sep 17 00:00:00 2001 From: Yanan Wang Date: Tue, 30 Mar 2021 16:08:50 +0800 Subject: KVM: selftests: Print the errno besides error-string in TEST_ASSERT Print the errno besides error-string in TEST_ASSERT in the format of "errno=%d - %s" will explicitly indicate that the string is an error information. Besides, the errno is easier to be used for debugging than the error-string. Suggested-by: Andrew Jones Signed-off-by: Yanan Wang Reviewed-by: Andrew Jones Message-Id: <20210330080856.14940-5-wangyanan55@huawei.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/lib/assert.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/assert.c b/tools/testing/selftests/kvm/lib/assert.c index 5ebbd0d6b472..71ade6100fd3 100644 --- a/tools/testing/selftests/kvm/lib/assert.c +++ b/tools/testing/selftests/kvm/lib/assert.c @@ -71,9 +71,9 @@ test_assert(bool exp, const char *exp_str, fprintf(stderr, "==== Test Assertion Failure ====\n" " %s:%u: %s\n" - " pid=%d tid=%d - %s\n", + " pid=%d tid=%d errno=%d - %s\n", file, line, exp_str, getpid(), _gettid(), - strerror(errno)); + errno, strerror(errno)); test_dump_stack(); if (fmt) { fputs(" ", stderr); -- cgit v1.2.3 From 6436430e29fb250c140c94a8fcc218a832dcaf74 Mon Sep 17 00:00:00 2001 From: Yanan Wang Date: Tue, 30 Mar 2021 16:08:51 +0800 Subject: KVM: selftests: Make a generic helper to get vm guest mode strings For generality and conciseness, make an API which can be used in all kvm libs and selftests to get vm guest mode strings. And the index i is checked in the API in case of possiable faults. Suggested-by: Sean Christopherson Signed-off-by: Yanan Wang Reviewed-by: Ben Gardon Reviewed-by: Andrew Jones Message-Id: <20210330080856.14940-6-wangyanan55@huawei.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/kvm_util.h | 4 +--- tools/testing/selftests/kvm/lib/kvm_util.c | 29 ++++++++++++++++---------- 2 files changed, 19 insertions(+), 14 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 2d7eb6989e83..f52a7492f47f 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -68,9 +68,6 @@ enum vm_guest_mode { #define MIN_PAGE_SIZE (1U << MIN_PAGE_SHIFT) #define PTES_PER_MIN_PAGE ptes_per_page(MIN_PAGE_SIZE) -#define vm_guest_mode_string(m) vm_guest_mode_string[m] -extern const char * const vm_guest_mode_string[]; - struct vm_guest_mode_params { unsigned int pa_bits; unsigned int va_bits; @@ -84,6 +81,7 @@ int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap); int vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id, struct kvm_enable_cap *cap); void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size); +const char *vm_guest_mode_string(uint32_t i); struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm); void kvm_vm_free(struct kvm_vm *vmp); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index e5fbf16f725b..2ea837fe03af 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -143,17 +143,24 @@ static void vm_open(struct kvm_vm *vm, int perm) "rc: %i errno: %i", vm->fd, errno); } -const char * const vm_guest_mode_string[] = { - "PA-bits:52, VA-bits:48, 4K pages", - "PA-bits:52, VA-bits:48, 64K pages", - "PA-bits:48, VA-bits:48, 4K pages", - "PA-bits:48, VA-bits:48, 64K pages", - "PA-bits:40, VA-bits:48, 4K pages", - "PA-bits:40, VA-bits:48, 64K pages", - "PA-bits:ANY, VA-bits:48, 4K pages", -}; -_Static_assert(sizeof(vm_guest_mode_string)/sizeof(char *) == NUM_VM_MODES, - "Missing new mode strings?"); +const char *vm_guest_mode_string(uint32_t i) +{ + static const char * const strings[] = { + [VM_MODE_P52V48_4K] = "PA-bits:52, VA-bits:48, 4K pages", + [VM_MODE_P52V48_64K] = "PA-bits:52, VA-bits:48, 64K pages", + [VM_MODE_P48V48_4K] = "PA-bits:48, VA-bits:48, 4K pages", + [VM_MODE_P48V48_64K] = "PA-bits:48, VA-bits:48, 64K pages", + [VM_MODE_P40V48_4K] = "PA-bits:40, VA-bits:48, 4K pages", + [VM_MODE_P40V48_64K] = "PA-bits:40, VA-bits:48, 64K pages", + [VM_MODE_PXXV48_4K] = "PA-bits:ANY, VA-bits:48, 4K pages", + }; + _Static_assert(sizeof(strings)/sizeof(char *) == NUM_VM_MODES, + "Missing new mode strings?"); + + TEST_ASSERT(i < NUM_VM_MODES, "Guest mode ID %d too big", i); + + return strings[i]; +} const struct vm_guest_mode_params vm_guest_mode_params[] = { { 52, 48, 0x1000, 12 }, -- cgit v1.2.3 From 3b70c4d128a7781f507fa072d60ce5df3fdfcaa6 Mon Sep 17 00:00:00 2001 From: Yanan Wang Date: Tue, 30 Mar 2021 16:08:52 +0800 Subject: KVM: selftests: Add a helper to get system configured THP page size If we want to have some tests about transparent hugepages, the system configured THP hugepage size should better be known by the tests, which can be used for kinds of alignment or guest memory accessing of vcpus... So it makes sense to add a helper to get the transparent hugepage size. With VM_MEM_SRC_ANONYMOUS_THP specified in vm_userspace_mem_region_add(), we now stat /sys/kernel/mm/transparent_hugepage to check whether THP is configured in the host kernel before madvise(). Based on this, we can also read file /sys/kernel/mm/transparent_hugepage/hpage_pmd_size to get THP hugepage size. Signed-off-by: Yanan Wang Reviewed-by: Ben Gardon Reviewed-by: Andrew Jones Message-Id: <20210330080856.14940-7-wangyanan55@huawei.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/test_util.h | 2 ++ tools/testing/selftests/kvm/lib/test_util.c | 29 +++++++++++++++++++++++++ 2 files changed, 31 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index b7f41399f22c..ef24c76ba89a 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -78,6 +78,8 @@ struct vm_mem_backing_src_alias { enum vm_mem_backing_src_type type; }; +bool thp_configured(void); +size_t get_trans_hugepagesz(void); void backing_src_help(void); enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name); diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c index 906c955384e2..9830b2bcd068 100644 --- a/tools/testing/selftests/kvm/lib/test_util.c +++ b/tools/testing/selftests/kvm/lib/test_util.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "linux/kernel.h" #include "test_util.h" @@ -117,6 +118,34 @@ const struct vm_mem_backing_src_alias backing_src_aliases[] = { {"anonymous_hugetlb", VM_MEM_SRC_ANONYMOUS_HUGETLB,}, }; +bool thp_configured(void) +{ + int ret; + struct stat statbuf; + + ret = stat("/sys/kernel/mm/transparent_hugepage", &statbuf); + TEST_ASSERT(ret == 0 || (ret == -1 && errno == ENOENT), + "Error in stating /sys/kernel/mm/transparent_hugepage"); + + return ret == 0; +} + +size_t get_trans_hugepagesz(void) +{ + size_t size; + FILE *f; + + TEST_ASSERT(thp_configured(), "THP is not configured in host kernel"); + + f = fopen("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size", "r"); + TEST_ASSERT(f != NULL, "Error in opening transparent_hugepage/hpage_pmd_size"); + + fscanf(f, "%ld", &size); + fclose(f); + + return size; +} + void backing_src_help(void) { int i; -- cgit v1.2.3 From 5579fa682a3411e5ed58bde35af5128f1044a4b9 Mon Sep 17 00:00:00 2001 From: Yanan Wang Date: Tue, 30 Mar 2021 16:08:53 +0800 Subject: KVM: selftests: Add a helper to get system default hugetlb page size If HUGETLB is configured in the host kernel, then we can know the system default hugetlb page size through *cat /proc/meminfo*. Otherwise, we will not see the information of hugetlb pages in file /proc/meminfo if it's not configured. So add a helper to determine whether HUGETLB is configured and then get the default page size by reading /proc/meminfo. This helper can be useful when a program wants to use the default hugetlb pages of the system and doesn't know the default page size. Signed-off-by: Yanan Wang Reviewed-by: Andrew Jones Message-Id: <20210330080856.14940-8-wangyanan55@huawei.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/test_util.h | 1 + tools/testing/selftests/kvm/lib/test_util.c | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index ef24c76ba89a..e087174eefe5 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -80,6 +80,7 @@ struct vm_mem_backing_src_alias { bool thp_configured(void); size_t get_trans_hugepagesz(void); +size_t get_def_hugetlb_pagesz(void); void backing_src_help(void); enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name); diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c index 9830b2bcd068..bd718c9fb92e 100644 --- a/tools/testing/selftests/kvm/lib/test_util.c +++ b/tools/testing/selftests/kvm/lib/test_util.c @@ -146,6 +146,31 @@ size_t get_trans_hugepagesz(void) return size; } +size_t get_def_hugetlb_pagesz(void) +{ + char buf[64]; + const char *tag = "Hugepagesize:"; + FILE *f; + + f = fopen("/proc/meminfo", "r"); + TEST_ASSERT(f != NULL, "Error in opening /proc/meminfo"); + + while (fgets(buf, sizeof(buf), f) != NULL) { + if (strstr(buf, tag) == buf) { + fclose(f); + return strtoull(buf + strlen(tag), NULL, 10) << 10; + } + } + + if (feof(f)) + TEST_FAIL("HUGETLB is not configured in host kernel"); + else + TEST_FAIL("Error in reading /proc/meminfo"); + + fclose(f); + return 0; +} + void backing_src_help(void) { int i; -- cgit v1.2.3 From 623653b7d4622cb7d016e9a81e6a9d0d9b1326df Mon Sep 17 00:00:00 2001 From: Yanan Wang Date: Tue, 30 Mar 2021 16:08:54 +0800 Subject: KVM: selftests: List all hugetlb src types specified with page sizes With VM_MEM_SRC_ANONYMOUS_HUGETLB, we currently can only use system default hugetlb pages to back the testing guest memory. In order to add flexibility, now list all the known hugetlb backing src types with different page sizes, so that we can specify use of hugetlb pages of the exact granularity that we want. And as all the known hugetlb page sizes are listed, it's appropriate for all architectures. Besides, the helper get_backing_src_pagesz() is added to get the granularity of different backing src types(anonumous, thp, hugetlb). Suggested-by: Ben Gardon Signed-off-by: Yanan Wang Reviewed-by: Andrew Jones Message-Id: <20210330080856.14940-9-wangyanan55@huawei.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/test_util.h | 18 +++- tools/testing/selftests/kvm/lib/kvm_util.c | 2 +- tools/testing/selftests/kvm/lib/test_util.c | 111 +++++++++++++++++++++--- 3 files changed, 118 insertions(+), 13 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index e087174eefe5..fade3130eb01 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -71,16 +71,32 @@ enum vm_mem_backing_src_type { VM_MEM_SRC_ANONYMOUS, VM_MEM_SRC_ANONYMOUS_THP, VM_MEM_SRC_ANONYMOUS_HUGETLB, + VM_MEM_SRC_ANONYMOUS_HUGETLB_16KB, + VM_MEM_SRC_ANONYMOUS_HUGETLB_64KB, + VM_MEM_SRC_ANONYMOUS_HUGETLB_512KB, + VM_MEM_SRC_ANONYMOUS_HUGETLB_1MB, + VM_MEM_SRC_ANONYMOUS_HUGETLB_2MB, + VM_MEM_SRC_ANONYMOUS_HUGETLB_8MB, + VM_MEM_SRC_ANONYMOUS_HUGETLB_16MB, + VM_MEM_SRC_ANONYMOUS_HUGETLB_32MB, + VM_MEM_SRC_ANONYMOUS_HUGETLB_256MB, + VM_MEM_SRC_ANONYMOUS_HUGETLB_512MB, + VM_MEM_SRC_ANONYMOUS_HUGETLB_1GB, + VM_MEM_SRC_ANONYMOUS_HUGETLB_2GB, + VM_MEM_SRC_ANONYMOUS_HUGETLB_16GB, + NUM_SRC_TYPES, }; struct vm_mem_backing_src_alias { const char *name; - enum vm_mem_backing_src_type type; + uint32_t flag; }; bool thp_configured(void); size_t get_trans_hugepagesz(void); size_t get_def_hugetlb_pagesz(void); +const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i); +size_t get_backing_src_pagesz(uint32_t i); void backing_src_help(void); enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 2ea837fe03af..3506174c2053 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -759,7 +759,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, region->mmap_start = mmap(NULL, region->mmap_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS - | (src_type == VM_MEM_SRC_ANONYMOUS_HUGETLB ? MAP_HUGETLB : 0), + | vm_mem_backing_src_alias(src_type)->flag, -1, 0); TEST_ASSERT(region->mmap_start != MAP_FAILED, "test_malloc failed, mmap_start: %p errno: %i", diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c index bd718c9fb92e..63d2bc7d757b 100644 --- a/tools/testing/selftests/kvm/lib/test_util.c +++ b/tools/testing/selftests/kvm/lib/test_util.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "linux/kernel.h" #include "test_util.h" @@ -112,12 +113,6 @@ void print_skip(const char *fmt, ...) puts(", skipping test"); } -const struct vm_mem_backing_src_alias backing_src_aliases[] = { - {"anonymous", VM_MEM_SRC_ANONYMOUS,}, - {"anonymous_thp", VM_MEM_SRC_ANONYMOUS_THP,}, - {"anonymous_hugetlb", VM_MEM_SRC_ANONYMOUS_HUGETLB,}, -}; - bool thp_configured(void) { int ret; @@ -171,22 +166,116 @@ size_t get_def_hugetlb_pagesz(void) return 0; } +const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i) +{ + static const struct vm_mem_backing_src_alias aliases[] = { + [VM_MEM_SRC_ANONYMOUS] = { + .name = "anonymous", + .flag = 0, + }, + [VM_MEM_SRC_ANONYMOUS_THP] = { + .name = "anonymous_thp", + .flag = 0, + }, + [VM_MEM_SRC_ANONYMOUS_HUGETLB] = { + .name = "anonymous_hugetlb", + .flag = MAP_HUGETLB, + }, + [VM_MEM_SRC_ANONYMOUS_HUGETLB_16KB] = { + .name = "anonymous_hugetlb_16kb", + .flag = MAP_HUGETLB | MAP_HUGE_16KB, + }, + [VM_MEM_SRC_ANONYMOUS_HUGETLB_64KB] = { + .name = "anonymous_hugetlb_64kb", + .flag = MAP_HUGETLB | MAP_HUGE_64KB, + }, + [VM_MEM_SRC_ANONYMOUS_HUGETLB_512KB] = { + .name = "anonymous_hugetlb_512kb", + .flag = MAP_HUGETLB | MAP_HUGE_512KB, + }, + [VM_MEM_SRC_ANONYMOUS_HUGETLB_1MB] = { + .name = "anonymous_hugetlb_1mb", + .flag = MAP_HUGETLB | MAP_HUGE_1MB, + }, + [VM_MEM_SRC_ANONYMOUS_HUGETLB_2MB] = { + .name = "anonymous_hugetlb_2mb", + .flag = MAP_HUGETLB | MAP_HUGE_2MB, + }, + [VM_MEM_SRC_ANONYMOUS_HUGETLB_8MB] = { + .name = "anonymous_hugetlb_8mb", + .flag = MAP_HUGETLB | MAP_HUGE_8MB, + }, + [VM_MEM_SRC_ANONYMOUS_HUGETLB_16MB] = { + .name = "anonymous_hugetlb_16mb", + .flag = MAP_HUGETLB | MAP_HUGE_16MB, + }, + [VM_MEM_SRC_ANONYMOUS_HUGETLB_32MB] = { + .name = "anonymous_hugetlb_32mb", + .flag = MAP_HUGETLB | MAP_HUGE_32MB, + }, + [VM_MEM_SRC_ANONYMOUS_HUGETLB_256MB] = { + .name = "anonymous_hugetlb_256mb", + .flag = MAP_HUGETLB | MAP_HUGE_256MB, + }, + [VM_MEM_SRC_ANONYMOUS_HUGETLB_512MB] = { + .name = "anonymous_hugetlb_512mb", + .flag = MAP_HUGETLB | MAP_HUGE_512MB, + }, + [VM_MEM_SRC_ANONYMOUS_HUGETLB_1GB] = { + .name = "anonymous_hugetlb_1gb", + .flag = MAP_HUGETLB | MAP_HUGE_1GB, + }, + [VM_MEM_SRC_ANONYMOUS_HUGETLB_2GB] = { + .name = "anonymous_hugetlb_2gb", + .flag = MAP_HUGETLB | MAP_HUGE_2GB, + }, + [VM_MEM_SRC_ANONYMOUS_HUGETLB_16GB] = { + .name = "anonymous_hugetlb_16gb", + .flag = MAP_HUGETLB | MAP_HUGE_16GB, + }, + }; + _Static_assert(ARRAY_SIZE(aliases) == NUM_SRC_TYPES, + "Missing new backing src types?"); + + TEST_ASSERT(i < NUM_SRC_TYPES, "Backing src type ID %d too big", i); + + return &aliases[i]; +} + +#define MAP_HUGE_PAGE_SIZE(x) (1ULL << ((x >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK)) + +size_t get_backing_src_pagesz(uint32_t i) +{ + uint32_t flag = vm_mem_backing_src_alias(i)->flag; + + switch (i) { + case VM_MEM_SRC_ANONYMOUS: + return getpagesize(); + case VM_MEM_SRC_ANONYMOUS_THP: + return get_trans_hugepagesz(); + case VM_MEM_SRC_ANONYMOUS_HUGETLB: + return get_def_hugetlb_pagesz(); + default: + return MAP_HUGE_PAGE_SIZE(flag); + } +} + void backing_src_help(void) { int i; printf("Available backing src types:\n"); - for (i = 0; i < ARRAY_SIZE(backing_src_aliases); i++) - printf("\t%s\n", backing_src_aliases[i].name); + for (i = 0; i < NUM_SRC_TYPES; i++) + printf("\t%s\n", vm_mem_backing_src_alias(i)->name); } enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name) { int i; - for (i = 0; i < ARRAY_SIZE(backing_src_aliases); i++) - if (!strcmp(type_name, backing_src_aliases[i].name)) - return backing_src_aliases[i].type; + for (i = 0; i < NUM_SRC_TYPES; i++) + if (!strcmp(type_name, vm_mem_backing_src_alias(i)->name)) + return i; backing_src_help(); TEST_FAIL("Unknown backing src type: %s", type_name); -- cgit v1.2.3 From a4b3c8b583bfc8b3a3d04c63a527cbd081eac9f7 Mon Sep 17 00:00:00 2001 From: Yanan Wang Date: Tue, 30 Mar 2021 16:08:55 +0800 Subject: KVM: selftests: Adapt vm_userspace_mem_region_add to new helpers With VM_MEM_SRC_ANONYMOUS_THP specified in vm_userspace_mem_region_add(), we have to get the transparent hugepage size for HVA alignment. With the new helpers, we can use get_backing_src_pagesz() to check whether THP is configured and then get the exact configured hugepage size. As different architectures may have different THP page sizes configured, this can get the accurate THP page sizes on any platform. Signed-off-by: Yanan Wang Reviewed-by: Ben Gardon Reviewed-by: Andrew Jones Message-Id: <20210330080856.14940-10-wangyanan55@huawei.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/lib/kvm_util.c | 28 +++++++++------------------- 1 file changed, 9 insertions(+), 19 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 3506174c2053..c7a2228deaf3 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -18,7 +18,6 @@ #include #include -#define KVM_UTIL_PGS_PER_HUGEPG 512 #define KVM_UTIL_MIN_PFN 2 static int vcpu_mmap_sz(void); @@ -688,7 +687,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, { int ret; struct userspace_mem_region *region; - size_t huge_page_size = KVM_UTIL_PGS_PER_HUGEPG * vm->page_size; + size_t backing_src_pagesz = get_backing_src_pagesz(src_type); size_t alignment; TEST_ASSERT(vm_adjust_num_guest_pages(vm->mode, npages) == npages, @@ -750,7 +749,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, #endif if (src_type == VM_MEM_SRC_ANONYMOUS_THP) - alignment = max(huge_page_size, alignment); + alignment = max(backing_src_pagesz, alignment); /* Add enough memory to align up if necessary */ if (alignment > 1) @@ -769,22 +768,13 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, region->host_mem = align(region->mmap_start, alignment); /* As needed perform madvise */ - if (src_type == VM_MEM_SRC_ANONYMOUS || src_type == VM_MEM_SRC_ANONYMOUS_THP) { - struct stat statbuf; - - ret = stat("/sys/kernel/mm/transparent_hugepage", &statbuf); - TEST_ASSERT(ret == 0 || (ret == -1 && errno == ENOENT), - "stat /sys/kernel/mm/transparent_hugepage"); - - TEST_ASSERT(ret == 0 || src_type != VM_MEM_SRC_ANONYMOUS_THP, - "VM_MEM_SRC_ANONYMOUS_THP requires THP to be configured in the host kernel"); - - if (ret == 0) { - ret = madvise(region->host_mem, npages * vm->page_size, - src_type == VM_MEM_SRC_ANONYMOUS ? MADV_NOHUGEPAGE : MADV_HUGEPAGE); - TEST_ASSERT(ret == 0, "madvise failed, addr: %p length: 0x%lx src_type: %x", - region->host_mem, npages * vm->page_size, src_type); - } + if ((src_type == VM_MEM_SRC_ANONYMOUS || + src_type == VM_MEM_SRC_ANONYMOUS_THP) && thp_configured()) { + ret = madvise(region->host_mem, npages * vm->page_size, + src_type == VM_MEM_SRC_ANONYMOUS ? MADV_NOHUGEPAGE : MADV_HUGEPAGE); + TEST_ASSERT(ret == 0, "madvise failed, addr: %p length: 0x%lx src_type: %s", + region->host_mem, npages * vm->page_size, + vm_mem_backing_src_alias(src_type)->name); } region->unused_phy_pages = sparsebit_alloc(); -- cgit v1.2.3 From b9c2bd50eca5dc6ed8eaacbbb2e17df95a56bd1c Mon Sep 17 00:00:00 2001 From: Yanan Wang Date: Tue, 30 Mar 2021 16:08:56 +0800 Subject: KVM: selftests: Add a test for kvm page table code This test serves as a performance tester and a bug reproducer for kvm page table code (GPA->HPA mappings), so it gives guidance for people trying to make some improvement for kvm. The function guest_code() can cover the conditions where a single vcpu or multiple vcpus access guest pages within the same memory region, in three VM stages(before dirty logging, during dirty logging, after dirty logging). Besides, the backing src memory type(ANONYMOUS/THP/HUGETLB) of the tested memory region can be specified by users, which means normal page mappings or block mappings can be chosen by users to be created in the test. If ANONYMOUS memory is specified, kvm will create normal page mappings for the tested memory region before dirty logging, and update attributes of the page mappings from RO to RW during dirty logging. If THP/HUGETLB memory is specified, kvm will create block mappings for the tested memory region before dirty logging, and split the blcok mappings into normal page mappings during dirty logging, and coalesce the page mappings back into block mappings after dirty logging is stopped. So in summary, as a performance tester, this test can present the performance of kvm creating/updating normal page mappings, or the performance of kvm creating/splitting/recovering block mappings, through execution time. When we need to coalesce the page mappings back to block mappings after dirty logging is stopped, we have to firstly invalidate *all* the TLB entries for the page mappings right before installation of the block entry, because a TLB conflict abort error could occur if we can't invalidate the TLB entries fully. We have hit this TLB conflict twice on aarch64 software implementation and fixed it. As this test can imulate process from dirty logging enabled to dirty logging stopped of a VM with block mappings, so it can also reproduce this TLB conflict abort due to inadequate TLB invalidation when coalescing tables. Signed-off-by: Yanan Wang Reviewed-by: Ben Gardon Reviewed-by: Andrew Jones Message-Id: <20210330080856.14940-11-wangyanan55@huawei.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/.gitignore | 1 + tools/testing/selftests/kvm/Makefile | 3 + tools/testing/selftests/kvm/kvm_page_table_test.c | 506 ++++++++++++++++++++++ 3 files changed, 510 insertions(+) create mode 100644 tools/testing/selftests/kvm/kvm_page_table_test.c (limited to 'tools') diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 32b87cc77c8e..137ab7273be6 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -35,6 +35,7 @@ /dirty_log_perf_test /hardware_disable_test /kvm_create_max_vcpus +/kvm_page_table_test /memslot_modification_stress_test /set_memory_region_test /steal_time diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index a6d61f451f88..75dc57db36b4 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -69,6 +69,7 @@ TEST_GEN_PROGS_x86_64 += dirty_log_test TEST_GEN_PROGS_x86_64 += dirty_log_perf_test TEST_GEN_PROGS_x86_64 += hardware_disable_test TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus +TEST_GEN_PROGS_x86_64 += kvm_page_table_test TEST_GEN_PROGS_x86_64 += memslot_modification_stress_test TEST_GEN_PROGS_x86_64 += set_memory_region_test TEST_GEN_PROGS_x86_64 += steal_time @@ -79,6 +80,7 @@ TEST_GEN_PROGS_aarch64 += demand_paging_test TEST_GEN_PROGS_aarch64 += dirty_log_test TEST_GEN_PROGS_aarch64 += dirty_log_perf_test TEST_GEN_PROGS_aarch64 += kvm_create_max_vcpus +TEST_GEN_PROGS_aarch64 += kvm_page_table_test TEST_GEN_PROGS_aarch64 += set_memory_region_test TEST_GEN_PROGS_aarch64 += steal_time @@ -88,6 +90,7 @@ TEST_GEN_PROGS_s390x += s390x/sync_regs_test TEST_GEN_PROGS_s390x += demand_paging_test TEST_GEN_PROGS_s390x += dirty_log_test TEST_GEN_PROGS_s390x += kvm_create_max_vcpus +TEST_GEN_PROGS_s390x += kvm_page_table_test TEST_GEN_PROGS_s390x += set_memory_region_test TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M)) diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c new file mode 100644 index 000000000000..1c4753fff19e --- /dev/null +++ b/tools/testing/selftests/kvm/kvm_page_table_test.c @@ -0,0 +1,506 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KVM page table test + * + * Copyright (C) 2021, Huawei, Inc. + * + * Make sure that THP has been enabled or enough HUGETLB pages with specific + * page size have been pre-allocated on your system, if you are planning to + * use hugepages to back the guest memory for testing. + */ + +#define _GNU_SOURCE /* for program_invocation_name */ + +#include +#include +#include +#include +#include + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "guest_modes.h" + +#define TEST_MEM_SLOT_INDEX 1 + +/* Default size(1GB) of the memory for testing */ +#define DEFAULT_TEST_MEM_SIZE (1 << 30) + +/* Default guest test virtual memory offset */ +#define DEFAULT_GUEST_TEST_MEM 0xc0000000 + +/* Different guest memory accessing stages */ +enum test_stage { + KVM_BEFORE_MAPPINGS, + KVM_CREATE_MAPPINGS, + KVM_UPDATE_MAPPINGS, + KVM_ADJUST_MAPPINGS, + NUM_TEST_STAGES, +}; + +static const char * const test_stage_string[] = { + "KVM_BEFORE_MAPPINGS", + "KVM_CREATE_MAPPINGS", + "KVM_UPDATE_MAPPINGS", + "KVM_ADJUST_MAPPINGS", +}; + +struct vcpu_args { + int vcpu_id; + bool vcpu_write; +}; + +struct test_args { + struct kvm_vm *vm; + uint64_t guest_test_virt_mem; + uint64_t host_page_size; + uint64_t host_num_pages; + uint64_t large_page_size; + uint64_t large_num_pages; + uint64_t host_pages_per_lpage; + enum vm_mem_backing_src_type src_type; + struct vcpu_args vcpu_args[KVM_MAX_VCPUS]; +}; + +/* + * Guest variables. Use addr_gva2hva() if these variables need + * to be changed in host. + */ +static enum test_stage guest_test_stage; + +/* Host variables */ +static uint32_t nr_vcpus = 1; +static struct test_args test_args; +static enum test_stage *current_stage; +static bool host_quit; + +/* Whether the test stage is updated, or completed */ +static sem_t test_stage_updated; +static sem_t test_stage_completed; + +/* + * Guest physical memory offset of the testing memory slot. + * This will be set to the topmost valid physical address minus + * the test memory size. + */ +static uint64_t guest_test_phys_mem; + +/* + * Guest virtual memory offset of the testing memory slot. + * Must not conflict with identity mapped test code. + */ +static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM; + +static void guest_code(int vcpu_id) +{ + struct test_args *p = &test_args; + struct vcpu_args *vcpu_args = &p->vcpu_args[vcpu_id]; + enum test_stage *current_stage = &guest_test_stage; + uint64_t addr; + int i, j; + + /* Make sure vCPU args data structure is not corrupt */ + GUEST_ASSERT(vcpu_args->vcpu_id == vcpu_id); + + while (true) { + addr = p->guest_test_virt_mem; + + switch (READ_ONCE(*current_stage)) { + /* + * All vCPU threads will be started in this stage, + * where guest code of each vCPU will do nothing. + */ + case KVM_BEFORE_MAPPINGS: + break; + + /* + * Before dirty logging, vCPUs concurrently access the first + * 8 bytes of each page (host page/large page) within the same + * memory region with different accessing types (read/write). + * Then KVM will create normal page mappings or huge block + * mappings for them. + */ + case KVM_CREATE_MAPPINGS: + for (i = 0; i < p->large_num_pages; i++) { + if (vcpu_args->vcpu_write) + *(uint64_t *)addr = 0x0123456789ABCDEF; + else + READ_ONCE(*(uint64_t *)addr); + + addr += p->large_page_size; + } + break; + + /* + * During dirty logging, KVM will only update attributes of the + * normal page mappings from RO to RW if memory backing src type + * is anonymous. In other cases, KVM will split the huge block + * mappings into normal page mappings if memory backing src type + * is THP or HUGETLB. + */ + case KVM_UPDATE_MAPPINGS: + if (p->src_type == VM_MEM_SRC_ANONYMOUS) { + for (i = 0; i < p->host_num_pages; i++) { + *(uint64_t *)addr = 0x0123456789ABCDEF; + addr += p->host_page_size; + } + break; + } + + for (i = 0; i < p->large_num_pages; i++) { + /* + * Write to the first host page in each large + * page region, and triger break of large pages. + */ + *(uint64_t *)addr = 0x0123456789ABCDEF; + + /* + * Access the middle host pages in each large + * page region. Since dirty logging is enabled, + * this will create new mappings at the smallest + * granularity. + */ + addr += p->large_page_size / 2; + for (j = 0; j < p->host_pages_per_lpage / 2; j++) { + READ_ONCE(*(uint64_t *)addr); + addr += p->host_page_size; + } + } + break; + + /* + * After dirty logging is stopped, vCPUs concurrently read + * from every single host page. Then KVM will coalesce the + * split page mappings back to block mappings. And a TLB + * conflict abort could occur here if TLB entries of the + * page mappings are not fully invalidated. + */ + case KVM_ADJUST_MAPPINGS: + for (i = 0; i < p->host_num_pages; i++) { + READ_ONCE(*(uint64_t *)addr); + addr += p->host_page_size; + } + break; + + default: + GUEST_ASSERT(0); + } + + GUEST_SYNC(1); + } +} + +static void *vcpu_worker(void *data) +{ + int ret; + struct vcpu_args *vcpu_args = data; + struct kvm_vm *vm = test_args.vm; + int vcpu_id = vcpu_args->vcpu_id; + struct kvm_run *run; + struct timespec start; + struct timespec ts_diff; + enum test_stage stage; + + vcpu_args_set(vm, vcpu_id, 1, vcpu_id); + run = vcpu_state(vm, vcpu_id); + + while (!READ_ONCE(host_quit)) { + ret = sem_wait(&test_stage_updated); + TEST_ASSERT(ret == 0, "Error in sem_wait"); + + if (READ_ONCE(host_quit)) + return NULL; + + clock_gettime(CLOCK_MONOTONIC_RAW, &start); + ret = _vcpu_run(vm, vcpu_id); + ts_diff = timespec_elapsed(start); + + TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret); + TEST_ASSERT(get_ucall(vm, vcpu_id, NULL) == UCALL_SYNC, + "Invalid guest sync status: exit_reason=%s\n", + exit_reason_str(run->exit_reason)); + + pr_debug("Got sync event from vCPU %d\n", vcpu_id); + stage = READ_ONCE(*current_stage); + + /* + * Here we can know the execution time of every + * single vcpu running in different test stages. + */ + pr_debug("vCPU %d has completed stage %s\n" + "execution time is: %ld.%.9lds\n\n", + vcpu_id, test_stage_string[stage], + ts_diff.tv_sec, ts_diff.tv_nsec); + + ret = sem_post(&test_stage_completed); + TEST_ASSERT(ret == 0, "Error in sem_post"); + } + + return NULL; +} + +struct test_params { + uint64_t phys_offset; + uint64_t test_mem_size; + enum vm_mem_backing_src_type src_type; +}; + +static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg) +{ + int ret; + struct test_params *p = arg; + struct vcpu_args *vcpu_args; + enum vm_mem_backing_src_type src_type = p->src_type; + uint64_t large_page_size = get_backing_src_pagesz(src_type); + uint64_t guest_page_size = vm_guest_mode_params[mode].page_size; + uint64_t host_page_size = getpagesize(); + uint64_t test_mem_size = p->test_mem_size; + uint64_t guest_num_pages; + uint64_t alignment; + void *host_test_mem; + struct kvm_vm *vm; + int vcpu_id; + + /* Align up the test memory size */ + alignment = max(large_page_size, guest_page_size); + test_mem_size = (test_mem_size + alignment - 1) & ~(alignment - 1); + + /* Create a VM with enough guest pages */ + guest_num_pages = test_mem_size / guest_page_size; + vm = vm_create_with_vcpus(mode, nr_vcpus, + guest_num_pages, 0, guest_code, NULL); + + /* Align down GPA of the testing memslot */ + if (!p->phys_offset) + guest_test_phys_mem = (vm_get_max_gfn(vm) - guest_num_pages) * + guest_page_size; + else + guest_test_phys_mem = p->phys_offset; +#ifdef __s390x__ + alignment = max(0x100000, alignment); +#endif + guest_test_phys_mem &= ~(alignment - 1); + + /* Set up the shared data structure test_args */ + test_args.vm = vm; + test_args.guest_test_virt_mem = guest_test_virt_mem; + test_args.host_page_size = host_page_size; + test_args.host_num_pages = test_mem_size / host_page_size; + test_args.large_page_size = large_page_size; + test_args.large_num_pages = test_mem_size / large_page_size; + test_args.host_pages_per_lpage = large_page_size / host_page_size; + test_args.src_type = src_type; + + for (vcpu_id = 0; vcpu_id < KVM_MAX_VCPUS; vcpu_id++) { + vcpu_args = &test_args.vcpu_args[vcpu_id]; + vcpu_args->vcpu_id = vcpu_id; + vcpu_args->vcpu_write = !(vcpu_id % 2); + } + + /* Add an extra memory slot with specified backing src type */ + vm_userspace_mem_region_add(vm, src_type, guest_test_phys_mem, + TEST_MEM_SLOT_INDEX, guest_num_pages, 0); + + /* Do mapping(GVA->GPA) for the testing memory slot */ + virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages, 0); + + /* Cache the HVA pointer of the region */ + host_test_mem = addr_gpa2hva(vm, (vm_paddr_t)guest_test_phys_mem); + + /* Export shared structure test_args to guest */ + ucall_init(vm, NULL); + sync_global_to_guest(vm, test_args); + + ret = sem_init(&test_stage_updated, 0, 0); + TEST_ASSERT(ret == 0, "Error in sem_init"); + + ret = sem_init(&test_stage_completed, 0, 0); + TEST_ASSERT(ret == 0, "Error in sem_init"); + + current_stage = addr_gva2hva(vm, (vm_vaddr_t)(&guest_test_stage)); + *current_stage = NUM_TEST_STAGES; + + pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); + pr_info("Testing memory backing src type: %s\n", + vm_mem_backing_src_alias(src_type)->name); + pr_info("Testing memory backing src granularity: 0x%lx\n", + large_page_size); + pr_info("Testing memory size(aligned): 0x%lx\n", test_mem_size); + pr_info("Guest physical test memory offset: 0x%lx\n", + guest_test_phys_mem); + pr_info("Host virtual test memory offset: 0x%lx\n", + (uint64_t)host_test_mem); + pr_info("Number of testing vCPUs: %d\n", nr_vcpus); + + return vm; +} + +static void vcpus_complete_new_stage(enum test_stage stage) +{ + int ret; + int vcpus; + + /* Wake up all the vcpus to run new test stage */ + for (vcpus = 0; vcpus < nr_vcpus; vcpus++) { + ret = sem_post(&test_stage_updated); + TEST_ASSERT(ret == 0, "Error in sem_post"); + } + pr_debug("All vcpus have been notified to continue\n"); + + /* Wait for all the vcpus to complete new test stage */ + for (vcpus = 0; vcpus < nr_vcpus; vcpus++) { + ret = sem_wait(&test_stage_completed); + TEST_ASSERT(ret == 0, "Error in sem_wait"); + + pr_debug("%d vcpus have completed stage %s\n", + vcpus + 1, test_stage_string[stage]); + } + + pr_debug("All vcpus have completed stage %s\n", + test_stage_string[stage]); +} + +static void run_test(enum vm_guest_mode mode, void *arg) +{ + int ret; + pthread_t *vcpu_threads; + struct kvm_vm *vm; + int vcpu_id; + struct timespec start; + struct timespec ts_diff; + + /* Create VM with vCPUs and make some pre-initialization */ + vm = pre_init_before_test(mode, arg); + + vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads)); + TEST_ASSERT(vcpu_threads, "Memory allocation failed"); + + host_quit = false; + *current_stage = KVM_BEFORE_MAPPINGS; + + for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { + pthread_create(&vcpu_threads[vcpu_id], NULL, vcpu_worker, + &test_args.vcpu_args[vcpu_id]); + } + + vcpus_complete_new_stage(*current_stage); + pr_info("Started all vCPUs successfully\n"); + + /* Test the stage of KVM creating mappings */ + *current_stage = KVM_CREATE_MAPPINGS; + + clock_gettime(CLOCK_MONOTONIC_RAW, &start); + vcpus_complete_new_stage(*current_stage); + ts_diff = timespec_elapsed(start); + + pr_info("KVM_CREATE_MAPPINGS: total execution time: %ld.%.9lds\n\n", + ts_diff.tv_sec, ts_diff.tv_nsec); + + /* Test the stage of KVM updating mappings */ + vm_mem_region_set_flags(vm, TEST_MEM_SLOT_INDEX, + KVM_MEM_LOG_DIRTY_PAGES); + + *current_stage = KVM_UPDATE_MAPPINGS; + + clock_gettime(CLOCK_MONOTONIC_RAW, &start); + vcpus_complete_new_stage(*current_stage); + ts_diff = timespec_elapsed(start); + + pr_info("KVM_UPDATE_MAPPINGS: total execution time: %ld.%.9lds\n\n", + ts_diff.tv_sec, ts_diff.tv_nsec); + + /* Test the stage of KVM adjusting mappings */ + vm_mem_region_set_flags(vm, TEST_MEM_SLOT_INDEX, 0); + + *current_stage = KVM_ADJUST_MAPPINGS; + + clock_gettime(CLOCK_MONOTONIC_RAW, &start); + vcpus_complete_new_stage(*current_stage); + ts_diff = timespec_elapsed(start); + + pr_info("KVM_ADJUST_MAPPINGS: total execution time: %ld.%.9lds\n\n", + ts_diff.tv_sec, ts_diff.tv_nsec); + + /* Tell the vcpu thread to quit */ + host_quit = true; + for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { + ret = sem_post(&test_stage_updated); + TEST_ASSERT(ret == 0, "Error in sem_post"); + } + + for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) + pthread_join(vcpu_threads[vcpu_id], NULL); + + ret = sem_destroy(&test_stage_updated); + TEST_ASSERT(ret == 0, "Error in sem_destroy"); + + ret = sem_destroy(&test_stage_completed); + TEST_ASSERT(ret == 0, "Error in sem_destroy"); + + free(vcpu_threads); + ucall_uninit(vm); + kvm_vm_free(vm); +} + +static void help(char *name) +{ + puts(""); + printf("usage: %s [-h] [-p offset] [-m mode] " + "[-b mem-size] [-v vcpus] [-s mem-type]\n", name); + puts(""); + printf(" -p: specify guest physical test memory offset\n" + " Warning: a low offset can conflict with the loaded test code.\n"); + guest_modes_help(); + printf(" -b: specify size of the memory region for testing. e.g. 10M or 3G.\n" + " (default: 1G)\n"); + printf(" -v: specify the number of vCPUs to run\n" + " (default: 1)\n"); + printf(" -s: specify the type of memory that should be used to\n" + " back the guest data region.\n" + " (default: anonymous)\n\n"); + backing_src_help(); + puts(""); +} + +int main(int argc, char *argv[]) +{ + int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS); + struct test_params p = { + .test_mem_size = DEFAULT_TEST_MEM_SIZE, + .src_type = VM_MEM_SRC_ANONYMOUS, + }; + int opt; + + guest_modes_append_default(); + + while ((opt = getopt(argc, argv, "hp:m:b:v:s:")) != -1) { + switch (opt) { + case 'p': + p.phys_offset = strtoull(optarg, NULL, 0); + break; + case 'm': + guest_modes_cmdline(optarg); + break; + case 'b': + p.test_mem_size = parse_size(optarg); + break; + case 'v': + nr_vcpus = atoi(optarg); + TEST_ASSERT(nr_vcpus > 0 && nr_vcpus <= max_vcpus, + "Invalid number of vcpus, must be between 1 and %d", max_vcpus); + break; + case 's': + p.src_type = parse_backing_src_type(optarg); + break; + case 'h': + default: + help(argv[0]); + exit(0); + } + } + + for_each_guest_mode(run_test, &p); + + return 0; +} -- cgit v1.2.3 From 47d01e7b9999b9591077a59a1ecec11c6ce570de Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 14 Apr 2021 11:07:39 -0500 Subject: libperf: Add support for user space counter access x86 and arm64 can both support direct access of event counters in userspace. The access sequence is less than trivial and currently exists in perf test code (tools/perf/arch/x86/tests/rdpmc.c) with copies in projects such as PAPI and libpfm4. In order to support userspace access, an event must be mmapped first with perf_evsel__mmap(). Then subsequent calls to perf_evsel__read() will use the fast path (assuming the arch supports it). Committer notes: Added a '__maybe_unused' attribute to the read_perf_counter() argument to fix the build on arches other than x86_64 and arm. Committer testing: Building and running the libperf tests in verbose mode (V=1) now shows those "loop = N, count = N" extra lines, testing user space counter access. # make V=1 -C tools/lib/perf tests make: Entering directory '/home/acme/git/perf/tools/lib/perf' make -f /home/acme/git/perf/tools/build/Makefile.build dir=. obj=libperf make -C /home/acme/git/perf/tools/lib/api/ O= libapi.a make -f /home/acme/git/perf/tools/build/Makefile.build dir=./fd obj=libapi make -f /home/acme/git/perf/tools/build/Makefile.build dir=./fs obj=libapi make -C tests gcc -I/home/acme/git/perf/tools/lib/perf/include -I/home/acme/git/perf/tools/include -I/home/acme/git/perf/tools/lib -g -Wall -o test-cpumap-a test-cpumap.c ../libperf.a /home/acme/git/perf/tools/lib/api/libapi.a gcc -I/home/acme/git/perf/tools/lib/perf/include -I/home/acme/git/perf/tools/include -I/home/acme/git/perf/tools/lib -g -Wall -o test-threadmap-a test-threadmap.c ../libperf.a /home/acme/git/perf/tools/lib/api/libapi.a gcc -I/home/acme/git/perf/tools/lib/perf/include -I/home/acme/git/perf/tools/include -I/home/acme/git/perf/tools/lib -g -Wall -o test-evlist-a test-evlist.c ../libperf.a /home/acme/git/perf/tools/lib/api/libapi.a gcc -I/home/acme/git/perf/tools/lib/perf/include -I/home/acme/git/perf/tools/include -I/home/acme/git/perf/tools/lib -g -Wall -o test-evsel-a test-evsel.c ../libperf.a /home/acme/git/perf/tools/lib/api/libapi.a gcc -I/home/acme/git/perf/tools/lib/perf/include -I/home/acme/git/perf/tools/include -I/home/acme/git/perf/tools/lib -g -Wall -L.. -o test-cpumap-so test-cpumap.c /home/acme/git/perf/tools/lib/api/libapi.a -lperf gcc -I/home/acme/git/perf/tools/lib/perf/include -I/home/acme/git/perf/tools/include -I/home/acme/git/perf/tools/lib -g -Wall -L.. -o test-threadmap-so test-threadmap.c /home/acme/git/perf/tools/lib/api/libapi.a -lperf gcc -I/home/acme/git/perf/tools/lib/perf/include -I/home/acme/git/perf/tools/include -I/home/acme/git/perf/tools/lib -g -Wall -L.. -o test-evlist-so test-evlist.c /home/acme/git/perf/tools/lib/api/libapi.a -lperf gcc -I/home/acme/git/perf/tools/lib/perf/include -I/home/acme/git/perf/tools/include -I/home/acme/git/perf/tools/lib -g -Wall -L.. -o test-evsel-so test-evsel.c /home/acme/git/perf/tools/lib/api/libapi.a -lperf make -C tests run running static: - running test-cpumap.c...OK - running test-threadmap.c...OK - running test-evlist.c...OK - running test-evsel.c... loop = 65536, count = 333926 loop = 131072, count = 655781 loop = 262144, count = 1311141 loop = 524288, count = 2630126 loop = 1048576, count = 5256955 loop = 65536, count = 524594 loop = 131072, count = 1058916 loop = 262144, count = 2097458 loop = 524288, count = 4205429 loop = 1048576, count = 8406606 OK running dynamic: - running test-cpumap.c...OK - running test-threadmap.c...OK - running test-evlist.c...OK - running test-evsel.c... loop = 65536, count = 328102 loop = 131072, count = 655782 loop = 262144, count = 1317494 loop = 524288, count = 2627851 loop = 1048576, count = 5255187 loop = 65536, count = 524601 loop = 131072, count = 1048923 loop = 262144, count = 2107917 loop = 524288, count = 4194606 loop = 1048576, count = 8409322 OK make: Leaving directory '/home/acme/git/perf/tools/lib/perf' # Signed-off-by: Rob Herring Acked-by: Jiri Olsa Acked-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Catalin Marinas Cc: Itaru Kitayama Cc: Mark Rutland Cc: Peter Zijlstra Cc: Will Deacon Link: http://lore.kernel.org/lkml/20210414155412.3697605-4-robh@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/perf/evsel.c | 4 ++ tools/lib/perf/include/internal/mmap.h | 3 ++ tools/lib/perf/mmap.c | 88 ++++++++++++++++++++++++++++++++++ tools/lib/perf/tests/test-evsel.c | 66 +++++++++++++++++++++++++ 4 files changed, 161 insertions(+) (limited to 'tools') diff --git a/tools/lib/perf/evsel.c b/tools/lib/perf/evsel.c index cd203998e875..bd8c2f19ef74 100644 --- a/tools/lib/perf/evsel.c +++ b/tools/lib/perf/evsel.c @@ -267,6 +267,10 @@ int perf_evsel__read(struct perf_evsel *evsel, int cpu, int thread, if (FD(evsel, cpu, thread) < 0) return -EINVAL; + if (MMAP(evsel, cpu, thread) && + !perf_mmap__read_self(MMAP(evsel, cpu, thread), count)) + return 0; + if (readn(FD(evsel, cpu, thread), count->values, size) <= 0) return -errno; diff --git a/tools/lib/perf/include/internal/mmap.h b/tools/lib/perf/include/internal/mmap.h index be7556e0a2b2..5e3422f40ed5 100644 --- a/tools/lib/perf/include/internal/mmap.h +++ b/tools/lib/perf/include/internal/mmap.h @@ -11,6 +11,7 @@ #define PERF_SAMPLE_MAX_SIZE (1 << 16) struct perf_mmap; +struct perf_counts_values; typedef void (*libperf_unmap_cb_t)(struct perf_mmap *map); @@ -52,4 +53,6 @@ void perf_mmap__put(struct perf_mmap *map); u64 perf_mmap__read_head(struct perf_mmap *map); +int perf_mmap__read_self(struct perf_mmap *map, struct perf_counts_values *count); + #endif /* __LIBPERF_INTERNAL_MMAP_H */ diff --git a/tools/lib/perf/mmap.c b/tools/lib/perf/mmap.c index 79d5ed6c38cc..c89dfa5f67b3 100644 --- a/tools/lib/perf/mmap.c +++ b/tools/lib/perf/mmap.c @@ -8,9 +8,11 @@ #include #include #include +#include #include #include #include +#include #include "internal.h" void perf_mmap__init(struct perf_mmap *map, struct perf_mmap *prev, @@ -273,3 +275,89 @@ union perf_event *perf_mmap__read_event(struct perf_mmap *map) return event; } + +#if defined(__i386__) || defined(__x86_64__) +static u64 read_perf_counter(unsigned int counter) +{ + unsigned int low, high; + + asm volatile("rdpmc" : "=a" (low), "=d" (high) : "c" (counter)); + + return low | ((u64)high) << 32; +} + +static u64 read_timestamp(void) +{ + unsigned int low, high; + + asm volatile("rdtsc" : "=a" (low), "=d" (high)); + + return low | ((u64)high) << 32; +} +#else +static u64 read_perf_counter(unsigned int counter __maybe_unused) { return 0; } +static u64 read_timestamp(void) { return 0; } +#endif + +int perf_mmap__read_self(struct perf_mmap *map, struct perf_counts_values *count) +{ + struct perf_event_mmap_page *pc = map->base; + u32 seq, idx, time_mult = 0, time_shift = 0; + u64 cnt, cyc = 0, time_offset = 0, time_cycles = 0, time_mask = ~0ULL; + + if (!pc || !pc->cap_user_rdpmc) + return -1; + + do { + seq = READ_ONCE(pc->lock); + barrier(); + + count->ena = READ_ONCE(pc->time_enabled); + count->run = READ_ONCE(pc->time_running); + + if (pc->cap_user_time && count->ena != count->run) { + cyc = read_timestamp(); + time_mult = READ_ONCE(pc->time_mult); + time_shift = READ_ONCE(pc->time_shift); + time_offset = READ_ONCE(pc->time_offset); + + if (pc->cap_user_time_short) { + time_cycles = READ_ONCE(pc->time_cycles); + time_mask = READ_ONCE(pc->time_mask); + } + } + + idx = READ_ONCE(pc->index); + cnt = READ_ONCE(pc->offset); + if (pc->cap_user_rdpmc && idx) { + s64 evcnt = read_perf_counter(idx - 1); + u16 width = READ_ONCE(pc->pmc_width); + + evcnt <<= 64 - width; + evcnt >>= 64 - width; + cnt += evcnt; + } else + return -1; + + barrier(); + } while (READ_ONCE(pc->lock) != seq); + + if (count->ena != count->run) { + u64 delta; + + /* Adjust for cap_usr_time_short, a nop if not */ + cyc = time_cycles + ((cyc - time_cycles) & time_mask); + + delta = time_offset + mul_u64_u32_shr(cyc, time_mult, time_shift); + + count->ena += delta; + if (idx) + count->run += delta; + + cnt = mul_u64_u64_div64(cnt, count->ena, count->run); + } + + count->val = cnt; + + return 0; +} diff --git a/tools/lib/perf/tests/test-evsel.c b/tools/lib/perf/tests/test-evsel.c index 0ad82d7a2a51..288b5feaefe2 100644 --- a/tools/lib/perf/tests/test-evsel.c +++ b/tools/lib/perf/tests/test-evsel.c @@ -120,6 +120,70 @@ static int test_stat_thread_enable(void) return 0; } +static int test_stat_user_read(int event) +{ + struct perf_counts_values counts = { .val = 0 }; + struct perf_thread_map *threads; + struct perf_evsel *evsel; + struct perf_event_mmap_page *pc; + struct perf_event_attr attr = { + .type = PERF_TYPE_HARDWARE, + .config = event, + }; + int err, i; + + threads = perf_thread_map__new_dummy(); + __T("failed to create threads", threads); + + perf_thread_map__set_pid(threads, 0, 0); + + evsel = perf_evsel__new(&attr); + __T("failed to create evsel", evsel); + + err = perf_evsel__open(evsel, NULL, threads); + __T("failed to open evsel", err == 0); + + err = perf_evsel__mmap(evsel, 0); + __T("failed to mmap evsel", err == 0); + + pc = perf_evsel__mmap_base(evsel, 0, 0); + +#if defined(__i386__) || defined(__x86_64__) + __T("userspace counter access not supported", pc->cap_user_rdpmc); + __T("userspace counter access not enabled", pc->index); + __T("userspace counter width not set", pc->pmc_width >= 32); +#endif + + perf_evsel__read(evsel, 0, 0, &counts); + __T("failed to read value for evsel", counts.val != 0); + + for (i = 0; i < 5; i++) { + volatile int count = 0x10000 << i; + __u64 start, end, last = 0; + + __T_VERBOSE("\tloop = %u, ", count); + + perf_evsel__read(evsel, 0, 0, &counts); + start = counts.val; + + while (count--) ; + + perf_evsel__read(evsel, 0, 0, &counts); + end = counts.val; + + __T("invalid counter data", (end - start) > last); + last = end - start; + __T_VERBOSE("count = %llu\n", end - start); + } + + perf_evsel__munmap(evsel); + perf_evsel__close(evsel); + perf_evsel__delete(evsel); + + perf_thread_map__put(threads); + return 0; +} + int main(int argc, char **argv) { __T_START; @@ -129,6 +193,8 @@ int main(int argc, char **argv) test_stat_cpu(); test_stat_thread(); test_stat_thread_enable(); + test_stat_user_read(PERF_COUNT_HW_INSTRUCTIONS); + test_stat_user_read(PERF_COUNT_HW_CPU_CYCLES); __T_END; return tests_failed == 0 ? 0 : -1; -- cgit v1.2.3 From 818869489ba3c4a4ed1360e22b2f66be488ea9f5 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 14 Apr 2021 14:57:58 -0500 Subject: libperf xyarray: Add bounds checks to xyarray__entry() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit xyarray__entry() is missing any bounds checking yet often the x and y parameters come from external callers. Add bounds checks and an unchecked __xyarray__entry(). Committer notes: Make the 'x' and 'y' arguments to the new xyarray__entry() that does bounds check to be of type 'size_t', so that we cover also the case where 'x' and 'y' could be negative, which is needed anyway as having them as 'int' breaks the build with: /home/acme/git/perf/tools/lib/perf/include/internal/xyarray.h: In function ‘xyarray__entry’: /home/acme/git/perf/tools/lib/perf/include/internal/xyarray.h:28:8: error: comparison of integer expressions of different signedness: ‘int’ and ‘size_t’ {aka ‘long unsigned int’} [-Werror=sign-compare] 28 | if (x >= xy->max_x || y >= xy->max_y) | ^~ /home/acme/git/perf/tools/lib/perf/include/internal/xyarray.h:28:26: error: comparison of integer expressions of different signedness: ‘int’ and ‘size_t’ {aka ‘long unsigned int’} [-Werror=sign-compare] 28 | if (x >= xy->max_x || y >= xy->max_y) | ^~ cc1: all warnings being treated as errors Signed-off-by: Rob Herring Suggested-by: Arnaldo Carvalho de Melo Suggested-by: Namhyung Kim Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20210414195758.4078803-1-robh@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/perf/include/internal/xyarray.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/perf/include/internal/xyarray.h b/tools/lib/perf/include/internal/xyarray.h index 51e35d6c8ec4..f10af3da7b21 100644 --- a/tools/lib/perf/include/internal/xyarray.h +++ b/tools/lib/perf/include/internal/xyarray.h @@ -18,11 +18,18 @@ struct xyarray *xyarray__new(int xlen, int ylen, size_t entry_size); void xyarray__delete(struct xyarray *xy); void xyarray__reset(struct xyarray *xy); -static inline void *xyarray__entry(struct xyarray *xy, int x, int y) +static inline void *__xyarray__entry(struct xyarray *xy, int x, int y) { return &xy->contents[x * xy->row_size + y * xy->entry_size]; } +static inline void *xyarray__entry(struct xyarray *xy, size_t x, size_t y) +{ + if (x >= xy->max_x || y >= xy->max_y) + return NULL; + return __xyarray__entry(xy, x, y); +} + static inline int xyarray__max_y(struct xyarray *xy) { return xy->max_y; -- cgit v1.2.3 From 32daa5d7899e03433429bedf9e20d7963179703a Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Mon, 19 Apr 2021 16:50:01 +0530 Subject: perf vendor events: Initial JSON/events list for power10 platform Patch adds initial JSON/events for POWER10. Signed-off-by: Kajol Jain Reviewed-by: Paul Clarke Tested-by: Paul Clarke Acked-by: Michael Ellerman Cc: Athira Jajeev Cc: Jiri Olsa Cc: Madhavan Srinivasan Cc: Ravi Bangoria Cc: linuxppc-dev@lists.ozlabs.org Link: http://lore.kernel.org/lkml/20210419112001.71466-1-kjain@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/powerpc/mapfile.csv | 1 + .../pmu-events/arch/powerpc/power10/cache.json | 47 ++++ .../arch/powerpc/power10/floating_point.json | 7 + .../pmu-events/arch/powerpc/power10/frontend.json | 217 +++++++++++++++ .../pmu-events/arch/powerpc/power10/locks.json | 12 + .../pmu-events/arch/powerpc/power10/marked.json | 147 ++++++++++ .../pmu-events/arch/powerpc/power10/memory.json | 192 +++++++++++++ .../pmu-events/arch/powerpc/power10/others.json | 297 +++++++++++++++++++++ .../pmu-events/arch/powerpc/power10/pipeline.json | 297 +++++++++++++++++++++ .../perf/pmu-events/arch/powerpc/power10/pmc.json | 22 ++ .../arch/powerpc/power10/translation.json | 57 ++++ 11 files changed, 1296 insertions(+) create mode 100644 tools/perf/pmu-events/arch/powerpc/power10/cache.json create mode 100644 tools/perf/pmu-events/arch/powerpc/power10/floating_point.json create mode 100644 tools/perf/pmu-events/arch/powerpc/power10/frontend.json create mode 100644 tools/perf/pmu-events/arch/powerpc/power10/locks.json create mode 100644 tools/perf/pmu-events/arch/powerpc/power10/marked.json create mode 100644 tools/perf/pmu-events/arch/powerpc/power10/memory.json create mode 100644 tools/perf/pmu-events/arch/powerpc/power10/others.json create mode 100644 tools/perf/pmu-events/arch/powerpc/power10/pipeline.json create mode 100644 tools/perf/pmu-events/arch/powerpc/power10/pmc.json create mode 100644 tools/perf/pmu-events/arch/powerpc/power10/translation.json (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/powerpc/mapfile.csv b/tools/perf/pmu-events/arch/powerpc/mapfile.csv index 229150e7ab7d..4abdfc3f9692 100644 --- a/tools/perf/pmu-events/arch/powerpc/mapfile.csv +++ b/tools/perf/pmu-events/arch/powerpc/mapfile.csv @@ -15,3 +15,4 @@ # Power8 entries 004[bcd][[:xdigit:]]{4},1,power8,core 004e[[:xdigit:]]{4},1,power9,core +0080[[:xdigit:]]{4},1,power10,core diff --git a/tools/perf/pmu-events/arch/powerpc/power10/cache.json b/tools/perf/pmu-events/arch/powerpc/power10/cache.json new file mode 100644 index 000000000000..616f29098c71 --- /dev/null +++ b/tools/perf/pmu-events/arch/powerpc/power10/cache.json @@ -0,0 +1,47 @@ +[ + { + "EventCode": "1003C", + "EventName": "PM_EXEC_STALL_DMISS_L2L3", + "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from either the local L2 or local L3." + }, + { + "EventCode": "34056", + "EventName": "PM_EXEC_STALL_LOAD_FINISH", + "BriefDescription": "Cycles in which the oldest instruction in the pipeline was finishing a load after its data was reloaded from a data source beyond the local L1; cycles in which the LSU was processing an L1-hit; cycles in which the NTF instruction merged with another load in the LMQ." + }, + { + "EventCode": "3006C", + "EventName": "PM_RUN_CYC_SMT2_MODE", + "BriefDescription": "Cycles when this thread's run latch is set and the core is in SMT2 mode." + }, + { + "EventCode": "300F4", + "EventName": "PM_RUN_INST_CMPL_CONC", + "BriefDescription": "PowerPC instructions completed by this thread when all threads in the core had the run-latch set." + }, + { + "EventCode": "4C016", + "EventName": "PM_EXEC_STALL_DMISS_L2L3_CONFLICT", + "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from the local L2 or local L3, with a dispatch conflict." + }, + { + "EventCode": "4D014", + "EventName": "PM_EXEC_STALL_LOAD", + "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a load instruction executing in the Load Store Unit." + }, + { + "EventCode": "4D016", + "EventName": "PM_EXEC_STALL_PTESYNC", + "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a PTESYNC instruction executing in the Load Store Unit." + }, + { + "EventCode": "401EA", + "EventName": "PM_THRESH_EXC_128", + "BriefDescription": "Threshold counter exceeded a value of 128." + }, + { + "EventCode": "400F6", + "EventName": "PM_BR_MPRED_CMPL", + "BriefDescription": "A mispredicted branch completed. Includes direction and target." + } +] diff --git a/tools/perf/pmu-events/arch/powerpc/power10/floating_point.json b/tools/perf/pmu-events/arch/powerpc/power10/floating_point.json new file mode 100644 index 000000000000..703cd431ae5b --- /dev/null +++ b/tools/perf/pmu-events/arch/powerpc/power10/floating_point.json @@ -0,0 +1,7 @@ +[ + { + "EventCode": "4016E", + "EventName": "PM_THRESH_NOT_MET", + "BriefDescription": "Threshold counter did not meet threshold." + } +] diff --git a/tools/perf/pmu-events/arch/powerpc/power10/frontend.json b/tools/perf/pmu-events/arch/powerpc/power10/frontend.json new file mode 100644 index 000000000000..eac8609dcc90 --- /dev/null +++ b/tools/perf/pmu-events/arch/powerpc/power10/frontend.json @@ -0,0 +1,217 @@ +[ + { + "EventCode": "10004", + "EventName": "PM_EXEC_STALL_TRANSLATION", + "BriefDescription": "Cycles in which the oldest instruction in the pipeline suffered a TLB miss or ERAT miss and waited for it to resolve." + }, + { + "EventCode": "10010", + "EventName": "PM_PMC4_OVERFLOW", + "BriefDescription": "The event selected for PMC4 caused the event counter to overflow." + }, + { + "EventCode": "10020", + "EventName": "PM_PMC4_REWIND", + "BriefDescription": "The speculative event selected for PMC4 rewinds and the counter for PMC4 is not charged." + }, + { + "EventCode": "10038", + "EventName": "PM_DISP_STALL_TRANSLATION", + "BriefDescription": "Cycles when dispatch was stalled for this thread because the MMU was handling a translation miss." + }, + { + "EventCode": "1003A", + "EventName": "PM_DISP_STALL_BR_MPRED_IC_L2", + "BriefDescription": "Cycles when dispatch was stalled while the instruction was fetched from the local L2 after suffering a branch mispredict." + }, + { + "EventCode": "1E050", + "EventName": "PM_DISP_STALL_HELD_STF_MAPPER_CYC", + "BriefDescription": "Cycles in which the NTC instruction is held at dispatch because the STF mapper/SRB was full. Includes GPR (count, link, tar), VSR, VMR, FPR." + }, + { + "EventCode": "1F054", + "EventName": "PM_DTLB_HIT", + "BriefDescription": "The PTE required by the instruction was resident in the TLB (data TLB access). When MMCR1[16]=0 this event counts only demand hits. When MMCR1[16]=1 this event includes demand and prefetch. Applies to both HPT and RPT." + }, + { + "EventCode": "101E8", + "EventName": "PM_THRESH_EXC_256", + "BriefDescription": "Threshold counter exceeded a count of 256." + }, + { + "EventCode": "101EC", + "EventName": "PM_THRESH_MET", + "BriefDescription": "Threshold exceeded." + }, + { + "EventCode": "100F2", + "EventName": "PM_1PLUS_PPC_CMPL", + "BriefDescription": "Cycles in which at least one instruction is completed by this thread." + }, + { + "EventCode": "100F6", + "EventName": "PM_IERAT_MISS", + "BriefDescription": "IERAT Reloaded to satisfy an IERAT miss. All page sizes are counted by this event." + }, + { + "EventCode": "100F8", + "EventName": "PM_DISP_STALL_CYC", + "BriefDescription": "Cycles the ICT has no itags assigned to this thread (no instructions were dispatched during these cycles)." + }, + { + "EventCode": "20114", + "EventName": "PM_MRK_L2_RC_DISP", + "BriefDescription": "Marked instruction RC dispatched in L2." + }, + { + "EventCode": "2C010", + "EventName": "PM_EXEC_STALL_LSU", + "BriefDescription": "Cycles in which the oldest instruction in the pipeline was executing in the Load Store Unit. This does not include simple fixed point instructions." + }, + { + "EventCode": "2C016", + "EventName": "PM_DISP_STALL_IERAT_ONLY_MISS", + "BriefDescription": "Cycles when dispatch was stalled while waiting to resolve an instruction ERAT miss." + }, + { + "EventCode": "2C01E", + "EventName": "PM_DISP_STALL_BR_MPRED_IC_L3", + "BriefDescription": "Cycles when dispatch was stalled while the instruction was fetched from the local L3 after suffering a branch mispredict." + }, + { + "EventCode": "2D01A", + "EventName": "PM_DISP_STALL_IC_MISS", + "BriefDescription": "Cycles when dispatch was stalled for this thread due to an Icache Miss." + }, + { + "EventCode": "2D01C", + "EventName": "PM_CMPL_STALL_STCX", + "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a stcx waiting for resolution from the nest before completing." + }, + { + "EventCode": "2E018", + "EventName": "PM_DISP_STALL_FETCH", + "BriefDescription": "Cycles when dispatch was stalled for this thread because Fetch was being held." + }, + { + "EventCode": "2E01A", + "EventName": "PM_DISP_STALL_HELD_XVFC_MAPPER_CYC", + "BriefDescription": "Cycles in which the NTC instruction is held at dispatch because the XVFC mapper/SRB was full." + }, + { + "EventCode": "2C142", + "EventName": "PM_MRK_XFER_FROM_SRC_PMC2", + "BriefDescription": "For a marked data transfer instruction, the processor's L1 data cache was reloaded from the source specified in MMCR3[15:27]. If MMCR1[16|17] is 0 (default), this count includes only lines that were reloaded to satisfy a demand miss. If MMCR1[16|17] is 1, this count includes both demand misses and prefetch reloads." + }, + { + "EventCode": "24050", + "EventName": "PM_IOPS_DISP", + "BriefDescription": "Internal Operations dispatched. PM_IOPS_DISP / PM_INST_DISP will show the average number of internal operations per PowerPC instruction." + }, + { + "EventCode": "2405E", + "EventName": "PM_ISSUE_CANCEL", + "BriefDescription": "An instruction issued and the issue was later cancelled. Only one cancel per PowerPC instruction." + }, + { + "EventCode": "200FA", + "EventName": "PM_BR_TAKEN_CMPL", + "BriefDescription": "Branch Taken instruction completed." + }, + { + "EventCode": "30012", + "EventName": "PM_FLUSH_COMPLETION", + "BriefDescription": "The instruction that was next to complete (oldest in the pipeline) did not complete because it suffered a flush." + }, + { + "EventCode": "30014", + "EventName": "PM_EXEC_STALL_STORE", + "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a store instruction executing in the Load Store Unit." + }, + { + "EventCode": "30018", + "EventName": "PM_DISP_STALL_HELD_SCOREBOARD_CYC", + "BriefDescription": "Cycles in which the NTC instruction is held at dispatch while waiting on the Scoreboard. This event combines VSCR and FPSCR together." + }, + { + "EventCode": "30026", + "EventName": "PM_EXEC_STALL_STORE_MISS", + "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a store whose cache line was not resident in the L1 and was waiting for allocation of the missing line into the L1." + }, + { + "EventCode": "3012A", + "EventName": "PM_MRK_L2_RC_DONE", + "BriefDescription": "L2 RC machine completed the transaction for the marked instruction." + }, + { + "EventCode": "3F046", + "EventName": "PM_ITLB_HIT_1G", + "BriefDescription": "Instruction TLB hit (IERAT reload) page size 1G, which implies Radix Page Table translation is in use. When MMCR1[17]=0 this event counts only for demand misses. When MMCR1[17]=1 this event includes demand misses and prefetches." + }, + { + "EventCode": "34058", + "EventName": "PM_DISP_STALL_BR_MPRED_ICMISS", + "BriefDescription": "Cycles when dispatch was stalled after a mispredicted branch resulted in an instruction cache miss." + }, + { + "EventCode": "3D05C", + "EventName": "PM_DISP_STALL_HELD_RENAME_CYC", + "BriefDescription": "Cycles in which the NTC instruction is held at dispatch because the mapper/SRB was full. Includes GPR (count, link, tar), VSR, VMR, FPR and XVFC." + }, + { + "EventCode": "3E052", + "EventName": "PM_DISP_STALL_IC_L3", + "BriefDescription": "Cycles when dispatch was stalled while the instruction was fetched from the local L3." + }, + { + "EventCode": "3E054", + "EventName": "PM_LD_MISS_L1", + "BriefDescription": "Load Missed L1, counted at execution time (can be greater than loads finished). LMQ merges are not included in this count. i.e. if a load instruction misses on an address that is already allocated on the LMQ, this event will not increment for that load). Note that this count is per slice, so if a load spans multiple slices this event will increment multiple times for a single load." + }, + { + "EventCode": "301EA", + "EventName": "PM_THRESH_EXC_1024", + "BriefDescription": "Threshold counter exceeded a value of 1024." + }, + { + "EventCode": "300FA", + "EventName": "PM_INST_FROM_L3MISS", + "BriefDescription": "The processor's instruction cache was reloaded from a source other than the local core's L1, L2, or L3 due to a demand miss." + }, + { + "EventCode": "40006", + "EventName": "PM_ISSUE_KILL", + "BriefDescription": "Cycles in which an instruction or group of instructions were cancelled after being issued. This event increments once per occurrence, regardless of how many instructions are included in the issue group." + }, + { + "EventCode": "40116", + "EventName": "PM_MRK_LARX_FIN", + "BriefDescription": "Marked load and reserve instruction (LARX) finished. LARX and STCX are instructions used to acquire a lock." + }, + { + "EventCode": "4C010", + "EventName": "PM_DISP_STALL_BR_MPRED_IC_L3MISS", + "BriefDescription": "Cycles when dispatch was stalled while the instruction was fetched from sources beyond the local L3 after suffering a mispredicted branch." + }, + { + "EventCode": "4D01E", + "EventName": "PM_DISP_STALL_BR_MPRED", + "BriefDescription": "Cycles when dispatch was stalled for this thread due to a mispredicted branch." + }, + { + "EventCode": "4E010", + "EventName": "PM_DISP_STALL_IC_L3MISS", + "BriefDescription": "Cycles when dispatch was stalled while the instruction was fetched from any source beyond the local L3." + }, + { + "EventCode": "4E01A", + "EventName": "PM_DISP_STALL_HELD_CYC", + "BriefDescription": "Cycles in which the NTC instruction is held at dispatch for any reason." + }, + { + "EventCode": "44056", + "EventName": "PM_VECTOR_ST_CMPL", + "BriefDescription": "Vector store instructions completed." + } +] diff --git a/tools/perf/pmu-events/arch/powerpc/power10/locks.json b/tools/perf/pmu-events/arch/powerpc/power10/locks.json new file mode 100644 index 000000000000..016d8de0e14a --- /dev/null +++ b/tools/perf/pmu-events/arch/powerpc/power10/locks.json @@ -0,0 +1,12 @@ +[ + { + "EventCode": "1E058", + "EventName": "PM_STCX_FAIL_FIN", + "BriefDescription": "Conditional store instruction (STCX) failed. LARX and STCX are instructions used to acquire a lock." + }, + { + "EventCode": "4E050", + "EventName": "PM_STCX_PASS_FIN", + "BriefDescription": "Conditional store instruction (STCX) passed. LARX and STCX are instructions used to acquire a lock." + } +] diff --git a/tools/perf/pmu-events/arch/powerpc/power10/marked.json b/tools/perf/pmu-events/arch/powerpc/power10/marked.json new file mode 100644 index 000000000000..93a5a5910648 --- /dev/null +++ b/tools/perf/pmu-events/arch/powerpc/power10/marked.json @@ -0,0 +1,147 @@ +[ + { + "EventCode": "1002C", + "EventName": "PM_LD_PREFETCH_CACHE_LINE_MISS", + "BriefDescription": "The L1 cache was reloaded with a line that fulfills a prefetch request." + }, + { + "EventCode": "10132", + "EventName": "PM_MRK_INST_ISSUED", + "BriefDescription": "Marked instruction issued. Note that stores always get issued twice, the address gets issued to the LSU and the data gets issued to the VSU. Also, issues can sometimes get killed/cancelled and cause multiple sequential issues for the same instruction." + }, + { + "EventCode": "101E0", + "EventName": "PM_MRK_INST_DISP", + "BriefDescription": "The thread has dispatched a randomly sampled marked instruction." + }, + { + "EventCode": "101E2", + "EventName": "PM_MRK_BR_TAKEN_CMPL", + "BriefDescription": "Marked Branch Taken instruction completed." + }, + { + "EventCode": "20112", + "EventName": "PM_MRK_NTF_FIN", + "BriefDescription": "The marked instruction became the oldest in the pipeline before it finished. It excludes instructions that finish at dispatch." + }, + { + "EventCode": "2C01C", + "EventName": "PM_EXEC_STALL_DMISS_OFF_CHIP", + "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from a remote chip." + }, + { + "EventCode": "20138", + "EventName": "PM_MRK_ST_NEST", + "BriefDescription": "A store has been sampled/marked and is at the point of execution where it has completed in the core and can no longer be flushed. At this point the store is sent to the L2." + }, + { + "EventCode": "2013A", + "EventName": "PM_MRK_BRU_FIN", + "BriefDescription": "Marked Branch instruction finished." + }, + { + "EventCode": "2C144", + "EventName": "PM_MRK_XFER_FROM_SRC_CYC_PMC2", + "BriefDescription": "Cycles taken for a marked demand miss to reload a line from the source specified in MMCR3[15:27]." + }, + { + "EventCode": "24156", + "EventName": "PM_MRK_STCX_FIN", + "BriefDescription": "Marked conditional store instruction (STCX) finished. LARX and STCX are instructions used to acquire a lock." + }, + { + "EventCode": "24158", + "EventName": "PM_MRK_INST", + "BriefDescription": "An instruction was marked. Includes both Random Instruction Sampling (RIS) at decode time and Random Event Sampling (RES) at the time the configured event happens." + }, + { + "EventCode": "2415C", + "EventName": "PM_MRK_BR_CMPL", + "BriefDescription": "A marked branch completed. All branches are included." + }, + { + "EventCode": "200FD", + "EventName": "PM_L1_ICACHE_MISS", + "BriefDescription": "Demand iCache Miss." + }, + { + "EventCode": "30130", + "EventName": "PM_MRK_INST_FIN", + "BriefDescription": "marked instruction finished. Excludes instructions that finish at dispatch. Note that stores always finish twice since the address gets issued to the LSU and the data gets issued to the VSU." + }, + { + "EventCode": "34146", + "EventName": "PM_MRK_LD_CMPL", + "BriefDescription": "Marked loads completed." + }, + { + "EventCode": "3E158", + "EventName": "PM_MRK_STCX_FAIL", + "BriefDescription": "Marked conditional store instruction (STCX) failed. LARX and STCX are instructions used to acquire a lock." + }, + { + "EventCode": "3E15A", + "EventName": "PM_MRK_ST_FIN", + "BriefDescription": "The marked instruction was a store of any kind." + }, + { + "EventCode": "30068", + "EventName": "PM_L1_ICACHE_RELOADED_PREF", + "BriefDescription": "Counts all Icache prefetch reloads ( includes demand turned into prefetch)." + }, + { + "EventCode": "301E4", + "EventName": "PM_MRK_BR_MPRED_CMPL", + "BriefDescription": "Marked Branch Mispredicted. Includes direction and target." + }, + { + "EventCode": "300F6", + "EventName": "PM_LD_DEMAND_MISS_L1", + "BriefDescription": "The L1 cache was reloaded with a line that fulfills a demand miss request. Counted at reload time, before finish." + }, + { + "EventCode": "300FE", + "EventName": "PM_DATA_FROM_L3MISS", + "BriefDescription": "The processor's data cache was reloaded from a source other than the local core's L1, L2, or L3 due to a demand miss." + }, + { + "EventCode": "40012", + "EventName": "PM_L1_ICACHE_RELOADED_ALL", + "BriefDescription": "Counts all Icache reloads includes demand, prefetch, prefetch turned into demand and demand turned into prefetch." + }, + { + "EventCode": "40134", + "EventName": "PM_MRK_INST_TIMEO", + "BriefDescription": "Marked instruction finish timeout (instruction was lost)." + }, + { + "EventCode": "4003C", + "EventName": "PM_DISP_STALL_HELD_SYNC_CYC", + "BriefDescription": "Cycles in which the NTC instruction is held at dispatch because of a synchronizing instruction that requires the ICT to be empty before dispatch." + }, + { + "EventCode": "4505A", + "EventName": "PM_SP_FLOP_CMPL", + "BriefDescription": "Single Precision floating point instructions completed." + }, + { + "EventCode": "4D058", + "EventName": "PM_VECTOR_FLOP_CMPL", + "BriefDescription": "Vector floating point instructions completed." + }, + { + "EventCode": "4D05A", + "EventName": "PM_NON_MATH_FLOP_CMPL", + "BriefDescription": "Non Math instructions completed." + }, + { + "EventCode": "401E0", + "EventName": "PM_MRK_INST_CMPL", + "BriefDescription": "marked instruction completed." + }, + { + "EventCode": "400FE", + "EventName": "PM_DATA_FROM_MEMORY", + "BriefDescription": "The processor's data cache was reloaded from local, remote, or distant memory due to a demand miss." + } +] diff --git a/tools/perf/pmu-events/arch/powerpc/power10/memory.json b/tools/perf/pmu-events/arch/powerpc/power10/memory.json new file mode 100644 index 000000000000..b01141eeebee --- /dev/null +++ b/tools/perf/pmu-events/arch/powerpc/power10/memory.json @@ -0,0 +1,192 @@ +[ + { + "EventCode": "1000A", + "EventName": "PM_PMC3_REWIND", + "BriefDescription": "The speculative event selected for PMC3 rewinds and the counter for PMC3 is not charged." + }, + { + "EventCode": "1C040", + "EventName": "PM_XFER_FROM_SRC_PMC1", + "BriefDescription": "The processor's L1 data cache was reloaded from the source specified in MMCR3[0:12]. If MMCR1[16|17] is 0 (default), this count includes only lines that were reloaded to satisfy a demand miss. If MMCR1[16|17] is 1, this count includes both demand misses and prefetch reloads." + }, + { + "EventCode": "1C142", + "EventName": "PM_MRK_XFER_FROM_SRC_PMC1", + "BriefDescription": "For a marked data transfer instruction, the processor's L1 data cache was reloaded from the source specified in MMCR3[0:12]. If MMCR1[16|17] is 0 (default), this count includes only lines that were reloaded to satisfy a demand miss. If MMCR1[16|17] is 1, this count includes both demand misses and prefetch reloads." + }, + { + "EventCode": "1C144", + "EventName": "PM_MRK_XFER_FROM_SRC_CYC_PMC1", + "BriefDescription": "Cycles taken for a marked demand miss to reload a line from the source specified in MMCR3[0:12]." + }, + { + "EventCode": "1C056", + "EventName": "PM_DERAT_MISS_4K", + "BriefDescription": "Data ERAT Miss (Data TLB Access) page size 4K. When MMCR1[16]=0 this event counts only DERAT reloads for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches." + }, + { + "EventCode": "1C058", + "EventName": "PM_DTLB_MISS_16G", + "BriefDescription": "Data TLB reload (after a miss) page size 16G. When MMCR1[16]=0 this event counts only for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches." + }, + { + "EventCode": "1C05C", + "EventName": "PM_DTLB_MISS_2M", + "BriefDescription": "Data TLB reload (after a miss) page size 2M. Implies radix translation was used. When MMCR1[16]=0 this event counts only for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches." + }, + { + "EventCode": "1E056", + "EventName": "PM_EXEC_STALL_STORE_PIPE", + "BriefDescription": "Cycles in which the oldest instruction in the pipeline was executing in the store unit. This does not include cycles spent handling store misses, PTESYNC instructions or TLBIE instructions." + }, + { + "EventCode": "1F150", + "EventName": "PM_MRK_ST_L2_CYC", + "BriefDescription": "Cycles from L2 RC dispatch to L2 RC completion." + }, + { + "EventCode": "10062", + "EventName": "PM_LD_L3MISS_PEND_CYC", + "BriefDescription": "Cycles L3 miss was pending for this thread." + }, + { + "EventCode": "20010", + "EventName": "PM_PMC1_OVERFLOW", + "BriefDescription": "The event selected for PMC1 caused the event counter to overflow." + }, + { + "EventCode": "2001A", + "EventName": "PM_ITLB_HIT", + "BriefDescription": "The PTE required to translate the instruction address was resident in the TLB (instruction TLB access/IERAT reload). Applies to both HPT and RPT. When MMCR1[17]=0 this event counts only for demand misses. When MMCR1[17]=1 this event includes demand misses and prefetches." + }, + { + "EventCode": "2003E", + "EventName": "PM_PTESYNC_FIN", + "BriefDescription": "Ptesync instruction finished in the store unit. Only one ptesync can finish at a time." + }, + { + "EventCode": "2C040", + "EventName": "PM_XFER_FROM_SRC_PMC2", + "BriefDescription": "The processor's L1 data cache was reloaded from the source specified in MMCR3[15:27]. If MMCR1[16|17] is 0 (default), this count includes only lines that were reloaded to satisfy a demand miss. If MMCR1[16|17] is 1, this count includes both demand misses and prefetch reloads." + }, + { + "EventCode": "2C054", + "EventName": "PM_DERAT_MISS_64K", + "BriefDescription": "Data ERAT Miss (Data TLB Access) page size 64K. When MMCR1[16]=0 this event counts only DERAT reloads for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches." + }, + { + "EventCode": "2C056", + "EventName": "PM_DTLB_MISS_4K", + "BriefDescription": "Data TLB reload (after a miss) page size 4K. When MMCR1[16]=0 this event counts only for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches." + }, + { + "EventCode": "2D154", + "EventName": "PM_MRK_DERAT_MISS_64K", + "BriefDescription": "Data ERAT Miss (Data TLB Access) page size 64K for a marked instruction. When MMCR1[16]=0 this event counts only DERAT reloads for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches." + }, + { + "EventCode": "200F6", + "EventName": "PM_DERAT_MISS", + "BriefDescription": "DERAT Reloaded to satisfy a DERAT miss. All page sizes are counted by this event. When MMCR1[16]=0 this event counts only DERAT reloads for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches." + }, + { + "EventCode": "3000A", + "EventName": "PM_DISP_STALL_ITLB_MISS", + "BriefDescription": "Cycles when dispatch was stalled while waiting to resolve an instruction TLB miss." + }, + { + "EventCode": "30016", + "EventName": "PM_EXEC_STALL_DERAT_DTLB_MISS", + "BriefDescription": "Cycles in which the oldest instruction in the pipeline suffered a TLB miss and waited for it resolve." + }, + { + "EventCode": "3C040", + "EventName": "PM_XFER_FROM_SRC_PMC3", + "BriefDescription": "The processor's L1 data cache was reloaded from the source specified in MMCR3[30:42]. If MMCR1[16|17] is 0 (default), this count includes only lines that were reloaded to satisfy a demand miss. If MMCR1[16|17] is 1, this count includes both demand misses and prefetch reloads." + }, + { + "EventCode": "3C142", + "EventName": "PM_MRK_XFER_FROM_SRC_PMC3", + "BriefDescription": "For a marked data transfer instruction, the processor's L1 data cache was reloaded from the source specified in MMCR3[30:42]. If MMCR1[16|17] is 0 (default), this count includes only lines that were reloaded to satisfy a demand miss. If MMCR1[16|17] is 1, this count includes both demand misses and prefetch reloads." + }, + { + "EventCode": "3C144", + "EventName": "PM_MRK_XFER_FROM_SRC_CYC_PMC3", + "BriefDescription": "Cycles taken for a marked demand miss to reload a line from the source specified in MMCR3[30:42]." + }, + { + "EventCode": "3C054", + "EventName": "PM_DERAT_MISS_16M", + "BriefDescription": "Data ERAT Miss (Data TLB Access) page size 16M. When MMCR1[16]=0 this event counts only DERAT reloads for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches." + }, + { + "EventCode": "3C056", + "EventName": "PM_DTLB_MISS_64K", + "BriefDescription": "Data TLB reload (after a miss) page size 64K. When MMCR1[16]=0 this event counts only for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches." + }, + { + "EventCode": "3C058", + "EventName": "PM_LARX_FIN", + "BriefDescription": "Load and reserve instruction (LARX) finished. LARX and STCX are instructions used to acquire a lock." + }, + { + "EventCode": "301E2", + "EventName": "PM_MRK_ST_CMPL", + "BriefDescription": "Marked store completed and sent to nest. Note that this count excludes cache-inhibited stores." + }, + { + "EventCode": "300FC", + "EventName": "PM_DTLB_MISS", + "BriefDescription": "The DPTEG required for the load/store instruction in execution was missing from the TLB. It includes pages of all sizes for demand and prefetch activity." + }, + { + "EventCode": "4D02C", + "EventName": "PM_PMC1_REWIND", + "BriefDescription": "The speculative event selected for PMC1 rewinds and the counter for PMC1 is not charged." + }, + { + "EventCode": "4003E", + "EventName": "PM_LD_CMPL", + "BriefDescription": "Loads completed." + }, + { + "EventCode": "4C040", + "EventName": "PM_XFER_FROM_SRC_PMC4", + "BriefDescription": "The processor's L1 data cache was reloaded from the source specified in MMCR3[45:57]. If MMCR1[16|17] is 0 (default), this count includes only lines that were reloaded to satisfy a demand miss. If MMCR1[16|17] is 1, this count includes both demand misses and prefetch reloads." + }, + { + "EventCode": "4C142", + "EventName": "PM_MRK_XFER_FROM_SRC_PMC4", + "BriefDescription": "For a marked data transfer instruction, the processor's L1 data cache was reloaded from the source specified in MMCR3[45:57]. If MMCR1[16|17] is 0 (default), this count includes only lines that were reloaded to satisfy a demand miss. If MMCR1[16|17] is 1, this count includes both demand misses and prefetch reloads." + }, + { + "EventCode": "4C144", + "EventName": "PM_MRK_XFER_FROM_SRC_CYC_PMC4", + "BriefDescription": "Cycles taken for a marked demand miss to reload a line from the source specified in MMCR3[45:57]." + }, + { + "EventCode": "4C056", + "EventName": "PM_DTLB_MISS_16M", + "BriefDescription": "Data TLB reload (after a miss) page size 16M. When MMCR1[16]=0 this event counts only for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches." + }, + { + "EventCode": "4C05A", + "EventName": "PM_DTLB_MISS_1G", + "BriefDescription": "Data TLB reload (after a miss) page size 1G. Implies radix translation was used. When MMCR1[16]=0 this event counts only for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches." + }, + { + "EventCode": "4C15E", + "EventName": "PM_MRK_DTLB_MISS_64K", + "BriefDescription": "Marked Data TLB reload (after a miss) page size 64K. When MMCR1[16]=0 this event counts only for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches." + }, + { + "EventCode": "4D056", + "EventName": "PM_NON_FMA_FLOP_CMPL", + "BriefDescription": "Non FMA instruction completed." + }, + { + "EventCode": "40164", + "EventName": "PM_MRK_DERAT_MISS_2M", + "BriefDescription": "Data ERAT Miss (Data TLB Access) page size 2M for a marked instruction. When MMCR1[16]=0 this event counts only DERAT reloads for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches." + } +] diff --git a/tools/perf/pmu-events/arch/powerpc/power10/others.json b/tools/perf/pmu-events/arch/powerpc/power10/others.json new file mode 100644 index 000000000000..a119e56cbf1c --- /dev/null +++ b/tools/perf/pmu-events/arch/powerpc/power10/others.json @@ -0,0 +1,297 @@ +[ + { + "EventCode": "10016", + "EventName": "PM_VSU0_ISSUE", + "BriefDescription": "VSU instructions issued to VSU pipe 0." + }, + { + "EventCode": "1001C", + "EventName": "PM_ULTRAVISOR_INST_CMPL", + "BriefDescription": "PowerPC instructions that completed while the thread was in ultravisor state." + }, + { + "EventCode": "100F0", + "EventName": "PM_CYC", + "BriefDescription": "Processor cycles." + }, + { + "EventCode": "10134", + "EventName": "PM_MRK_ST_DONE_L2", + "BriefDescription": "Marked stores completed in L2 (RC machine done)." + }, + { + "EventCode": "1505E", + "EventName": "PM_LD_HIT_L1", + "BriefDescription": "Loads that finished without experiencing an L1 miss." + }, + { + "EventCode": "1D05E", + "EventName": "PM_DISP_STALL_HELD_HALT_CYC", + "BriefDescription": "Cycles in which the NTC instruction is held at dispatch because of power management." + }, + { + "EventCode": "1E054", + "EventName": "PM_EXEC_STALL_DMISS_L21_L31", + "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from another core's L2 or L3 on the same chip." + }, + { + "EventCode": "1E05A", + "EventName": "PM_CMPL_STALL_LWSYNC", + "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a lwsync waiting to complete." + }, + { + "EventCode": "1F056", + "EventName": "PM_DISP_SS0_2_INSTR_CYC", + "BriefDescription": "Cycles in which Superslice 0 dispatches either 1 or 2 instructions." + }, + { + "EventCode": "1F15C", + "EventName": "PM_MRK_STCX_L2_CYC", + "BriefDescription": "Cycles spent in the nest portion of a marked Stcx instruction. It starts counting when the operation starts to drain to the L2 and it stops counting when the instruction retires from the Instruction Completion Table (ICT) in the Instruction Sequencing Unit (ISU)." + }, + { + "EventCode": "10066", + "EventName": "PM_ADJUNCT_CYC", + "BriefDescription": "Cycles in which the thread is in Adjunct state. MSR[S HV PR] bits = 011." + }, + { + "EventCode": "101E4", + "EventName": "PM_MRK_L1_ICACHE_MISS", + "BriefDescription": "Marked Instruction suffered an icache Miss." + }, + { + "EventCode": "101EA", + "EventName": "PM_MRK_L1_RELOAD_VALID", + "BriefDescription": "Marked demand reload." + }, + { + "EventCode": "100F4", + "EventName": "PM_FLOP_CMPL", + "BriefDescription": "Floating Point Operations Completed. Includes any type. It counts once for each 1, 2, 4 or 8 flop instruction. Use PM_1|2|4|8_FLOP_CMPL events to count flops." + }, + { + "EventCode": "100FA", + "EventName": "PM_RUN_LATCH_ANY_THREAD_CYC", + "BriefDescription": "Cycles when at least one thread has the run latch set." + }, + { + "EventCode": "100FC", + "EventName": "PM_LD_REF_L1", + "BriefDescription": "All L1 D cache load references counted at finish, gated by reject. In P9 and earlier this event counted only cacheable loads but in P10 both cacheable and non-cacheable loads are included." + }, + { + "EventCode": "20006", + "EventName": "PM_DISP_STALL_HELD_ISSQ_FULL_CYC", + "BriefDescription": "Cycles in which the NTC instruction is held at dispatch due to Issue queue full. Includes issue queue and branch queue." + }, + { + "EventCode": "2000C", + "EventName": "PM_RUN_LATCH_ALL_THREADS_CYC", + "BriefDescription": "Cycles when the run latch is set for all threads." + }, + { + "EventCode": "2E010", + "EventName": "PM_ADJUNCT_INST_CMPL", + "BriefDescription": "PowerPC instructions that completed while the thread is in Adjunct state." + }, + { + "EventCode": "2E014", + "EventName": "PM_STCX_FIN", + "BriefDescription": "Conditional store instruction (STCX) finished. LARX and STCX are instructions used to acquire a lock." + }, + { + "EventCode": "20130", + "EventName": "PM_MRK_INST_DECODED", + "BriefDescription": "An instruction was marked at decode time. Random Instruction Sampling (RIS) only." + }, + { + "EventCode": "20132", + "EventName": "PM_MRK_DFU_ISSUE", + "BriefDescription": "The marked instruction was a decimal floating point operation issued to the VSU. Measured at issue time." + }, + { + "EventCode": "20134", + "EventName": "PM_MRK_FXU_ISSUE", + "BriefDescription": "The marked instruction was a fixed point operation issued to the VSU. Measured at issue time." + }, + { + "EventCode": "2505C", + "EventName": "PM_VSU_ISSUE", + "BriefDescription": "At least one VSU instruction was issued to one of the VSU pipes. Up to 4 per cycle. Includes fixed point operations." + }, + { + "EventCode": "2F054", + "EventName": "PM_DISP_SS1_2_INSTR_CYC", + "BriefDescription": "Cycles in which Superslice 1 dispatches either 1 or 2 instructions." + }, + { + "EventCode": "2F056", + "EventName": "PM_DISP_SS1_4_INSTR_CYC", + "BriefDescription": "Cycles in which Superslice 1 dispatches either 3 or 4 instructions." + }, + { + "EventCode": "2006C", + "EventName": "PM_RUN_CYC_SMT4_MODE", + "BriefDescription": "Cycles when this thread's run latch is set and the core is in SMT4 mode." + }, + { + "EventCode": "201E0", + "EventName": "PM_MRK_DATA_FROM_MEMORY", + "BriefDescription": "The processor's data cache was reloaded from local, remote, or distant memory due to a demand miss for a marked load." + }, + { + "EventCode": "201E4", + "EventName": "PM_MRK_DATA_FROM_L3MISS", + "BriefDescription": "The processor's data cache was reloaded from a source other than the local core's L1, L2, or L3 due to a demand miss for a marked load." + }, + { + "EventCode": "201E8", + "EventName": "PM_THRESH_EXC_512", + "BriefDescription": "Threshold counter exceeded a value of 512." + }, + { + "EventCode": "200F2", + "EventName": "PM_INST_DISP", + "BriefDescription": "PowerPC instructions dispatched." + }, + { + "EventCode": "30132", + "EventName": "PM_MRK_VSU_FIN", + "BriefDescription": "VSU marked instructions finished. Excludes simple FX instructions issued to the Store Unit." + }, + { + "EventCode": "30038", + "EventName": "PM_EXEC_STALL_DMISS_LMEM", + "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from the local memory, local OpenCapp cache, or local OpenCapp memory." + }, + { + "EventCode": "3F04A", + "EventName": "PM_LSU_ST5_FIN", + "BriefDescription": "LSU Finished an internal operation in ST2 port." + }, + { + "EventCode": "34054", + "EventName": "PM_EXEC_STALL_DMISS_L2L3_NOCONFLICT", + "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from the local L2 or local L3, without a dispatch conflict." + }, + { + "EventCode": "3405A", + "EventName": "PM_PRIVILEGED_INST_CMPL", + "BriefDescription": "PowerPC Instructions that completed while the thread is in Privileged state." + }, + { + "EventCode": "3F150", + "EventName": "PM_MRK_ST_DRAIN_CYC", + "BriefDescription": "cycles to drain st from core to L2." + }, + { + "EventCode": "3F054", + "EventName": "PM_DISP_SS0_4_INSTR_CYC", + "BriefDescription": "Cycles in which Superslice 0 dispatches either 3 or 4 instructions." + }, + { + "EventCode": "3F056", + "EventName": "PM_DISP_SS0_8_INSTR_CYC", + "BriefDescription": "Cycles in which Superslice 0 dispatches either 5, 6, 7 or 8 instructions." + }, + { + "EventCode": "30162", + "EventName": "PM_MRK_ISSUE_DEPENDENT_LOAD", + "BriefDescription": "The marked instruction was dependent on a load. It is eligible for issue kill." + }, + { + "EventCode": "40114", + "EventName": "PM_MRK_START_PROBE_NOP_DISP", + "BriefDescription": "Marked Start probe nop dispatched. Instruction AND R0,R0,R0." + }, + { + "EventCode": "4001C", + "EventName": "PM_VSU_FIN", + "BriefDescription": "VSU instructions finished." + }, + { + "EventCode": "4C01A", + "EventName": "PM_EXEC_STALL_DMISS_OFF_NODE", + "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from a distant chip." + }, + { + "EventCode": "4D012", + "EventName": "PM_PMC3_SAVED", + "BriefDescription": "The conditions for the speculative event selected for PMC3 are met and PMC3 is charged." + }, + { + "EventCode": "4D022", + "EventName": "PM_HYPERVISOR_INST_CMPL", + "BriefDescription": "PowerPC instructions that completed while the thread is in hypervisor state." + }, + { + "EventCode": "4D026", + "EventName": "PM_ULTRAVISOR_CYC", + "BriefDescription": "Cycles when the thread is in Ultravisor state. MSR[S HV PR]=110." + }, + { + "EventCode": "4D028", + "EventName": "PM_PRIVILEGED_CYC", + "BriefDescription": "Cycles when the thread is in Privileged state. MSR[S HV PR]=x00." + }, + { + "EventCode": "40030", + "EventName": "PM_INST_FIN", + "BriefDescription": "Instructions finished." + }, + { + "EventCode": "44146", + "EventName": "PM_MRK_STCX_CORE_CYC", + "BriefDescription": "Cycles spent in the core portion of a marked Stcx instruction. It starts counting when the instruction is decoded and stops counting when it drains into the L2." + }, + { + "EventCode": "44054", + "EventName": "PM_VECTOR_LD_CMPL", + "BriefDescription": "Vector load instructions completed." + }, + { + "EventCode": "45054", + "EventName": "PM_FMA_CMPL", + "BriefDescription": "Two floating point instructions completed (FMA class of instructions: fmadd, fnmadd, fmsub, fnmsub). Scalar instructions only." + }, + { + "EventCode": "45056", + "EventName": "PM_SCALAR_FLOP_CMPL", + "BriefDescription": "Scalar floating point instructions completed." + }, + { + "EventCode": "4505C", + "EventName": "PM_MATH_FLOP_CMPL", + "BriefDescription": "Math floating point instructions completed." + }, + { + "EventCode": "4D05E", + "EventName": "PM_BR_CMPL", + "BriefDescription": "A branch completed. All branches are included." + }, + { + "EventCode": "4E15E", + "EventName": "PM_MRK_INST_FLUSHED", + "BriefDescription": "The marked instruction was flushed." + }, + { + "EventCode": "401E6", + "EventName": "PM_MRK_INST_FROM_L3MISS", + "BriefDescription": "The processor's instruction cache was reloaded from a source other than the local core's L1, L2, or L3 due to a demand miss for a marked instruction." + }, + { + "EventCode": "401E8", + "EventName": "PM_MRK_DATA_FROM_L2MISS", + "BriefDescription": "The processor's data cache was reloaded from a source other than the local core's L1 or L2 due to a demand miss for a marked load." + }, + { + "EventCode": "400F0", + "EventName": "PM_LD_DEMAND_MISS_L1_FIN", + "BriefDescription": "Load Missed L1, counted at finish time." + }, + { + "EventCode": "400FA", + "EventName": "PM_RUN_INST_CMPL", + "BriefDescription": "Completed PowerPC instructions gated by the run latch." + } +] diff --git a/tools/perf/pmu-events/arch/powerpc/power10/pipeline.json b/tools/perf/pmu-events/arch/powerpc/power10/pipeline.json new file mode 100644 index 000000000000..b61b5cc157ee --- /dev/null +++ b/tools/perf/pmu-events/arch/powerpc/power10/pipeline.json @@ -0,0 +1,297 @@ +[ + { + "EventCode": "100FE", + "EventName": "PM_INST_CMPL", + "BriefDescription": "PowerPC instructions completed." + }, + { + "EventCode": "10006", + "EventName": "PM_DISP_STALL_HELD_OTHER_CYC", + "BriefDescription": "Cycles in which the NTC instruction is held at dispatch for any other reason." + }, + { + "EventCode": "1000C", + "EventName": "PM_LSU_LD0_FIN", + "BriefDescription": "LSU Finished an internal operation in LD0 port." + }, + { + "EventCode": "1000E", + "EventName": "PM_MMA_ISSUED", + "BriefDescription": "MMA instructions issued." + }, + { + "EventCode": "10012", + "EventName": "PM_LSU_ST0_FIN", + "BriefDescription": "LSU Finished an internal operation in ST0 port." + }, + { + "EventCode": "10014", + "EventName": "PM_LSU_ST4_FIN", + "BriefDescription": "LSU Finished an internal operation in ST4 port." + }, + { + "EventCode": "10018", + "EventName": "PM_IC_DEMAND_CYC", + "BriefDescription": "Cycles in which an instruction reload is pending to satisfy a demand miss." + }, + { + "EventCode": "10022", + "EventName": "PM_PMC2_SAVED", + "BriefDescription": "The conditions for the speculative event selected for PMC2 are met and PMC2 is charged." + }, + { + "EventCode": "10024", + "EventName": "PM_PMC5_OVERFLOW", + "BriefDescription": "The event selected for PMC5 caused the event counter to overflow." + }, + { + "EventCode": "10058", + "EventName": "PM_EXEC_STALL_FIN_AT_DISP", + "BriefDescription": "Cycles in which the oldest instruction in the pipeline finished at dispatch and did not require execution in the LSU, BRU or VSU." + }, + { + "EventCode": "1005A", + "EventName": "PM_FLUSH_MPRED", + "BriefDescription": "A flush occurred due to a mispredicted branch. Includes target and direction." + }, + { + "EventCode": "1C05A", + "EventName": "PM_DERAT_MISS_2M", + "BriefDescription": "Data ERAT Miss (Data TLB Access) page size 2M. Implies radix translation. When MMCR1[16]=0 this event counts only DERAT reloads for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches." + }, + { + "EventCode": "10064", + "EventName": "PM_DISP_STALL_IC_L2", + "BriefDescription": "Cycles when dispatch was stalled while the instruction was fetched from the local L2." + }, + { + "EventCode": "10068", + "EventName": "PM_BR_FIN", + "BriefDescription": "A branch instruction finished. Includes predicted/mispredicted/unconditional." + }, + { + "EventCode": "1006A", + "EventName": "PM_FX_LSU_FIN", + "BriefDescription": "Simple fixed point instruction issued to the store unit. Measured at finish time." + }, + { + "EventCode": "1006C", + "EventName": "PM_RUN_CYC_ST_MODE", + "BriefDescription": "Cycles when the run latch is set and the core is in ST mode." + }, + { + "EventCode": "20004", + "EventName": "PM_ISSUE_STALL", + "BriefDescription": "Cycles in which the oldest instruction in the pipeline was dispatched but not issued yet." + }, + { + "EventCode": "2000A", + "EventName": "PM_HYPERVISOR_CYC", + "BriefDescription": "Cycles when the thread is in Hypervisor state. MSR[S HV PR]=010." + }, + { + "EventCode": "2000E", + "EventName": "PM_LSU_LD1_FIN", + "BriefDescription": "LSU Finished an internal operation in LD1 port." + }, + { + "EventCode": "2C014", + "EventName": "PM_CMPL_STALL_SPECIAL", + "BriefDescription": "Cycles in which the oldest instruction in the pipeline required special handling before completing." + }, + { + "EventCode": "2C018", + "EventName": "PM_EXEC_STALL_DMISS_L3MISS", + "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from a source beyond the local L2 or local L3." + }, + { + "EventCode": "2D010", + "EventName": "PM_LSU_ST1_FIN", + "BriefDescription": "LSU Finished an internal operation in ST1 port." + }, + { + "EventCode": "2D012", + "EventName": "PM_VSU1_ISSUE", + "BriefDescription": "VSU instructions issued to VSU pipe 1." + }, + { + "EventCode": "2D018", + "EventName": "PM_EXEC_STALL_VSU", + "BriefDescription": "Cycles in which the oldest instruction in the pipeline was executing in the VSU (includes FXU, VSU, CRU)." + }, + { + "EventCode": "2E01E", + "EventName": "PM_EXEC_STALL_NTC_FLUSH", + "BriefDescription": "Cycles in which the oldest instruction in the pipeline was executing in any unit before it was flushed. Note that if the flush of the oldest instruction happens after finish, the cycles from dispatch to issue will be included in PM_DISP_STALL and the cycles from issue to finish will be included in PM_EXEC_STALL and its corresponding children." + }, + { + "EventCode": "2013C", + "EventName": "PM_MRK_FX_LSU_FIN", + "BriefDescription": "The marked instruction was simple fixed point that was issued to the store unit. Measured at finish time." + }, + { + "EventCode": "2405A", + "EventName": "PM_NTC_FIN", + "BriefDescription": "Cycles in which the oldest instruction in the pipeline (NTC) finishes. Note that instructions can finish out of order, therefore not all the instructions that finish have a Next-to-complete status." + }, + { + "EventCode": "201E2", + "EventName": "PM_MRK_LD_MISS_L1", + "BriefDescription": "Marked DL1 Demand Miss counted at finish time." + }, + { + "EventCode": "200F4", + "EventName": "PM_RUN_CYC", + "BriefDescription": "Processor cycles gated by the run latch." + }, + { + "EventCode": "30004", + "EventName": "PM_DISP_STALL_FLUSH", + "BriefDescription": "Cycles when dispatch was stalled because of a flush that happened to an instruction(s) that was not yet NTC. PM_EXEC_STALL_NTC_FLUSH only includes instructions that were flushed after becoming NTC." + }, + { + "EventCode": "30008", + "EventName": "PM_EXEC_STALL", + "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting to finish in one of the execution units (BRU, LSU, VSU). Only cycles between issue and finish are counted in this category." + }, + { + "EventCode": "3001A", + "EventName": "PM_LSU_ST2_FIN", + "BriefDescription": "LSU Finished an internal operation in ST2 port." + }, + { + "EventCode": "30020", + "EventName": "PM_PMC2_REWIND", + "BriefDescription": "The speculative event selected for PMC2 rewinds and the counter for PMC2 is not charged." + }, + { + "EventCode": "30022", + "EventName": "PM_PMC4_SAVED", + "BriefDescription": "The conditions for the speculative event selected for PMC4 are met and PMC4 is charged." + }, + { + "EventCode": "30024", + "EventName": "PM_PMC6_OVERFLOW", + "BriefDescription": "The event selected for PMC6 caused the event counter to overflow." + }, + { + "EventCode": "30028", + "EventName": "PM_CMPL_STALL_MEM_ECC", + "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for the non-speculative finish of either a stcx waiting for its result or a load waiting for non-critical sectors of data and ECC." + }, + { + "EventCode": "30036", + "EventName": "PM_EXEC_STALL_SIMPLE_FX", + "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a simple fixed point instruction executing in the Load Store Unit." + }, + { + "EventCode": "3003A", + "EventName": "PM_CMPL_STALL_EXCEPTION", + "BriefDescription": "Cycles in which the oldest instruction in the pipeline was not allowed to complete because it was interrupted by ANY exception, which has to be serviced before the instruction can complete." + }, + { + "EventCode": "3F044", + "EventName": "PM_VSU2_ISSUE", + "BriefDescription": "VSU instructions issued to VSU pipe 2." + }, + { + "EventCode": "30058", + "EventName": "PM_TLBIE_FIN", + "BriefDescription": "TLBIE instructions finished in the LSU. Two TLBIEs can finish each cycle. All will be counted." + }, + { + "EventCode": "3D058", + "EventName": "PM_SCALAR_FSQRT_FDIV_ISSUE", + "BriefDescription": "Scalar versions of four floating point operations: fdiv,fsqrt (xvdivdp, xvdivsp, xvsqrtdp, xvsqrtsp)." + }, + { + "EventCode": "30066", + "EventName": "PM_LSU_FIN", + "BriefDescription": "LSU Finished an internal operation (up to 4 per cycle)." + }, + { + "EventCode": "40004", + "EventName": "PM_FXU_ISSUE", + "BriefDescription": "A fixed point instruction was issued to the VSU." + }, + { + "EventCode": "40008", + "EventName": "PM_NTC_ALL_FIN", + "BriefDescription": "Cycles in which both instructions in the ICT entry pair show as finished. These are the cycles between finish and completion for the oldest pair of instructions in the pipeline." + }, + { + "EventCode": "40010", + "EventName": "PM_PMC3_OVERFLOW", + "BriefDescription": "The event selected for PMC3 caused the event counter to overflow." + }, + { + "EventCode": "4C012", + "EventName": "PM_EXEC_STALL_DERAT_ONLY_MISS", + "BriefDescription": "Cycles in which the oldest instruction in the pipeline suffered an ERAT miss and waited for it resolve." + }, + { + "EventCode": "4C018", + "EventName": "PM_CMPL_STALL", + "BriefDescription": "Cycles in which the oldest instruction in the pipeline cannot complete because the thread was blocked for any reason." + }, + { + "EventCode": "4C01E", + "EventName": "PM_LSU_ST3_FIN", + "BriefDescription": "LSU Finished an internal operation in ST3 port." + }, + { + "EventCode": "4D018", + "EventName": "PM_EXEC_STALL_BRU", + "BriefDescription": "Cycles in which the oldest instruction in the pipeline was executing in the Branch unit." + }, + { + "EventCode": "4D01A", + "EventName": "PM_CMPL_STALL_HWSYNC", + "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a hwsync waiting for response from L2 before completing." + }, + { + "EventCode": "4D01C", + "EventName": "PM_EXEC_STALL_TLBIEL", + "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a TLBIEL instruction executing in the Load Store Unit. TLBIEL instructions have lower overhead than TLBIE instructions because they don't get set to the nest." + }, + { + "EventCode": "4E012", + "EventName": "PM_EXEC_STALL_UNKNOWN", + "BriefDescription": "Cycles in which the oldest instruction in the pipeline completed without an ntf_type pulse. The ntf_pulse was missed by the ISU because the NTF finishes and completions came too close together." + }, + { + "EventCode": "4D020", + "EventName": "PM_VSU3_ISSUE", + "BriefDescription": "VSU instruction was issued to VSU pipe 3." + }, + { + "EventCode": "40132", + "EventName": "PM_MRK_LSU_FIN", + "BriefDescription": "LSU marked instruction finish." + }, + { + "EventCode": "45058", + "EventName": "PM_IC_MISS_CMPL", + "BriefDescription": "Non-speculative icache miss, counted at completion." + }, + { + "EventCode": "4D050", + "EventName": "PM_VSU_NON_FLOP_CMPL", + "BriefDescription": "Non-floating point VSU instructions completed." + }, + { + "EventCode": "4D052", + "EventName": "PM_2FLOP_CMPL", + "BriefDescription": "Double Precision vector version of fmul, fsub, fcmp, fsel, fabs, fnabs, fres, fsqrte, fneg completed." + }, + { + "EventCode": "400F2", + "EventName": "PM_1PLUS_PPC_DISP", + "BriefDescription": "Cycles at least one Instr Dispatched." + }, + { + "EventCode": "400F8", + "EventName": "PM_FLUSH", + "BriefDescription": "Flush (any type)." + } +] diff --git a/tools/perf/pmu-events/arch/powerpc/power10/pmc.json b/tools/perf/pmu-events/arch/powerpc/power10/pmc.json new file mode 100644 index 000000000000..ea122a91ceb0 --- /dev/null +++ b/tools/perf/pmu-events/arch/powerpc/power10/pmc.json @@ -0,0 +1,22 @@ +[ + { + "EventCode": "301E8", + "EventName": "PM_THRESH_EXC_64", + "BriefDescription": "Threshold counter exceeded a value of 64." + }, + { + "EventCode": "45050", + "EventName": "PM_1FLOP_CMPL", + "BriefDescription": "One floating point instruction completed (fadd, fmul, fsub, fcmp, fsel, fabs, fnabs, fres, fsqrte, fneg)." + }, + { + "EventCode": "45052", + "EventName": "PM_4FLOP_CMPL", + "BriefDescription": "Four floating point instructions completed (fadd, fmul, fsub, fcmp, fsel, fabs, fnabs, fres, fsqrte, fneg)." + }, + { + "EventCode": "4D054", + "EventName": "PM_8FLOP_CMPL", + "BriefDescription": "Four Double Precision vector instructions completed." + } +] diff --git a/tools/perf/pmu-events/arch/powerpc/power10/translation.json b/tools/perf/pmu-events/arch/powerpc/power10/translation.json new file mode 100644 index 000000000000..5a714e3dd71a --- /dev/null +++ b/tools/perf/pmu-events/arch/powerpc/power10/translation.json @@ -0,0 +1,57 @@ +[ + { + "EventCode": "1F15E", + "EventName": "PM_MRK_START_PROBE_NOP_CMPL", + "BriefDescription": "Marked Start probe nop (AND R0,R0,R0) completed." + }, + { + "EventCode": "20016", + "EventName": "PM_ST_FIN", + "BriefDescription": "Store finish count. Includes speculative activity." + }, + { + "EventCode": "20018", + "EventName": "PM_ST_FWD", + "BriefDescription": "Store forwards that finished." + }, + { + "EventCode": "2011C", + "EventName": "PM_MRK_NTF_CYC", + "BriefDescription": "Cycles during which the marked instruction is the oldest in the pipeline (NTF or NTC)." + }, + { + "EventCode": "2E01C", + "EventName": "PM_EXEC_STALL_TLBIE", + "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a TLBIE instruction executing in the Load Store Unit." + }, + { + "EventCode": "201E6", + "EventName": "PM_THRESH_EXC_32", + "BriefDescription": "Threshold counter exceeded a value of 32." + }, + { + "EventCode": "200F0", + "EventName": "PM_ST_CMPL", + "BriefDescription": "Stores completed from S2Q (2nd-level store queue). This event includes regular stores, stcx and cache inhibited stores. The following operations are excluded (pteupdate, snoop tlbie complete, store atomics, miso, load atomic payloads, tlbie, tlbsync, slbieg, isync, msgsnd, slbiag, cpabort, copy, tcheck, tend, stsync, dcbst, icbi, dcbf, hwsync, lwsync, ptesync, eieio, msgsync)." + }, + { + "EventCode": "200FE", + "EventName": "PM_DATA_FROM_L2MISS", + "BriefDescription": "The processor's data cache was reloaded from a source other than the local core's L1 or L2 due to a demand miss." + }, + { + "EventCode": "30010", + "EventName": "PM_PMC2_OVERFLOW", + "BriefDescription": "The event selected for PMC2 caused the event counter to overflow." + }, + { + "EventCode": "4D010", + "EventName": "PM_PMC1_SAVED", + "BriefDescription": "The conditions for the speculative event selected for PMC1 are met and PMC1 is charged." + }, + { + "EventCode": "4D05C", + "EventName": "PM_DPP_FLOP_CMPL", + "BriefDescription": "Double-Precision or Quad-Precision instructions completed." + } +] -- cgit v1.2.3 From f07952b17969777196512368a216baae1ad45ea6 Mon Sep 17 00:00:00 2001 From: Alexander Antonov Date: Mon, 19 Apr 2021 12:41:44 +0300 Subject: perf stat: Basic support for iostat in perf Add basic flow for a new iostat mode in perf. Mode is intended to provide four I/O performance metrics per each PCIe root port: Inbound Read, Inbound Write, Outbound Read, Outbound Write. The actual code to compute the metrics and attribute it to root port is in follow-on patches. Signed-off-by: Alexander Antonov Acked-by: Namhyung Kim Cc: Alexander Shishkin Cc: Alexey V Bayduraev Cc: Andi Kleen Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20210419094147.15909-2-alexander.antonov@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 21 ++++++++++++++++- tools/perf/util/Build | 1 + tools/perf/util/iostat.c | 53 ++++++++++++++++++++++++++++++++++++++++++ tools/perf/util/iostat.h | 47 +++++++++++++++++++++++++++++++++++++ tools/perf/util/stat-display.c | 40 +++++++++++++++++++++++-------- tools/perf/util/stat-shadow.c | 5 +++- tools/perf/util/stat.h | 1 + 7 files changed, 156 insertions(+), 12 deletions(-) create mode 100644 tools/perf/util/iostat.c create mode 100644 tools/perf/util/iostat.h (limited to 'tools') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 2a2c15cac80a..ba5b31aab86b 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -68,6 +68,7 @@ #include "util/affinity.h" #include "util/pfm.h" #include "util/bpf_counter.h" +#include "util/iostat.h" #include "asm/bug.h" #include @@ -212,7 +213,8 @@ static struct perf_stat_config stat_config = { .walltime_nsecs_stats = &walltime_nsecs_stats, .big_num = true, .ctl_fd = -1, - .ctl_fd_ack = -1 + .ctl_fd_ack = -1, + .iostat_run = false, }; static bool cpus_map_matched(struct evsel *a, struct evsel *b) @@ -1268,6 +1270,9 @@ static struct option stat_options[] = { "\t\t\t Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n" "\t\t\t Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.", parse_control_option), + OPT_CALLBACK_OPTARG(0, "iostat", &evsel_list, &stat_config, "default", + "measure I/O performance metrics provided by arch/platform", + iostat_parse), OPT_END() }; @@ -2341,6 +2346,17 @@ int cmd_stat(int argc, const char **argv) goto out; } + if (stat_config.iostat_run) { + status = iostat_prepare(evsel_list, &stat_config); + if (status) + goto out; + if (iostat_mode == IOSTAT_LIST) { + iostat_list(evsel_list, &stat_config); + goto out; + } else if (verbose) + iostat_list(evsel_list, &stat_config); + } + if (add_default_attributes()) goto out; @@ -2516,6 +2532,9 @@ int cmd_stat(int argc, const char **argv) perf_stat__exit_aggr_mode(); evlist__free_stats(evsel_list); out: + if (stat_config.iostat_run) + iostat_release(evsel_list); + zfree(&stat_config.walltime_run); if (smi_cost && smi_reset) diff --git a/tools/perf/util/Build b/tools/perf/util/Build index e3e12f9d4733..7dd815712d60 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -102,6 +102,7 @@ perf-y += rwsem.o perf-y += thread-stack.o perf-y += spark.o perf-y += topdown.o +perf-y += iostat.o perf-y += stream.o perf-$(CONFIG_AUXTRACE) += auxtrace.o perf-$(CONFIG_AUXTRACE) += intel-pt-decoder/ diff --git a/tools/perf/util/iostat.c b/tools/perf/util/iostat.c new file mode 100644 index 000000000000..57dd49da28fe --- /dev/null +++ b/tools/perf/util/iostat.c @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "util/iostat.h" +#include "util/debug.h" + +enum iostat_mode_t iostat_mode = IOSTAT_NONE; + +__weak int iostat_prepare(struct evlist *evlist __maybe_unused, + struct perf_stat_config *config __maybe_unused) +{ + return -1; +} + +__weak int iostat_parse(const struct option *opt __maybe_unused, + const char *str __maybe_unused, + int unset __maybe_unused) +{ + pr_err("iostat mode is not supported on current platform\n"); + return -1; +} + +__weak void iostat_list(struct evlist *evlist __maybe_unused, + struct perf_stat_config *config __maybe_unused) +{ +} + +__weak void iostat_release(struct evlist *evlist __maybe_unused) +{ +} + +__weak void iostat_print_header_prefix(struct perf_stat_config *config __maybe_unused) +{ +} + +__weak void iostat_print_metric(struct perf_stat_config *config __maybe_unused, + struct evsel *evsel __maybe_unused, + struct perf_stat_output_ctx *out __maybe_unused) +{ +} + +__weak void iostat_prefix(struct evlist *evlist __maybe_unused, + struct perf_stat_config *config __maybe_unused, + char *prefix __maybe_unused, + struct timespec *ts __maybe_unused) +{ +} + +__weak void iostat_print_counters(struct evlist *evlist __maybe_unused, + struct perf_stat_config *config __maybe_unused, + struct timespec *ts __maybe_unused, + char *prefix __maybe_unused, + iostat_print_counter_t print_cnt_cb __maybe_unused) +{ +} diff --git a/tools/perf/util/iostat.h b/tools/perf/util/iostat.h new file mode 100644 index 000000000000..23c1c46a331a --- /dev/null +++ b/tools/perf/util/iostat.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * perf iostat + * + * Copyright (C) 2020, Intel Corporation + * + * Authors: Alexander Antonov + */ + +#ifndef _IOSTAT_H +#define _IOSTAT_H + +#include +#include "util/stat.h" +#include "util/parse-events.h" +#include "util/evlist.h" + +struct option; +struct perf_stat_config; +struct evlist; +struct timespec; + +enum iostat_mode_t { + IOSTAT_NONE = -1, + IOSTAT_RUN = 0, + IOSTAT_LIST = 1 +}; + +extern enum iostat_mode_t iostat_mode; + +typedef void (*iostat_print_counter_t)(struct perf_stat_config *, struct evsel *, char *); + +int iostat_prepare(struct evlist *evlist, struct perf_stat_config *config); +int iostat_parse(const struct option *opt, const char *str, + int unset __maybe_unused); +void iostat_list(struct evlist *evlist, struct perf_stat_config *config); +void iostat_release(struct evlist *evlist); +void iostat_prefix(struct evlist *evlist, struct perf_stat_config *config, + char *prefix, struct timespec *ts); +void iostat_print_header_prefix(struct perf_stat_config *config); +void iostat_print_metric(struct perf_stat_config *config, struct evsel *evsel, + struct perf_stat_output_ctx *out); +void iostat_print_counters(struct evlist *evlist, + struct perf_stat_config *config, struct timespec *ts, + char *prefix, iostat_print_counter_t print_cnt_cb); + +#endif /* _IOSTAT_H */ diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c index d3137bc17065..98664caef6af 100644 --- a/tools/perf/util/stat-display.c +++ b/tools/perf/util/stat-display.c @@ -17,6 +17,7 @@ #include "cgroup.h" #include #include "util.h" +#include "iostat.h" #define CNTR_NOT_SUPPORTED "" #define CNTR_NOT_COUNTED "" @@ -310,6 +311,11 @@ static void print_metric_header(struct perf_stat_config *config, struct outstate *os = ctx; char tbuf[1024]; + /* In case of iostat, print metric header for first root port only */ + if (config->iostat_run && + os->evsel->priv != os->evsel->evlist->selected->priv) + return; + if (!valid_only_metric(unit)) return; unit = fixunit(tbuf, os->evsel, unit); @@ -958,8 +964,11 @@ static void print_metric_headers(struct perf_stat_config *config, if (config->csv_output) { if (config->interval) fputs("time,", config->output); - fputs(aggr_header_csv[config->aggr_mode], config->output); + if (!config->iostat_run) + fputs(aggr_header_csv[config->aggr_mode], config->output); } + if (config->iostat_run) + iostat_print_header_prefix(config); /* Print metrics headers only */ evlist__for_each_entry(evlist, counter) { @@ -989,7 +998,8 @@ static void print_interval(struct perf_stat_config *config, if (config->interval_clear) puts(CONSOLE_CLEAR); - sprintf(prefix, "%6lu.%09lu%s", (unsigned long) ts->tv_sec, ts->tv_nsec, config->csv_sep); + if (!config->iostat_run) + sprintf(prefix, "%6lu.%09lu%s", (unsigned long) ts->tv_sec, ts->tv_nsec, config->csv_sep); if ((num_print_interval == 0 && !config->csv_output) || config->interval_clear) { switch (config->aggr_mode) { @@ -1025,9 +1035,11 @@ static void print_interval(struct perf_stat_config *config, break; case AGGR_GLOBAL: default: - fprintf(output, "# time"); - if (!metric_only) - fprintf(output, " counts %*s events\n", unit_width, "unit"); + if (!config->iostat_run) { + fprintf(output, "# time"); + if (!metric_only) + fprintf(output, " counts %*s events\n", unit_width, "unit"); + } case AGGR_UNSET: break; } @@ -1220,6 +1232,9 @@ void evlist__print_counters(struct evlist *evlist, struct perf_stat_config *conf struct evsel *counter; char buf[64], *prefix = NULL; + if (config->iostat_run) + evlist->selected = evlist__first(evlist); + if (interval) print_interval(config, evlist, prefix = buf, ts); else @@ -1232,7 +1247,7 @@ void evlist__print_counters(struct evlist *evlist, struct perf_stat_config *conf print_metric_headers(config, evlist, prefix, false); if (num_print_iv++ == 25) num_print_iv = 0; - if (config->aggr_mode == AGGR_GLOBAL && prefix) + if (config->aggr_mode == AGGR_GLOBAL && prefix && !config->iostat_run) fprintf(config->output, "%s", prefix); } @@ -1249,11 +1264,16 @@ void evlist__print_counters(struct evlist *evlist, struct perf_stat_config *conf } break; case AGGR_GLOBAL: - evlist__for_each_entry(evlist, counter) { - print_counter_aggr(config, counter, prefix); + if (config->iostat_run) + iostat_print_counters(evlist, config, ts, prefix = buf, + print_counter_aggr); + else { + evlist__for_each_entry(evlist, counter) { + print_counter_aggr(config, counter, prefix); + } + if (metric_only) + fputc('\n', config->output); } - if (metric_only) - fputc('\n', config->output); break; case AGGR_NONE: if (metric_only) diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c index 3f800e71126f..39967a45f55b 100644 --- a/tools/perf/util/stat-shadow.c +++ b/tools/perf/util/stat-shadow.c @@ -11,6 +11,7 @@ #include "cgroup.h" #include "units.h" #include +#include "iostat.h" /* * AGGR_GLOBAL: Use CPU 0 @@ -962,7 +963,9 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, struct metric_event *me; int num = 1; - if (evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) { + if (config->iostat_run) { + iostat_print_metric(config, evsel, out); + } else if (evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) { total = runtime_stat_avg(st, STAT_CYCLES, cpu, &rsd); if (total) { diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h index 48e6a06233fa..32c8527de347 100644 --- a/tools/perf/util/stat.h +++ b/tools/perf/util/stat.h @@ -133,6 +133,7 @@ struct perf_stat_config { bool metric_no_merge; bool stop_read_counter; bool quiet; + bool iostat_run; FILE *output; unsigned int interval; unsigned int timeout; -- cgit v1.2.3 From 19776d3cede733dc9be79d880339acb9b2f456d6 Mon Sep 17 00:00:00 2001 From: Alexander Antonov Date: Mon, 19 Apr 2021 12:41:45 +0300 Subject: perf stat: Helper functions for PCIe root ports list in iostat mode Introduce helper functions to control PCIe root ports list. These helpers will be used in the follow-up patch. Signed-off-by: Alexander Antonov Acked-by: Namhyung Kim Cc: Alexander Shishkin Cc: Alexey V Bayduraev Cc: Andi Kleen Cc: Arnaldo Carvalho de Melo Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20210419094147.15909-3-alexander.antonov@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/x86/util/iostat.c | 110 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100644 tools/perf/arch/x86/util/iostat.c (limited to 'tools') diff --git a/tools/perf/arch/x86/util/iostat.c b/tools/perf/arch/x86/util/iostat.c new file mode 100644 index 000000000000..c4471f8efa5e --- /dev/null +++ b/tools/perf/arch/x86/util/iostat.c @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * perf iostat + * + * Copyright (C) 2020, Intel Corporation + * + * Authors: Alexander Antonov + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "util/cpumap.h" +#include "util/debug.h" +#include "util/iostat.h" +#include "util/counts.h" +#include "path.h" + +struct iio_root_port { + u32 domain; + u8 bus; + u8 die; + u8 pmu_idx; + int idx; +}; + +struct iio_root_ports_list { + struct iio_root_port **rps; + int nr_entries; +}; + +static void iio_root_port_show(FILE *output, + const struct iio_root_port * const rp) +{ + if (output && rp) + fprintf(output, "S%d-uncore_iio_%d<%04x:%02x>\n", + rp->die, rp->pmu_idx, rp->domain, rp->bus); +} + +static struct iio_root_port *iio_root_port_new(u32 domain, u8 bus, + u8 die, u8 pmu_idx) +{ + struct iio_root_port *p = calloc(1, sizeof(*p)); + + if (p) { + p->domain = domain; + p->bus = bus; + p->die = die; + p->pmu_idx = pmu_idx; + } + return p; +} + +static void iio_root_ports_list_free(struct iio_root_ports_list *list) +{ + int idx; + + if (list) { + for (idx = 0; idx < list->nr_entries; idx++) + free(list->rps[idx]); + free(list->rps); + free(list); + } +} + +static struct iio_root_port *iio_root_port_find_by_notation( + const struct iio_root_ports_list * const list, u32 domain, u8 bus) +{ + int idx; + struct iio_root_port *rp; + + if (list) { + for (idx = 0; idx < list->nr_entries; idx++) { + rp = list->rps[idx]; + if (rp && rp->domain == domain && rp->bus == bus) + return rp; + } + } + return NULL; +} + +static int iio_root_ports_list_insert(struct iio_root_ports_list *list, + struct iio_root_port * const rp) +{ + struct iio_root_port **tmp_buf; + + if (list && rp) { + rp->idx = list->nr_entries++; + tmp_buf = realloc(list->rps, + list->nr_entries * sizeof(*list->rps)); + if (!tmp_buf) { + pr_err("Failed to realloc memory\n"); + return -ENOMEM; + } + tmp_buf[rp->idx] = rp; + list->rps = tmp_buf; + } + return 0; +} -- cgit v1.2.3 From f9ed693e8bc0e7de9eb766a3c7178590e8bb6cd5 Mon Sep 17 00:00:00 2001 From: Alexander Antonov Date: Mon, 19 Apr 2021 12:41:46 +0300 Subject: perf stat: Enable iostat mode for x86 platforms MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This functionality is based on recently introduced sysfs attributes for Intel® Xeon® Scalable processor family (code name Skylake-SP): Commit bb42b3d39781d7fc ("perf/x86/intel/uncore: Expose an Uncore unit to IIO PMON mapping") Mode is intended to provide four I/O performance metrics in MB per each PCIe root port: - Inbound Read: I/O devices below root port read from the host memory - Inbound Write: I/O devices below root port write to the host memory - Outbound Read: CPU reads from I/O devices below root port - Outbound Write: CPU writes to I/O devices below root port Each metric requiries only one uncore event which increments at every 4B transfer in corresponding direction. The formulas to compute metrics are generic: #EventCount * 4B / (1024 * 1024) Acked-by: Namhyung Kim Signed-off-by: Alexander Antonov Cc: Alexander Shishkin Cc: Alexey V Bayduraev Cc: Andi Kleen Cc: Arnaldo Carvalho de Melo Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20210419094147.15909-4-alexander.antonov@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-iostat.txt | 88 ++++++++ tools/perf/Makefile.perf | 5 +- tools/perf/arch/x86/util/Build | 1 + tools/perf/arch/x86/util/iostat.c | 360 +++++++++++++++++++++++++++++++ tools/perf/command-list.txt | 1 + tools/perf/perf-iostat.sh | 12 ++ 6 files changed, 466 insertions(+), 1 deletion(-) create mode 100644 tools/perf/Documentation/perf-iostat.txt create mode 100644 tools/perf/perf-iostat.sh (limited to 'tools') diff --git a/tools/perf/Documentation/perf-iostat.txt b/tools/perf/Documentation/perf-iostat.txt new file mode 100644 index 000000000000..165176944031 --- /dev/null +++ b/tools/perf/Documentation/perf-iostat.txt @@ -0,0 +1,88 @@ +perf-iostat(1) +=============== + +NAME +---- +perf-iostat - Show I/O performance metrics + +SYNOPSIS +-------- +[verse] +'perf iostat' list +'perf iostat' -- [] + +DESCRIPTION +----------- +Mode is intended to provide four I/O performance metrics per each PCIe root port: + +- Inbound Read - I/O devices below root port read from the host memory, in MB + +- Inbound Write - I/O devices below root port write to the host memory, in MB + +- Outbound Read - CPU reads from I/O devices below root port, in MB + +- Outbound Write - CPU writes to I/O devices below root port, in MB + +OPTIONS +------- +...:: + Any command you can specify in a shell. + +list:: + List all PCIe root ports. + +:: + Select the root ports for monitoring. Comma-separated list is supported. + +EXAMPLES +-------- + +1. List all PCIe root ports (example for 2-S platform): + + $ perf iostat list + S0-uncore_iio_0<0000:00> + S1-uncore_iio_0<0000:80> + S0-uncore_iio_1<0000:17> + S1-uncore_iio_1<0000:85> + S0-uncore_iio_2<0000:3a> + S1-uncore_iio_2<0000:ae> + S0-uncore_iio_3<0000:5d> + S1-uncore_iio_3<0000:d7> + +2. Collect metrics for all PCIe root ports: + + $ perf iostat -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct + 357708+0 records in + 357707+0 records out + 375083606016 bytes (375 GB, 349 GiB) copied, 215.974 s, 1.7 GB/s + + Performance counter stats for 'system wide': + + port Inbound Read(MB) Inbound Write(MB) Outbound Read(MB) Outbound Write(MB) + 0000:00 1 0 2 3 + 0000:80 0 0 0 0 + 0000:17 352552 43 0 21 + 0000:85 0 0 0 0 + 0000:3a 3 0 0 0 + 0000:ae 0 0 0 0 + 0000:5d 0 0 0 0 + 0000:d7 0 0 0 0 + +3. Collect metrics for comma-separated list of PCIe root ports: + + $ perf iostat 0000:17,0:3a -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct + 357708+0 records in + 357707+0 records out + 375083606016 bytes (375 GB, 349 GiB) copied, 197.08 s, 1.9 GB/s + + Performance counter stats for 'system wide': + + port Inbound Read(MB) Inbound Write(MB) Outbound Read(MB) Outbound Write(MB) + 0000:17 358559 44 0 22 + 0000:3a 3 2 0 0 + + 197.081983474 seconds time elapsed + +SEE ALSO +-------- +linkperf:perf-stat[1] \ No newline at end of file diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 090fb9d62665..6240fbb1646e 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -283,6 +283,7 @@ SCRIPT_SH = SCRIPT_SH += perf-archive.sh SCRIPT_SH += perf-with-kcore.sh +SCRIPT_SH += perf-iostat.sh grep-libs = $(filter -l%,$(1)) strip-libs = $(filter-out -l%,$(1)) @@ -948,6 +949,8 @@ endif $(INSTALL) $(OUTPUT)perf-archive -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)' $(call QUIET_INSTALL, perf-with-kcore) \ $(INSTALL) $(OUTPUT)perf-with-kcore -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)' + $(call QUIET_INSTALL, perf-iostat) \ + $(INSTALL) $(OUTPUT)perf-iostat -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)' ifndef NO_LIBAUDIT $(call QUIET_INSTALL, strace/groups) \ $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(STRACE_GROUPS_INSTDIR_SQ)'; \ @@ -1042,7 +1045,7 @@ bpf-skel-clean: $(call QUIET_CLEAN, bpf-skel) $(RM) -r $(SKEL_TMP_OUT) $(SKELETONS) clean:: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean $(LIBPERF)-clean fixdep-clean python-clean bpf-skel-clean - $(call QUIET_CLEAN, core-objs) $(RM) $(LIBPERF_A) $(OUTPUT)perf-archive $(OUTPUT)perf-with-kcore $(LANG_BINDINGS) + $(call QUIET_CLEAN, core-objs) $(RM) $(LIBPERF_A) $(OUTPUT)perf-archive $(OUTPUT)perf-with-kcore $(OUTPUT)perf-iostat $(LANG_BINDINGS) $(Q)find $(if $(OUTPUT),$(OUTPUT),.) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete $(Q)$(RM) $(OUTPUT).config-detected $(call QUIET_CLEAN, core-progs) $(RM) $(ALL_PROGRAMS) perf perf-read-vdso32 perf-read-vdsox32 $(OUTPUT)pmu-events/jevents $(OUTPUT)$(LIBJVMTI).so diff --git a/tools/perf/arch/x86/util/Build b/tools/perf/arch/x86/util/Build index 0c72d418932e..dbeb04cb336e 100644 --- a/tools/perf/arch/x86/util/Build +++ b/tools/perf/arch/x86/util/Build @@ -9,6 +9,7 @@ perf-y += event.o perf-y += evlist.o perf-y += mem-events.o perf-y += evsel.o +perf-y += iostat.o perf-$(CONFIG_DWARF) += dwarf-regs.o perf-$(CONFIG_BPF_PROLOGUE) += dwarf-regs.o diff --git a/tools/perf/arch/x86/util/iostat.c b/tools/perf/arch/x86/util/iostat.c index c4471f8efa5e..d63acb782b63 100644 --- a/tools/perf/arch/x86/util/iostat.c +++ b/tools/perf/arch/x86/util/iostat.c @@ -27,6 +27,36 @@ #include "util/counts.h" #include "path.h" +#ifndef MAX_PATH +#define MAX_PATH 1024 +#endif + +#define UNCORE_IIO_PMU_PATH "devices/uncore_iio_%d" +#define SYSFS_UNCORE_PMU_PATH "%s/"UNCORE_IIO_PMU_PATH +#define PLATFORM_MAPPING_PATH UNCORE_IIO_PMU_PATH"/die%d" + +/* + * Each metric requiries one IIO event which increments at every 4B transfer + * in corresponding direction. The formulas to compute metrics are generic: + * #EventCount * 4B / (1024 * 1024) + */ +static const char * const iostat_metrics[] = { + "Inbound Read(MB)", + "Inbound Write(MB)", + "Outbound Read(MB)", + "Outbound Write(MB)", +}; + +static inline int iostat_metrics_count(void) +{ + return sizeof(iostat_metrics) / sizeof(char *); +} + +static const char *iostat_metric_by_idx(int idx) +{ + return *(iostat_metrics + idx % iostat_metrics_count()); +} + struct iio_root_port { u32 domain; u8 bus; @@ -40,6 +70,8 @@ struct iio_root_ports_list { int nr_entries; }; +static struct iio_root_ports_list *root_ports; + static void iio_root_port_show(FILE *output, const struct iio_root_port * const rp) { @@ -108,3 +140,331 @@ static int iio_root_ports_list_insert(struct iio_root_ports_list *list, } return 0; } + +static int iio_mapping(u8 pmu_idx, struct iio_root_ports_list * const list) +{ + char *buf; + char path[MAX_PATH]; + u32 domain; + u8 bus; + struct iio_root_port *rp; + size_t size; + int ret; + + for (int die = 0; die < cpu__max_node(); die++) { + scnprintf(path, MAX_PATH, PLATFORM_MAPPING_PATH, pmu_idx, die); + if (sysfs__read_str(path, &buf, &size) < 0) { + if (pmu_idx) + goto out; + pr_err("Mode iostat is not supported\n"); + return -1; + } + ret = sscanf(buf, "%04x:%02hhx", &domain, &bus); + free(buf); + if (ret != 2) { + pr_err("Invalid mapping data: iio_%d; die%d\n", + pmu_idx, die); + return -1; + } + rp = iio_root_port_new(domain, bus, die, pmu_idx); + if (!rp || iio_root_ports_list_insert(list, rp)) { + free(rp); + return -ENOMEM; + } + } +out: + return 0; +} + +static u8 iio_pmu_count(void) +{ + u8 pmu_idx = 0; + char path[MAX_PATH]; + const char *sysfs = sysfs__mountpoint(); + + if (sysfs) { + for (;; pmu_idx++) { + snprintf(path, sizeof(path), SYSFS_UNCORE_PMU_PATH, + sysfs, pmu_idx); + if (access(path, F_OK) != 0) + break; + } + } + return pmu_idx; +} + +static int iio_root_ports_scan(struct iio_root_ports_list **list) +{ + int ret = -ENOMEM; + struct iio_root_ports_list *tmp_list; + u8 pmu_count = iio_pmu_count(); + + if (!pmu_count) { + pr_err("Unsupported uncore pmu configuration\n"); + return -1; + } + + tmp_list = calloc(1, sizeof(*tmp_list)); + if (!tmp_list) + goto err; + + for (u8 pmu_idx = 0; pmu_idx < pmu_count; pmu_idx++) { + ret = iio_mapping(pmu_idx, tmp_list); + if (ret) + break; + } +err: + if (!ret) + *list = tmp_list; + else + iio_root_ports_list_free(tmp_list); + + return ret; +} + +static int iio_root_port_parse_str(u32 *domain, u8 *bus, char *str) +{ + int ret; + regex_t regex; + /* + * Expected format domain:bus: + * Valid domain range [0:ffff] + * Valid bus range [0:ff] + * Example: 0000:af, 0:3d, 01:7 + */ + regcomp(®ex, "^([a-f0-9A-F]{1,}):([a-f0-9A-F]{1,2})", REG_EXTENDED); + ret = regexec(®ex, str, 0, NULL, 0); + if (ret || sscanf(str, "%08x:%02hhx", domain, bus) != 2) + pr_warning("Unrecognized root port format: %s\n" + "Please use the following format:\n" + "\t [domain]:[bus]\n" + "\t for example: 0000:3d\n", str); + + regfree(®ex); + return ret; +} + +static int iio_root_ports_list_filter(struct iio_root_ports_list **list, + const char *filter) +{ + char *tok, *tmp, *filter_copy = NULL; + struct iio_root_port *rp; + u32 domain; + u8 bus; + int ret = -ENOMEM; + struct iio_root_ports_list *tmp_list = calloc(1, sizeof(*tmp_list)); + + if (!tmp_list) + goto err; + + filter_copy = strdup(filter); + if (!filter_copy) + goto err; + + for (tok = strtok_r(filter_copy, ",", &tmp); tok; + tok = strtok_r(NULL, ",", &tmp)) { + if (!iio_root_port_parse_str(&domain, &bus, tok)) { + rp = iio_root_port_find_by_notation(*list, domain, bus); + if (rp) { + (*list)->rps[rp->idx] = NULL; + ret = iio_root_ports_list_insert(tmp_list, rp); + if (ret) { + free(rp); + goto err; + } + } else if (!iio_root_port_find_by_notation(tmp_list, + domain, bus)) + pr_warning("Root port %04x:%02x were not found\n", + domain, bus); + } + } + + if (tmp_list->nr_entries == 0) { + pr_err("Requested root ports were not found\n"); + ret = -EINVAL; + } +err: + iio_root_ports_list_free(*list); + if (ret) + iio_root_ports_list_free(tmp_list); + else + *list = tmp_list; + + free(filter_copy); + return ret; +} + +static int iostat_event_group(struct evlist *evl, + struct iio_root_ports_list *list) +{ + int ret; + int idx; + const char *iostat_cmd_template = + "{uncore_iio_%x/event=0x83,umask=0x04,ch_mask=0xF,fc_mask=0x07/,\ + uncore_iio_%x/event=0x83,umask=0x01,ch_mask=0xF,fc_mask=0x07/,\ + uncore_iio_%x/event=0xc0,umask=0x04,ch_mask=0xF,fc_mask=0x07/,\ + uncore_iio_%x/event=0xc0,umask=0x01,ch_mask=0xF,fc_mask=0x07/}"; + const int len_template = strlen(iostat_cmd_template) + 1; + struct evsel *evsel = NULL; + int metrics_count = iostat_metrics_count(); + char *iostat_cmd = calloc(len_template, 1); + + if (!iostat_cmd) + return -ENOMEM; + + for (idx = 0; idx < list->nr_entries; idx++) { + sprintf(iostat_cmd, iostat_cmd_template, + list->rps[idx]->pmu_idx, list->rps[idx]->pmu_idx, + list->rps[idx]->pmu_idx, list->rps[idx]->pmu_idx); + ret = parse_events(evl, iostat_cmd, NULL); + if (ret) + goto err; + } + + evlist__for_each_entry(evl, evsel) { + evsel->priv = list->rps[evsel->idx / metrics_count]; + } + list->nr_entries = 0; +err: + iio_root_ports_list_free(list); + free(iostat_cmd); + return ret; +} + +int iostat_prepare(struct evlist *evlist, struct perf_stat_config *config) +{ + if (evlist->core.nr_entries > 0) { + pr_warning("The -e and -M options are not supported." + "All chosen events/metrics will be dropped\n"); + evlist__delete(evlist); + evlist = evlist__new(); + if (!evlist) + return -ENOMEM; + } + + config->metric_only = true; + config->aggr_mode = AGGR_GLOBAL; + + return iostat_event_group(evlist, root_ports); +} + +int iostat_parse(const struct option *opt, const char *str, + int unset __maybe_unused) +{ + int ret; + struct perf_stat_config *config = (struct perf_stat_config *)opt->data; + + ret = iio_root_ports_scan(&root_ports); + if (!ret) { + config->iostat_run = true; + if (!str) + iostat_mode = IOSTAT_RUN; + else if (!strcmp(str, "list")) + iostat_mode = IOSTAT_LIST; + else { + iostat_mode = IOSTAT_RUN; + ret = iio_root_ports_list_filter(&root_ports, str); + } + } + return ret; +} + +void iostat_list(struct evlist *evlist, struct perf_stat_config *config) +{ + struct evsel *evsel; + struct iio_root_port *rp = NULL; + + evlist__for_each_entry(evlist, evsel) { + if (rp != evsel->priv) { + rp = evsel->priv; + iio_root_port_show(config->output, rp); + } + } +} + +void iostat_release(struct evlist *evlist) +{ + struct evsel *evsel; + struct iio_root_port *rp = NULL; + + evlist__for_each_entry(evlist, evsel) { + if (rp != evsel->priv) { + rp = evsel->priv; + free(evsel->priv); + } + } +} + +void iostat_prefix(struct evlist *evlist, + struct perf_stat_config *config, + char *prefix, struct timespec *ts) +{ + struct iio_root_port *rp = evlist->selected->priv; + + if (rp) { + if (ts) + sprintf(prefix, "%6lu.%09lu%s%04x:%02x%s", + ts->tv_sec, ts->tv_nsec, + config->csv_sep, rp->domain, rp->bus, + config->csv_sep); + else + sprintf(prefix, "%04x:%02x%s", rp->domain, rp->bus, + config->csv_sep); + } +} + +void iostat_print_header_prefix(struct perf_stat_config *config) +{ + if (config->csv_output) + fputs("port,", config->output); + else if (config->interval) + fprintf(config->output, "# time port "); + else + fprintf(config->output, " port "); +} + +void iostat_print_metric(struct perf_stat_config *config, struct evsel *evsel, + struct perf_stat_output_ctx *out) +{ + double iostat_value = 0; + u64 prev_count_val = 0; + const char *iostat_metric = iostat_metric_by_idx(evsel->idx); + u8 die = ((struct iio_root_port *)evsel->priv)->die; + struct perf_counts_values *count = perf_counts(evsel->counts, die, 0); + + if (count->run && count->ena) { + if (evsel->prev_raw_counts && !out->force_header) { + struct perf_counts_values *prev_count = + perf_counts(evsel->prev_raw_counts, die, 0); + + prev_count_val = prev_count->val; + prev_count->val = count->val; + } + iostat_value = (count->val - prev_count_val) / + ((double) count->run / count->ena); + } + out->print_metric(config, out->ctx, NULL, "%8.0f", iostat_metric, + iostat_value / (256 * 1024)); +} + +void iostat_print_counters(struct evlist *evlist, + struct perf_stat_config *config, struct timespec *ts, + char *prefix, iostat_print_counter_t print_cnt_cb) +{ + void *perf_device = NULL; + struct evsel *counter = evlist__first(evlist); + + evlist__set_selected(evlist, counter); + iostat_prefix(evlist, config, prefix, ts); + fprintf(config->output, "%s", prefix); + evlist__for_each_entry(evlist, counter) { + perf_device = evlist->selected->priv; + if (perf_device && perf_device != counter->priv) { + evlist__set_selected(evlist, counter); + iostat_prefix(evlist, config, prefix, ts); + fprintf(config->output, "\n%s", prefix); + } + print_cnt_cb(config, counter, prefix); + } + fputc('\n', config->output); +} diff --git a/tools/perf/command-list.txt b/tools/perf/command-list.txt index 825a12e8d694..4aa034aefa33 100644 --- a/tools/perf/command-list.txt +++ b/tools/perf/command-list.txt @@ -14,6 +14,7 @@ perf-config mainporcelain common perf-evlist mainporcelain common perf-ftrace mainporcelain common perf-inject mainporcelain common +perf-iostat mainporcelain common perf-kallsyms mainporcelain common perf-kmem mainporcelain common perf-kvm mainporcelain common diff --git a/tools/perf/perf-iostat.sh b/tools/perf/perf-iostat.sh new file mode 100644 index 000000000000..e562f252d56f --- /dev/null +++ b/tools/perf/perf-iostat.sh @@ -0,0 +1,12 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# perf iostat +# Alexander Antonov + +if [[ "$1" == "list" ]] || [[ "$1" =~ ([a-f0-9A-F]{1,}):([a-f0-9A-F]{1,2})(,)? ]]; then + DELIMITER="=" +else + DELIMITER=" " +fi + +perf stat --iostat$DELIMITER$* -- cgit v1.2.3 From 537f1e38f31a2d2b9941f16d6e2a9ab24cdab086 Mon Sep 17 00:00:00 2001 From: Alexander Antonov Date: Mon, 19 Apr 2021 12:41:47 +0300 Subject: perf: Update .gitignore file After a "make -C tools/perf", git reports the following untracked file: perf-iostat Add this generated file to perf's .gitignore file. Signed-off-by: Alexander Antonov Acked-by: Namhyung Kim Cc: Alexander Shishkin Cc: Alexey V Bayduraev Cc: Andi Kleen Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20210419094147.15909-5-alexander.antonov@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/.gitignore | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/perf/.gitignore b/tools/perf/.gitignore index f3f84781fd74..e555e9729758 100644 --- a/tools/perf/.gitignore +++ b/tools/perf/.gitignore @@ -20,6 +20,7 @@ perf.data.old output.svg perf-archive perf-with-kcore +perf-iostat tags TAGS cscope* -- cgit v1.2.3 From f89a82a82b20261e5778132f5237971991bad8e6 Mon Sep 17 00:00:00 2001 From: Martin Liška Date: Sun, 21 Feb 2021 13:46:36 +0100 Subject: perf annotate: Add line number like in TUI and source location at EOL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The patch changes the output format in 2 ways: - line number is displayed for all source lines (matching TUI mode) - source locations for the hottest lines are printed at the line end in order to preserve layout Before: 0.00 : 405ef1: inc %r15 : tmpsd * (TD + tmpsd * TDD))); 0.01 : 405ef4: vfmadd213sd 0x2b9b3(%rip),%xmm0,%xmm3 # 4318b0 <_IO_stdin_used+0x8b0> : tmpsd * (TC + eff.c:1811 0.67 : 405efd: vfmadd213sd 0x2b9b2(%rip),%xmm0,%xmm3 # 4318b8 <_IO_stdin_used+0x8b8> : TA + tmpsd * (TB + 0.35 : 405f06: vfmadd213sd 0x2b9b1(%rip),%xmm0,%xmm3 # 4318c0 <_IO_stdin_used+0x8c0> : dumbo = eff.c:1809 1.41 : 405f0f: vfmadd213sd 0x2b9b0(%rip),%xmm0,%xmm3 # 4318c8 <_IO_stdin_used+0x8c8> : sumi -= sj * tmpsd * dij2i * dumbo; eff.c:1813 2.58 : 405f18: vmulsd %xmm3,%xmm0,%xmm0 2.81 : 405f1c: vfnmadd213sd 0x30(%rsp),%xmm1,%xmm0 3.78 : 405f23: vmovsd %xmm0,0x30(%rsp) : for (k = 0; k < lpears[i] + upears[i]; k++) { eff.c:1761 0.90 : 405f29: cmp %r15d,%r12d After: 0.00 : 405ef1: inc %r15 : 1812 tmpsd * (TD + tmpsd * TDD))); 0.01 : 405ef4: vfmadd213sd 0x2b9b3(%rip),%xmm0,%xmm3 # 4318b0 <_IO_stdin_used+0x8b0> : 1811 tmpsd * (TC + 0.67 : 405efd: vfmadd213sd 0x2b9b2(%rip),%xmm0,%xmm3 # 4318b8 <_IO_stdin_used+0x8b8> // eff.c:1811 : 1810 TA + tmpsd * (TB + 0.35 : 405f06: vfmadd213sd 0x2b9b1(%rip),%xmm0,%xmm3 # 4318c0 <_IO_stdin_used+0x8c0> : 1809 dumbo = 1.41 : 405f0f: vfmadd213sd 0x2b9b0(%rip),%xmm0,%xmm3 # 4318c8 <_IO_stdin_used+0x8c8> // eff.c:1809 : 1813 sumi -= sj * tmpsd * dij2i * dumbo; 2.58 : 405f18: vmulsd %xmm3,%xmm0,%xmm0 // eff.c:1813 2.81 : 405f1c: vfnmadd213sd 0x30(%rsp),%xmm1,%xmm0 3.78 : 405f23: vmovsd %xmm0,0x30(%rsp) : 1761 for (k = 0; k < lpears[i] + upears[i]; k++) { Where e.g. '// eff.c:1811' shares the same color as the percentantage at the line beginning. Signed-off-by: Martin Liška Link: http://lore.kernel.org/lkml/a0d53f31-f633-5013-c386-a4452391b081@suse.cz Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 18eee25b4976..abe1499a9164 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -1368,7 +1368,6 @@ annotation_line__print(struct annotation_line *al, struct symbol *sym, u64 start { struct disasm_line *dl = container_of(al, struct disasm_line, al); static const char *prev_line; - static const char *prev_color; if (al->offset != -1) { double max_percent = 0.0; @@ -1407,20 +1406,6 @@ annotation_line__print(struct annotation_line *al, struct symbol *sym, u64 start color = get_percent_color(max_percent); - /* - * Also color the filename and line if needed, with - * the same color than the percentage. Don't print it - * twice for close colored addr with the same filename:line - */ - if (al->path) { - if (!prev_line || strcmp(prev_line, al->path) - || color != prev_color) { - color_fprintf(stdout, color, " %s", al->path); - prev_line = al->path; - prev_color = color; - } - } - for (i = 0; i < nr_percent; i++) { struct annotation_data *data = &al->data[i]; double percent; @@ -1441,6 +1426,19 @@ annotation_line__print(struct annotation_line *al, struct symbol *sym, u64 start printf(" : "); disasm_line__print(dl, start, addr_fmt_width); + + /* + * Also color the filename and line if needed, with + * the same color than the percentage. Don't print it + * twice for close colored addr with the same filename:line + */ + if (al->path) { + if (!prev_line || strcmp(prev_line, al->path)) { + color_fprintf(stdout, color, " // %s", al->path); + prev_line = al->path; + } + } + printf("\n"); } else if (max_lines && printed >= max_lines) return 1; @@ -1456,7 +1454,7 @@ annotation_line__print(struct annotation_line *al, struct symbol *sym, u64 start if (!*al->line) printf(" %*s:\n", width, " "); else - printf(" %*s: %*s %s\n", width, " ", addr_fmt_width, " ", al->line); + printf(" %*s: %-*d %s\n", width, " ", addr_fmt_width, al->line_nr, al->line); } return 0; -- cgit v1.2.3 From b96da02bd6b8d7d81b345c94d3d76d8733f5ef60 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 16 Apr 2021 14:41:13 -0700 Subject: perf arm64: Fix off-by-one directory paths. Relative path include works in the regular build due to -I paths but may break in other situations. Signed-off-by: Ian Rogers Cc: Alexander Shishkin Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Mark Rutland Cc: Mathieu Poirier Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Sergey Senozhatsky Cc: Stephane Eranian Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Link: http://lore.kernel.org/lkml/20210416214113.552252-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm64/util/kvm-stat.c | 4 ++-- tools/perf/arch/arm64/util/pmu.c | 4 ++-- tools/perf/arch/arm64/util/unwind-libunwind.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/perf/arch/arm64/util/kvm-stat.c b/tools/perf/arch/arm64/util/kvm-stat.c index 50376b9062c1..2303256b7d05 100644 --- a/tools/perf/arch/arm64/util/kvm-stat.c +++ b/tools/perf/arch/arm64/util/kvm-stat.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include #include -#include "../../util/evsel.h" -#include "../../util/kvm-stat.h" +#include "../../../util/evsel.h" +#include "../../../util/kvm-stat.h" #include "arm64_exception_types.h" #include "debug.h" diff --git a/tools/perf/arch/arm64/util/pmu.c b/tools/perf/arch/arm64/util/pmu.c index d3259d61ca75..2234fbd0a912 100644 --- a/tools/perf/arch/arm64/util/pmu.c +++ b/tools/perf/arch/arm64/util/pmu.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 -#include "../../util/cpumap.h" -#include "../../util/pmu.h" +#include "../../../util/cpumap.h" +#include "../../../util/pmu.h" struct pmu_events_map *pmu_events_map__find(void) { diff --git a/tools/perf/arch/arm64/util/unwind-libunwind.c b/tools/perf/arch/arm64/util/unwind-libunwind.c index 1495a9523a23..5aecf88e3de6 100644 --- a/tools/perf/arch/arm64/util/unwind-libunwind.c +++ b/tools/perf/arch/arm64/util/unwind-libunwind.c @@ -4,9 +4,9 @@ #ifndef REMOTE_UNWIND_LIBUNWIND #include #include "perf_regs.h" -#include "../../util/unwind.h" +#include "../../../util/unwind.h" #endif -#include "../../util/debug.h" +#include "../../../util/debug.h" int LIBUNWIND__ARCH_REG_ID(int regnum) { -- cgit v1.2.3 From 59a1a843b028c88b2ed33a459ff2767c737d8d69 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Thu, 15 Apr 2021 16:34:16 +0800 Subject: perf data: Fix error return code in perf_data__create_dir() Although 'ret' has been initialized to -1, but it will be reassigned by the "ret = open(...)" statement in the for loop. So that, the value of 'ret' is unknown when asprintf() failed. Reported-by: Hulk Robot Signed-off-by: Zhen Lei Acked-by: Jiri Olsa Cc: Alexander Shishkin Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20210415083417.3740-1-thunder.leizhen@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/data.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c index f29af4fc3d09..8fca4779ae6a 100644 --- a/tools/perf/util/data.c +++ b/tools/perf/util/data.c @@ -35,7 +35,7 @@ void perf_data__close_dir(struct perf_data *data) int perf_data__create_dir(struct perf_data *data, int nr) { struct perf_data_file *files = NULL; - int i, ret = -1; + int i, ret; if (WARN_ON(!data->is_dir)) return -EINVAL; @@ -51,7 +51,8 @@ int perf_data__create_dir(struct perf_data *data, int nr) for (i = 0; i < nr; i++) { struct perf_data_file *file = &files[i]; - if (asprintf(&file->path, "%s/data.%d", data->path, i) < 0) + ret = asprintf(&file->path, "%s/data.%d", data->path, i); + if (ret < 0) goto out_err; ret = open(file->path, O_RDWR|O_CREAT|O_TRUNC, S_IRUSR|S_IWUSR); -- cgit v1.2.3 From bb7db8699b6cd877c766ce69f3b44ab0830d85a4 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 15 Apr 2021 21:15:34 -0300 Subject: perf tools: Add a build-test variant to use in builds from a tarball To use in automated tests inside containers from a tarball generated by 'make perf-tar-src-pkg*', where testing building from a tarball is obviously not needed, so add a 'build-test-tarball' for that case. And don't build with gtk2 as this complicates things for cross builds where we don't always have all the libraries a full perf build requires available for the target arch, ditto for static builds. Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile | 5 ++++- tools/perf/tests/make | 22 ++++++++++++++++++++-- 2 files changed, 24 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/Makefile b/tools/perf/Makefile index b8fc7d972be9..f3fe360a35c6 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -100,7 +100,10 @@ clean: # make -C tools/perf -f tests/make # build-test: - @$(MAKE) SHUF=1 -f tests/make REUSE_FEATURES_DUMP=1 MK=Makefile SET_PARALLEL=1 --no-print-directory tarpkg out + @$(MAKE) SHUF=1 -f tests/make REUSE_FEATURES_DUMP=1 MK=Makefile SET_PARALLEL=1 --no-print-directory tarpkg make_static make_with_gtk2 out + +build-test-tarball: + @$(MAKE) -f tests/make REUSE_FEATURES_DUMP=1 MK=Makefile SET_PARALLEL=1 --no-print-directory out # # All other targets get passed through: diff --git a/tools/perf/tests/make b/tools/perf/tests/make index a90fa043c066..94bd5d215d94 100644 --- a/tools/perf/tests/make +++ b/tools/perf/tests/make @@ -155,7 +155,6 @@ run += make_no_syscall_tbl run += make_with_babeltrace run += make_with_clangllvm run += make_with_libpfm4 -run += make_with_gtk2 run += make_help run += make_doc run += make_perf_o @@ -172,7 +171,6 @@ run += make_install_prefix_slash # run += make_install_info # run += make_install_pdf run += make_minimal -run += make_static ifneq ($(call has,ctags),) run += make_tags @@ -307,6 +305,26 @@ $(run): $(call test,$@) && \ rm -rf $@ $$TMP_DEST || (cat $@ ; false) +make_with_gtk2: + $(call clean) + @TMP_DEST=$$(mktemp -d); \ + cmd="cd $(PERF) && $(MAKE_F) $($@) $(PARALLEL_OPT) $(O_OPT) DESTDIR=$$TMP_DEST"; \ + printf "%*.*s: %s\n" $(max_width) $(max_width) "$@" "$$cmd" && echo $$cmd > $@ && \ + ( eval $$cmd ) >> $@ 2>&1; \ + echo " test: $(call test,$@)" >> $@ 2>&1; \ + $(call test,$@) && \ + rm -rf $@ $$TMP_DEST || (cat $@ ; false) + +make_static: + $(call clean) + @TMP_DEST=$$(mktemp -d); \ + cmd="cd $(PERF) && $(MAKE_F) $($@) $(PARALLEL_OPT) $(O_OPT) DESTDIR=$$TMP_DEST"; \ + printf "%*.*s: %s\n" $(max_width) $(max_width) "$@" "$$cmd" && echo $$cmd > $@ && \ + ( eval $$cmd ) >> $@ 2>&1; \ + echo " test: $(call test,$@)" >> $@ 2>&1; \ + $(call test,$@) && \ + rm -rf $@ $$TMP_DEST || (cat $@ ; false) + $(run_O): $(call clean) @TMP_O=$$(mktemp -d); \ -- cgit v1.2.3 From d044d9fc1380b66917dcb418ef4ec7e59dd6e597 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 20 Apr 2021 15:24:28 +0200 Subject: selftests/bpf: Add docs target as all dependency Currently docs target is make dependency for TEST_GEN_FILES, which makes tests to be rebuilt every time you run make. Adding docs as all target dependency, so when running make on top of built selftests it will show just: $ make make[1]: Nothing to be done for 'docs'. After cleaning docs, only docs is rebuilt: $ make docs-clean CLEAN eBPF_helpers-manpage CLEAN eBPF_syscall-manpage $ make GEN ...selftests/bpf/bpf-helpers.rst GEN ...selftests/bpf/bpf-helpers.7 GEN ...selftests/bpf/bpf-syscall.rst GEN ...selftests/bpf/bpf-syscall.2 $ make make[1]: Nothing to be done for 'docs'. Fixes: a01d935b2e09 ("tools/bpf: Remove bpf-helpers from bpftool docs") Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210420132428.15710-1-jolsa@kernel.org --- tools/testing/selftests/bpf/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index c45ae13b88a0..c5bcdb3d4b12 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -187,7 +187,6 @@ $(OUTPUT)/runqslower: $(BPFOBJ) | $(DEFAULT_BPFTOOL) cp $(SCRATCH_DIR)/runqslower $@ $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED): $(OUTPUT)/test_stub.o $(BPFOBJ) -$(TEST_GEN_FILES): docs $(OUTPUT)/test_dev_cgroup: cgroup_helpers.c $(OUTPUT)/test_skb_cgroup_id_user: cgroup_helpers.c @@ -210,6 +209,8 @@ $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) \ OUTPUT=$(HOST_BUILD_DIR)/bpftool/ \ prefix= DESTDIR=$(HOST_SCRATCH_DIR)/ install +all: docs + docs: $(Q)RST2MAN_OPTS="--exit-status=1" $(MAKE) $(submake_extras) \ -f Makefile.docs \ -- cgit v1.2.3 From 0a4d0cb1a326cf0070a625036e19871f544f2d25 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 20 Apr 2021 16:53:48 +0200 Subject: selftests: mlxsw: sch_red_ets: Test proper counter cleaning in ETS There was a bug introduced during the rework which cause non-zero backlog being stuck at ETS. Introduce a selftest that would have caught the issue earlier. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- tools/testing/selftests/drivers/net/mlxsw/sch_red_ets.sh | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/mlxsw/sch_red_ets.sh b/tools/testing/selftests/drivers/net/mlxsw/sch_red_ets.sh index 3f007c5f8361..f3ef3274f9b3 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/sch_red_ets.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/sch_red_ets.sh @@ -67,6 +67,13 @@ red_test() { install_qdisc + # Make sure that we get the non-zero value if there is any. + local cur=$(busywait 1100 until_counter_is "> 0" \ + qdisc_stats_get $swp3 10: .backlog) + (( cur == 0 )) + check_err $? "backlog of $cur observed on non-busy qdisc" + log_test "$QDISC backlog properly cleaned" + do_red_test 10 $BACKLOG1 do_red_test 11 $BACKLOG2 -- cgit v1.2.3 From f56607e85ee38f2a5bb7096e24e2d40f35d714f9 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 31 Mar 2021 13:59:17 +0000 Subject: selftests/timens: Fix gettime_perf to work on powerpc On powerpc: - VDSO library is named linux-vdso32.so.1 or linux-vdso64.so.1 - clock_gettime is named __kernel_clock_gettime() Ensure gettime_perf tries these names before giving up. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/469f37ab91984309eb68c0fb47e8438cdf5b6463.1617198956.git.christophe.leroy@csgroup.eu --- tools/testing/selftests/timens/gettime_perf.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/timens/gettime_perf.c b/tools/testing/selftests/timens/gettime_perf.c index 7bf841a3967b..6b13dc277724 100644 --- a/tools/testing/selftests/timens/gettime_perf.c +++ b/tools/testing/selftests/timens/gettime_perf.c @@ -25,12 +25,20 @@ static void fill_function_pointers(void) if (!vdso) vdso = dlopen("linux-gate.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD); + if (!vdso) + vdso = dlopen("linux-vdso32.so.1", + RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD); + if (!vdso) + vdso = dlopen("linux-vdso64.so.1", + RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD); if (!vdso) { pr_err("[WARN]\tfailed to find vDSO\n"); return; } vdso_clock_gettime = (vgettime_t)dlsym(vdso, "__vdso_clock_gettime"); + if (!vdso_clock_gettime) + vdso_clock_gettime = (vgettime_t)dlsym(vdso, "__kernel_clock_gettime"); if (!vdso_clock_gettime) pr_err("Warning: failed to find clock_gettime in vDSO\n"); -- cgit v1.2.3 From 016ff1a442d9a8f36dcb3beca0bcdfc35e281e18 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Sat, 17 Apr 2021 10:36:01 -0400 Subject: KVM: selftests: Sync data verify of dirty logging with guest sync This fixes a bug that can trigger with e.g. "taskset -c 0 ./dirty_log_test" or when the testing host is very busy. A similar previous attempt is done [1] but that is not enough, the reason is stated in the reply [2]. As a summary (partly quotting from [2]): The problem is I think one guest memory write operation (of this specific test) contains a few micro-steps when page is during kvm dirty tracking (here I'm only considering write-protect rather than pml but pml should be similar at least when the log buffer is full): (1) Guest read 'iteration' number into register, prepare to write, page fault (2) Set dirty bit in either dirty bitmap or dirty ring (3) Return to guest, data written When we verify the data, we assumed that all these steps are "atomic", say, when (1) happened for this page, we assume (2) & (3) must have happened. We had some trick to workaround "un-atomicity" of above three steps, as previous version of this patch wanted to fix atomicity of step (2)+(3) by explicitly letting the main thread wait for at least one vmenter of vcpu thread, which should work. However what I overlooked is probably that we still have race when (1) and (2) can be interrupted. One example calltrace when it could happen that we read an old interation, got interrupted before even setting the dirty bit and flushing data: __schedule+1742 __cond_resched+52 __get_user_pages+530 get_user_pages_unlocked+197 hva_to_pfn+206 try_async_pf+132 direct_page_fault+320 kvm_mmu_page_fault+103 vmx_handle_exit+288 vcpu_enter_guest+2460 kvm_arch_vcpu_ioctl_run+325 kvm_vcpu_ioctl+526 __x64_sys_ioctl+131 do_syscall_64+51 entry_SYSCALL_64_after_hwframe+68 It means iteration number cached in vcpu register can be very old when dirty bit set and data flushed. So far I don't see an easy way to guarantee all steps 1-3 atomicity but to sync at the GUEST_SYNC() point of guest code when we do verification of the dirty bits as what this patch does. [1] https://lore.kernel.org/lkml/20210413213641.23742-1-peterx@redhat.com/ [2] https://lore.kernel.org/lkml/20210417140956.GV4440@xz-x1/ Cc: Paolo Bonzini Cc: Sean Christopherson Cc: Andrew Jones Cc: stable@vger.kernel.org Signed-off-by: Peter Xu Message-Id: <20210417143602.215059-2-peterx@redhat.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/dirty_log_test.c | 60 +++++++++++++++++++++++----- 1 file changed, 50 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index bb2752d78fe3..ffa4e2791926 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "kvm_util.h" #include "test_util.h" @@ -137,12 +138,20 @@ static uint64_t host_clear_count; static uint64_t host_track_next_count; /* Whether dirty ring reset is requested, or finished */ -static sem_t dirty_ring_vcpu_stop; -static sem_t dirty_ring_vcpu_cont; +static sem_t sem_vcpu_stop; +static sem_t sem_vcpu_cont; +/* + * This is only set by main thread, and only cleared by vcpu thread. It is + * used to request vcpu thread to stop at the next GUEST_SYNC, since GUEST_SYNC + * is the only place that we'll guarantee both "dirty bit" and "dirty data" + * will match. E.g., SIG_IPI won't guarantee that if the vcpu is interrupted + * after setting dirty bit but before the data is written. + */ +static atomic_t vcpu_sync_stop_requested; /* * This is updated by the vcpu thread to tell the host whether it's a * ring-full event. It should only be read until a sem_wait() of - * dirty_ring_vcpu_stop and before vcpu continues to run. + * sem_vcpu_stop and before vcpu continues to run. */ static bool dirty_ring_vcpu_ring_full; /* @@ -234,6 +243,17 @@ static void clear_log_collect_dirty_pages(struct kvm_vm *vm, int slot, kvm_vm_clear_dirty_log(vm, slot, bitmap, 0, num_pages); } +/* Should only be called after a GUEST_SYNC */ +static void vcpu_handle_sync_stop(void) +{ + if (atomic_read(&vcpu_sync_stop_requested)) { + /* It means main thread is sleeping waiting */ + atomic_set(&vcpu_sync_stop_requested, false); + sem_post(&sem_vcpu_stop); + sem_wait_until(&sem_vcpu_cont); + } +} + static void default_after_vcpu_run(struct kvm_vm *vm, int ret, int err) { struct kvm_run *run = vcpu_state(vm, VCPU_ID); @@ -244,6 +264,8 @@ static void default_after_vcpu_run(struct kvm_vm *vm, int ret, int err) TEST_ASSERT(get_ucall(vm, VCPU_ID, NULL) == UCALL_SYNC, "Invalid guest sync status: exit_reason=%s\n", exit_reason_str(run->exit_reason)); + + vcpu_handle_sync_stop(); } static bool dirty_ring_supported(void) @@ -301,13 +323,13 @@ static void dirty_ring_wait_vcpu(void) { /* This makes sure that hardware PML cache flushed */ vcpu_kick(); - sem_wait_until(&dirty_ring_vcpu_stop); + sem_wait_until(&sem_vcpu_stop); } static void dirty_ring_continue_vcpu(void) { pr_info("Notifying vcpu to continue\n"); - sem_post(&dirty_ring_vcpu_cont); + sem_post(&sem_vcpu_cont); } static void dirty_ring_collect_dirty_pages(struct kvm_vm *vm, int slot, @@ -361,11 +383,11 @@ static void dirty_ring_after_vcpu_run(struct kvm_vm *vm, int ret, int err) /* Update the flag first before pause */ WRITE_ONCE(dirty_ring_vcpu_ring_full, run->exit_reason == KVM_EXIT_DIRTY_RING_FULL); - sem_post(&dirty_ring_vcpu_stop); + sem_post(&sem_vcpu_stop); pr_info("vcpu stops because %s...\n", dirty_ring_vcpu_ring_full ? "dirty ring is full" : "vcpu is kicked out"); - sem_wait_until(&dirty_ring_vcpu_cont); + sem_wait_until(&sem_vcpu_cont); pr_info("vcpu continues now.\n"); } else { TEST_ASSERT(false, "Invalid guest sync status: " @@ -377,7 +399,7 @@ static void dirty_ring_after_vcpu_run(struct kvm_vm *vm, int ret, int err) static void dirty_ring_before_vcpu_join(void) { /* Kick another round of vcpu just to make sure it will quit */ - sem_post(&dirty_ring_vcpu_cont); + sem_post(&sem_vcpu_cont); } struct log_mode { @@ -768,7 +790,25 @@ static void run_test(enum vm_guest_mode mode, void *arg) usleep(p->interval * 1000); log_mode_collect_dirty_pages(vm, TEST_MEM_SLOT_INDEX, bmap, host_num_pages); + + /* + * See vcpu_sync_stop_requested definition for details on why + * we need to stop vcpu when verify data. + */ + atomic_set(&vcpu_sync_stop_requested, true); + sem_wait_until(&sem_vcpu_stop); + /* + * NOTE: for dirty ring, it's possible that we didn't stop at + * GUEST_SYNC but instead we stopped because ring is full; + * that's okay too because ring full means we're only missing + * the flush of the last page, and since we handle the last + * page specially verification will succeed anyway. + */ + assert(host_log_mode == LOG_MODE_DIRTY_RING || + atomic_read(&vcpu_sync_stop_requested) == false); vm_dirty_log_verify(mode, bmap); + sem_post(&sem_vcpu_cont); + iteration++; sync_global_to_guest(vm, iteration); } @@ -819,8 +859,8 @@ int main(int argc, char *argv[]) }; int opt, i; - sem_init(&dirty_ring_vcpu_stop, 0, 0); - sem_init(&dirty_ring_vcpu_cont, 0, 0); + sem_init(&sem_vcpu_stop, 0, 0); + sem_init(&sem_vcpu_cont, 0, 0); guest_modes_append_default(); -- cgit v1.2.3 From bf1e15a82e3b74ee86bb119d6038b41e1ed2b319 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 20 Apr 2021 04:13:03 -0400 Subject: KVM: selftests: Always run vCPU thread with blocked SIG_IPI The main thread could start to send SIG_IPI at any time, even before signal blocked on vcpu thread. Therefore, start the vcpu thread with the signal blocked. Without this patch, on very busy cores the dirty_log_test could fail directly on receiving a SIGUSR1 without a handler (when vcpu runs far slower than main). Reported-by: Peter Xu Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/dirty_log_test.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index ffa4e2791926..81edbd23d371 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -527,9 +527,8 @@ static void *vcpu_worker(void *data) */ sigmask->len = 8; pthread_sigmask(0, NULL, sigset); + sigdelset(sigset, SIG_IPI); vcpu_ioctl(vm, VCPU_ID, KVM_SET_SIGNAL_MASK, sigmask); - sigaddset(sigset, SIG_IPI); - pthread_sigmask(SIG_BLOCK, sigset, NULL); sigemptyset(sigset); sigaddset(sigset, SIG_IPI); @@ -858,6 +857,7 @@ int main(int argc, char *argv[]) .interval = TEST_HOST_LOOP_INTERVAL, }; int opt, i; + sigset_t sigset; sem_init(&sem_vcpu_stop, 0, 0); sem_init(&sem_vcpu_cont, 0, 0); @@ -916,6 +916,11 @@ int main(int argc, char *argv[]) srandom(time(0)); + /* Ensure that vCPU threads start with SIG_IPI blocked. */ + sigemptyset(&sigset); + sigaddset(&sigset, SIG_IPI); + pthread_sigmask(SIG_BLOCK, &sigset, NULL); + if (host_log_mode_option == LOG_MODE_ALL) { /* Run each log mode */ for (i = 0; i < LOG_MODE_NUM; i++) { -- cgit v1.2.3 From bc2e9578baed90f36abe6bb922b9598a327b0555 Mon Sep 17 00:00:00 2001 From: Quanyang Wang Date: Thu, 22 Apr 2021 18:26:04 +0800 Subject: spi: tools: make a symbolic link to the header file spi.h The header file spi.h in include/uapi/linux/spi is needed for spidev.h, so we also need make a symbolic link to it to eliminate the error message as below: In file included from spidev_test.c:24: include/linux/spi/spidev.h:28:10: fatal error: linux/spi/spi.h: No such file or directory 28 | #include | ^~~~~~~~~~~~~~~~~ compilation terminated. Fixes: f7005142dace ("spi: uapi: unify SPI modes into a single spi.h") Signed-off-by: Quanyang Wang Link: https://lore.kernel.org/r/20210422102604.3034217-1-quanyang.wang@windriver.com Signed-off-by: Mark Brown --- tools/spi/Makefile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/spi/Makefile b/tools/spi/Makefile index ada881afb489..0aa6dbd31fb8 100644 --- a/tools/spi/Makefile +++ b/tools/spi/Makefile @@ -25,11 +25,12 @@ include $(srctree)/tools/build/Makefile.include # # We need the following to be outside of kernel tree # -$(OUTPUT)include/linux/spi/spidev.h: ../../include/uapi/linux/spi/spidev.h +$(OUTPUT)include/linux/spi: ../../include/uapi/linux/spi mkdir -p $(OUTPUT)include/linux/spi 2>&1 || true ln -sf $(CURDIR)/../../include/uapi/linux/spi/spidev.h $@ + ln -sf $(CURDIR)/../../include/uapi/linux/spi/spi.h $@ -prepare: $(OUTPUT)include/linux/spi/spidev.h +prepare: $(OUTPUT)include/linux/spi # # spidev_test -- cgit v1.2.3 From da650ada100956b0f00aa4fe9ce33103378ce9ca Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Thu, 25 Feb 2021 17:19:49 +1100 Subject: selftests/powerpc: Add uaccess flush test Also based on the RFI and entry flush tests, it counts the L1D misses by doing a syscall that does user access: uname, in this case. Signed-off-by: Thadeu Lima de Souza Cascardo [dja: forward port, rename function] Signed-off-by: Daniel Axtens Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210225061949.1213404-1-dja@axtens.net --- tools/testing/selftests/powerpc/security/Makefile | 3 +- .../selftests/powerpc/security/flush_utils.c | 13 ++ .../selftests/powerpc/security/flush_utils.h | 3 + .../selftests/powerpc/security/uaccess_flush.c | 158 +++++++++++++++++++++ 4 files changed, 176 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/security/uaccess_flush.c (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/security/Makefile b/tools/testing/selftests/powerpc/security/Makefile index f25e854fe370..844d18cd5f93 100644 --- a/tools/testing/selftests/powerpc/security/Makefile +++ b/tools/testing/selftests/powerpc/security/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0+ -TEST_GEN_PROGS := rfi_flush entry_flush spectre_v2 +TEST_GEN_PROGS := rfi_flush entry_flush uaccess_flush spectre_v2 top_srcdir = ../../../../.. CFLAGS += -I../../../../../usr/include @@ -13,3 +13,4 @@ $(OUTPUT)/spectre_v2: CFLAGS += -m64 $(OUTPUT)/spectre_v2: ../pmu/event.c branch_loops.S $(OUTPUT)/rfi_flush: flush_utils.c $(OUTPUT)/entry_flush: flush_utils.c +$(OUTPUT)/uaccess_flush: flush_utils.c diff --git a/tools/testing/selftests/powerpc/security/flush_utils.c b/tools/testing/selftests/powerpc/security/flush_utils.c index 0c3c4c40c7fb..4d95965cb751 100644 --- a/tools/testing/selftests/powerpc/security/flush_utils.c +++ b/tools/testing/selftests/powerpc/security/flush_utils.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "utils.h" #include "flush_utils.h" @@ -35,6 +36,18 @@ void syscall_loop(char *p, unsigned long iterations, } } +void syscall_loop_uaccess(char *p, unsigned long iterations, + unsigned long zero_size) +{ + struct utsname utsname; + + for (unsigned long i = 0; i < iterations; i++) { + for (unsigned long j = 0; j < zero_size; j += CACHELINE_SIZE) + load(p + j); + uname(&utsname); + } +} + static void sigill_handler(int signr, siginfo_t *info, void *unused) { static int warned; diff --git a/tools/testing/selftests/powerpc/security/flush_utils.h b/tools/testing/selftests/powerpc/security/flush_utils.h index 7a3d60292916..e1e68281f7ac 100644 --- a/tools/testing/selftests/powerpc/security/flush_utils.h +++ b/tools/testing/selftests/powerpc/security/flush_utils.h @@ -16,6 +16,9 @@ void syscall_loop(char *p, unsigned long iterations, unsigned long zero_size); +void syscall_loop_uaccess(char *p, unsigned long iterations, + unsigned long zero_size); + void set_dscr(unsigned long val); #endif /* _SELFTESTS_POWERPC_SECURITY_FLUSH_UTILS_H */ diff --git a/tools/testing/selftests/powerpc/security/uaccess_flush.c b/tools/testing/selftests/powerpc/security/uaccess_flush.c new file mode 100644 index 000000000000..cf80f960e38a --- /dev/null +++ b/tools/testing/selftests/powerpc/security/uaccess_flush.c @@ -0,0 +1,158 @@ +// SPDX-License-Identifier: GPL-2.0+ + +/* + * Copyright 2018 IBM Corporation. + * Copyright 2020 Canonical Ltd. + */ + +#define __SANE_USERSPACE_TYPES__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "utils.h" +#include "flush_utils.h" + +int uaccess_flush_test(void) +{ + char *p; + int repetitions = 10; + int fd, passes = 0, iter, rc = 0; + struct perf_event_read v; + __u64 l1d_misses_total = 0; + unsigned long iterations = 100000, zero_size = 24 * 1024; + unsigned long l1d_misses_expected; + int rfi_flush_orig; + int entry_flush_orig; + int uaccess_flush, uaccess_flush_orig; + + SKIP_IF(geteuid() != 0); + + // The PMU event we use only works on Power7 or later + SKIP_IF(!have_hwcap(PPC_FEATURE_ARCH_2_06)); + + if (read_debugfs_file("powerpc/rfi_flush", &rfi_flush_orig) < 0) { + perror("Unable to read powerpc/rfi_flush debugfs file"); + SKIP_IF(1); + } + + if (read_debugfs_file("powerpc/entry_flush", &entry_flush_orig) < 0) { + perror("Unable to read powerpc/entry_flush debugfs file"); + SKIP_IF(1); + } + + if (read_debugfs_file("powerpc/uaccess_flush", &uaccess_flush_orig) < 0) { + perror("Unable to read powerpc/entry_flush debugfs file"); + SKIP_IF(1); + } + + if (rfi_flush_orig != 0) { + if (write_debugfs_file("powerpc/rfi_flush", 0) < 0) { + perror("error writing to powerpc/rfi_flush debugfs file"); + FAIL_IF(1); + } + } + + if (entry_flush_orig != 0) { + if (write_debugfs_file("powerpc/entry_flush", 0) < 0) { + perror("error writing to powerpc/entry_flush debugfs file"); + FAIL_IF(1); + } + } + + uaccess_flush = uaccess_flush_orig; + + fd = perf_event_open_counter(PERF_TYPE_HW_CACHE, PERF_L1D_READ_MISS_CONFIG, -1); + FAIL_IF(fd < 0); + + p = (char *)memalign(zero_size, CACHELINE_SIZE); + + FAIL_IF(perf_event_enable(fd)); + + // disable L1 prefetching + set_dscr(1); + + iter = repetitions; + + /* + * We expect to see l1d miss for each cacheline access when entry_flush + * is set. Allow a small variation on this. + */ + l1d_misses_expected = iterations * (zero_size / CACHELINE_SIZE - 2); + +again: + FAIL_IF(perf_event_reset(fd)); + + syscall_loop_uaccess(p, iterations, zero_size); + + FAIL_IF(read(fd, &v, sizeof(v)) != sizeof(v)); + + if (uaccess_flush && v.l1d_misses >= l1d_misses_expected) + passes++; + else if (!uaccess_flush && v.l1d_misses < (l1d_misses_expected / 2)) + passes++; + + l1d_misses_total += v.l1d_misses; + + while (--iter) + goto again; + + if (passes < repetitions) { + printf("FAIL (L1D misses with uaccess_flush=%d: %llu %c %lu) [%d/%d failures]\n", + uaccess_flush, l1d_misses_total, uaccess_flush ? '<' : '>', + uaccess_flush ? repetitions * l1d_misses_expected : + repetitions * l1d_misses_expected / 2, + repetitions - passes, repetitions); + rc = 1; + } else { + printf("PASS (L1D misses with uaccess_flush=%d: %llu %c %lu) [%d/%d pass]\n", + uaccess_flush, l1d_misses_total, uaccess_flush ? '>' : '<', + uaccess_flush ? repetitions * l1d_misses_expected : + repetitions * l1d_misses_expected / 2, + passes, repetitions); + } + + if (uaccess_flush == uaccess_flush_orig) { + uaccess_flush = !uaccess_flush_orig; + if (write_debugfs_file("powerpc/uaccess_flush", uaccess_flush) < 0) { + perror("error writing to powerpc/uaccess_flush debugfs file"); + return 1; + } + iter = repetitions; + l1d_misses_total = 0; + passes = 0; + goto again; + } + + perf_event_disable(fd); + close(fd); + + set_dscr(0); + + if (write_debugfs_file("powerpc/rfi_flush", rfi_flush_orig) < 0) { + perror("unable to restore original value of powerpc/rfi_flush debugfs file"); + return 1; + } + + if (write_debugfs_file("powerpc/entry_flush", entry_flush_orig) < 0) { + perror("unable to restore original value of powerpc/entry_flush debugfs file"); + return 1; + } + + if (write_debugfs_file("powerpc/uaccess_flush", uaccess_flush_orig) < 0) { + perror("unable to restore original value of powerpc/uaccess_flush debugfs file"); + return 1; + } + + return rc; +} + +int main(int argc, char *argv[]) +{ + return test_harness(uaccess_flush_test, "uaccess_flush_test"); +} -- cgit v1.2.3 From dae4ff8031b49af4721101d6298fc14cb9c16a4c Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Mon, 12 Apr 2021 16:52:15 +0530 Subject: powerpc/selftests/ptrace-hwbreak: Add testcases for 2nd DAWR Message-ID: <20210412112218.128183-2-ravi.bangoria@linux.ibm.com> (raw) Add selftests to test multiple active DAWRs with ptrace interface. Sample o/p: $ ./ptrace-hwbreak ... PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DW ALIGNED, WO, len: 6: Ok PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DW UNALIGNED, RO, len: 6: Ok PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DAWR Overlap, WO, len: 6: Ok PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DAWR Overlap, RO, len: 6: Ok Signed-off-by: Ravi Bangoria Reviewed-by: Daniel Axtens [mpe: Fix build on older distros] Signed-off-by: Michael Ellerman --- .../selftests/powerpc/ptrace/perf-hwbreak.c | 4 ++ .../selftests/powerpc/ptrace/ptrace-hwbreak.c | 79 ++++++++++++++++++++++ 2 files changed, 83 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c b/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c index c1f324afdbf3..9ccf252d4b13 100644 --- a/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c +++ b/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c @@ -30,6 +30,10 @@ #include #include "utils.h" +#ifndef PPC_DEBUG_FEATURE_DATA_BP_ARCH_31 +#define PPC_DEBUG_FEATURE_DATA_BP_ARCH_31 0x20 +#endif + #define MAX_LOOPS 10000 #define DAWR_LENGTH_MAX ((0x3f + 1) * 8) diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c b/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c index 2e0d86e0687e..a0635a3819aa 100644 --- a/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c +++ b/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c @@ -194,6 +194,18 @@ static void test_workload(void) big_var[rand() % DAWR_MAX_LEN] = 'a'; else cvar = big_var[rand() % DAWR_MAX_LEN]; + + /* PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DW ALIGNED, WO test */ + gstruct.a[rand() % A_LEN] = 'a'; + + /* PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DW UNALIGNED, RO test */ + cvar = gstruct.b[rand() % B_LEN]; + + /* PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DAWR Overlap, WO test */ + gstruct.a[rand() % A_LEN] = 'a'; + + /* PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DAWR Overlap, RO test */ + cvar = gstruct.a[rand() % A_LEN]; } static void check_success(pid_t child_pid, const char *name, const char *type, @@ -417,6 +429,69 @@ static void test_sethwdebug_range_aligned(pid_t child_pid) ptrace_delhwdebug(child_pid, wh); } +static void test_multi_sethwdebug_range(pid_t child_pid) +{ + struct ppc_hw_breakpoint info1, info2; + unsigned long wp_addr1, wp_addr2; + char *name1 = "PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DW ALIGNED"; + char *name2 = "PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DW UNALIGNED"; + int len1, len2; + int wh1, wh2; + + wp_addr1 = (unsigned long)&gstruct.a; + wp_addr2 = (unsigned long)&gstruct.b; + len1 = A_LEN; + len2 = B_LEN; + get_ppc_hw_breakpoint(&info1, PPC_BREAKPOINT_TRIGGER_WRITE, wp_addr1, len1); + get_ppc_hw_breakpoint(&info2, PPC_BREAKPOINT_TRIGGER_READ, wp_addr2, len2); + + /* PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DW ALIGNED, WO test */ + wh1 = ptrace_sethwdebug(child_pid, &info1); + + /* PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DW UNALIGNED, RO test */ + wh2 = ptrace_sethwdebug(child_pid, &info2); + + ptrace(PTRACE_CONT, child_pid, NULL, 0); + check_success(child_pid, name1, "WO", wp_addr1, len1); + + ptrace(PTRACE_CONT, child_pid, NULL, 0); + check_success(child_pid, name2, "RO", wp_addr2, len2); + + ptrace_delhwdebug(child_pid, wh1); + ptrace_delhwdebug(child_pid, wh2); +} + +static void test_multi_sethwdebug_range_dawr_overlap(pid_t child_pid) +{ + struct ppc_hw_breakpoint info1, info2; + unsigned long wp_addr1, wp_addr2; + char *name = "PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DAWR Overlap"; + int len1, len2; + int wh1, wh2; + + wp_addr1 = (unsigned long)&gstruct.a; + wp_addr2 = (unsigned long)&gstruct.a; + len1 = A_LEN; + len2 = A_LEN; + get_ppc_hw_breakpoint(&info1, PPC_BREAKPOINT_TRIGGER_WRITE, wp_addr1, len1); + get_ppc_hw_breakpoint(&info2, PPC_BREAKPOINT_TRIGGER_READ, wp_addr2, len2); + + /* PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DAWR Overlap, WO test */ + wh1 = ptrace_sethwdebug(child_pid, &info1); + + /* PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DAWR Overlap, RO test */ + wh2 = ptrace_sethwdebug(child_pid, &info2); + + ptrace(PTRACE_CONT, child_pid, NULL, 0); + check_success(child_pid, name, "WO", wp_addr1, len1); + + ptrace(PTRACE_CONT, child_pid, NULL, 0); + check_success(child_pid, name, "RO", wp_addr2, len2); + + ptrace_delhwdebug(child_pid, wh1); + ptrace_delhwdebug(child_pid, wh2); +} + static void test_sethwdebug_range_unaligned(pid_t child_pid) { struct ppc_hw_breakpoint info; @@ -504,6 +579,10 @@ run_tests(pid_t child_pid, struct ppc_debug_info *dbginfo, bool dawr) test_sethwdebug_range_unaligned(child_pid); test_sethwdebug_range_unaligned_dar(child_pid); test_sethwdebug_dawr_max_range(child_pid); + if (dbginfo->num_data_bps > 1) { + test_multi_sethwdebug_range(child_pid); + test_multi_sethwdebug_range_dawr_overlap(child_pid); + } } } } -- cgit v1.2.3 From c9cb0afb4eaa03801322f48dad4093979ff45e88 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Mon, 12 Apr 2021 16:52:16 +0530 Subject: powerpc/selftests/perf-hwbreak: Coalesce event creation code perf-hwbreak selftest opens hw-breakpoint event at multiple places for which it has same code repeated. Coalesce that code into a function. Signed-off-by: Ravi Bangoria Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210412112218.128183-3-ravi.bangoria@linux.ibm.com --- .../selftests/powerpc/ptrace/perf-hwbreak.c | 79 +++++++++++----------- 1 file changed, 39 insertions(+), 40 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c b/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c index 9ccf252d4b13..8c54adaa40b1 100644 --- a/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c +++ b/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c @@ -38,28 +38,46 @@ #define DAWR_LENGTH_MAX ((0x3f + 1) * 8) -static inline int sys_perf_event_open(struct perf_event_attr *attr, pid_t pid, - int cpu, int group_fd, - unsigned long flags) +static void perf_event_attr_set(struct perf_event_attr *attr, + __u32 type, __u64 addr, __u64 len, + bool exclude_user) { - attr->size = sizeof(*attr); - return syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags); + memset(attr, 0, sizeof(struct perf_event_attr)); + attr->type = PERF_TYPE_BREAKPOINT; + attr->size = sizeof(struct perf_event_attr); + attr->bp_type = type; + attr->bp_addr = addr; + attr->bp_len = len; + attr->exclude_kernel = 1; + attr->exclude_hv = 1; + attr->exclude_guest = 1; + attr->exclude_user = exclude_user; + attr->disabled = 1; } -static inline bool breakpoint_test(int len) +static int +perf_process_event_open_exclude_user(__u32 type, __u64 addr, __u64 len, bool exclude_user) { struct perf_event_attr attr; + + perf_event_attr_set(&attr, type, addr, len, exclude_user); + return syscall(__NR_perf_event_open, &attr, getpid(), -1, -1, 0); +} + +static int perf_process_event_open(__u32 type, __u64 addr, __u64 len) +{ + struct perf_event_attr attr; + + perf_event_attr_set(&attr, type, addr, len, 0); + return syscall(__NR_perf_event_open, &attr, getpid(), -1, -1, 0); +} + +static inline bool breakpoint_test(int len) +{ int fd; - /* setup counters */ - memset(&attr, 0, sizeof(attr)); - attr.disabled = 1; - attr.type = PERF_TYPE_BREAKPOINT; - attr.bp_type = HW_BREAKPOINT_R; /* bp_addr can point anywhere but needs to be aligned */ - attr.bp_addr = (__u64)(&attr) & 0xfffffffffffff800; - attr.bp_len = len; - fd = sys_perf_event_open(&attr, 0, -1, -1, 0); + fd = perf_process_event_open(HW_BREAKPOINT_R, (__u64)(&fd) & 0xfffffffffffff800, len); if (fd < 0) return false; close(fd); @@ -79,7 +97,6 @@ static inline bool dawr_supported(void) static int runtestsingle(int readwriteflag, int exclude_user, int arraytest) { int i,j; - struct perf_event_attr attr; size_t res; unsigned long long breaks, needed; int readint; @@ -89,6 +106,7 @@ static int runtestsingle(int readwriteflag, int exclude_user, int arraytest) int break_fd; int loop_num = MAX_LOOPS - (rand() % 100); /* provide some variability */ volatile int *k; + __u64 len; /* align to 0x400 boundary as required by DAWR */ readintalign = (int *)(((unsigned long)readintarraybig + 0x7ff) & @@ -98,19 +116,11 @@ static int runtestsingle(int readwriteflag, int exclude_user, int arraytest) if (arraytest) ptr = &readintalign[0]; - /* setup counters */ - memset(&attr, 0, sizeof(attr)); - attr.disabled = 1; - attr.type = PERF_TYPE_BREAKPOINT; - attr.bp_type = readwriteflag; - attr.bp_addr = (__u64)ptr; - attr.bp_len = sizeof(int); - if (arraytest) - attr.bp_len = DAWR_LENGTH_MAX; - attr.exclude_user = exclude_user; - break_fd = sys_perf_event_open(&attr, 0, -1, -1, 0); + len = arraytest ? DAWR_LENGTH_MAX : sizeof(int); + break_fd = perf_process_event_open_exclude_user(readwriteflag, (__u64)ptr, + len, exclude_user); if (break_fd < 0) { - perror("sys_perf_event_open"); + perror("perf_process_event_open_exclude_user"); exit(1); } @@ -157,7 +167,6 @@ static int runtest_dar_outside(void) void *target; volatile __u16 temp16; volatile __u64 temp64; - struct perf_event_attr attr; int break_fd; unsigned long long breaks; int fail = 0; @@ -169,21 +178,11 @@ static int runtest_dar_outside(void) exit(EXIT_FAILURE); } - /* setup counters */ - memset(&attr, 0, sizeof(attr)); - attr.disabled = 1; - attr.type = PERF_TYPE_BREAKPOINT; - attr.exclude_kernel = 1; - attr.exclude_hv = 1; - attr.exclude_guest = 1; - attr.bp_type = HW_BREAKPOINT_RW; /* watch middle half of target array */ - attr.bp_addr = (__u64)(target + 2); - attr.bp_len = 4; - break_fd = sys_perf_event_open(&attr, 0, -1, -1, 0); + break_fd = perf_process_event_open(HW_BREAKPOINT_RW, (__u64)(target + 2), 4); if (break_fd < 0) { free(target); - perror("sys_perf_event_open"); + perror("perf_process_event_open"); exit(EXIT_FAILURE); } -- cgit v1.2.3 From c65c64cc7bbd273121edf96a7a5a0269038ab454 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Mon, 12 Apr 2021 16:52:17 +0530 Subject: powerpc/selftests/perf-hwbreak: Add testcases for 2nd DAWR Extend perf-hwbreak.c selftest to test multiple DAWRs. Also add testcase for testing 512 byte boundary removal. Sample o/p: # ./perf-hwbreak ... TESTED: Process specific, Two events, diff addr TESTED: Process specific, Two events, same addr TESTED: Process specific, Two events, diff addr, one is RO, other is WO TESTED: Process specific, Two events, same addr, one is RO, other is WO TESTED: Systemwide, Two events, diff addr TESTED: Systemwide, Two events, same addr TESTED: Systemwide, Two events, diff addr, one is RO, other is WO TESTED: Systemwide, Two events, same addr, one is RO, other is WO TESTED: Process specific, 512 bytes, unaligned success: perf_hwbreak Signed-off-by: Ravi Bangoria Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210412112218.128183-4-ravi.bangoria@linux.ibm.com --- .../selftests/powerpc/ptrace/perf-hwbreak.c | 552 ++++++++++++++++++++- 1 file changed, 551 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c b/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c index 8c54adaa40b1..ecde2c199f3b 100644 --- a/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c +++ b/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c @@ -21,8 +21,13 @@ #include #include #include +#include #include #include +#include +#include +#include +#include #include #include #include @@ -38,6 +43,12 @@ #define DAWR_LENGTH_MAX ((0x3f + 1) * 8) +int nprocs; + +static volatile int a = 10; +static volatile int b = 10; +static volatile char c[512 + 8] __attribute__((aligned(512))); + static void perf_event_attr_set(struct perf_event_attr *attr, __u32 type, __u64 addr, __u64 len, bool exclude_user) @@ -72,6 +83,76 @@ static int perf_process_event_open(__u32 type, __u64 addr, __u64 len) return syscall(__NR_perf_event_open, &attr, getpid(), -1, -1, 0); } +static int perf_cpu_event_open(long cpu, __u32 type, __u64 addr, __u64 len) +{ + struct perf_event_attr attr; + + perf_event_attr_set(&attr, type, addr, len, 0); + return syscall(__NR_perf_event_open, &attr, -1, cpu, -1, 0); +} + +static void close_fds(int *fd, int n) +{ + int i; + + for (i = 0; i < n; i++) + close(fd[i]); +} + +static unsigned long read_fds(int *fd, int n) +{ + int i; + unsigned long c = 0; + unsigned long count = 0; + size_t res; + + for (i = 0; i < n; i++) { + res = read(fd[i], &c, sizeof(c)); + assert(res == sizeof(unsigned long long)); + count += c; + } + return count; +} + +static void reset_fds(int *fd, int n) +{ + int i; + + for (i = 0; i < n; i++) + ioctl(fd[i], PERF_EVENT_IOC_RESET); +} + +static void enable_fds(int *fd, int n) +{ + int i; + + for (i = 0; i < n; i++) + ioctl(fd[i], PERF_EVENT_IOC_ENABLE); +} + +static void disable_fds(int *fd, int n) +{ + int i; + + for (i = 0; i < n; i++) + ioctl(fd[i], PERF_EVENT_IOC_DISABLE); +} + +static int perf_systemwide_event_open(int *fd, __u32 type, __u64 addr, __u64 len) +{ + int i = 0; + + /* Assume online processors are 0 to nprocs for simplisity */ + for (i = 0; i < nprocs; i++) { + fd[i] = perf_cpu_event_open(i, type, addr, len); + if (fd[i] < 0) { + close_fds(fd, i); + return fd[i]; + } + } + return 0; +} + static inline bool breakpoint_test(int len) { int fd; @@ -266,11 +347,467 @@ static int runtest_dar_outside(void) return fail; } +static void multi_dawr_workload(void) +{ + a += 10; + b += 10; + c[512 + 1] += 'a'; +} + +static int test_process_multi_diff_addr(void) +{ + unsigned long long breaks1 = 0, breaks2 = 0; + int fd1, fd2; + char *desc = "Process specific, Two events, diff addr"; + size_t res; + + fd1 = perf_process_event_open(HW_BREAKPOINT_RW, (__u64)&a, (__u64)sizeof(a)); + if (fd1 < 0) { + perror("perf_process_event_open"); + exit(EXIT_FAILURE); + } + + fd2 = perf_process_event_open(HW_BREAKPOINT_RW, (__u64)&b, (__u64)sizeof(b)); + if (fd2 < 0) { + close(fd1); + perror("perf_process_event_open"); + exit(EXIT_FAILURE); + } + + ioctl(fd1, PERF_EVENT_IOC_RESET); + ioctl(fd2, PERF_EVENT_IOC_RESET); + ioctl(fd1, PERF_EVENT_IOC_ENABLE); + ioctl(fd2, PERF_EVENT_IOC_ENABLE); + multi_dawr_workload(); + ioctl(fd1, PERF_EVENT_IOC_DISABLE); + ioctl(fd2, PERF_EVENT_IOC_DISABLE); + + res = read(fd1, &breaks1, sizeof(breaks1)); + assert(res == sizeof(unsigned long long)); + res = read(fd2, &breaks2, sizeof(breaks2)); + assert(res == sizeof(unsigned long long)); + + close(fd1); + close(fd2); + + if (breaks1 != 2 || breaks2 != 2) { + printf("FAILED: %s: %lld != 2 || %lld != 2\n", desc, breaks1, breaks2); + return 1; + } + + printf("TESTED: %s\n", desc); + return 0; +} + +static int test_process_multi_same_addr(void) +{ + unsigned long long breaks1 = 0, breaks2 = 0; + int fd1, fd2; + char *desc = "Process specific, Two events, same addr"; + size_t res; + + fd1 = perf_process_event_open(HW_BREAKPOINT_RW, (__u64)&a, (__u64)sizeof(a)); + if (fd1 < 0) { + perror("perf_process_event_open"); + exit(EXIT_FAILURE); + } + + fd2 = perf_process_event_open(HW_BREAKPOINT_RW, (__u64)&a, (__u64)sizeof(a)); + if (fd2 < 0) { + close(fd1); + perror("perf_process_event_open"); + exit(EXIT_FAILURE); + } + + ioctl(fd1, PERF_EVENT_IOC_RESET); + ioctl(fd2, PERF_EVENT_IOC_RESET); + ioctl(fd1, PERF_EVENT_IOC_ENABLE); + ioctl(fd2, PERF_EVENT_IOC_ENABLE); + multi_dawr_workload(); + ioctl(fd1, PERF_EVENT_IOC_DISABLE); + ioctl(fd2, PERF_EVENT_IOC_DISABLE); + + res = read(fd1, &breaks1, sizeof(breaks1)); + assert(res == sizeof(unsigned long long)); + res = read(fd2, &breaks2, sizeof(breaks2)); + assert(res == sizeof(unsigned long long)); + + close(fd1); + close(fd2); + + if (breaks1 != 2 || breaks2 != 2) { + printf("FAILED: %s: %lld != 2 || %lld != 2\n", desc, breaks1, breaks2); + return 1; + } + + printf("TESTED: %s\n", desc); + return 0; +} + +static int test_process_multi_diff_addr_ro_wo(void) +{ + unsigned long long breaks1 = 0, breaks2 = 0; + int fd1, fd2; + char *desc = "Process specific, Two events, diff addr, one is RO, other is WO"; + size_t res; + + fd1 = perf_process_event_open(HW_BREAKPOINT_W, (__u64)&a, (__u64)sizeof(a)); + if (fd1 < 0) { + perror("perf_process_event_open"); + exit(EXIT_FAILURE); + } + + fd2 = perf_process_event_open(HW_BREAKPOINT_R, (__u64)&b, (__u64)sizeof(b)); + if (fd2 < 0) { + close(fd1); + perror("perf_process_event_open"); + exit(EXIT_FAILURE); + } + + ioctl(fd1, PERF_EVENT_IOC_RESET); + ioctl(fd2, PERF_EVENT_IOC_RESET); + ioctl(fd1, PERF_EVENT_IOC_ENABLE); + ioctl(fd2, PERF_EVENT_IOC_ENABLE); + multi_dawr_workload(); + ioctl(fd1, PERF_EVENT_IOC_DISABLE); + ioctl(fd2, PERF_EVENT_IOC_DISABLE); + + res = read(fd1, &breaks1, sizeof(breaks1)); + assert(res == sizeof(unsigned long long)); + res = read(fd2, &breaks2, sizeof(breaks2)); + assert(res == sizeof(unsigned long long)); + + close(fd1); + close(fd2); + + if (breaks1 != 1 || breaks2 != 1) { + printf("FAILED: %s: %lld != 1 || %lld != 1\n", desc, breaks1, breaks2); + return 1; + } + + printf("TESTED: %s\n", desc); + return 0; +} + +static int test_process_multi_same_addr_ro_wo(void) +{ + unsigned long long breaks1 = 0, breaks2 = 0; + int fd1, fd2; + char *desc = "Process specific, Two events, same addr, one is RO, other is WO"; + size_t res; + + fd1 = perf_process_event_open(HW_BREAKPOINT_R, (__u64)&a, (__u64)sizeof(a)); + if (fd1 < 0) { + perror("perf_process_event_open"); + exit(EXIT_FAILURE); + } + + fd2 = perf_process_event_open(HW_BREAKPOINT_W, (__u64)&a, (__u64)sizeof(a)); + if (fd2 < 0) { + close(fd1); + perror("perf_process_event_open"); + exit(EXIT_FAILURE); + } + + ioctl(fd1, PERF_EVENT_IOC_RESET); + ioctl(fd2, PERF_EVENT_IOC_RESET); + ioctl(fd1, PERF_EVENT_IOC_ENABLE); + ioctl(fd2, PERF_EVENT_IOC_ENABLE); + multi_dawr_workload(); + ioctl(fd1, PERF_EVENT_IOC_DISABLE); + ioctl(fd2, PERF_EVENT_IOC_DISABLE); + + res = read(fd1, &breaks1, sizeof(breaks1)); + assert(res == sizeof(unsigned long long)); + res = read(fd2, &breaks2, sizeof(breaks2)); + assert(res == sizeof(unsigned long long)); + + close(fd1); + close(fd2); + + if (breaks1 != 1 || breaks2 != 1) { + printf("FAILED: %s: %lld != 1 || %lld != 1\n", desc, breaks1, breaks2); + return 1; + } + + printf("TESTED: %s\n", desc); + return 0; +} + +static int test_syswide_multi_diff_addr(void) +{ + unsigned long long breaks1 = 0, breaks2 = 0; + int *fd1 = malloc(nprocs * sizeof(int)); + int *fd2 = malloc(nprocs * sizeof(int)); + char *desc = "Systemwide, Two events, diff addr"; + int ret; + + ret = perf_systemwide_event_open(fd1, HW_BREAKPOINT_RW, (__u64)&a, (__u64)sizeof(a)); + if (ret) { + perror("perf_systemwide_event_open"); + exit(EXIT_FAILURE); + } + + ret = perf_systemwide_event_open(fd2, HW_BREAKPOINT_RW, (__u64)&b, (__u64)sizeof(b)); + if (ret) { + close_fds(fd1, nprocs); + perror("perf_systemwide_event_open"); + exit(EXIT_FAILURE); + } + + reset_fds(fd1, nprocs); + reset_fds(fd2, nprocs); + enable_fds(fd1, nprocs); + enable_fds(fd2, nprocs); + multi_dawr_workload(); + disable_fds(fd1, nprocs); + disable_fds(fd2, nprocs); + + breaks1 = read_fds(fd1, nprocs); + breaks2 = read_fds(fd2, nprocs); + + close_fds(fd1, nprocs); + close_fds(fd2, nprocs); + + free(fd1); + free(fd2); + + if (breaks1 != 2 || breaks2 != 2) { + printf("FAILED: %s: %lld != 2 || %lld != 2\n", desc, breaks1, breaks2); + return 1; + } + + printf("TESTED: %s\n", desc); + return 0; +} + +static int test_syswide_multi_same_addr(void) +{ + unsigned long long breaks1 = 0, breaks2 = 0; + int *fd1 = malloc(nprocs * sizeof(int)); + int *fd2 = malloc(nprocs * sizeof(int)); + char *desc = "Systemwide, Two events, same addr"; + int ret; + + ret = perf_systemwide_event_open(fd1, HW_BREAKPOINT_RW, (__u64)&a, (__u64)sizeof(a)); + if (ret) { + perror("perf_systemwide_event_open"); + exit(EXIT_FAILURE); + } + + ret = perf_systemwide_event_open(fd2, HW_BREAKPOINT_RW, (__u64)&a, (__u64)sizeof(a)); + if (ret) { + close_fds(fd1, nprocs); + perror("perf_systemwide_event_open"); + exit(EXIT_FAILURE); + } + + reset_fds(fd1, nprocs); + reset_fds(fd2, nprocs); + enable_fds(fd1, nprocs); + enable_fds(fd2, nprocs); + multi_dawr_workload(); + disable_fds(fd1, nprocs); + disable_fds(fd2, nprocs); + + breaks1 = read_fds(fd1, nprocs); + breaks2 = read_fds(fd2, nprocs); + + close_fds(fd1, nprocs); + close_fds(fd2, nprocs); + + free(fd1); + free(fd2); + + if (breaks1 != 2 || breaks2 != 2) { + printf("FAILED: %s: %lld != 2 || %lld != 2\n", desc, breaks1, breaks2); + return 1; + } + + printf("TESTED: %s\n", desc); + return 0; +} + +static int test_syswide_multi_diff_addr_ro_wo(void) +{ + unsigned long long breaks1 = 0, breaks2 = 0; + int *fd1 = malloc(nprocs * sizeof(int)); + int *fd2 = malloc(nprocs * sizeof(int)); + char *desc = "Systemwide, Two events, diff addr, one is RO, other is WO"; + int ret; + + ret = perf_systemwide_event_open(fd1, HW_BREAKPOINT_W, (__u64)&a, (__u64)sizeof(a)); + if (ret) { + perror("perf_systemwide_event_open"); + exit(EXIT_FAILURE); + } + + ret = perf_systemwide_event_open(fd2, HW_BREAKPOINT_R, (__u64)&b, (__u64)sizeof(b)); + if (ret) { + close_fds(fd1, nprocs); + perror("perf_systemwide_event_open"); + exit(EXIT_FAILURE); + } + + reset_fds(fd1, nprocs); + reset_fds(fd2, nprocs); + enable_fds(fd1, nprocs); + enable_fds(fd2, nprocs); + multi_dawr_workload(); + disable_fds(fd1, nprocs); + disable_fds(fd2, nprocs); + + breaks1 = read_fds(fd1, nprocs); + breaks2 = read_fds(fd2, nprocs); + + close_fds(fd1, nprocs); + close_fds(fd2, nprocs); + + free(fd1); + free(fd2); + + if (breaks1 != 1 || breaks2 != 1) { + printf("FAILED: %s: %lld != 1 || %lld != 1\n", desc, breaks1, breaks2); + return 1; + } + + printf("TESTED: %s\n", desc); + return 0; +} + +static int test_syswide_multi_same_addr_ro_wo(void) +{ + unsigned long long breaks1 = 0, breaks2 = 0; + int *fd1 = malloc(nprocs * sizeof(int)); + int *fd2 = malloc(nprocs * sizeof(int)); + char *desc = "Systemwide, Two events, same addr, one is RO, other is WO"; + int ret; + + ret = perf_systemwide_event_open(fd1, HW_BREAKPOINT_W, (__u64)&a, (__u64)sizeof(a)); + if (ret) { + perror("perf_systemwide_event_open"); + exit(EXIT_FAILURE); + } + + ret = perf_systemwide_event_open(fd2, HW_BREAKPOINT_R, (__u64)&a, (__u64)sizeof(a)); + if (ret) { + close_fds(fd1, nprocs); + perror("perf_systemwide_event_open"); + exit(EXIT_FAILURE); + } + + reset_fds(fd1, nprocs); + reset_fds(fd2, nprocs); + enable_fds(fd1, nprocs); + enable_fds(fd2, nprocs); + multi_dawr_workload(); + disable_fds(fd1, nprocs); + disable_fds(fd2, nprocs); + + breaks1 = read_fds(fd1, nprocs); + breaks2 = read_fds(fd2, nprocs); + + close_fds(fd1, nprocs); + close_fds(fd2, nprocs); + + free(fd1); + free(fd2); + + if (breaks1 != 1 || breaks2 != 1) { + printf("FAILED: %s: %lld != 1 || %lld != 1\n", desc, breaks1, breaks2); + return 1; + } + + printf("TESTED: %s\n", desc); + return 0; +} + +static int runtest_multi_dawr(void) +{ + int ret = 0; + + ret |= test_process_multi_diff_addr(); + ret |= test_process_multi_same_addr(); + ret |= test_process_multi_diff_addr_ro_wo(); + ret |= test_process_multi_same_addr_ro_wo(); + ret |= test_syswide_multi_diff_addr(); + ret |= test_syswide_multi_same_addr(); + ret |= test_syswide_multi_diff_addr_ro_wo(); + ret |= test_syswide_multi_same_addr_ro_wo(); + + return ret; +} + +static int runtest_unaligned_512bytes(void) +{ + unsigned long long breaks = 0; + int fd; + char *desc = "Process specific, 512 bytes, unaligned"; + __u64 addr = (__u64)&c + 8; + size_t res; + + fd = perf_process_event_open(HW_BREAKPOINT_RW, addr, 512); + if (fd < 0) { + perror("perf_process_event_open"); + exit(EXIT_FAILURE); + } + + ioctl(fd, PERF_EVENT_IOC_RESET); + ioctl(fd, PERF_EVENT_IOC_ENABLE); + multi_dawr_workload(); + ioctl(fd, PERF_EVENT_IOC_DISABLE); + + res = read(fd, &breaks, sizeof(breaks)); + assert(res == sizeof(unsigned long long)); + + close(fd); + + if (breaks != 2) { + printf("FAILED: %s: %lld != 2\n", desc, breaks); + return 1; + } + + printf("TESTED: %s\n", desc); + return 0; +} + +/* There is no perf api to find number of available watchpoints. Use ptrace. */ +static int get_nr_wps(bool *arch_31) +{ + struct ppc_debug_info dbginfo; + int child_pid; + + child_pid = fork(); + if (!child_pid) { + int ret = ptrace(PTRACE_TRACEME, 0, NULL, 0); + if (ret) { + perror("PTRACE_TRACEME failed\n"); + exit(EXIT_FAILURE); + } + kill(getpid(), SIGUSR1); + + sleep(1); + exit(EXIT_SUCCESS); + } + + wait(NULL); + if (ptrace(PPC_PTRACE_GETHWDBGINFO, child_pid, NULL, &dbginfo)) { + perror("Can't get breakpoint info"); + exit(EXIT_FAILURE); + } + + *arch_31 = !!(dbginfo.features & PPC_DEBUG_FEATURE_DATA_BP_ARCH_31); + return dbginfo.num_data_bps; +} + static int runtest(void) { int rwflag; int exclude_user; int ret; + bool dawr = dawr_supported(); + bool arch_31 = false; + int nr_wps = get_nr_wps(&arch_31); /* * perf defines rwflag as two bits read and write and at least @@ -283,7 +820,7 @@ static int runtest(void) return ret; /* if we have the dawr, we can do an array test */ - if (!dawr_supported()) + if (!dawr) continue; ret = runtestsingle(rwflag, exclude_user, 1); if (ret) @@ -292,6 +829,19 @@ static int runtest(void) } ret = runtest_dar_outside(); + if (ret) + return ret; + + if (dawr && nr_wps > 1) { + nprocs = get_nprocs(); + ret = runtest_multi_dawr(); + if (ret) + return ret; + } + + if (dawr && arch_31) + ret = runtest_unaligned_512bytes(); + return ret; } -- cgit v1.2.3 From 290f7d8ce2b1eea5413bb120e0d9d610675b7fba Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Mon, 12 Apr 2021 16:52:18 +0530 Subject: powerpc/selftests: Add selftest to test concurrent perf/ptrace events ptrace and perf watchpoints can't co-exists if their address range overlaps. See commit 29da4f91c0c1 ("powerpc/watchpoint: Don't allow concurrent perf and ptrace events") for more detail. Add selftest for the same. Sample o/p: # ./ptrace-perf-hwbreak test: ptrace-perf-hwbreak tags: git_version:powerpc-5.8-7-118-g937fa174a15d-dirty perf cpu event -> ptrace thread event (Overlapping): Ok perf cpu event -> ptrace thread event (Non-overlapping): Ok perf thread event -> ptrace same thread event (Overlapping): Ok perf thread event -> ptrace same thread event (Non-overlapping): Ok perf thread event -> ptrace other thread event: Ok ptrace thread event -> perf kernel event: Ok ptrace thread event -> perf same thread event (Overlapping): Ok ptrace thread event -> perf same thread event (Non-overlapping): Ok ptrace thread event -> perf other thread event: Ok ptrace thread event -> perf cpu event (Overlapping): Ok ptrace thread event -> perf cpu event (Non-overlapping): Ok ptrace thread event -> perf same thread & cpu event (Overlapping): Ok ptrace thread event -> perf same thread & cpu event (Non-overlapping): Ok ptrace thread event -> perf other thread & cpu event: Ok success: ptrace-perf-hwbreak Signed-off-by: Ravi Bangoria Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210412112218.128183-5-ravi.bangoria@linux.ibm.com --- tools/testing/selftests/powerpc/ptrace/.gitignore | 1 + tools/testing/selftests/powerpc/ptrace/Makefile | 2 +- .../selftests/powerpc/ptrace/ptrace-perf-hwbreak.c | 659 +++++++++++++++++++++ 3 files changed, 661 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/ptrace/ptrace-perf-hwbreak.c (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/ptrace/.gitignore b/tools/testing/selftests/powerpc/ptrace/.gitignore index 0e96150b7c7e..eb75e5360e31 100644 --- a/tools/testing/selftests/powerpc/ptrace/.gitignore +++ b/tools/testing/selftests/powerpc/ptrace/.gitignore @@ -14,3 +14,4 @@ perf-hwbreak core-pkey ptrace-pkey ptrace-syscall +ptrace-perf-hwbreak diff --git a/tools/testing/selftests/powerpc/ptrace/Makefile b/tools/testing/selftests/powerpc/ptrace/Makefile index 8d3f006c98cc..a500639da97a 100644 --- a/tools/testing/selftests/powerpc/ptrace/Makefile +++ b/tools/testing/selftests/powerpc/ptrace/Makefile @@ -2,7 +2,7 @@ TEST_GEN_PROGS := ptrace-gpr ptrace-tm-gpr ptrace-tm-spd-gpr \ ptrace-tar ptrace-tm-tar ptrace-tm-spd-tar ptrace-vsx ptrace-tm-vsx \ ptrace-tm-spd-vsx ptrace-tm-spr ptrace-hwbreak ptrace-pkey core-pkey \ - perf-hwbreak ptrace-syscall + perf-hwbreak ptrace-syscall ptrace-perf-hwbreak top_srcdir = ../../../../.. include ../../lib.mk diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-perf-hwbreak.c b/tools/testing/selftests/powerpc/ptrace/ptrace-perf-hwbreak.c new file mode 100644 index 000000000000..3344e74a97b4 --- /dev/null +++ b/tools/testing/selftests/powerpc/ptrace/ptrace-perf-hwbreak.c @@ -0,0 +1,659 @@ +// SPDX-License-Identifier: GPL-2.0+ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ptrace.h" + +char data[16]; + +/* Overlapping address range */ +volatile __u64 *ptrace_data1 = (__u64 *)&data[0]; +volatile __u64 *perf_data1 = (__u64 *)&data[4]; + +/* Non-overlapping address range */ +volatile __u64 *ptrace_data2 = (__u64 *)&data[0]; +volatile __u64 *perf_data2 = (__u64 *)&data[8]; + +static unsigned long pid_max_addr(void) +{ + FILE *fp; + char *line, *c; + char addr[100]; + size_t len = 0; + + fp = fopen("/proc/kallsyms", "r"); + if (!fp) { + printf("Failed to read /proc/kallsyms. Exiting..\n"); + exit(EXIT_FAILURE); + } + + while (getline(&line, &len, fp) != -1) { + if (!strstr(line, "pid_max") || strstr(line, "pid_max_max") || + strstr(line, "pid_max_min")) + continue; + + strncpy(addr, line, len < 100 ? len : 100); + c = strchr(addr, ' '); + *c = '\0'; + return strtoul(addr, &c, 16); + } + fclose(fp); + printf("Could not find pix_max. Exiting..\n"); + exit(EXIT_FAILURE); + return -1; +} + +static void perf_user_event_attr_set(struct perf_event_attr *attr, __u64 addr, __u64 len) +{ + memset(attr, 0, sizeof(struct perf_event_attr)); + attr->type = PERF_TYPE_BREAKPOINT; + attr->size = sizeof(struct perf_event_attr); + attr->bp_type = HW_BREAKPOINT_R; + attr->bp_addr = addr; + attr->bp_len = len; + attr->exclude_kernel = 1; + attr->exclude_hv = 1; +} + +static void perf_kernel_event_attr_set(struct perf_event_attr *attr) +{ + memset(attr, 0, sizeof(struct perf_event_attr)); + attr->type = PERF_TYPE_BREAKPOINT; + attr->size = sizeof(struct perf_event_attr); + attr->bp_type = HW_BREAKPOINT_R; + attr->bp_addr = pid_max_addr(); + attr->bp_len = sizeof(unsigned long); + attr->exclude_user = 1; + attr->exclude_hv = 1; +} + +static int perf_cpu_event_open(int cpu, __u64 addr, __u64 len) +{ + struct perf_event_attr attr; + + perf_user_event_attr_set(&attr, addr, len); + return syscall(__NR_perf_event_open, &attr, -1, cpu, -1, 0); +} + +static int perf_thread_event_open(pid_t child_pid, __u64 addr, __u64 len) +{ + struct perf_event_attr attr; + + perf_user_event_attr_set(&attr, addr, len); + return syscall(__NR_perf_event_open, &attr, child_pid, -1, -1, 0); +} + +static int perf_thread_cpu_event_open(pid_t child_pid, int cpu, __u64 addr, __u64 len) +{ + struct perf_event_attr attr; + + perf_user_event_attr_set(&attr, addr, len); + return syscall(__NR_perf_event_open, &attr, child_pid, cpu, -1, 0); +} + +static int perf_thread_kernel_event_open(pid_t child_pid) +{ + struct perf_event_attr attr; + + perf_kernel_event_attr_set(&attr); + return syscall(__NR_perf_event_open, &attr, child_pid, -1, -1, 0); +} + +static int perf_cpu_kernel_event_open(int cpu) +{ + struct perf_event_attr attr; + + perf_kernel_event_attr_set(&attr); + return syscall(__NR_perf_event_open, &attr, -1, cpu, -1, 0); +} + +static int child(void) +{ + int ret; + + ret = ptrace(PTRACE_TRACEME, 0, NULL, 0); + if (ret) { + printf("Error: PTRACE_TRACEME failed\n"); + return 0; + } + kill(getpid(), SIGUSR1); /* --> parent (SIGUSR1) */ + + return 0; +} + +static void ptrace_ppc_hw_breakpoint(struct ppc_hw_breakpoint *info, int type, + __u64 addr, int len) +{ + info->version = 1; + info->trigger_type = type; + info->condition_mode = PPC_BREAKPOINT_CONDITION_NONE; + info->addr = addr; + info->addr2 = addr + len; + info->condition_value = 0; + if (!len) + info->addr_mode = PPC_BREAKPOINT_MODE_EXACT; + else + info->addr_mode = PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE; +} + +static int ptrace_open(pid_t child_pid, __u64 wp_addr, int len) +{ + struct ppc_hw_breakpoint info; + + ptrace_ppc_hw_breakpoint(&info, PPC_BREAKPOINT_TRIGGER_RW, wp_addr, len); + return ptrace(PPC_PTRACE_SETHWDEBUG, child_pid, 0, &info); +} + +static int test1(pid_t child_pid) +{ + int perf_fd; + int ptrace_fd; + int ret = 0; + + /* Test: + * if (new per thread event by ptrace) + * if (existing cpu event by perf) + * if (addr range overlaps) + * fail; + */ + + perf_fd = perf_cpu_event_open(0, (__u64)perf_data1, sizeof(*perf_data1)); + if (perf_fd < 0) + return -1; + + ptrace_fd = ptrace_open(child_pid, (__u64)ptrace_data1, sizeof(*ptrace_data1)); + if (ptrace_fd > 0 || errno != ENOSPC) + ret = -1; + + close(perf_fd); + return ret; +} + +static int test2(pid_t child_pid) +{ + int perf_fd; + int ptrace_fd; + int ret = 0; + + /* Test: + * if (new per thread event by ptrace) + * if (existing cpu event by perf) + * if (addr range does not overlaps) + * allow; + */ + + perf_fd = perf_cpu_event_open(0, (__u64)perf_data2, sizeof(*perf_data2)); + if (perf_fd < 0) + return -1; + + ptrace_fd = ptrace_open(child_pid, (__u64)ptrace_data2, sizeof(*ptrace_data2)); + if (ptrace_fd < 0) { + ret = -1; + goto perf_close; + } + ptrace(PPC_PTRACE_DELHWDEBUG, child_pid, 0, ptrace_fd); + +perf_close: + close(perf_fd); + return ret; +} + +static int test3(pid_t child_pid) +{ + int perf_fd; + int ptrace_fd; + int ret = 0; + + /* Test: + * if (new per thread event by ptrace) + * if (existing thread event by perf on the same thread) + * if (addr range overlaps) + * fail; + */ + perf_fd = perf_thread_event_open(child_pid, (__u64)perf_data1, + sizeof(*perf_data1)); + if (perf_fd < 0) + return -1; + + ptrace_fd = ptrace_open(child_pid, (__u64)ptrace_data1, sizeof(*ptrace_data1)); + if (ptrace_fd > 0 || errno != ENOSPC) + ret = -1; + + close(perf_fd); + return ret; +} + +static int test4(pid_t child_pid) +{ + int perf_fd; + int ptrace_fd; + int ret = 0; + + /* Test: + * if (new per thread event by ptrace) + * if (existing thread event by perf on the same thread) + * if (addr range does not overlaps) + * fail; + */ + perf_fd = perf_thread_event_open(child_pid, (__u64)perf_data2, + sizeof(*perf_data2)); + if (perf_fd < 0) + return -1; + + ptrace_fd = ptrace_open(child_pid, (__u64)ptrace_data2, sizeof(*ptrace_data2)); + if (ptrace_fd < 0) { + ret = -1; + goto perf_close; + } + ptrace(PPC_PTRACE_DELHWDEBUG, child_pid, 0, ptrace_fd); + +perf_close: + close(perf_fd); + return ret; +} + +static int test5(pid_t child_pid) +{ + int perf_fd; + int ptrace_fd; + int cpid; + int ret = 0; + + /* Test: + * if (new per thread event by ptrace) + * if (existing thread event by perf on the different thread) + * allow; + */ + cpid = fork(); + if (!cpid) { + /* Temporary Child */ + pause(); + exit(EXIT_SUCCESS); + } + + perf_fd = perf_thread_event_open(cpid, (__u64)perf_data1, sizeof(*perf_data1)); + if (perf_fd < 0) { + ret = -1; + goto kill_child; + } + + ptrace_fd = ptrace_open(child_pid, (__u64)ptrace_data1, sizeof(*ptrace_data1)); + if (ptrace_fd < 0) { + ret = -1; + goto perf_close; + } + + ptrace(PPC_PTRACE_DELHWDEBUG, child_pid, 0, ptrace_fd); +perf_close: + close(perf_fd); +kill_child: + kill(cpid, SIGINT); + return ret; +} + +static int test6(pid_t child_pid) +{ + int perf_fd; + int ptrace_fd; + int ret = 0; + + /* Test: + * if (new per thread kernel event by perf) + * if (existing thread event by ptrace on the same thread) + * allow; + * -- OR -- + * if (new per cpu kernel event by perf) + * if (existing thread event by ptrace) + * allow; + */ + ptrace_fd = ptrace_open(child_pid, (__u64)ptrace_data1, sizeof(*ptrace_data1)); + if (ptrace_fd < 0) + return -1; + + perf_fd = perf_thread_kernel_event_open(child_pid); + if (perf_fd < 0) { + ret = -1; + goto ptrace_close; + } + close(perf_fd); + + perf_fd = perf_cpu_kernel_event_open(0); + if (perf_fd < 0) { + ret = -1; + goto ptrace_close; + } + close(perf_fd); + +ptrace_close: + ptrace(PPC_PTRACE_DELHWDEBUG, child_pid, 0, ptrace_fd); + return ret; +} + +static int test7(pid_t child_pid) +{ + int perf_fd; + int ptrace_fd; + int ret = 0; + + /* Test: + * if (new per thread event by perf) + * if (existing thread event by ptrace on the same thread) + * if (addr range overlaps) + * fail; + */ + ptrace_fd = ptrace_open(child_pid, (__u64)ptrace_data1, sizeof(*ptrace_data1)); + if (ptrace_fd < 0) + return -1; + + perf_fd = perf_thread_event_open(child_pid, (__u64)perf_data1, + sizeof(*perf_data1)); + if (perf_fd > 0 || errno != ENOSPC) + ret = -1; + + ptrace(PPC_PTRACE_DELHWDEBUG, child_pid, 0, ptrace_fd); + return ret; +} + +static int test8(pid_t child_pid) +{ + int perf_fd; + int ptrace_fd; + int ret = 0; + + /* Test: + * if (new per thread event by perf) + * if (existing thread event by ptrace on the same thread) + * if (addr range does not overlaps) + * allow; + */ + ptrace_fd = ptrace_open(child_pid, (__u64)ptrace_data2, sizeof(*ptrace_data2)); + if (ptrace_fd < 0) + return -1; + + perf_fd = perf_thread_event_open(child_pid, (__u64)perf_data2, + sizeof(*perf_data2)); + if (perf_fd < 0) { + ret = -1; + goto ptrace_close; + } + close(perf_fd); + +ptrace_close: + ptrace(PPC_PTRACE_DELHWDEBUG, child_pid, 0, ptrace_fd); + return ret; +} + +static int test9(pid_t child_pid) +{ + int perf_fd; + int ptrace_fd; + int cpid; + int ret = 0; + + /* Test: + * if (new per thread event by perf) + * if (existing thread event by ptrace on the other thread) + * allow; + */ + ptrace_fd = ptrace_open(child_pid, (__u64)ptrace_data1, sizeof(*ptrace_data1)); + if (ptrace_fd < 0) + return -1; + + cpid = fork(); + if (!cpid) { + /* Temporary Child */ + pause(); + exit(EXIT_SUCCESS); + } + + perf_fd = perf_thread_event_open(cpid, (__u64)perf_data1, sizeof(*perf_data1)); + if (perf_fd < 0) { + ret = -1; + goto kill_child; + } + close(perf_fd); + +kill_child: + kill(cpid, SIGINT); + ptrace(PPC_PTRACE_DELHWDEBUG, child_pid, 0, ptrace_fd); + return ret; +} + +static int test10(pid_t child_pid) +{ + int perf_fd; + int ptrace_fd; + int ret = 0; + + /* Test: + * if (new per cpu event by perf) + * if (existing thread event by ptrace on the same thread) + * if (addr range overlaps) + * fail; + */ + ptrace_fd = ptrace_open(child_pid, (__u64)ptrace_data1, sizeof(*ptrace_data1)); + if (ptrace_fd < 0) + return -1; + + perf_fd = perf_cpu_event_open(0, (__u64)perf_data1, sizeof(*perf_data1)); + if (perf_fd > 0 || errno != ENOSPC) + ret = -1; + + ptrace(PPC_PTRACE_DELHWDEBUG, child_pid, 0, ptrace_fd); + return ret; +} + +static int test11(pid_t child_pid) +{ + int perf_fd; + int ptrace_fd; + int ret = 0; + + /* Test: + * if (new per cpu event by perf) + * if (existing thread event by ptrace on the same thread) + * if (addr range does not overlap) + * allow; + */ + ptrace_fd = ptrace_open(child_pid, (__u64)ptrace_data2, sizeof(*ptrace_data2)); + if (ptrace_fd < 0) + return -1; + + perf_fd = perf_cpu_event_open(0, (__u64)perf_data2, sizeof(*perf_data2)); + if (perf_fd < 0) { + ret = -1; + goto ptrace_close; + } + close(perf_fd); + +ptrace_close: + ptrace(PPC_PTRACE_DELHWDEBUG, child_pid, 0, ptrace_fd); + return ret; +} + +static int test12(pid_t child_pid) +{ + int perf_fd; + int ptrace_fd; + int ret = 0; + + /* Test: + * if (new per thread and per cpu event by perf) + * if (existing thread event by ptrace on the same thread) + * if (addr range overlaps) + * fail; + */ + ptrace_fd = ptrace_open(child_pid, (__u64)ptrace_data1, sizeof(*ptrace_data1)); + if (ptrace_fd < 0) + return -1; + + perf_fd = perf_thread_cpu_event_open(child_pid, 0, (__u64)perf_data1, sizeof(*perf_data1)); + if (perf_fd > 0 || errno != ENOSPC) + ret = -1; + + ptrace(PPC_PTRACE_DELHWDEBUG, child_pid, 0, ptrace_fd); + return ret; +} + +static int test13(pid_t child_pid) +{ + int perf_fd; + int ptrace_fd; + int ret = 0; + + /* Test: + * if (new per thread and per cpu event by perf) + * if (existing thread event by ptrace on the same thread) + * if (addr range does not overlap) + * allow; + */ + ptrace_fd = ptrace_open(child_pid, (__u64)ptrace_data2, sizeof(*ptrace_data2)); + if (ptrace_fd < 0) + return -1; + + perf_fd = perf_thread_cpu_event_open(child_pid, 0, (__u64)perf_data2, sizeof(*perf_data2)); + if (perf_fd < 0) { + ret = -1; + goto ptrace_close; + } + close(perf_fd); + +ptrace_close: + ptrace(PPC_PTRACE_DELHWDEBUG, child_pid, 0, ptrace_fd); + return ret; +} + +static int test14(pid_t child_pid) +{ + int perf_fd; + int ptrace_fd; + int cpid; + int ret = 0; + + /* Test: + * if (new per thread and per cpu event by perf) + * if (existing thread event by ptrace on the other thread) + * allow; + */ + ptrace_fd = ptrace_open(child_pid, (__u64)ptrace_data1, sizeof(*ptrace_data1)); + if (ptrace_fd < 0) + return -1; + + cpid = fork(); + if (!cpid) { + /* Temporary Child */ + pause(); + exit(EXIT_SUCCESS); + } + + perf_fd = perf_thread_cpu_event_open(cpid, 0, (__u64)perf_data1, + sizeof(*perf_data1)); + if (perf_fd < 0) { + ret = -1; + goto kill_child; + } + close(perf_fd); + +kill_child: + kill(cpid, SIGINT); + ptrace(PPC_PTRACE_DELHWDEBUG, child_pid, 0, ptrace_fd); + return ret; +} + +static int do_test(const char *msg, int (*fun)(pid_t arg), pid_t arg) +{ + int ret; + + ret = fun(arg); + if (ret) + printf("%s: Error\n", msg); + else + printf("%s: Ok\n", msg); + return ret; +} + +char *desc[14] = { + "perf cpu event -> ptrace thread event (Overlapping)", + "perf cpu event -> ptrace thread event (Non-overlapping)", + "perf thread event -> ptrace same thread event (Overlapping)", + "perf thread event -> ptrace same thread event (Non-overlapping)", + "perf thread event -> ptrace other thread event", + "ptrace thread event -> perf kernel event", + "ptrace thread event -> perf same thread event (Overlapping)", + "ptrace thread event -> perf same thread event (Non-overlapping)", + "ptrace thread event -> perf other thread event", + "ptrace thread event -> perf cpu event (Overlapping)", + "ptrace thread event -> perf cpu event (Non-overlapping)", + "ptrace thread event -> perf same thread & cpu event (Overlapping)", + "ptrace thread event -> perf same thread & cpu event (Non-overlapping)", + "ptrace thread event -> perf other thread & cpu event", +}; + +static int test(pid_t child_pid) +{ + int ret = TEST_PASS; + + ret |= do_test(desc[0], test1, child_pid); + ret |= do_test(desc[1], test2, child_pid); + ret |= do_test(desc[2], test3, child_pid); + ret |= do_test(desc[3], test4, child_pid); + ret |= do_test(desc[4], test5, child_pid); + ret |= do_test(desc[5], test6, child_pid); + ret |= do_test(desc[6], test7, child_pid); + ret |= do_test(desc[7], test8, child_pid); + ret |= do_test(desc[8], test9, child_pid); + ret |= do_test(desc[9], test10, child_pid); + ret |= do_test(desc[10], test11, child_pid); + ret |= do_test(desc[11], test12, child_pid); + ret |= do_test(desc[12], test13, child_pid); + ret |= do_test(desc[13], test14, child_pid); + + return ret; +} + +static void get_dbginfo(pid_t child_pid, struct ppc_debug_info *dbginfo) +{ + if (ptrace(PPC_PTRACE_GETHWDBGINFO, child_pid, NULL, dbginfo)) { + perror("Can't get breakpoint info"); + exit(-1); + } +} + +static int ptrace_perf_hwbreak(void) +{ + int ret; + pid_t child_pid; + struct ppc_debug_info dbginfo; + + child_pid = fork(); + if (!child_pid) + return child(); + + /* parent */ + wait(NULL); /* <-- child (SIGUSR1) */ + + get_dbginfo(child_pid, &dbginfo); + SKIP_IF(dbginfo.num_data_bps <= 1); + + ret = perf_cpu_event_open(0, (__u64)perf_data1, sizeof(*perf_data1)); + SKIP_IF(ret < 0); + close(ret); + + ret = test(child_pid); + + ptrace(PTRACE_CONT, child_pid, NULL, 0); + return ret; +} + +int main(int argc, char *argv[]) +{ + return test_harness(ptrace_perf_hwbreak, "ptrace-perf-hwbreak"); +} -- cgit v1.2.3 From 0db11461677aa5105b9ebbd939aee0ceb77a988b Mon Sep 17 00:00:00 2001 From: Yang Li Date: Mon, 8 Feb 2021 18:41:10 +0800 Subject: selftests/powerpc: remove unneeded semicolon Eliminate the following coccicheck warning: ./tools/testing/selftests/powerpc/nx-gzip/gzfht_test.c:327:4-5: Unneeded semicolon Reported-by: Abaci Robot Signed-off-by: Yang Li Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/1612780870-95890-1-git-send-email-yang.lee@linux.alibaba.com --- tools/testing/selftests/powerpc/nx-gzip/gzfht_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/powerpc/nx-gzip/gzfht_test.c b/tools/testing/selftests/powerpc/nx-gzip/gzfht_test.c index 02dffb65de48..b099753b50e4 100644 --- a/tools/testing/selftests/powerpc/nx-gzip/gzfht_test.c +++ b/tools/testing/selftests/powerpc/nx-gzip/gzfht_test.c @@ -324,7 +324,7 @@ int compress_file(int argc, char **argv, void *handle) fprintf(stderr, "error: cannot progress; "); fprintf(stderr, "too many faults\n"); exit(-1); - }; + } } fault_tries = NX_MAX_FAULTS; /* Reset for the next chunk */ -- cgit v1.2.3 From a4b0fccfbdb4a2004b97cae3872088570495e274 Mon Sep 17 00:00:00 2001 From: Ray Kinsella Date: Wed, 21 Apr 2021 10:10:09 +0100 Subject: perf tools: Update topdown documentation to permit rdpmc calls Update Topdown documentation to permit calls to rdpmc, and describe interaction with system calls. Signed-off-by: Ray Kinsella Reviewed-by: Andi Kleen Cc: Jiri Olsa Cc: Kan Liang Link: http://lore.kernel.org/lkml/20210421091009.1711565-1-mdr@ashroe.eu Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/topdown.txt | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'tools') diff --git a/tools/perf/Documentation/topdown.txt b/tools/perf/Documentation/topdown.txt index 10f07f9455b8..c6302df4cf29 100644 --- a/tools/perf/Documentation/topdown.txt +++ b/tools/perf/Documentation/topdown.txt @@ -72,6 +72,7 @@ For example, the perf_event_attr structure can be initialized with The Fixed counter 3 must be the leader of the group. #include +#include #include #include @@ -95,6 +96,11 @@ int slots_fd = perf_event_open(&slots, 0, -1, -1, 0); if (slots_fd < 0) ... error ... +/* Memory mapping the fd permits _rdpmc calls from userspace */ +void *slots_p = mmap(0, getpagesize(), PROT_READ, MAP_SHARED, slots_fd, 0); +if (!slot_p) + .... error ... + /* * Open metrics event file descriptor for current task. * Set slots event as the leader of the group. @@ -110,6 +116,14 @@ int metrics_fd = perf_event_open(&metrics, 0, -1, slots_fd, 0); if (metrics_fd < 0) ... error ... +/* Memory mapping the fd permits _rdpmc calls from userspace */ +void *metrics_p = mmap(0, getpagesize(), PROT_READ, MAP_SHARED, metrics_fd, 0); +if (!metrics_p) + ... error ... + +Note: the file descriptors returned by the perf_event_open calls must be memory +mapped to permit calls to the _rdpmd instruction. Permission may also be granted +by writing the /sys/devices/cpu/rdpmc sysfs node. The RDPMC instruction (or _rdpmc compiler intrinsic) can now be used to read slots and the topdown metrics at different points of the program: @@ -141,6 +155,10 @@ as the parallelism and overlap in the CPU program execution will cause too much measurement inaccuracy. For example instrumenting individual basic blocks is definitely too fine grained. +_rdpmc calls should not be mixed with reading the metrics and slots counters +through system calls, as the kernel will reset these counters after each system +call. + Decoding metrics values ======================= -- cgit v1.2.3 From e1199815b47be83346c03e20a3de76f934e4bb34 Mon Sep 17 00:00:00 2001 From: Mickaël Salaün Date: Thu, 22 Apr 2021 17:41:20 +0200 Subject: selftests/landlock: Add user space tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Test all Landlock system calls, ptrace hooks semantic and filesystem access-control with multiple layouts. Test coverage for security/landlock/ is 93.6% of lines. The code not covered only deals with internal kernel errors (e.g. memory allocation) and race conditions. Cc: James Morris Cc: Jann Horn Cc: Serge E. Hallyn Cc: Shuah Khan Signed-off-by: Mickaël Salaün Reviewed-by: Vincent Dagonneau Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20210422154123.13086-11-mic@digikod.net Signed-off-by: James Morris --- MAINTAINERS | 1 + tools/testing/selftests/Makefile | 1 + tools/testing/selftests/landlock/.gitignore | 2 + tools/testing/selftests/landlock/Makefile | 24 + tools/testing/selftests/landlock/base_test.c | 219 ++ tools/testing/selftests/landlock/common.h | 183 ++ tools/testing/selftests/landlock/config | 7 + tools/testing/selftests/landlock/fs_test.c | 2791 ++++++++++++++++++++++++ tools/testing/selftests/landlock/ptrace_test.c | 337 +++ tools/testing/selftests/landlock/true.c | 5 + 10 files changed, 3570 insertions(+) create mode 100644 tools/testing/selftests/landlock/.gitignore create mode 100644 tools/testing/selftests/landlock/Makefile create mode 100644 tools/testing/selftests/landlock/base_test.c create mode 100644 tools/testing/selftests/landlock/common.h create mode 100644 tools/testing/selftests/landlock/config create mode 100644 tools/testing/selftests/landlock/fs_test.c create mode 100644 tools/testing/selftests/landlock/ptrace_test.c create mode 100644 tools/testing/selftests/landlock/true.c (limited to 'tools') diff --git a/MAINTAINERS b/MAINTAINERS index 70ec117efa8a..8cab5854844e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10005,6 +10005,7 @@ W: https://landlock.io T: git https://github.com/landlock-lsm/linux.git F: include/uapi/linux/landlock.h F: security/landlock/ +F: tools/testing/selftests/landlock/ K: landlock K: LANDLOCK diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 6c575cf34a71..bc3299a20338 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -25,6 +25,7 @@ TARGETS += ir TARGETS += kcmp TARGETS += kexec TARGETS += kvm +TARGETS += landlock TARGETS += lib TARGETS += livepatch TARGETS += lkdtm diff --git a/tools/testing/selftests/landlock/.gitignore b/tools/testing/selftests/landlock/.gitignore new file mode 100644 index 000000000000..470203a7cd73 --- /dev/null +++ b/tools/testing/selftests/landlock/.gitignore @@ -0,0 +1,2 @@ +/*_test +/true diff --git a/tools/testing/selftests/landlock/Makefile b/tools/testing/selftests/landlock/Makefile new file mode 100644 index 000000000000..a99596ca9882 --- /dev/null +++ b/tools/testing/selftests/landlock/Makefile @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: GPL-2.0 + +CFLAGS += -Wall -O2 + +src_test := $(wildcard *_test.c) + +TEST_GEN_PROGS := $(src_test:.c=) + +TEST_GEN_PROGS_EXTENDED := true + +KSFT_KHDR_INSTALL := 1 +OVERRIDE_TARGETS := 1 +include ../lib.mk + +khdr_dir = $(top_srcdir)/usr/include + +$(khdr_dir)/linux/landlock.h: khdr + @: + +$(OUTPUT)/true: true.c + $(LINK.c) $< $(LDLIBS) -o $@ -static + +$(OUTPUT)/%_test: %_test.c $(khdr_dir)/linux/landlock.h ../kselftest_harness.h common.h + $(LINK.c) $< $(LDLIBS) -o $@ -lcap -I$(khdr_dir) diff --git a/tools/testing/selftests/landlock/base_test.c b/tools/testing/selftests/landlock/base_test.c new file mode 100644 index 000000000000..262c3c8d953a --- /dev/null +++ b/tools/testing/selftests/landlock/base_test.c @@ -0,0 +1,219 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Landlock tests - Common user space base + * + * Copyright © 2017-2020 Mickaël Salaün + * Copyright © 2019-2020 ANSSI + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" + +#ifndef O_PATH +#define O_PATH 010000000 +#endif + +TEST(inconsistent_attr) { + const long page_size = sysconf(_SC_PAGESIZE); + char *const buf = malloc(page_size + 1); + struct landlock_ruleset_attr *const ruleset_attr = (void *)buf; + + ASSERT_NE(NULL, buf); + + /* Checks copy_from_user(). */ + ASSERT_EQ(-1, landlock_create_ruleset(ruleset_attr, 0, 0)); + /* The size if less than sizeof(struct landlock_attr_enforce). */ + ASSERT_EQ(EINVAL, errno); + ASSERT_EQ(-1, landlock_create_ruleset(ruleset_attr, 1, 0)); + ASSERT_EQ(EINVAL, errno); + + ASSERT_EQ(-1, landlock_create_ruleset(NULL, 1, 0)); + /* The size if less than sizeof(struct landlock_attr_enforce). */ + ASSERT_EQ(EFAULT, errno); + + ASSERT_EQ(-1, landlock_create_ruleset(NULL, + sizeof(struct landlock_ruleset_attr), 0)); + ASSERT_EQ(EFAULT, errno); + + ASSERT_EQ(-1, landlock_create_ruleset(ruleset_attr, page_size + 1, 0)); + ASSERT_EQ(E2BIG, errno); + + ASSERT_EQ(-1, landlock_create_ruleset(ruleset_attr, + sizeof(struct landlock_ruleset_attr), 0)); + ASSERT_EQ(ENOMSG, errno); + ASSERT_EQ(-1, landlock_create_ruleset(ruleset_attr, page_size, 0)); + ASSERT_EQ(ENOMSG, errno); + + /* Checks non-zero value. */ + buf[page_size - 2] = '.'; + ASSERT_EQ(-1, landlock_create_ruleset(ruleset_attr, page_size, 0)); + ASSERT_EQ(E2BIG, errno); + + ASSERT_EQ(-1, landlock_create_ruleset(ruleset_attr, page_size + 1, 0)); + ASSERT_EQ(E2BIG, errno); + + free(buf); +} + +TEST(empty_path_beneath_attr) { + const struct landlock_ruleset_attr ruleset_attr = { + .handled_access_fs = LANDLOCK_ACCESS_FS_EXECUTE, + }; + const int ruleset_fd = landlock_create_ruleset(&ruleset_attr, + sizeof(ruleset_attr), 0); + + ASSERT_LE(0, ruleset_fd); + + /* Similar to struct landlock_path_beneath_attr.parent_fd = 0 */ + ASSERT_EQ(-1, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_PATH_BENEATH, + NULL, 0)); + ASSERT_EQ(EFAULT, errno); + ASSERT_EQ(0, close(ruleset_fd)); +} + +TEST(inval_fd_enforce) { + ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)); + + ASSERT_EQ(-1, landlock_restrict_self(-1, 0)); + ASSERT_EQ(EBADF, errno); +} + +TEST(unpriv_enforce_without_no_new_privs) { + int err; + + drop_caps(_metadata); + err = landlock_restrict_self(-1, 0); + ASSERT_EQ(EPERM, errno); + ASSERT_EQ(err, -1); +} + +TEST(ruleset_fd_io) +{ + struct landlock_ruleset_attr ruleset_attr = { + .handled_access_fs = LANDLOCK_ACCESS_FS_READ_FILE, + }; + int ruleset_fd; + char buf; + + drop_caps(_metadata); + ruleset_fd = landlock_create_ruleset(&ruleset_attr, + sizeof(ruleset_attr), 0); + ASSERT_LE(0, ruleset_fd); + + ASSERT_EQ(-1, write(ruleset_fd, ".", 1)); + ASSERT_EQ(EINVAL, errno); + ASSERT_EQ(-1, read(ruleset_fd, &buf, 1)); + ASSERT_EQ(EINVAL, errno); + + ASSERT_EQ(0, close(ruleset_fd)); +} + +/* Tests enforcement of a ruleset FD transferred through a UNIX socket. */ +TEST(ruleset_fd_transfer) +{ + struct landlock_ruleset_attr ruleset_attr = { + .handled_access_fs = LANDLOCK_ACCESS_FS_READ_DIR, + }; + struct landlock_path_beneath_attr path_beneath_attr = { + .allowed_access = LANDLOCK_ACCESS_FS_READ_DIR, + }; + int ruleset_fd_tx, dir_fd; + union { + /* Aligned ancillary data buffer. */ + char buf[CMSG_SPACE(sizeof(ruleset_fd_tx))]; + struct cmsghdr _align; + } cmsg_tx = {}; + char data_tx = '.'; + struct iovec io = { + .iov_base = &data_tx, + .iov_len = sizeof(data_tx), + }; + struct msghdr msg = { + .msg_iov = &io, + .msg_iovlen = 1, + .msg_control = &cmsg_tx.buf, + .msg_controllen = sizeof(cmsg_tx.buf), + }; + struct cmsghdr *cmsg; + int socket_fds[2]; + pid_t child; + int status; + + drop_caps(_metadata); + + /* Creates a test ruleset with a simple rule. */ + ruleset_fd_tx = landlock_create_ruleset(&ruleset_attr, + sizeof(ruleset_attr), 0); + ASSERT_LE(0, ruleset_fd_tx); + path_beneath_attr.parent_fd = open("/tmp", O_PATH | O_NOFOLLOW | + O_DIRECTORY | O_CLOEXEC); + ASSERT_LE(0, path_beneath_attr.parent_fd); + ASSERT_EQ(0, landlock_add_rule(ruleset_fd_tx, LANDLOCK_RULE_PATH_BENEATH, + &path_beneath_attr, 0)); + ASSERT_EQ(0, close(path_beneath_attr.parent_fd)); + + cmsg = CMSG_FIRSTHDR(&msg); + ASSERT_NE(NULL, cmsg); + cmsg->cmsg_len = CMSG_LEN(sizeof(ruleset_fd_tx)); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + memcpy(CMSG_DATA(cmsg), &ruleset_fd_tx, sizeof(ruleset_fd_tx)); + + /* Sends the ruleset FD over a socketpair and then close it. */ + ASSERT_EQ(0, socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, socket_fds)); + ASSERT_EQ(sizeof(data_tx), sendmsg(socket_fds[0], &msg, 0)); + ASSERT_EQ(0, close(socket_fds[0])); + ASSERT_EQ(0, close(ruleset_fd_tx)); + + child = fork(); + ASSERT_LE(0, child); + if (child == 0) { + int ruleset_fd_rx; + + *(char *)msg.msg_iov->iov_base = '\0'; + ASSERT_EQ(sizeof(data_tx), recvmsg(socket_fds[1], &msg, MSG_CMSG_CLOEXEC)); + ASSERT_EQ('.', *(char *)msg.msg_iov->iov_base); + ASSERT_EQ(0, close(socket_fds[1])); + cmsg = CMSG_FIRSTHDR(&msg); + ASSERT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(ruleset_fd_tx))); + memcpy(&ruleset_fd_rx, CMSG_DATA(cmsg), sizeof(ruleset_fd_tx)); + + /* Enforces the received ruleset on the child. */ + ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)); + ASSERT_EQ(0, landlock_restrict_self(ruleset_fd_rx, 0)); + ASSERT_EQ(0, close(ruleset_fd_rx)); + + /* Checks that the ruleset enforcement. */ + ASSERT_EQ(-1, open("/", O_RDONLY | O_DIRECTORY | O_CLOEXEC)); + ASSERT_EQ(EACCES, errno); + dir_fd = open("/tmp", O_RDONLY | O_DIRECTORY | O_CLOEXEC); + ASSERT_LE(0, dir_fd); + ASSERT_EQ(0, close(dir_fd)); + _exit(_metadata->passed ? EXIT_SUCCESS : EXIT_FAILURE); + return; + } + + ASSERT_EQ(0, close(socket_fds[1])); + + /* Checks that the parent is unrestricted. */ + dir_fd = open("/", O_RDONLY | O_DIRECTORY | O_CLOEXEC); + ASSERT_LE(0, dir_fd); + ASSERT_EQ(0, close(dir_fd)); + dir_fd = open("/tmp", O_RDONLY | O_DIRECTORY | O_CLOEXEC); + ASSERT_LE(0, dir_fd); + ASSERT_EQ(0, close(dir_fd)); + + ASSERT_EQ(child, waitpid(child, &status, 0)); + ASSERT_EQ(1, WIFEXITED(status)); + ASSERT_EQ(EXIT_SUCCESS, WEXITSTATUS(status)); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/landlock/common.h b/tools/testing/selftests/landlock/common.h new file mode 100644 index 000000000000..20e2a9286d71 --- /dev/null +++ b/tools/testing/selftests/landlock/common.h @@ -0,0 +1,183 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Landlock test helpers + * + * Copyright © 2017-2020 Mickaël Salaün + * Copyright © 2019-2020 ANSSI + * Copyright © 2021 Microsoft Corporation + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest_harness.h" + +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) +#endif + +/* + * TEST_F_FORK() is useful when a test drop privileges but the corresponding + * FIXTURE_TEARDOWN() requires them (e.g. to remove files from a directory + * where write actions are denied). For convenience, FIXTURE_TEARDOWN() is + * also called when the test failed, but not when FIXTURE_SETUP() failed. For + * this to be possible, we must not call abort() but instead exit smoothly + * (hence the step print). + */ +#define TEST_F_FORK(fixture_name, test_name) \ + static void fixture_name##_##test_name##_child( \ + struct __test_metadata *_metadata, \ + FIXTURE_DATA(fixture_name) *self, \ + const FIXTURE_VARIANT(fixture_name) *variant); \ + TEST_F(fixture_name, test_name) \ + { \ + int status; \ + const pid_t child = fork(); \ + if (child < 0) \ + abort(); \ + if (child == 0) { \ + _metadata->no_print = 1; \ + fixture_name##_##test_name##_child(_metadata, self, variant); \ + if (_metadata->skip) \ + _exit(255); \ + if (_metadata->passed) \ + _exit(0); \ + _exit(_metadata->step); \ + } \ + if (child != waitpid(child, &status, 0)) \ + abort(); \ + if (WIFSIGNALED(status) || !WIFEXITED(status)) { \ + _metadata->passed = 0; \ + _metadata->step = 1; \ + return; \ + } \ + switch (WEXITSTATUS(status)) { \ + case 0: \ + _metadata->passed = 1; \ + break; \ + case 255: \ + _metadata->passed = 1; \ + _metadata->skip = 1; \ + break; \ + default: \ + _metadata->passed = 0; \ + _metadata->step = WEXITSTATUS(status); \ + break; \ + } \ + } \ + static void fixture_name##_##test_name##_child( \ + struct __test_metadata __attribute__((unused)) *_metadata, \ + FIXTURE_DATA(fixture_name) __attribute__((unused)) *self, \ + const FIXTURE_VARIANT(fixture_name) \ + __attribute__((unused)) *variant) + +#ifndef landlock_create_ruleset +static inline int landlock_create_ruleset( + const struct landlock_ruleset_attr *const attr, + const size_t size, const __u32 flags) +{ + return syscall(__NR_landlock_create_ruleset, attr, size, flags); +} +#endif + +#ifndef landlock_add_rule +static inline int landlock_add_rule(const int ruleset_fd, + const enum landlock_rule_type rule_type, + const void *const rule_attr, const __u32 flags) +{ + return syscall(__NR_landlock_add_rule, ruleset_fd, rule_type, + rule_attr, flags); +} +#endif + +#ifndef landlock_restrict_self +static inline int landlock_restrict_self(const int ruleset_fd, + const __u32 flags) +{ + return syscall(__NR_landlock_restrict_self, ruleset_fd, flags); +} +#endif + +static void _init_caps(struct __test_metadata *const _metadata, bool drop_all) +{ + cap_t cap_p; + /* Only these three capabilities are useful for the tests. */ + const cap_value_t caps[] = { + CAP_DAC_OVERRIDE, + CAP_MKNOD, + CAP_SYS_ADMIN, + CAP_SYS_CHROOT, + }; + + cap_p = cap_get_proc(); + EXPECT_NE(NULL, cap_p) { + TH_LOG("Failed to cap_get_proc: %s", strerror(errno)); + } + EXPECT_NE(-1, cap_clear(cap_p)) { + TH_LOG("Failed to cap_clear: %s", strerror(errno)); + } + if (!drop_all) { + EXPECT_NE(-1, cap_set_flag(cap_p, CAP_PERMITTED, + ARRAY_SIZE(caps), caps, CAP_SET)) { + TH_LOG("Failed to cap_set_flag: %s", strerror(errno)); + } + } + EXPECT_NE(-1, cap_set_proc(cap_p)) { + TH_LOG("Failed to cap_set_proc: %s", strerror(errno)); + } + EXPECT_NE(-1, cap_free(cap_p)) { + TH_LOG("Failed to cap_free: %s", strerror(errno)); + } +} + +/* We cannot put such helpers in a library because of kselftest_harness.h . */ +__attribute__((__unused__)) +static void disable_caps(struct __test_metadata *const _metadata) +{ + _init_caps(_metadata, false); +} + +__attribute__((__unused__)) +static void drop_caps(struct __test_metadata *const _metadata) +{ + _init_caps(_metadata, true); +} + +static void _effective_cap(struct __test_metadata *const _metadata, + const cap_value_t caps, const cap_flag_value_t value) +{ + cap_t cap_p; + + cap_p = cap_get_proc(); + EXPECT_NE(NULL, cap_p) { + TH_LOG("Failed to cap_get_proc: %s", strerror(errno)); + } + EXPECT_NE(-1, cap_set_flag(cap_p, CAP_EFFECTIVE, 1, &caps, value)) { + TH_LOG("Failed to cap_set_flag: %s", strerror(errno)); + } + EXPECT_NE(-1, cap_set_proc(cap_p)) { + TH_LOG("Failed to cap_set_proc: %s", strerror(errno)); + } + EXPECT_NE(-1, cap_free(cap_p)) { + TH_LOG("Failed to cap_free: %s", strerror(errno)); + } +} + +__attribute__((__unused__)) +static void set_cap(struct __test_metadata *const _metadata, + const cap_value_t caps) +{ + _effective_cap(_metadata, caps, CAP_SET); +} + +__attribute__((__unused__)) +static void clear_cap(struct __test_metadata *const _metadata, + const cap_value_t caps) +{ + _effective_cap(_metadata, caps, CAP_CLEAR); +} diff --git a/tools/testing/selftests/landlock/config b/tools/testing/selftests/landlock/config new file mode 100644 index 000000000000..0f0a65287bac --- /dev/null +++ b/tools/testing/selftests/landlock/config @@ -0,0 +1,7 @@ +CONFIG_OVERLAY_FS=y +CONFIG_SECURITY_LANDLOCK=y +CONFIG_SECURITY_PATH=y +CONFIG_SECURITY=y +CONFIG_SHMEM=y +CONFIG_TMPFS_XATTR=y +CONFIG_TMPFS=y diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c new file mode 100644 index 000000000000..10c9a1e4ebd9 --- /dev/null +++ b/tools/testing/selftests/landlock/fs_test.c @@ -0,0 +1,2791 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Landlock tests - Filesystem + * + * Copyright © 2017-2020 Mickaël Salaün + * Copyright © 2020 ANSSI + * Copyright © 2020-2021 Microsoft Corporation + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" + +#define TMP_DIR "tmp" +#define BINARY_PATH "./true" + +/* Paths (sibling number and depth) */ +static const char dir_s1d1[] = TMP_DIR "/s1d1"; +static const char file1_s1d1[] = TMP_DIR "/s1d1/f1"; +static const char file2_s1d1[] = TMP_DIR "/s1d1/f2"; +static const char dir_s1d2[] = TMP_DIR "/s1d1/s1d2"; +static const char file1_s1d2[] = TMP_DIR "/s1d1/s1d2/f1"; +static const char file2_s1d2[] = TMP_DIR "/s1d1/s1d2/f2"; +static const char dir_s1d3[] = TMP_DIR "/s1d1/s1d2/s1d3"; +static const char file1_s1d3[] = TMP_DIR "/s1d1/s1d2/s1d3/f1"; +static const char file2_s1d3[] = TMP_DIR "/s1d1/s1d2/s1d3/f2"; + +static const char dir_s2d1[] = TMP_DIR "/s2d1"; +static const char file1_s2d1[] = TMP_DIR "/s2d1/f1"; +static const char dir_s2d2[] = TMP_DIR "/s2d1/s2d2"; +static const char file1_s2d2[] = TMP_DIR "/s2d1/s2d2/f1"; +static const char dir_s2d3[] = TMP_DIR "/s2d1/s2d2/s2d3"; +static const char file1_s2d3[] = TMP_DIR "/s2d1/s2d2/s2d3/f1"; +static const char file2_s2d3[] = TMP_DIR "/s2d1/s2d2/s2d3/f2"; + +static const char dir_s3d1[] = TMP_DIR "/s3d1"; +/* dir_s3d2 is a mount point. */ +static const char dir_s3d2[] = TMP_DIR "/s3d1/s3d2"; +static const char dir_s3d3[] = TMP_DIR "/s3d1/s3d2/s3d3"; + +/* + * layout1 hierarchy: + * + * tmp + * ├── s1d1 + * │   ├── f1 + * │   ├── f2 + * │   └── s1d2 + * │   ├── f1 + * │   ├── f2 + * │   └── s1d3 + * │   ├── f1 + * │   └── f2 + * ├── s2d1 + * │   ├── f1 + * │   └── s2d2 + * │   ├── f1 + * │   └── s2d3 + * │   ├── f1 + * │   └── f2 + * └── s3d1 + * └── s3d2 + * └── s3d3 + */ + +static void mkdir_parents(struct __test_metadata *const _metadata, + const char *const path) +{ + char *walker; + const char *parent; + int i, err; + + ASSERT_NE(path[0], '\0'); + walker = strdup(path); + ASSERT_NE(NULL, walker); + parent = walker; + for (i = 1; walker[i]; i++) { + if (walker[i] != '/') + continue; + walker[i] = '\0'; + err = mkdir(parent, 0700); + ASSERT_FALSE(err && errno != EEXIST) { + TH_LOG("Failed to create directory \"%s\": %s", + parent, strerror(errno)); + } + walker[i] = '/'; + } + free(walker); +} + +static void create_directory(struct __test_metadata *const _metadata, + const char *const path) +{ + mkdir_parents(_metadata, path); + ASSERT_EQ(0, mkdir(path, 0700)) { + TH_LOG("Failed to create directory \"%s\": %s", path, + strerror(errno)); + } +} + +static void create_file(struct __test_metadata *const _metadata, + const char *const path) +{ + mkdir_parents(_metadata, path); + ASSERT_EQ(0, mknod(path, S_IFREG | 0700, 0)) { + TH_LOG("Failed to create file \"%s\": %s", path, + strerror(errno)); + } +} + +static int remove_path(const char *const path) +{ + char *walker; + int i, ret, err = 0; + + walker = strdup(path); + if (!walker) { + err = ENOMEM; + goto out; + } + if (unlink(path) && rmdir(path)) { + if (errno != ENOENT) + err = errno; + goto out; + } + for (i = strlen(walker); i > 0; i--) { + if (walker[i] != '/') + continue; + walker[i] = '\0'; + ret = rmdir(walker); + if (ret) { + if (errno != ENOTEMPTY && errno != EBUSY) + err = errno; + goto out; + } + if (strcmp(walker, TMP_DIR) == 0) + goto out; + } + +out: + free(walker); + return err; +} + +static void prepare_layout(struct __test_metadata *const _metadata) +{ + disable_caps(_metadata); + umask(0077); + create_directory(_metadata, TMP_DIR); + + /* + * Do not pollute the rest of the system: creates a private mount point + * for tests relying on pivot_root(2) and move_mount(2). + */ + set_cap(_metadata, CAP_SYS_ADMIN); + ASSERT_EQ(0, unshare(CLONE_NEWNS)); + ASSERT_EQ(0, mount("tmp", TMP_DIR, "tmpfs", 0, "size=4m,mode=700")); + ASSERT_EQ(0, mount(NULL, TMP_DIR, NULL, MS_PRIVATE | MS_REC, NULL)); + clear_cap(_metadata, CAP_SYS_ADMIN); +} + +static void cleanup_layout(struct __test_metadata *const _metadata) +{ + set_cap(_metadata, CAP_SYS_ADMIN); + EXPECT_EQ(0, umount(TMP_DIR)); + clear_cap(_metadata, CAP_SYS_ADMIN); + EXPECT_EQ(0, remove_path(TMP_DIR)); +} + +static void create_layout1(struct __test_metadata *const _metadata) +{ + create_file(_metadata, file1_s1d1); + create_file(_metadata, file1_s1d2); + create_file(_metadata, file1_s1d3); + create_file(_metadata, file2_s1d1); + create_file(_metadata, file2_s1d2); + create_file(_metadata, file2_s1d3); + + create_file(_metadata, file1_s2d1); + create_file(_metadata, file1_s2d2); + create_file(_metadata, file1_s2d3); + create_file(_metadata, file2_s2d3); + + create_directory(_metadata, dir_s3d2); + set_cap(_metadata, CAP_SYS_ADMIN); + ASSERT_EQ(0, mount("tmp", dir_s3d2, "tmpfs", 0, "size=4m,mode=700")); + clear_cap(_metadata, CAP_SYS_ADMIN); + + ASSERT_EQ(0, mkdir(dir_s3d3, 0700)); +} + +static void remove_layout1(struct __test_metadata *const _metadata) +{ + EXPECT_EQ(0, remove_path(file2_s1d3)); + EXPECT_EQ(0, remove_path(file2_s1d2)); + EXPECT_EQ(0, remove_path(file2_s1d1)); + EXPECT_EQ(0, remove_path(file1_s1d3)); + EXPECT_EQ(0, remove_path(file1_s1d2)); + EXPECT_EQ(0, remove_path(file1_s1d1)); + + EXPECT_EQ(0, remove_path(file2_s2d3)); + EXPECT_EQ(0, remove_path(file1_s2d3)); + EXPECT_EQ(0, remove_path(file1_s2d2)); + EXPECT_EQ(0, remove_path(file1_s2d1)); + + EXPECT_EQ(0, remove_path(dir_s3d3)); + set_cap(_metadata, CAP_SYS_ADMIN); + umount(dir_s3d2); + clear_cap(_metadata, CAP_SYS_ADMIN); + EXPECT_EQ(0, remove_path(dir_s3d2)); +} + +FIXTURE(layout1) { +}; + +FIXTURE_SETUP(layout1) +{ + prepare_layout(_metadata); + + create_layout1(_metadata); +} + +FIXTURE_TEARDOWN(layout1) +{ + remove_layout1(_metadata); + + cleanup_layout(_metadata); +} + +/* + * This helper enables to use the ASSERT_* macros and print the line number + * pointing to the test caller. + */ +static int test_open_rel(const int dirfd, const char *const path, const int flags) +{ + int fd; + + /* Works with file and directories. */ + fd = openat(dirfd, path, flags | O_CLOEXEC); + if (fd < 0) + return errno; + /* + * Mixing error codes from close(2) and open(2) should not lead to any + * (access type) confusion for this test. + */ + if (close(fd) != 0) + return errno; + return 0; +} + +static int test_open(const char *const path, const int flags) +{ + return test_open_rel(AT_FDCWD, path, flags); +} + +TEST_F_FORK(layout1, no_restriction) +{ + ASSERT_EQ(0, test_open(dir_s1d1, O_RDONLY)); + ASSERT_EQ(0, test_open(file1_s1d1, O_RDONLY)); + ASSERT_EQ(0, test_open(file2_s1d1, O_RDONLY)); + ASSERT_EQ(0, test_open(dir_s1d2, O_RDONLY)); + ASSERT_EQ(0, test_open(file1_s1d2, O_RDONLY)); + ASSERT_EQ(0, test_open(file2_s1d2, O_RDONLY)); + ASSERT_EQ(0, test_open(dir_s1d3, O_RDONLY)); + ASSERT_EQ(0, test_open(file1_s1d3, O_RDONLY)); + + ASSERT_EQ(0, test_open(dir_s2d1, O_RDONLY)); + ASSERT_EQ(0, test_open(file1_s2d1, O_RDONLY)); + ASSERT_EQ(0, test_open(dir_s2d2, O_RDONLY)); + ASSERT_EQ(0, test_open(file1_s2d2, O_RDONLY)); + ASSERT_EQ(0, test_open(dir_s2d3, O_RDONLY)); + ASSERT_EQ(0, test_open(file1_s2d3, O_RDONLY)); + + ASSERT_EQ(0, test_open(dir_s3d1, O_RDONLY)); + ASSERT_EQ(0, test_open(dir_s3d2, O_RDONLY)); + ASSERT_EQ(0, test_open(dir_s3d3, O_RDONLY)); +} + +TEST_F_FORK(layout1, inval) +{ + struct landlock_path_beneath_attr path_beneath = { + .allowed_access = LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_WRITE_FILE, + .parent_fd = -1, + }; + struct landlock_ruleset_attr ruleset_attr = { + .handled_access_fs = LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_WRITE_FILE, + }; + int ruleset_fd; + + path_beneath.parent_fd = open(dir_s1d2, O_PATH | O_DIRECTORY | + O_CLOEXEC); + ASSERT_LE(0, path_beneath.parent_fd); + + ruleset_fd = open(dir_s1d1, O_PATH | O_DIRECTORY | O_CLOEXEC); + ASSERT_LE(0, ruleset_fd); + ASSERT_EQ(-1, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_PATH_BENEATH, + &path_beneath, 0)); + /* Returns EBADF because ruleset_fd is not a landlock-ruleset FD. */ + ASSERT_EQ(EBADF, errno); + ASSERT_EQ(0, close(ruleset_fd)); + + ruleset_fd = open(dir_s1d1, O_DIRECTORY | O_CLOEXEC); + ASSERT_LE(0, ruleset_fd); + ASSERT_EQ(-1, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_PATH_BENEATH, + &path_beneath, 0)); + /* Returns EBADFD because ruleset_fd is not a valid ruleset. */ + ASSERT_EQ(EBADFD, errno); + ASSERT_EQ(0, close(ruleset_fd)); + + /* Gets a real ruleset. */ + ruleset_fd = landlock_create_ruleset(&ruleset_attr, + sizeof(ruleset_attr), 0); + ASSERT_LE(0, ruleset_fd); + ASSERT_EQ(0, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_PATH_BENEATH, + &path_beneath, 0)); + ASSERT_EQ(0, close(path_beneath.parent_fd)); + + /* Tests without O_PATH. */ + path_beneath.parent_fd = open(dir_s1d2, O_DIRECTORY | O_CLOEXEC); + ASSERT_LE(0, path_beneath.parent_fd); + ASSERT_EQ(0, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_PATH_BENEATH, + &path_beneath, 0)); + ASSERT_EQ(0, close(path_beneath.parent_fd)); + + /* Tests with a ruleset FD. */ + path_beneath.parent_fd = ruleset_fd; + ASSERT_EQ(-1, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_PATH_BENEATH, + &path_beneath, 0)); + ASSERT_EQ(EBADFD, errno); + + /* Checks unhandled allowed_access. */ + path_beneath.parent_fd = open(dir_s1d2, O_PATH | O_DIRECTORY | + O_CLOEXEC); + ASSERT_LE(0, path_beneath.parent_fd); + + /* Test with legitimate values. */ + path_beneath.allowed_access |= LANDLOCK_ACCESS_FS_EXECUTE; + ASSERT_EQ(-1, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_PATH_BENEATH, + &path_beneath, 0)); + ASSERT_EQ(EINVAL, errno); + path_beneath.allowed_access &= ~LANDLOCK_ACCESS_FS_EXECUTE; + + /* Test with unknown (64-bits) value. */ + path_beneath.allowed_access |= (1ULL << 60); + ASSERT_EQ(-1, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_PATH_BENEATH, + &path_beneath, 0)); + ASSERT_EQ(EINVAL, errno); + path_beneath.allowed_access &= ~(1ULL << 60); + + /* Test with no access. */ + path_beneath.allowed_access = 0; + ASSERT_EQ(-1, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_PATH_BENEATH, + &path_beneath, 0)); + ASSERT_EQ(ENOMSG, errno); + path_beneath.allowed_access &= ~(1ULL << 60); + + ASSERT_EQ(0, close(path_beneath.parent_fd)); + + /* Enforces the ruleset. */ + ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)); + ASSERT_EQ(0, landlock_restrict_self(ruleset_fd, 0)); + + ASSERT_EQ(0, close(ruleset_fd)); +} + +#define ACCESS_FILE ( \ + LANDLOCK_ACCESS_FS_EXECUTE | \ + LANDLOCK_ACCESS_FS_WRITE_FILE | \ + LANDLOCK_ACCESS_FS_READ_FILE) + +#define ACCESS_LAST LANDLOCK_ACCESS_FS_MAKE_SYM + +#define ACCESS_ALL ( \ + ACCESS_FILE | \ + LANDLOCK_ACCESS_FS_READ_DIR | \ + LANDLOCK_ACCESS_FS_REMOVE_DIR | \ + LANDLOCK_ACCESS_FS_REMOVE_FILE | \ + LANDLOCK_ACCESS_FS_MAKE_CHAR | \ + LANDLOCK_ACCESS_FS_MAKE_DIR | \ + LANDLOCK_ACCESS_FS_MAKE_REG | \ + LANDLOCK_ACCESS_FS_MAKE_SOCK | \ + LANDLOCK_ACCESS_FS_MAKE_FIFO | \ + LANDLOCK_ACCESS_FS_MAKE_BLOCK | \ + ACCESS_LAST) + +TEST_F_FORK(layout1, file_access_rights) +{ + __u64 access; + int err; + struct landlock_path_beneath_attr path_beneath = {}; + struct landlock_ruleset_attr ruleset_attr = { + .handled_access_fs = ACCESS_ALL, + }; + const int ruleset_fd = landlock_create_ruleset(&ruleset_attr, + sizeof(ruleset_attr), 0); + + ASSERT_LE(0, ruleset_fd); + + /* Tests access rights for files. */ + path_beneath.parent_fd = open(file1_s1d2, O_PATH | O_CLOEXEC); + ASSERT_LE(0, path_beneath.parent_fd); + for (access = 1; access <= ACCESS_LAST; access <<= 1) { + path_beneath.allowed_access = access; + err = landlock_add_rule(ruleset_fd, LANDLOCK_RULE_PATH_BENEATH, + &path_beneath, 0); + if ((access | ACCESS_FILE) == ACCESS_FILE) { + ASSERT_EQ(0, err); + } else { + ASSERT_EQ(-1, err); + ASSERT_EQ(EINVAL, errno); + } + } + ASSERT_EQ(0, close(path_beneath.parent_fd)); +} + +static void add_path_beneath(struct __test_metadata *const _metadata, + const int ruleset_fd, const __u64 allowed_access, + const char *const path) +{ + struct landlock_path_beneath_attr path_beneath = { + .allowed_access = allowed_access, + }; + + path_beneath.parent_fd = open(path, O_PATH | O_CLOEXEC); + ASSERT_LE(0, path_beneath.parent_fd) { + TH_LOG("Failed to open directory \"%s\": %s", path, + strerror(errno)); + } + ASSERT_EQ(0, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_PATH_BENEATH, + &path_beneath, 0)) { + TH_LOG("Failed to update the ruleset with \"%s\": %s", path, + strerror(errno)); + } + ASSERT_EQ(0, close(path_beneath.parent_fd)); +} + +struct rule { + const char *path; + __u64 access; +}; + +#define ACCESS_RO ( \ + LANDLOCK_ACCESS_FS_READ_FILE | \ + LANDLOCK_ACCESS_FS_READ_DIR) + +#define ACCESS_RW ( \ + ACCESS_RO | \ + LANDLOCK_ACCESS_FS_WRITE_FILE) + +static int create_ruleset(struct __test_metadata *const _metadata, + const __u64 handled_access_fs, const struct rule rules[]) +{ + int ruleset_fd, i; + struct landlock_ruleset_attr ruleset_attr = { + .handled_access_fs = handled_access_fs, + }; + + ASSERT_NE(NULL, rules) { + TH_LOG("No rule list"); + } + ASSERT_NE(NULL, rules[0].path) { + TH_LOG("Empty rule list"); + } + + ruleset_fd = landlock_create_ruleset(&ruleset_attr, + sizeof(ruleset_attr), 0); + ASSERT_LE(0, ruleset_fd) { + TH_LOG("Failed to create a ruleset: %s", strerror(errno)); + } + + for (i = 0; rules[i].path; i++) { + add_path_beneath(_metadata, ruleset_fd, rules[i].access, + rules[i].path); + } + return ruleset_fd; +} + +static void enforce_ruleset(struct __test_metadata *const _metadata, + const int ruleset_fd) +{ + ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)); + ASSERT_EQ(0, landlock_restrict_self(ruleset_fd, 0)) { + TH_LOG("Failed to enforce ruleset: %s", strerror(errno)); + } +} + +TEST_F_FORK(layout1, proc_nsfs) +{ + const struct rule rules[] = { + { + .path = "/dev/null", + .access = LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_WRITE_FILE, + }, + {} + }; + struct landlock_path_beneath_attr path_beneath; + const int ruleset_fd = create_ruleset(_metadata, rules[0].access | + LANDLOCK_ACCESS_FS_READ_DIR, rules); + + ASSERT_LE(0, ruleset_fd); + ASSERT_EQ(0, test_open("/proc/self/ns/mnt", O_RDONLY)); + + enforce_ruleset(_metadata, ruleset_fd); + + ASSERT_EQ(EACCES, test_open("/", O_RDONLY)); + ASSERT_EQ(EACCES, test_open("/dev", O_RDONLY)); + ASSERT_EQ(0, test_open("/dev/null", O_RDONLY)); + ASSERT_EQ(EACCES, test_open("/dev/full", O_RDONLY)); + + ASSERT_EQ(EACCES, test_open("/proc", O_RDONLY)); + ASSERT_EQ(EACCES, test_open("/proc/self", O_RDONLY)); + ASSERT_EQ(EACCES, test_open("/proc/self/ns", O_RDONLY)); + /* + * Because nsfs is an internal filesystem, /proc/self/ns/mnt is a + * disconnected path. Such path cannot be identified and must then be + * allowed. + */ + ASSERT_EQ(0, test_open("/proc/self/ns/mnt", O_RDONLY)); + + /* + * Checks that it is not possible to add nsfs-like filesystem + * references to a ruleset. + */ + path_beneath.allowed_access = LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_WRITE_FILE, + path_beneath.parent_fd = open("/proc/self/ns/mnt", O_PATH | O_CLOEXEC); + ASSERT_LE(0, path_beneath.parent_fd); + ASSERT_EQ(-1, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_PATH_BENEATH, + &path_beneath, 0)); + ASSERT_EQ(EBADFD, errno); + ASSERT_EQ(0, close(path_beneath.parent_fd)); +} + +TEST_F_FORK(layout1, unpriv) { + const struct rule rules[] = { + { + .path = dir_s1d2, + .access = ACCESS_RO, + }, + {} + }; + int ruleset_fd; + + drop_caps(_metadata); + + ruleset_fd = create_ruleset(_metadata, ACCESS_RO, rules); + ASSERT_LE(0, ruleset_fd); + ASSERT_EQ(-1, landlock_restrict_self(ruleset_fd, 0)); + ASSERT_EQ(EPERM, errno); + + /* enforce_ruleset() calls prctl(no_new_privs). */ + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); +} + +TEST_F_FORK(layout1, effective_access) +{ + const struct rule rules[] = { + { + .path = dir_s1d2, + .access = ACCESS_RO, + }, + { + .path = file1_s2d2, + .access = LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_WRITE_FILE, + }, + {} + }; + const int ruleset_fd = create_ruleset(_metadata, ACCESS_RW, rules); + char buf; + int reg_fd; + + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* Tests on a directory. */ + ASSERT_EQ(EACCES, test_open("/", O_RDONLY)); + ASSERT_EQ(EACCES, test_open(dir_s1d1, O_RDONLY)); + ASSERT_EQ(EACCES, test_open(file1_s1d1, O_RDONLY)); + ASSERT_EQ(0, test_open(dir_s1d2, O_RDONLY)); + ASSERT_EQ(0, test_open(file1_s1d2, O_RDONLY)); + ASSERT_EQ(0, test_open(dir_s1d3, O_RDONLY)); + ASSERT_EQ(0, test_open(file1_s1d3, O_RDONLY)); + + /* Tests on a file. */ + ASSERT_EQ(EACCES, test_open(dir_s2d2, O_RDONLY)); + ASSERT_EQ(0, test_open(file1_s2d2, O_RDONLY)); + + /* Checks effective read and write actions. */ + reg_fd = open(file1_s2d2, O_RDWR | O_CLOEXEC); + ASSERT_LE(0, reg_fd); + ASSERT_EQ(1, write(reg_fd, ".", 1)); + ASSERT_LE(0, lseek(reg_fd, 0, SEEK_SET)); + ASSERT_EQ(1, read(reg_fd, &buf, 1)); + ASSERT_EQ('.', buf); + ASSERT_EQ(0, close(reg_fd)); + + /* Just in case, double-checks effective actions. */ + reg_fd = open(file1_s2d2, O_RDONLY | O_CLOEXEC); + ASSERT_LE(0, reg_fd); + ASSERT_EQ(-1, write(reg_fd, &buf, 1)); + ASSERT_EQ(EBADF, errno); + ASSERT_EQ(0, close(reg_fd)); +} + +TEST_F_FORK(layout1, unhandled_access) +{ + const struct rule rules[] = { + { + .path = dir_s1d2, + .access = ACCESS_RO, + }, + {} + }; + /* Here, we only handle read accesses, not write accesses. */ + const int ruleset_fd = create_ruleset(_metadata, ACCESS_RO, rules); + + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* + * Because the policy does not handle LANDLOCK_ACCESS_FS_WRITE_FILE, + * opening for write-only should be allowed, but not read-write. + */ + ASSERT_EQ(0, test_open(file1_s1d1, O_WRONLY)); + ASSERT_EQ(EACCES, test_open(file1_s1d1, O_RDWR)); + + ASSERT_EQ(0, test_open(file1_s1d2, O_WRONLY)); + ASSERT_EQ(0, test_open(file1_s1d2, O_RDWR)); +} + +TEST_F_FORK(layout1, ruleset_overlap) +{ + const struct rule rules[] = { + /* These rules should be ORed among them. */ + { + .path = dir_s1d2, + .access = LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_WRITE_FILE, + }, + { + .path = dir_s1d2, + .access = LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_READ_DIR, + }, + {} + }; + const int ruleset_fd = create_ruleset(_metadata, ACCESS_RW, rules); + + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* Checks s1d1 hierarchy. */ + ASSERT_EQ(EACCES, test_open(file1_s1d1, O_RDONLY)); + ASSERT_EQ(EACCES, test_open(file1_s1d1, O_WRONLY)); + ASSERT_EQ(EACCES, test_open(file1_s1d1, O_RDWR)); + ASSERT_EQ(EACCES, test_open(dir_s1d1, O_RDONLY | O_DIRECTORY)); + + /* Checks s1d2 hierarchy. */ + ASSERT_EQ(0, test_open(file1_s1d2, O_RDONLY)); + ASSERT_EQ(0, test_open(file1_s1d2, O_WRONLY)); + ASSERT_EQ(0, test_open(file1_s1d2, O_RDWR)); + ASSERT_EQ(0, test_open(dir_s1d2, O_RDONLY | O_DIRECTORY)); + + /* Checks s1d3 hierarchy. */ + ASSERT_EQ(0, test_open(file1_s1d3, O_RDONLY)); + ASSERT_EQ(0, test_open(file1_s1d3, O_WRONLY)); + ASSERT_EQ(0, test_open(file1_s1d3, O_RDWR)); + ASSERT_EQ(0, test_open(dir_s1d3, O_RDONLY | O_DIRECTORY)); +} + +TEST_F_FORK(layout1, non_overlapping_accesses) +{ + const struct rule layer1[] = { + { + .path = dir_s1d2, + .access = LANDLOCK_ACCESS_FS_MAKE_REG, + }, + {} + }; + const struct rule layer2[] = { + { + .path = dir_s1d3, + .access = LANDLOCK_ACCESS_FS_REMOVE_FILE, + }, + {} + }; + int ruleset_fd; + + ASSERT_EQ(0, unlink(file1_s1d1)); + ASSERT_EQ(0, unlink(file1_s1d2)); + + ruleset_fd = create_ruleset(_metadata, LANDLOCK_ACCESS_FS_MAKE_REG, + layer1); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + ASSERT_EQ(-1, mknod(file1_s1d1, S_IFREG | 0700, 0)); + ASSERT_EQ(EACCES, errno); + ASSERT_EQ(0, mknod(file1_s1d2, S_IFREG | 0700, 0)); + ASSERT_EQ(0, unlink(file1_s1d2)); + + ruleset_fd = create_ruleset(_metadata, LANDLOCK_ACCESS_FS_REMOVE_FILE, + layer2); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* Unchanged accesses for file creation. */ + ASSERT_EQ(-1, mknod(file1_s1d1, S_IFREG | 0700, 0)); + ASSERT_EQ(EACCES, errno); + ASSERT_EQ(0, mknod(file1_s1d2, S_IFREG | 0700, 0)); + + /* Checks file removing. */ + ASSERT_EQ(-1, unlink(file1_s1d2)); + ASSERT_EQ(EACCES, errno); + ASSERT_EQ(0, unlink(file1_s1d3)); +} + +TEST_F_FORK(layout1, interleaved_masked_accesses) +{ + /* + * Checks overly restrictive rules: + * layer 1: allows R s1d1/s1d2/s1d3/file1 + * layer 2: allows RW s1d1/s1d2/s1d3 + * allows W s1d1/s1d2 + * denies R s1d1/s1d2 + * layer 3: allows R s1d1 + * layer 4: allows R s1d1/s1d2 + * denies W s1d1/s1d2 + * layer 5: allows R s1d1/s1d2 + * layer 6: allows X ---- + * layer 7: allows W s1d1/s1d2 + * denies R s1d1/s1d2 + */ + const struct rule layer1_read[] = { + /* Allows read access to file1_s1d3 with the first layer. */ + { + .path = file1_s1d3, + .access = LANDLOCK_ACCESS_FS_READ_FILE, + }, + {} + }; + /* First rule with write restrictions. */ + const struct rule layer2_read_write[] = { + /* Start by granting read-write access via its parent directory... */ + { + .path = dir_s1d3, + .access = LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_WRITE_FILE, + }, + /* ...but also denies read access via its grandparent directory. */ + { + .path = dir_s1d2, + .access = LANDLOCK_ACCESS_FS_WRITE_FILE, + }, + {} + }; + const struct rule layer3_read[] = { + /* Allows read access via its great-grandparent directory. */ + { + .path = dir_s1d1, + .access = LANDLOCK_ACCESS_FS_READ_FILE, + }, + {} + }; + const struct rule layer4_read_write[] = { + /* + * Try to confuse the deny access by denying write (but not + * read) access via its grandparent directory. + */ + { + .path = dir_s1d2, + .access = LANDLOCK_ACCESS_FS_READ_FILE, + }, + {} + }; + const struct rule layer5_read[] = { + /* + * Try to override layer2's deny read access by explicitly + * allowing read access via file1_s1d3's grandparent. + */ + { + .path = dir_s1d2, + .access = LANDLOCK_ACCESS_FS_READ_FILE, + }, + {} + }; + const struct rule layer6_execute[] = { + /* + * Restricts an unrelated file hierarchy with a new access + * (non-overlapping) type. + */ + { + .path = dir_s2d1, + .access = LANDLOCK_ACCESS_FS_EXECUTE, + }, + {} + }; + const struct rule layer7_read_write[] = { + /* + * Finally, denies read access to file1_s1d3 via its + * grandparent. + */ + { + .path = dir_s1d2, + .access = LANDLOCK_ACCESS_FS_WRITE_FILE, + }, + {} + }; + int ruleset_fd; + + ruleset_fd = create_ruleset(_metadata, LANDLOCK_ACCESS_FS_READ_FILE, + layer1_read); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* Checks that read access is granted for file1_s1d3 with layer 1. */ + ASSERT_EQ(0, test_open(file1_s1d3, O_RDWR)); + ASSERT_EQ(EACCES, test_open(file2_s1d3, O_RDONLY)); + ASSERT_EQ(0, test_open(file2_s1d3, O_WRONLY)); + + ruleset_fd = create_ruleset(_metadata, LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_WRITE_FILE, layer2_read_write); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* Checks that previous access rights are unchanged with layer 2. */ + ASSERT_EQ(0, test_open(file1_s1d3, O_RDWR)); + ASSERT_EQ(EACCES, test_open(file2_s1d3, O_RDONLY)); + ASSERT_EQ(0, test_open(file2_s1d3, O_WRONLY)); + + ruleset_fd = create_ruleset(_metadata, LANDLOCK_ACCESS_FS_READ_FILE, + layer3_read); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* Checks that previous access rights are unchanged with layer 3. */ + ASSERT_EQ(0, test_open(file1_s1d3, O_RDWR)); + ASSERT_EQ(EACCES, test_open(file2_s1d3, O_RDONLY)); + ASSERT_EQ(0, test_open(file2_s1d3, O_WRONLY)); + + /* This time, denies write access for the file hierarchy. */ + ruleset_fd = create_ruleset(_metadata, LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_WRITE_FILE, layer4_read_write); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* + * Checks that the only change with layer 4 is that write access is + * denied. + */ + ASSERT_EQ(0, test_open(file1_s1d3, O_RDONLY)); + ASSERT_EQ(EACCES, test_open(file1_s1d3, O_WRONLY)); + ASSERT_EQ(EACCES, test_open(file2_s1d3, O_RDONLY)); + ASSERT_EQ(EACCES, test_open(file2_s1d3, O_WRONLY)); + + ruleset_fd = create_ruleset(_metadata, LANDLOCK_ACCESS_FS_READ_FILE, + layer5_read); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* Checks that previous access rights are unchanged with layer 5. */ + ASSERT_EQ(0, test_open(file1_s1d3, O_RDONLY)); + ASSERT_EQ(EACCES, test_open(file1_s1d3, O_WRONLY)); + ASSERT_EQ(EACCES, test_open(file2_s1d3, O_WRONLY)); + ASSERT_EQ(EACCES, test_open(file2_s1d3, O_RDONLY)); + + ruleset_fd = create_ruleset(_metadata, LANDLOCK_ACCESS_FS_EXECUTE, + layer6_execute); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* Checks that previous access rights are unchanged with layer 6. */ + ASSERT_EQ(0, test_open(file1_s1d3, O_RDONLY)); + ASSERT_EQ(EACCES, test_open(file1_s1d3, O_WRONLY)); + ASSERT_EQ(EACCES, test_open(file2_s1d3, O_WRONLY)); + ASSERT_EQ(EACCES, test_open(file2_s1d3, O_RDONLY)); + + ruleset_fd = create_ruleset(_metadata, LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_WRITE_FILE, layer7_read_write); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* Checks read access is now denied with layer 7. */ + ASSERT_EQ(EACCES, test_open(file1_s1d3, O_RDONLY)); + ASSERT_EQ(EACCES, test_open(file1_s1d3, O_WRONLY)); + ASSERT_EQ(EACCES, test_open(file2_s1d3, O_WRONLY)); + ASSERT_EQ(EACCES, test_open(file2_s1d3, O_RDONLY)); +} + +TEST_F_FORK(layout1, inherit_subset) +{ + const struct rule rules[] = { + { + .path = dir_s1d2, + .access = LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_READ_DIR, + }, + {} + }; + const int ruleset_fd = create_ruleset(_metadata, ACCESS_RW, rules); + + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + + ASSERT_EQ(EACCES, test_open(file1_s1d1, O_WRONLY)); + ASSERT_EQ(EACCES, test_open(dir_s1d1, O_RDONLY | O_DIRECTORY)); + + /* Write access is forbidden. */ + ASSERT_EQ(EACCES, test_open(file1_s1d2, O_WRONLY)); + /* Readdir access is allowed. */ + ASSERT_EQ(0, test_open(dir_s1d2, O_RDONLY | O_DIRECTORY)); + + /* Write access is forbidden. */ + ASSERT_EQ(EACCES, test_open(file1_s1d3, O_WRONLY)); + /* Readdir access is allowed. */ + ASSERT_EQ(0, test_open(dir_s1d3, O_RDONLY | O_DIRECTORY)); + + /* + * Tests shared rule extension: the following rules should not grant + * any new access, only remove some. Once enforced, these rules are + * ANDed with the previous ones. + */ + add_path_beneath(_metadata, ruleset_fd, LANDLOCK_ACCESS_FS_WRITE_FILE, + dir_s1d2); + /* + * According to ruleset_fd, dir_s1d2 should now have the + * LANDLOCK_ACCESS_FS_READ_FILE and LANDLOCK_ACCESS_FS_WRITE_FILE + * access rights (even if this directory is opened a second time). + * However, when enforcing this updated ruleset, the ruleset tied to + * the current process (i.e. its domain) will still only have the + * dir_s1d2 with LANDLOCK_ACCESS_FS_READ_FILE and + * LANDLOCK_ACCESS_FS_READ_DIR accesses, but + * LANDLOCK_ACCESS_FS_WRITE_FILE must not be allowed because it would + * be a privilege escalation. + */ + enforce_ruleset(_metadata, ruleset_fd); + + /* Same tests and results as above. */ + ASSERT_EQ(EACCES, test_open(file1_s1d1, O_WRONLY)); + ASSERT_EQ(EACCES, test_open(dir_s1d1, O_RDONLY | O_DIRECTORY)); + + /* It is still forbidden to write in file1_s1d2. */ + ASSERT_EQ(EACCES, test_open(file1_s1d2, O_WRONLY)); + /* Readdir access is still allowed. */ + ASSERT_EQ(0, test_open(dir_s1d2, O_RDONLY | O_DIRECTORY)); + + /* It is still forbidden to write in file1_s1d3. */ + ASSERT_EQ(EACCES, test_open(file1_s1d3, O_WRONLY)); + /* Readdir access is still allowed. */ + ASSERT_EQ(0, test_open(dir_s1d3, O_RDONLY | O_DIRECTORY)); + + /* + * Try to get more privileges by adding new access rights to the parent + * directory: dir_s1d1. + */ + add_path_beneath(_metadata, ruleset_fd, ACCESS_RW, dir_s1d1); + enforce_ruleset(_metadata, ruleset_fd); + + /* Same tests and results as above. */ + ASSERT_EQ(EACCES, test_open(file1_s1d1, O_WRONLY)); + ASSERT_EQ(EACCES, test_open(dir_s1d1, O_RDONLY | O_DIRECTORY)); + + /* It is still forbidden to write in file1_s1d2. */ + ASSERT_EQ(EACCES, test_open(file1_s1d2, O_WRONLY)); + /* Readdir access is still allowed. */ + ASSERT_EQ(0, test_open(dir_s1d2, O_RDONLY | O_DIRECTORY)); + + /* It is still forbidden to write in file1_s1d3. */ + ASSERT_EQ(EACCES, test_open(file1_s1d3, O_WRONLY)); + /* Readdir access is still allowed. */ + ASSERT_EQ(0, test_open(dir_s1d3, O_RDONLY | O_DIRECTORY)); + + /* + * Now, dir_s1d3 get a new rule tied to it, only allowing + * LANDLOCK_ACCESS_FS_WRITE_FILE. The (kernel internal) difference is + * that there was no rule tied to it before. + */ + add_path_beneath(_metadata, ruleset_fd, LANDLOCK_ACCESS_FS_WRITE_FILE, + dir_s1d3); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* + * Same tests and results as above, except for open(dir_s1d3) which is + * now denied because the new rule mask the rule previously inherited + * from dir_s1d2. + */ + + /* Same tests and results as above. */ + ASSERT_EQ(EACCES, test_open(file1_s1d1, O_WRONLY)); + ASSERT_EQ(EACCES, test_open(dir_s1d1, O_RDONLY | O_DIRECTORY)); + + /* It is still forbidden to write in file1_s1d2. */ + ASSERT_EQ(EACCES, test_open(file1_s1d2, O_WRONLY)); + /* Readdir access is still allowed. */ + ASSERT_EQ(0, test_open(dir_s1d2, O_RDONLY | O_DIRECTORY)); + + /* It is still forbidden to write in file1_s1d3. */ + ASSERT_EQ(EACCES, test_open(file1_s1d3, O_WRONLY)); + /* + * Readdir of dir_s1d3 is still allowed because of the OR policy inside + * the same layer. + */ + ASSERT_EQ(0, test_open(dir_s1d3, O_RDONLY | O_DIRECTORY)); +} + +TEST_F_FORK(layout1, inherit_superset) +{ + const struct rule rules[] = { + { + .path = dir_s1d3, + .access = ACCESS_RO, + }, + {} + }; + const int ruleset_fd = create_ruleset(_metadata, ACCESS_RW, rules); + + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + + /* Readdir access is denied for dir_s1d2. */ + ASSERT_EQ(EACCES, test_open(dir_s1d2, O_RDONLY | O_DIRECTORY)); + /* Readdir access is allowed for dir_s1d3. */ + ASSERT_EQ(0, test_open(dir_s1d3, O_RDONLY | O_DIRECTORY)); + /* File access is allowed for file1_s1d3. */ + ASSERT_EQ(0, test_open(file1_s1d3, O_RDONLY)); + + /* Now dir_s1d2, parent of dir_s1d3, gets a new rule tied to it. */ + add_path_beneath(_metadata, ruleset_fd, LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_READ_DIR, dir_s1d2); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* Readdir access is still denied for dir_s1d2. */ + ASSERT_EQ(EACCES, test_open(dir_s1d2, O_RDONLY | O_DIRECTORY)); + /* Readdir access is still allowed for dir_s1d3. */ + ASSERT_EQ(0, test_open(dir_s1d3, O_RDONLY | O_DIRECTORY)); + /* File access is still allowed for file1_s1d3. */ + ASSERT_EQ(0, test_open(file1_s1d3, O_RDONLY)); +} + +TEST_F_FORK(layout1, max_layers) +{ + int i, err; + const struct rule rules[] = { + { + .path = dir_s1d2, + .access = ACCESS_RO, + }, + {} + }; + const int ruleset_fd = create_ruleset(_metadata, ACCESS_RW, rules); + + ASSERT_LE(0, ruleset_fd); + for (i = 0; i < 64; i++) + enforce_ruleset(_metadata, ruleset_fd); + + for (i = 0; i < 2; i++) { + err = landlock_restrict_self(ruleset_fd, 0); + ASSERT_EQ(-1, err); + ASSERT_EQ(E2BIG, errno); + } + ASSERT_EQ(0, close(ruleset_fd)); +} + +TEST_F_FORK(layout1, empty_or_same_ruleset) +{ + struct landlock_ruleset_attr ruleset_attr = {}; + int ruleset_fd; + + /* Tests empty handled_access_fs. */ + ruleset_fd = landlock_create_ruleset(&ruleset_attr, + sizeof(ruleset_attr), 0); + ASSERT_LE(-1, ruleset_fd); + ASSERT_EQ(ENOMSG, errno); + + /* Enforces policy which deny read access to all files. */ + ruleset_attr.handled_access_fs = LANDLOCK_ACCESS_FS_READ_FILE; + ruleset_fd = landlock_create_ruleset(&ruleset_attr, + sizeof(ruleset_attr), 0); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(EACCES, test_open(file1_s1d1, O_RDONLY)); + ASSERT_EQ(0, test_open(dir_s1d1, O_RDONLY)); + + /* Nests a policy which deny read access to all directories. */ + ruleset_attr.handled_access_fs = LANDLOCK_ACCESS_FS_READ_DIR; + ruleset_fd = landlock_create_ruleset(&ruleset_attr, + sizeof(ruleset_attr), 0); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(EACCES, test_open(file1_s1d1, O_RDONLY)); + ASSERT_EQ(EACCES, test_open(dir_s1d1, O_RDONLY)); + + /* Enforces a second time with the same ruleset. */ + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); +} + +TEST_F_FORK(layout1, rule_on_mountpoint) +{ + const struct rule rules[] = { + { + .path = dir_s1d1, + .access = ACCESS_RO, + }, + { + /* dir_s3d2 is a mount point. */ + .path = dir_s3d2, + .access = ACCESS_RO, + }, + {} + }; + const int ruleset_fd = create_ruleset(_metadata, ACCESS_RW, rules); + + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + ASSERT_EQ(0, test_open(dir_s1d1, O_RDONLY)); + + ASSERT_EQ(EACCES, test_open(dir_s2d1, O_RDONLY)); + + ASSERT_EQ(EACCES, test_open(dir_s3d1, O_RDONLY)); + ASSERT_EQ(0, test_open(dir_s3d2, O_RDONLY)); + ASSERT_EQ(0, test_open(dir_s3d3, O_RDONLY)); +} + +TEST_F_FORK(layout1, rule_over_mountpoint) +{ + const struct rule rules[] = { + { + .path = dir_s1d1, + .access = ACCESS_RO, + }, + { + /* dir_s3d2 is a mount point. */ + .path = dir_s3d1, + .access = ACCESS_RO, + }, + {} + }; + const int ruleset_fd = create_ruleset(_metadata, ACCESS_RW, rules); + + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + ASSERT_EQ(0, test_open(dir_s1d1, O_RDONLY)); + + ASSERT_EQ(EACCES, test_open(dir_s2d1, O_RDONLY)); + + ASSERT_EQ(0, test_open(dir_s3d1, O_RDONLY)); + ASSERT_EQ(0, test_open(dir_s3d2, O_RDONLY)); + ASSERT_EQ(0, test_open(dir_s3d3, O_RDONLY)); +} + +/* + * This test verifies that we can apply a landlock rule on the root directory + * (which might require special handling). + */ +TEST_F_FORK(layout1, rule_over_root_allow_then_deny) +{ + struct rule rules[] = { + { + .path = "/", + .access = ACCESS_RO, + }, + {} + }; + int ruleset_fd = create_ruleset(_metadata, ACCESS_RW, rules); + + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* Checks allowed access. */ + ASSERT_EQ(0, test_open("/", O_RDONLY)); + ASSERT_EQ(0, test_open(dir_s1d1, O_RDONLY)); + + rules[0].access = LANDLOCK_ACCESS_FS_READ_FILE; + ruleset_fd = create_ruleset(_metadata, ACCESS_RW, rules); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* Checks denied access (on a directory). */ + ASSERT_EQ(EACCES, test_open("/", O_RDONLY)); + ASSERT_EQ(EACCES, test_open(dir_s1d1, O_RDONLY)); +} + +TEST_F_FORK(layout1, rule_over_root_deny) +{ + const struct rule rules[] = { + { + .path = "/", + .access = LANDLOCK_ACCESS_FS_READ_FILE, + }, + {} + }; + const int ruleset_fd = create_ruleset(_metadata, ACCESS_RW, rules); + + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* Checks denied access (on a directory). */ + ASSERT_EQ(EACCES, test_open("/", O_RDONLY)); + ASSERT_EQ(EACCES, test_open(dir_s1d1, O_RDONLY)); +} + +TEST_F_FORK(layout1, rule_inside_mount_ns) +{ + const struct rule rules[] = { + { + .path = "s3d3", + .access = ACCESS_RO, + }, + {} + }; + int ruleset_fd; + + set_cap(_metadata, CAP_SYS_ADMIN); + ASSERT_EQ(0, syscall(SYS_pivot_root, dir_s3d2, dir_s3d3)) { + TH_LOG("Failed to pivot root: %s", strerror(errno)); + }; + ASSERT_EQ(0, chdir("/")); + clear_cap(_metadata, CAP_SYS_ADMIN); + + ruleset_fd = create_ruleset(_metadata, ACCESS_RW, rules); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + ASSERT_EQ(0, test_open("s3d3", O_RDONLY)); + ASSERT_EQ(EACCES, test_open("/", O_RDONLY)); +} + +TEST_F_FORK(layout1, mount_and_pivot) +{ + const struct rule rules[] = { + { + .path = dir_s3d2, + .access = ACCESS_RO, + }, + {} + }; + const int ruleset_fd = create_ruleset(_metadata, ACCESS_RW, rules); + + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + set_cap(_metadata, CAP_SYS_ADMIN); + ASSERT_EQ(-1, mount(NULL, dir_s3d2, NULL, MS_RDONLY, NULL)); + ASSERT_EQ(EPERM, errno); + ASSERT_EQ(-1, syscall(SYS_pivot_root, dir_s3d2, dir_s3d3)); + ASSERT_EQ(EPERM, errno); + clear_cap(_metadata, CAP_SYS_ADMIN); +} + +TEST_F_FORK(layout1, move_mount) +{ + const struct rule rules[] = { + { + .path = dir_s3d2, + .access = ACCESS_RO, + }, + {} + }; + const int ruleset_fd = create_ruleset(_metadata, ACCESS_RW, rules); + + ASSERT_LE(0, ruleset_fd); + + set_cap(_metadata, CAP_SYS_ADMIN); + ASSERT_EQ(0, syscall(SYS_move_mount, AT_FDCWD, dir_s3d2, AT_FDCWD, + dir_s1d2, 0)) { + TH_LOG("Failed to move mount: %s", strerror(errno)); + } + + ASSERT_EQ(0, syscall(SYS_move_mount, AT_FDCWD, dir_s1d2, AT_FDCWD, + dir_s3d2, 0)); + clear_cap(_metadata, CAP_SYS_ADMIN); + + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + set_cap(_metadata, CAP_SYS_ADMIN); + ASSERT_EQ(-1, syscall(SYS_move_mount, AT_FDCWD, dir_s3d2, AT_FDCWD, + dir_s1d2, 0)); + ASSERT_EQ(EPERM, errno); + clear_cap(_metadata, CAP_SYS_ADMIN); +} + +TEST_F_FORK(layout1, release_inodes) +{ + const struct rule rules[] = { + { + .path = dir_s1d1, + .access = ACCESS_RO, + }, + { + .path = dir_s3d2, + .access = ACCESS_RO, + }, + { + .path = dir_s3d3, + .access = ACCESS_RO, + }, + {} + }; + const int ruleset_fd = create_ruleset(_metadata, ACCESS_RW, rules); + + ASSERT_LE(0, ruleset_fd); + /* Unmount a file hierarchy while it is being used by a ruleset. */ + set_cap(_metadata, CAP_SYS_ADMIN); + ASSERT_EQ(0, umount(dir_s3d2)); + clear_cap(_metadata, CAP_SYS_ADMIN); + + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + ASSERT_EQ(0, test_open(file1_s1d1, O_RDONLY)); + ASSERT_EQ(EACCES, test_open(dir_s3d2, O_RDONLY)); + /* This dir_s3d3 would not be allowed and does not exist anyway. */ + ASSERT_EQ(ENOENT, test_open(dir_s3d3, O_RDONLY)); +} + +enum relative_access { + REL_OPEN, + REL_CHDIR, + REL_CHROOT_ONLY, + REL_CHROOT_CHDIR, +}; + +static void test_relative_path(struct __test_metadata *const _metadata, + const enum relative_access rel) +{ + /* + * Common layer to check that chroot doesn't ignore it (i.e. a chroot + * is not a disconnected root directory). + */ + const struct rule layer1_base[] = { + { + .path = TMP_DIR, + .access = ACCESS_RO, + }, + {} + }; + const struct rule layer2_subs[] = { + { + .path = dir_s1d2, + .access = ACCESS_RO, + }, + { + .path = dir_s2d2, + .access = ACCESS_RO, + }, + {} + }; + int dirfd, ruleset_fd; + + ruleset_fd = create_ruleset(_metadata, ACCESS_RW, layer1_base); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + ruleset_fd = create_ruleset(_metadata, ACCESS_RW, layer2_subs); + + ASSERT_LE(0, ruleset_fd); + switch (rel) { + case REL_OPEN: + case REL_CHDIR: + break; + case REL_CHROOT_ONLY: + ASSERT_EQ(0, chdir(dir_s2d2)); + break; + case REL_CHROOT_CHDIR: + ASSERT_EQ(0, chdir(dir_s1d2)); + break; + default: + ASSERT_TRUE(false); + return; + } + + set_cap(_metadata, CAP_SYS_CHROOT); + enforce_ruleset(_metadata, ruleset_fd); + + switch (rel) { + case REL_OPEN: + dirfd = open(dir_s1d2, O_DIRECTORY); + ASSERT_LE(0, dirfd); + break; + case REL_CHDIR: + ASSERT_EQ(0, chdir(dir_s1d2)); + dirfd = AT_FDCWD; + break; + case REL_CHROOT_ONLY: + /* Do chroot into dir_s1d2 (relative to dir_s2d2). */ + ASSERT_EQ(0, chroot("../../s1d1/s1d2")) { + TH_LOG("Failed to chroot: %s", strerror(errno)); + } + dirfd = AT_FDCWD; + break; + case REL_CHROOT_CHDIR: + /* Do chroot into dir_s1d2. */ + ASSERT_EQ(0, chroot(".")) { + TH_LOG("Failed to chroot: %s", strerror(errno)); + } + dirfd = AT_FDCWD; + break; + } + + ASSERT_EQ((rel == REL_CHROOT_CHDIR) ? 0 : EACCES, + test_open_rel(dirfd, "..", O_RDONLY)); + ASSERT_EQ(0, test_open_rel(dirfd, ".", O_RDONLY)); + + if (rel == REL_CHROOT_ONLY) { + /* The current directory is dir_s2d2. */ + ASSERT_EQ(0, test_open_rel(dirfd, "./s2d3", O_RDONLY)); + } else { + /* The current directory is dir_s1d2. */ + ASSERT_EQ(0, test_open_rel(dirfd, "./s1d3", O_RDONLY)); + } + + if (rel == REL_CHROOT_ONLY || rel == REL_CHROOT_CHDIR) { + /* Checks the root dir_s1d2. */ + ASSERT_EQ(0, test_open_rel(dirfd, "/..", O_RDONLY)); + ASSERT_EQ(0, test_open_rel(dirfd, "/", O_RDONLY)); + ASSERT_EQ(0, test_open_rel(dirfd, "/f1", O_RDONLY)); + ASSERT_EQ(0, test_open_rel(dirfd, "/s1d3", O_RDONLY)); + } + + if (rel != REL_CHROOT_CHDIR) { + ASSERT_EQ(EACCES, test_open_rel(dirfd, "../../s1d1", O_RDONLY)); + ASSERT_EQ(0, test_open_rel(dirfd, "../../s1d1/s1d2", O_RDONLY)); + ASSERT_EQ(0, test_open_rel(dirfd, "../../s1d1/s1d2/s1d3", O_RDONLY)); + + ASSERT_EQ(EACCES, test_open_rel(dirfd, "../../s2d1", O_RDONLY)); + ASSERT_EQ(0, test_open_rel(dirfd, "../../s2d1/s2d2", O_RDONLY)); + ASSERT_EQ(0, test_open_rel(dirfd, "../../s2d1/s2d2/s2d3", O_RDONLY)); + } + + if (rel == REL_OPEN) + ASSERT_EQ(0, close(dirfd)); + ASSERT_EQ(0, close(ruleset_fd)); +} + +TEST_F_FORK(layout1, relative_open) +{ + test_relative_path(_metadata, REL_OPEN); +} + +TEST_F_FORK(layout1, relative_chdir) +{ + test_relative_path(_metadata, REL_CHDIR); +} + +TEST_F_FORK(layout1, relative_chroot_only) +{ + test_relative_path(_metadata, REL_CHROOT_ONLY); +} + +TEST_F_FORK(layout1, relative_chroot_chdir) +{ + test_relative_path(_metadata, REL_CHROOT_CHDIR); +} + +static void copy_binary(struct __test_metadata *const _metadata, + const char *const dst_path) +{ + int dst_fd, src_fd; + struct stat statbuf; + + dst_fd = open(dst_path, O_WRONLY | O_TRUNC | O_CLOEXEC); + ASSERT_LE(0, dst_fd) { + TH_LOG("Failed to open \"%s\": %s", dst_path, + strerror(errno)); + } + src_fd = open(BINARY_PATH, O_RDONLY | O_CLOEXEC); + ASSERT_LE(0, src_fd) { + TH_LOG("Failed to open \"" BINARY_PATH "\": %s", + strerror(errno)); + } + ASSERT_EQ(0, fstat(src_fd, &statbuf)); + ASSERT_EQ(statbuf.st_size, sendfile(dst_fd, src_fd, 0, + statbuf.st_size)); + ASSERT_EQ(0, close(src_fd)); + ASSERT_EQ(0, close(dst_fd)); +} + +static void test_execute(struct __test_metadata *const _metadata, + const int err, const char *const path) +{ + int status; + char *const argv[] = {(char *)path, NULL}; + const pid_t child = fork(); + + ASSERT_LE(0, child); + if (child == 0) { + ASSERT_EQ(err ? -1 : 0, execve(path, argv, NULL)) { + TH_LOG("Failed to execute \"%s\": %s", path, + strerror(errno)); + }; + ASSERT_EQ(err, errno); + _exit(_metadata->passed ? 2 : 1); + return; + } + ASSERT_EQ(child, waitpid(child, &status, 0)); + ASSERT_EQ(1, WIFEXITED(status)); + ASSERT_EQ(err ? 2 : 0, WEXITSTATUS(status)) { + TH_LOG("Unexpected return code for \"%s\": %s", path, + strerror(errno)); + }; +} + +TEST_F_FORK(layout1, execute) +{ + const struct rule rules[] = { + { + .path = dir_s1d2, + .access = LANDLOCK_ACCESS_FS_EXECUTE, + }, + {} + }; + const int ruleset_fd = create_ruleset(_metadata, rules[0].access, + rules); + + ASSERT_LE(0, ruleset_fd); + copy_binary(_metadata, file1_s1d1); + copy_binary(_metadata, file1_s1d2); + copy_binary(_metadata, file1_s1d3); + + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + ASSERT_EQ(0, test_open(dir_s1d1, O_RDONLY)); + ASSERT_EQ(0, test_open(file1_s1d1, O_RDONLY)); + test_execute(_metadata, EACCES, file1_s1d1); + + ASSERT_EQ(0, test_open(dir_s1d2, O_RDONLY)); + ASSERT_EQ(0, test_open(file1_s1d2, O_RDONLY)); + test_execute(_metadata, 0, file1_s1d2); + + ASSERT_EQ(0, test_open(dir_s1d3, O_RDONLY)); + ASSERT_EQ(0, test_open(file1_s1d3, O_RDONLY)); + test_execute(_metadata, 0, file1_s1d3); +} + +TEST_F_FORK(layout1, link) +{ + const struct rule rules[] = { + { + .path = dir_s1d2, + .access = LANDLOCK_ACCESS_FS_MAKE_REG, + }, + {} + }; + const int ruleset_fd = create_ruleset(_metadata, rules[0].access, + rules); + + ASSERT_LE(0, ruleset_fd); + + ASSERT_EQ(0, unlink(file1_s1d1)); + ASSERT_EQ(0, unlink(file1_s1d2)); + ASSERT_EQ(0, unlink(file1_s1d3)); + + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + ASSERT_EQ(-1, link(file2_s1d1, file1_s1d1)); + ASSERT_EQ(EACCES, errno); + /* Denies linking because of reparenting. */ + ASSERT_EQ(-1, link(file1_s2d1, file1_s1d2)); + ASSERT_EQ(EXDEV, errno); + ASSERT_EQ(-1, link(file2_s1d2, file1_s1d3)); + ASSERT_EQ(EXDEV, errno); + + ASSERT_EQ(0, link(file2_s1d2, file1_s1d2)); + ASSERT_EQ(0, link(file2_s1d3, file1_s1d3)); +} + +TEST_F_FORK(layout1, rename_file) +{ + const struct rule rules[] = { + { + .path = dir_s1d3, + .access = LANDLOCK_ACCESS_FS_REMOVE_FILE, + }, + { + .path = dir_s2d2, + .access = LANDLOCK_ACCESS_FS_REMOVE_FILE, + }, + {} + }; + const int ruleset_fd = create_ruleset(_metadata, rules[0].access, + rules); + + ASSERT_LE(0, ruleset_fd); + + ASSERT_EQ(0, unlink(file1_s1d1)); + ASSERT_EQ(0, unlink(file1_s1d2)); + + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* + * Tries to replace a file, from a directory that allows file removal, + * but to a different directory (which also allows file removal). + */ + ASSERT_EQ(-1, rename(file1_s2d3, file1_s1d3)); + ASSERT_EQ(EXDEV, errno); + ASSERT_EQ(-1, renameat2(AT_FDCWD, file1_s2d3, AT_FDCWD, file1_s1d3, + RENAME_EXCHANGE)); + ASSERT_EQ(EXDEV, errno); + ASSERT_EQ(-1, renameat2(AT_FDCWD, file1_s2d3, AT_FDCWD, dir_s1d3, + RENAME_EXCHANGE)); + ASSERT_EQ(EXDEV, errno); + + /* + * Tries to replace a file, from a directory that denies file removal, + * to a different directory (which allows file removal). + */ + ASSERT_EQ(-1, rename(file1_s2d1, file1_s1d3)); + ASSERT_EQ(EXDEV, errno); + ASSERT_EQ(-1, renameat2(AT_FDCWD, file1_s2d1, AT_FDCWD, file1_s1d3, + RENAME_EXCHANGE)); + ASSERT_EQ(EXDEV, errno); + ASSERT_EQ(-1, renameat2(AT_FDCWD, dir_s2d2, AT_FDCWD, file1_s1d3, + RENAME_EXCHANGE)); + ASSERT_EQ(EXDEV, errno); + + /* Exchanges files and directories that partially allow removal. */ + ASSERT_EQ(-1, renameat2(AT_FDCWD, dir_s2d2, AT_FDCWD, file1_s2d1, + RENAME_EXCHANGE)); + ASSERT_EQ(EACCES, errno); + ASSERT_EQ(-1, renameat2(AT_FDCWD, file1_s2d1, AT_FDCWD, dir_s2d2, + RENAME_EXCHANGE)); + ASSERT_EQ(EACCES, errno); + + /* Renames files with different parents. */ + ASSERT_EQ(-1, rename(file1_s2d2, file1_s1d2)); + ASSERT_EQ(EXDEV, errno); + ASSERT_EQ(0, unlink(file1_s1d3)); + ASSERT_EQ(-1, rename(file1_s2d1, file1_s1d3)); + ASSERT_EQ(EXDEV, errno); + + /* Exchanges and renames files with same parent. */ + ASSERT_EQ(0, renameat2(AT_FDCWD, file2_s2d3, AT_FDCWD, file1_s2d3, + RENAME_EXCHANGE)); + ASSERT_EQ(0, rename(file2_s2d3, file1_s2d3)); + + /* Exchanges files and directories with same parent, twice. */ + ASSERT_EQ(0, renameat2(AT_FDCWD, file1_s2d2, AT_FDCWD, dir_s2d3, + RENAME_EXCHANGE)); + ASSERT_EQ(0, renameat2(AT_FDCWD, file1_s2d2, AT_FDCWD, dir_s2d3, + RENAME_EXCHANGE)); +} + +TEST_F_FORK(layout1, rename_dir) +{ + const struct rule rules[] = { + { + .path = dir_s1d2, + .access = LANDLOCK_ACCESS_FS_REMOVE_DIR, + }, + { + .path = dir_s2d1, + .access = LANDLOCK_ACCESS_FS_REMOVE_DIR, + }, + {} + }; + const int ruleset_fd = create_ruleset(_metadata, rules[0].access, + rules); + + ASSERT_LE(0, ruleset_fd); + + /* Empties dir_s1d3 to allow renaming. */ + ASSERT_EQ(0, unlink(file1_s1d3)); + ASSERT_EQ(0, unlink(file2_s1d3)); + + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* Exchanges and renames directory to a different parent. */ + ASSERT_EQ(-1, renameat2(AT_FDCWD, dir_s2d3, AT_FDCWD, dir_s1d3, + RENAME_EXCHANGE)); + ASSERT_EQ(EXDEV, errno); + ASSERT_EQ(-1, rename(dir_s2d3, dir_s1d3)); + ASSERT_EQ(EXDEV, errno); + ASSERT_EQ(-1, renameat2(AT_FDCWD, file1_s2d2, AT_FDCWD, dir_s1d3, + RENAME_EXCHANGE)); + ASSERT_EQ(EXDEV, errno); + + /* + * Exchanges directory to the same parent, which doesn't allow + * directory removal. + */ + ASSERT_EQ(-1, renameat2(AT_FDCWD, dir_s1d1, AT_FDCWD, dir_s2d1, + RENAME_EXCHANGE)); + ASSERT_EQ(EACCES, errno); + ASSERT_EQ(-1, renameat2(AT_FDCWD, file1_s1d1, AT_FDCWD, dir_s1d2, + RENAME_EXCHANGE)); + ASSERT_EQ(EACCES, errno); + + /* + * Exchanges and renames directory to the same parent, which allows + * directory removal. + */ + ASSERT_EQ(0, renameat2(AT_FDCWD, dir_s1d3, AT_FDCWD, file1_s1d2, + RENAME_EXCHANGE)); + ASSERT_EQ(0, unlink(dir_s1d3)); + ASSERT_EQ(0, mkdir(dir_s1d3, 0700)); + ASSERT_EQ(0, rename(file1_s1d2, dir_s1d3)); + ASSERT_EQ(0, rmdir(dir_s1d3)); +} + +TEST_F_FORK(layout1, remove_dir) +{ + const struct rule rules[] = { + { + .path = dir_s1d2, + .access = LANDLOCK_ACCESS_FS_REMOVE_DIR, + }, + {} + }; + const int ruleset_fd = create_ruleset(_metadata, rules[0].access, + rules); + + ASSERT_LE(0, ruleset_fd); + + ASSERT_EQ(0, unlink(file1_s1d1)); + ASSERT_EQ(0, unlink(file1_s1d2)); + ASSERT_EQ(0, unlink(file1_s1d3)); + ASSERT_EQ(0, unlink(file2_s1d3)); + + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + ASSERT_EQ(0, rmdir(dir_s1d3)); + ASSERT_EQ(0, mkdir(dir_s1d3, 0700)); + ASSERT_EQ(0, unlinkat(AT_FDCWD, dir_s1d3, AT_REMOVEDIR)); + + /* dir_s1d2 itself cannot be removed. */ + ASSERT_EQ(-1, rmdir(dir_s1d2)); + ASSERT_EQ(EACCES, errno); + ASSERT_EQ(-1, unlinkat(AT_FDCWD, dir_s1d2, AT_REMOVEDIR)); + ASSERT_EQ(EACCES, errno); + ASSERT_EQ(-1, rmdir(dir_s1d1)); + ASSERT_EQ(EACCES, errno); + ASSERT_EQ(-1, unlinkat(AT_FDCWD, dir_s1d1, AT_REMOVEDIR)); + ASSERT_EQ(EACCES, errno); +} + +TEST_F_FORK(layout1, remove_file) +{ + const struct rule rules[] = { + { + .path = dir_s1d2, + .access = LANDLOCK_ACCESS_FS_REMOVE_FILE, + }, + {} + }; + const int ruleset_fd = create_ruleset(_metadata, rules[0].access, + rules); + + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + ASSERT_EQ(-1, unlink(file1_s1d1)); + ASSERT_EQ(EACCES, errno); + ASSERT_EQ(-1, unlinkat(AT_FDCWD, file1_s1d1, 0)); + ASSERT_EQ(EACCES, errno); + ASSERT_EQ(0, unlink(file1_s1d2)); + ASSERT_EQ(0, unlinkat(AT_FDCWD, file1_s1d3, 0)); +} + +static void test_make_file(struct __test_metadata *const _metadata, + const __u64 access, const mode_t mode, const dev_t dev) +{ + const struct rule rules[] = { + { + .path = dir_s1d2, + .access = access, + }, + {} + }; + const int ruleset_fd = create_ruleset(_metadata, access, rules); + + ASSERT_LE(0, ruleset_fd); + + ASSERT_EQ(0, unlink(file1_s1d1)); + ASSERT_EQ(0, unlink(file2_s1d1)); + ASSERT_EQ(0, mknod(file2_s1d1, mode | 0400, dev)) { + TH_LOG("Failed to make file \"%s\": %s", + file2_s1d1, strerror(errno)); + }; + + ASSERT_EQ(0, unlink(file1_s1d2)); + ASSERT_EQ(0, unlink(file2_s1d2)); + + ASSERT_EQ(0, unlink(file1_s1d3)); + ASSERT_EQ(0, unlink(file2_s1d3)); + + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + ASSERT_EQ(-1, mknod(file1_s1d1, mode | 0400, dev)); + ASSERT_EQ(EACCES, errno); + ASSERT_EQ(-1, link(file2_s1d1, file1_s1d1)); + ASSERT_EQ(EACCES, errno); + ASSERT_EQ(-1, rename(file2_s1d1, file1_s1d1)); + ASSERT_EQ(EACCES, errno); + + ASSERT_EQ(0, mknod(file1_s1d2, mode | 0400, dev)) { + TH_LOG("Failed to make file \"%s\": %s", + file1_s1d2, strerror(errno)); + }; + ASSERT_EQ(0, link(file1_s1d2, file2_s1d2)); + ASSERT_EQ(0, unlink(file2_s1d2)); + ASSERT_EQ(0, rename(file1_s1d2, file2_s1d2)); + + ASSERT_EQ(0, mknod(file1_s1d3, mode | 0400, dev)); + ASSERT_EQ(0, link(file1_s1d3, file2_s1d3)); + ASSERT_EQ(0, unlink(file2_s1d3)); + ASSERT_EQ(0, rename(file1_s1d3, file2_s1d3)); +} + +TEST_F_FORK(layout1, make_char) +{ + /* Creates a /dev/null device. */ + set_cap(_metadata, CAP_MKNOD); + test_make_file(_metadata, LANDLOCK_ACCESS_FS_MAKE_CHAR, S_IFCHR, + makedev(1, 3)); +} + +TEST_F_FORK(layout1, make_block) +{ + /* Creates a /dev/loop0 device. */ + set_cap(_metadata, CAP_MKNOD); + test_make_file(_metadata, LANDLOCK_ACCESS_FS_MAKE_BLOCK, S_IFBLK, + makedev(7, 0)); +} + +TEST_F_FORK(layout1, make_reg_1) +{ + test_make_file(_metadata, LANDLOCK_ACCESS_FS_MAKE_REG, S_IFREG, 0); +} + +TEST_F_FORK(layout1, make_reg_2) +{ + test_make_file(_metadata, LANDLOCK_ACCESS_FS_MAKE_REG, 0, 0); +} + +TEST_F_FORK(layout1, make_sock) +{ + test_make_file(_metadata, LANDLOCK_ACCESS_FS_MAKE_SOCK, S_IFSOCK, 0); +} + +TEST_F_FORK(layout1, make_fifo) +{ + test_make_file(_metadata, LANDLOCK_ACCESS_FS_MAKE_FIFO, S_IFIFO, 0); +} + +TEST_F_FORK(layout1, make_sym) +{ + const struct rule rules[] = { + { + .path = dir_s1d2, + .access = LANDLOCK_ACCESS_FS_MAKE_SYM, + }, + {} + }; + const int ruleset_fd = create_ruleset(_metadata, rules[0].access, + rules); + + ASSERT_LE(0, ruleset_fd); + + ASSERT_EQ(0, unlink(file1_s1d1)); + ASSERT_EQ(0, unlink(file2_s1d1)); + ASSERT_EQ(0, symlink("none", file2_s1d1)); + + ASSERT_EQ(0, unlink(file1_s1d2)); + ASSERT_EQ(0, unlink(file2_s1d2)); + + ASSERT_EQ(0, unlink(file1_s1d3)); + ASSERT_EQ(0, unlink(file2_s1d3)); + + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + ASSERT_EQ(-1, symlink("none", file1_s1d1)); + ASSERT_EQ(EACCES, errno); + ASSERT_EQ(-1, link(file2_s1d1, file1_s1d1)); + ASSERT_EQ(EACCES, errno); + ASSERT_EQ(-1, rename(file2_s1d1, file1_s1d1)); + ASSERT_EQ(EACCES, errno); + + ASSERT_EQ(0, symlink("none", file1_s1d2)); + ASSERT_EQ(0, link(file1_s1d2, file2_s1d2)); + ASSERT_EQ(0, unlink(file2_s1d2)); + ASSERT_EQ(0, rename(file1_s1d2, file2_s1d2)); + + ASSERT_EQ(0, symlink("none", file1_s1d3)); + ASSERT_EQ(0, link(file1_s1d3, file2_s1d3)); + ASSERT_EQ(0, unlink(file2_s1d3)); + ASSERT_EQ(0, rename(file1_s1d3, file2_s1d3)); +} + +TEST_F_FORK(layout1, make_dir) +{ + const struct rule rules[] = { + { + .path = dir_s1d2, + .access = LANDLOCK_ACCESS_FS_MAKE_DIR, + }, + {} + }; + const int ruleset_fd = create_ruleset(_metadata, rules[0].access, + rules); + + ASSERT_LE(0, ruleset_fd); + + ASSERT_EQ(0, unlink(file1_s1d1)); + ASSERT_EQ(0, unlink(file1_s1d2)); + ASSERT_EQ(0, unlink(file1_s1d3)); + + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* Uses file_* as directory names. */ + ASSERT_EQ(-1, mkdir(file1_s1d1, 0700)); + ASSERT_EQ(EACCES, errno); + ASSERT_EQ(0, mkdir(file1_s1d2, 0700)); + ASSERT_EQ(0, mkdir(file1_s1d3, 0700)); +} + +static int open_proc_fd(struct __test_metadata *const _metadata, const int fd, + const int open_flags) +{ + static const char path_template[] = "/proc/self/fd/%d"; + char procfd_path[sizeof(path_template) + 10]; + const int procfd_path_size = snprintf(procfd_path, sizeof(procfd_path), + path_template, fd); + + ASSERT_LT(procfd_path_size, sizeof(procfd_path)); + return open(procfd_path, open_flags); +} + +TEST_F_FORK(layout1, proc_unlinked_file) +{ + const struct rule rules[] = { + { + .path = file1_s1d2, + .access = LANDLOCK_ACCESS_FS_READ_FILE, + }, + {} + }; + int reg_fd, proc_fd; + const int ruleset_fd = create_ruleset(_metadata, + LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_WRITE_FILE, rules); + + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + ASSERT_EQ(EACCES, test_open(file1_s1d2, O_RDWR)); + ASSERT_EQ(0, test_open(file1_s1d2, O_RDONLY)); + reg_fd = open(file1_s1d2, O_RDONLY | O_CLOEXEC); + ASSERT_LE(0, reg_fd); + ASSERT_EQ(0, unlink(file1_s1d2)); + + proc_fd = open_proc_fd(_metadata, reg_fd, O_RDONLY | O_CLOEXEC); + ASSERT_LE(0, proc_fd); + ASSERT_EQ(0, close(proc_fd)); + + proc_fd = open_proc_fd(_metadata, reg_fd, O_RDWR | O_CLOEXEC); + ASSERT_EQ(-1, proc_fd) { + TH_LOG("Successfully opened /proc/self/fd/%d: %s", + reg_fd, strerror(errno)); + } + ASSERT_EQ(EACCES, errno); + + ASSERT_EQ(0, close(reg_fd)); +} + +TEST_F_FORK(layout1, proc_pipe) +{ + int proc_fd; + int pipe_fds[2]; + char buf = '\0'; + const struct rule rules[] = { + { + .path = dir_s1d2, + .access = LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_WRITE_FILE, + }, + {} + }; + /* Limits read and write access to files tied to the filesystem. */ + const int ruleset_fd = create_ruleset(_metadata, rules[0].access, + rules); + + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* Checks enforcement for normal files. */ + ASSERT_EQ(0, test_open(file1_s1d2, O_RDWR)); + ASSERT_EQ(EACCES, test_open(file1_s1d1, O_RDWR)); + + /* Checks access to pipes through FD. */ + ASSERT_EQ(0, pipe2(pipe_fds, O_CLOEXEC)); + ASSERT_EQ(1, write(pipe_fds[1], ".", 1)) { + TH_LOG("Failed to write in pipe: %s", strerror(errno)); + } + ASSERT_EQ(1, read(pipe_fds[0], &buf, 1)); + ASSERT_EQ('.', buf); + + /* Checks write access to pipe through /proc/self/fd . */ + proc_fd = open_proc_fd(_metadata, pipe_fds[1], O_WRONLY | O_CLOEXEC); + ASSERT_LE(0, proc_fd); + ASSERT_EQ(1, write(proc_fd, ".", 1)) { + TH_LOG("Failed to write through /proc/self/fd/%d: %s", + pipe_fds[1], strerror(errno)); + } + ASSERT_EQ(0, close(proc_fd)); + + /* Checks read access to pipe through /proc/self/fd . */ + proc_fd = open_proc_fd(_metadata, pipe_fds[0], O_RDONLY | O_CLOEXEC); + ASSERT_LE(0, proc_fd); + buf = '\0'; + ASSERT_EQ(1, read(proc_fd, &buf, 1)) { + TH_LOG("Failed to read through /proc/self/fd/%d: %s", + pipe_fds[1], strerror(errno)); + } + ASSERT_EQ(0, close(proc_fd)); + + ASSERT_EQ(0, close(pipe_fds[0])); + ASSERT_EQ(0, close(pipe_fds[1])); +} + +FIXTURE(layout1_bind) { +}; + +FIXTURE_SETUP(layout1_bind) +{ + prepare_layout(_metadata); + + create_layout1(_metadata); + + set_cap(_metadata, CAP_SYS_ADMIN); + ASSERT_EQ(0, mount(dir_s1d2, dir_s2d2, NULL, MS_BIND, NULL)); + clear_cap(_metadata, CAP_SYS_ADMIN); +} + +FIXTURE_TEARDOWN(layout1_bind) +{ + set_cap(_metadata, CAP_SYS_ADMIN); + EXPECT_EQ(0, umount(dir_s2d2)); + clear_cap(_metadata, CAP_SYS_ADMIN); + + remove_layout1(_metadata); + + cleanup_layout(_metadata); +} + +static const char bind_dir_s1d3[] = TMP_DIR "/s2d1/s2d2/s1d3"; +static const char bind_file1_s1d3[] = TMP_DIR "/s2d1/s2d2/s1d3/f1"; + +/* + * layout1_bind hierarchy: + * + * tmp + * ├── s1d1 + * │   ├── f1 + * │   ├── f2 + * │   └── s1d2 + * │   ├── f1 + * │   ├── f2 + * │   └── s1d3 + * │   ├── f1 + * │   └── f2 + * ├── s2d1 + * │   ├── f1 + * │   └── s2d2 + * │   ├── f1 + * │   ├── f2 + * │   └── s1d3 + * │   ├── f1 + * │   └── f2 + * └── s3d1 + * └── s3d2 + * └── s3d3 + */ + +TEST_F_FORK(layout1_bind, no_restriction) +{ + ASSERT_EQ(0, test_open(dir_s1d1, O_RDONLY)); + ASSERT_EQ(0, test_open(file1_s1d1, O_RDONLY)); + ASSERT_EQ(0, test_open(dir_s1d2, O_RDONLY)); + ASSERT_EQ(0, test_open(file1_s1d2, O_RDONLY)); + ASSERT_EQ(0, test_open(dir_s1d3, O_RDONLY)); + ASSERT_EQ(0, test_open(file1_s1d3, O_RDONLY)); + + ASSERT_EQ(0, test_open(dir_s2d1, O_RDONLY)); + ASSERT_EQ(0, test_open(file1_s2d1, O_RDONLY)); + ASSERT_EQ(0, test_open(dir_s2d2, O_RDONLY)); + ASSERT_EQ(0, test_open(file1_s2d2, O_RDONLY)); + ASSERT_EQ(ENOENT, test_open(dir_s2d3, O_RDONLY)); + ASSERT_EQ(ENOENT, test_open(file1_s2d3, O_RDONLY)); + + ASSERT_EQ(0, test_open(bind_dir_s1d3, O_RDONLY)); + ASSERT_EQ(0, test_open(bind_file1_s1d3, O_RDONLY)); + + ASSERT_EQ(0, test_open(dir_s3d1, O_RDONLY)); +} + +TEST_F_FORK(layout1_bind, same_content_same_file) +{ + /* + * Sets access right on parent directories of both source and + * destination mount points. + */ + const struct rule layer1_parent[] = { + { + .path = dir_s1d1, + .access = ACCESS_RO, + }, + { + .path = dir_s2d1, + .access = ACCESS_RW, + }, + {} + }; + /* + * Sets access rights on the same bind-mounted directories. The result + * should be ACCESS_RW for both directories, but not both hierarchies + * because of the first layer. + */ + const struct rule layer2_mount_point[] = { + { + .path = dir_s1d2, + .access = LANDLOCK_ACCESS_FS_READ_FILE, + }, + { + .path = dir_s2d2, + .access = ACCESS_RW, + }, + {} + }; + /* Only allow read-access to the s1d3 hierarchies. */ + const struct rule layer3_source[] = { + { + .path = dir_s1d3, + .access = LANDLOCK_ACCESS_FS_READ_FILE, + }, + {} + }; + /* Removes all access rights. */ + const struct rule layer4_destination[] = { + { + .path = bind_file1_s1d3, + .access = LANDLOCK_ACCESS_FS_WRITE_FILE, + }, + {} + }; + int ruleset_fd; + + /* Sets rules for the parent directories. */ + ruleset_fd = create_ruleset(_metadata, ACCESS_RW, layer1_parent); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* Checks source hierarchy. */ + ASSERT_EQ(0, test_open(file1_s1d1, O_RDONLY)); + ASSERT_EQ(EACCES, test_open(file1_s1d1, O_WRONLY)); + ASSERT_EQ(0, test_open(dir_s1d1, O_RDONLY | O_DIRECTORY)); + + ASSERT_EQ(0, test_open(file1_s1d2, O_RDONLY)); + ASSERT_EQ(EACCES, test_open(file1_s1d2, O_WRONLY)); + ASSERT_EQ(0, test_open(dir_s1d2, O_RDONLY | O_DIRECTORY)); + + /* Checks destination hierarchy. */ + ASSERT_EQ(0, test_open(file1_s2d1, O_RDWR)); + ASSERT_EQ(0, test_open(dir_s2d1, O_RDONLY | O_DIRECTORY)); + + ASSERT_EQ(0, test_open(file1_s2d2, O_RDWR)); + ASSERT_EQ(0, test_open(dir_s2d2, O_RDONLY | O_DIRECTORY)); + + /* Sets rules for the mount points. */ + ruleset_fd = create_ruleset(_metadata, ACCESS_RW, layer2_mount_point); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* Checks source hierarchy. */ + ASSERT_EQ(EACCES, test_open(file1_s1d1, O_RDONLY)); + ASSERT_EQ(EACCES, test_open(file1_s1d1, O_WRONLY)); + ASSERT_EQ(EACCES, test_open(dir_s1d1, O_RDONLY | O_DIRECTORY)); + + ASSERT_EQ(0, test_open(file1_s1d2, O_RDONLY)); + ASSERT_EQ(EACCES, test_open(file1_s1d2, O_WRONLY)); + ASSERT_EQ(0, test_open(dir_s1d2, O_RDONLY | O_DIRECTORY)); + + /* Checks destination hierarchy. */ + ASSERT_EQ(EACCES, test_open(file1_s2d1, O_RDONLY)); + ASSERT_EQ(EACCES, test_open(file1_s2d1, O_WRONLY)); + ASSERT_EQ(EACCES, test_open(dir_s2d1, O_RDONLY | O_DIRECTORY)); + + ASSERT_EQ(0, test_open(file1_s2d2, O_RDWR)); + ASSERT_EQ(0, test_open(dir_s2d2, O_RDONLY | O_DIRECTORY)); + ASSERT_EQ(0, test_open(bind_dir_s1d3, O_RDONLY | O_DIRECTORY)); + + /* Sets a (shared) rule only on the source. */ + ruleset_fd = create_ruleset(_metadata, ACCESS_RW, layer3_source); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* Checks source hierarchy. */ + ASSERT_EQ(EACCES, test_open(file1_s1d2, O_RDONLY)); + ASSERT_EQ(EACCES, test_open(file1_s1d2, O_WRONLY)); + ASSERT_EQ(EACCES, test_open(dir_s1d2, O_RDONLY | O_DIRECTORY)); + + ASSERT_EQ(0, test_open(file1_s1d3, O_RDONLY)); + ASSERT_EQ(EACCES, test_open(file1_s1d3, O_WRONLY)); + ASSERT_EQ(EACCES, test_open(dir_s1d3, O_RDONLY | O_DIRECTORY)); + + /* Checks destination hierarchy. */ + ASSERT_EQ(EACCES, test_open(file1_s2d2, O_RDONLY)); + ASSERT_EQ(EACCES, test_open(file1_s2d2, O_WRONLY)); + ASSERT_EQ(EACCES, test_open(dir_s2d2, O_RDONLY | O_DIRECTORY)); + + ASSERT_EQ(0, test_open(bind_file1_s1d3, O_RDONLY)); + ASSERT_EQ(EACCES, test_open(bind_file1_s1d3, O_WRONLY)); + ASSERT_EQ(EACCES, test_open(bind_dir_s1d3, O_RDONLY | O_DIRECTORY)); + + /* Sets a (shared) rule only on the destination. */ + ruleset_fd = create_ruleset(_metadata, ACCESS_RW, layer4_destination); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* Checks source hierarchy. */ + ASSERT_EQ(EACCES, test_open(file1_s1d3, O_RDONLY)); + ASSERT_EQ(EACCES, test_open(file1_s1d3, O_WRONLY)); + + /* Checks destination hierarchy. */ + ASSERT_EQ(EACCES, test_open(bind_file1_s1d3, O_RDONLY)); + ASSERT_EQ(EACCES, test_open(bind_file1_s1d3, O_WRONLY)); +} + +#define LOWER_BASE TMP_DIR "/lower" +#define LOWER_DATA LOWER_BASE "/data" +static const char lower_fl1[] = LOWER_DATA "/fl1"; +static const char lower_dl1[] = LOWER_DATA "/dl1"; +static const char lower_dl1_fl2[] = LOWER_DATA "/dl1/fl2"; +static const char lower_fo1[] = LOWER_DATA "/fo1"; +static const char lower_do1[] = LOWER_DATA "/do1"; +static const char lower_do1_fo2[] = LOWER_DATA "/do1/fo2"; +static const char lower_do1_fl3[] = LOWER_DATA "/do1/fl3"; + +static const char (*lower_base_files[])[] = { + &lower_fl1, + &lower_fo1, + NULL +}; +static const char (*lower_base_directories[])[] = { + &lower_dl1, + &lower_do1, + NULL +}; +static const char (*lower_sub_files[])[] = { + &lower_dl1_fl2, + &lower_do1_fo2, + &lower_do1_fl3, + NULL +}; + +#define UPPER_BASE TMP_DIR "/upper" +#define UPPER_DATA UPPER_BASE "/data" +#define UPPER_WORK UPPER_BASE "/work" +static const char upper_fu1[] = UPPER_DATA "/fu1"; +static const char upper_du1[] = UPPER_DATA "/du1"; +static const char upper_du1_fu2[] = UPPER_DATA "/du1/fu2"; +static const char upper_fo1[] = UPPER_DATA "/fo1"; +static const char upper_do1[] = UPPER_DATA "/do1"; +static const char upper_do1_fo2[] = UPPER_DATA "/do1/fo2"; +static const char upper_do1_fu3[] = UPPER_DATA "/do1/fu3"; + +static const char (*upper_base_files[])[] = { + &upper_fu1, + &upper_fo1, + NULL +}; +static const char (*upper_base_directories[])[] = { + &upper_du1, + &upper_do1, + NULL +}; +static const char (*upper_sub_files[])[] = { + &upper_du1_fu2, + &upper_do1_fo2, + &upper_do1_fu3, + NULL +}; + +#define MERGE_BASE TMP_DIR "/merge" +#define MERGE_DATA MERGE_BASE "/data" +static const char merge_fl1[] = MERGE_DATA "/fl1"; +static const char merge_dl1[] = MERGE_DATA "/dl1"; +static const char merge_dl1_fl2[] = MERGE_DATA "/dl1/fl2"; +static const char merge_fu1[] = MERGE_DATA "/fu1"; +static const char merge_du1[] = MERGE_DATA "/du1"; +static const char merge_du1_fu2[] = MERGE_DATA "/du1/fu2"; +static const char merge_fo1[] = MERGE_DATA "/fo1"; +static const char merge_do1[] = MERGE_DATA "/do1"; +static const char merge_do1_fo2[] = MERGE_DATA "/do1/fo2"; +static const char merge_do1_fl3[] = MERGE_DATA "/do1/fl3"; +static const char merge_do1_fu3[] = MERGE_DATA "/do1/fu3"; + +static const char (*merge_base_files[])[] = { + &merge_fl1, + &merge_fu1, + &merge_fo1, + NULL +}; +static const char (*merge_base_directories[])[] = { + &merge_dl1, + &merge_du1, + &merge_do1, + NULL +}; +static const char (*merge_sub_files[])[] = { + &merge_dl1_fl2, + &merge_du1_fu2, + &merge_do1_fo2, + &merge_do1_fl3, + &merge_do1_fu3, + NULL +}; + +/* + * layout2_overlay hierarchy: + * + * tmp + * ├── lower + * │   └── data + * │   ├── dl1 + * │   │   └── fl2 + * │   ├── do1 + * │   │   ├── fl3 + * │   │   └── fo2 + * │   ├── fl1 + * │   └── fo1 + * ├── merge + * │   └── data + * │   ├── dl1 + * │   │   └── fl2 + * │   ├── do1 + * │   │   ├── fl3 + * │   │   ├── fo2 + * │   │   └── fu3 + * │   ├── du1 + * │   │   └── fu2 + * │   ├── fl1 + * │   ├── fo1 + * │   └── fu1 + * └── upper + * ├── data + * │   ├── do1 + * │   │   ├── fo2 + * │   │   └── fu3 + * │   ├── du1 + * │   │   └── fu2 + * │   ├── fo1 + * │   └── fu1 + * └── work + * └── work + */ + +FIXTURE(layout2_overlay) { +}; + +FIXTURE_SETUP(layout2_overlay) +{ + prepare_layout(_metadata); + + create_directory(_metadata, LOWER_BASE); + set_cap(_metadata, CAP_SYS_ADMIN); + /* Creates tmpfs mount points to get deterministic overlayfs. */ + ASSERT_EQ(0, mount("tmp", LOWER_BASE, "tmpfs", 0, "size=4m,mode=700")); + clear_cap(_metadata, CAP_SYS_ADMIN); + create_file(_metadata, lower_fl1); + create_file(_metadata, lower_dl1_fl2); + create_file(_metadata, lower_fo1); + create_file(_metadata, lower_do1_fo2); + create_file(_metadata, lower_do1_fl3); + + create_directory(_metadata, UPPER_BASE); + set_cap(_metadata, CAP_SYS_ADMIN); + ASSERT_EQ(0, mount("tmp", UPPER_BASE, "tmpfs", 0, "size=4m,mode=700")); + clear_cap(_metadata, CAP_SYS_ADMIN); + create_file(_metadata, upper_fu1); + create_file(_metadata, upper_du1_fu2); + create_file(_metadata, upper_fo1); + create_file(_metadata, upper_do1_fo2); + create_file(_metadata, upper_do1_fu3); + ASSERT_EQ(0, mkdir(UPPER_WORK, 0700)); + + create_directory(_metadata, MERGE_DATA); + set_cap(_metadata, CAP_SYS_ADMIN); + set_cap(_metadata, CAP_DAC_OVERRIDE); + ASSERT_EQ(0, mount("overlay", MERGE_DATA, "overlay", 0, + "lowerdir=" LOWER_DATA + ",upperdir=" UPPER_DATA + ",workdir=" UPPER_WORK)); + clear_cap(_metadata, CAP_DAC_OVERRIDE); + clear_cap(_metadata, CAP_SYS_ADMIN); +} + +FIXTURE_TEARDOWN(layout2_overlay) +{ + EXPECT_EQ(0, remove_path(lower_do1_fl3)); + EXPECT_EQ(0, remove_path(lower_dl1_fl2)); + EXPECT_EQ(0, remove_path(lower_fl1)); + EXPECT_EQ(0, remove_path(lower_do1_fo2)); + EXPECT_EQ(0, remove_path(lower_fo1)); + set_cap(_metadata, CAP_SYS_ADMIN); + EXPECT_EQ(0, umount(LOWER_BASE)); + clear_cap(_metadata, CAP_SYS_ADMIN); + EXPECT_EQ(0, remove_path(LOWER_BASE)); + + EXPECT_EQ(0, remove_path(upper_do1_fu3)); + EXPECT_EQ(0, remove_path(upper_du1_fu2)); + EXPECT_EQ(0, remove_path(upper_fu1)); + EXPECT_EQ(0, remove_path(upper_do1_fo2)); + EXPECT_EQ(0, remove_path(upper_fo1)); + EXPECT_EQ(0, remove_path(UPPER_WORK "/work")); + set_cap(_metadata, CAP_SYS_ADMIN); + EXPECT_EQ(0, umount(UPPER_BASE)); + clear_cap(_metadata, CAP_SYS_ADMIN); + EXPECT_EQ(0, remove_path(UPPER_BASE)); + + set_cap(_metadata, CAP_SYS_ADMIN); + EXPECT_EQ(0, umount(MERGE_DATA)); + clear_cap(_metadata, CAP_SYS_ADMIN); + EXPECT_EQ(0, remove_path(MERGE_DATA)); + + cleanup_layout(_metadata); +} + +TEST_F_FORK(layout2_overlay, no_restriction) +{ + ASSERT_EQ(0, test_open(lower_fl1, O_RDONLY)); + ASSERT_EQ(0, test_open(lower_dl1, O_RDONLY)); + ASSERT_EQ(0, test_open(lower_dl1_fl2, O_RDONLY)); + ASSERT_EQ(0, test_open(lower_fo1, O_RDONLY)); + ASSERT_EQ(0, test_open(lower_do1, O_RDONLY)); + ASSERT_EQ(0, test_open(lower_do1_fo2, O_RDONLY)); + ASSERT_EQ(0, test_open(lower_do1_fl3, O_RDONLY)); + + ASSERT_EQ(0, test_open(upper_fu1, O_RDONLY)); + ASSERT_EQ(0, test_open(upper_du1, O_RDONLY)); + ASSERT_EQ(0, test_open(upper_du1_fu2, O_RDONLY)); + ASSERT_EQ(0, test_open(upper_fo1, O_RDONLY)); + ASSERT_EQ(0, test_open(upper_do1, O_RDONLY)); + ASSERT_EQ(0, test_open(upper_do1_fo2, O_RDONLY)); + ASSERT_EQ(0, test_open(upper_do1_fu3, O_RDONLY)); + + ASSERT_EQ(0, test_open(merge_fl1, O_RDONLY)); + ASSERT_EQ(0, test_open(merge_dl1, O_RDONLY)); + ASSERT_EQ(0, test_open(merge_dl1_fl2, O_RDONLY)); + ASSERT_EQ(0, test_open(merge_fu1, O_RDONLY)); + ASSERT_EQ(0, test_open(merge_du1, O_RDONLY)); + ASSERT_EQ(0, test_open(merge_du1_fu2, O_RDONLY)); + ASSERT_EQ(0, test_open(merge_fo1, O_RDONLY)); + ASSERT_EQ(0, test_open(merge_do1, O_RDONLY)); + ASSERT_EQ(0, test_open(merge_do1_fo2, O_RDONLY)); + ASSERT_EQ(0, test_open(merge_do1_fl3, O_RDONLY)); + ASSERT_EQ(0, test_open(merge_do1_fu3, O_RDONLY)); +} + +#define for_each_path(path_list, path_entry, i) \ + for (i = 0, path_entry = *path_list[i]; path_list[i]; \ + path_entry = *path_list[++i]) + +TEST_F_FORK(layout2_overlay, same_content_different_file) +{ + /* Sets access right on parent directories of both layers. */ + const struct rule layer1_base[] = { + { + .path = LOWER_BASE, + .access = LANDLOCK_ACCESS_FS_READ_FILE, + }, + { + .path = UPPER_BASE, + .access = LANDLOCK_ACCESS_FS_READ_FILE, + }, + { + .path = MERGE_BASE, + .access = ACCESS_RW, + }, + {} + }; + const struct rule layer2_data[] = { + { + .path = LOWER_DATA, + .access = LANDLOCK_ACCESS_FS_READ_FILE, + }, + { + .path = UPPER_DATA, + .access = LANDLOCK_ACCESS_FS_READ_FILE, + }, + { + .path = MERGE_DATA, + .access = ACCESS_RW, + }, + {} + }; + /* Sets access right on directories inside both layers. */ + const struct rule layer3_subdirs[] = { + { + .path = lower_dl1, + .access = LANDLOCK_ACCESS_FS_READ_FILE, + }, + { + .path = lower_do1, + .access = LANDLOCK_ACCESS_FS_READ_FILE, + }, + { + .path = upper_du1, + .access = LANDLOCK_ACCESS_FS_READ_FILE, + }, + { + .path = upper_do1, + .access = LANDLOCK_ACCESS_FS_READ_FILE, + }, + { + .path = merge_dl1, + .access = ACCESS_RW, + }, + { + .path = merge_du1, + .access = ACCESS_RW, + }, + { + .path = merge_do1, + .access = ACCESS_RW, + }, + {} + }; + /* Tighten access rights to the files. */ + const struct rule layer4_files[] = { + { + .path = lower_dl1_fl2, + .access = LANDLOCK_ACCESS_FS_READ_FILE, + }, + { + .path = lower_do1_fo2, + .access = LANDLOCK_ACCESS_FS_READ_FILE, + }, + { + .path = lower_do1_fl3, + .access = LANDLOCK_ACCESS_FS_READ_FILE, + }, + { + .path = upper_du1_fu2, + .access = LANDLOCK_ACCESS_FS_READ_FILE, + }, + { + .path = upper_do1_fo2, + .access = LANDLOCK_ACCESS_FS_READ_FILE, + }, + { + .path = upper_do1_fu3, + .access = LANDLOCK_ACCESS_FS_READ_FILE, + }, + { + .path = merge_dl1_fl2, + .access = LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_WRITE_FILE, + }, + { + .path = merge_du1_fu2, + .access = LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_WRITE_FILE, + }, + { + .path = merge_do1_fo2, + .access = LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_WRITE_FILE, + }, + { + .path = merge_do1_fl3, + .access = LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_WRITE_FILE, + }, + { + .path = merge_do1_fu3, + .access = LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_WRITE_FILE, + }, + {} + }; + const struct rule layer5_merge_only[] = { + { + .path = MERGE_DATA, + .access = LANDLOCK_ACCESS_FS_READ_FILE | + LANDLOCK_ACCESS_FS_WRITE_FILE, + }, + {} + }; + int ruleset_fd; + size_t i; + const char *path_entry; + + /* Sets rules on base directories (i.e. outside overlay scope). */ + ruleset_fd = create_ruleset(_metadata, ACCESS_RW, layer1_base); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* Checks lower layer. */ + for_each_path(lower_base_files, path_entry, i) { + ASSERT_EQ(0, test_open(path_entry, O_RDONLY)); + ASSERT_EQ(EACCES, test_open(path_entry, O_WRONLY)); + } + for_each_path(lower_base_directories, path_entry, i) { + ASSERT_EQ(EACCES, test_open(path_entry, O_RDONLY | O_DIRECTORY)); + } + for_each_path(lower_sub_files, path_entry, i) { + ASSERT_EQ(0, test_open(path_entry, O_RDONLY)); + ASSERT_EQ(EACCES, test_open(path_entry, O_WRONLY)); + } + /* Checks upper layer. */ + for_each_path(upper_base_files, path_entry, i) { + ASSERT_EQ(0, test_open(path_entry, O_RDONLY)); + ASSERT_EQ(EACCES, test_open(path_entry, O_WRONLY)); + } + for_each_path(upper_base_directories, path_entry, i) { + ASSERT_EQ(EACCES, test_open(path_entry, O_RDONLY | O_DIRECTORY)); + } + for_each_path(upper_sub_files, path_entry, i) { + ASSERT_EQ(0, test_open(path_entry, O_RDONLY)); + ASSERT_EQ(EACCES, test_open(path_entry, O_WRONLY)); + } + /* + * Checks that access rights are independent from the lower and upper + * layers: write access to upper files viewed through the merge point + * is still allowed, and write access to lower file viewed (and copied) + * through the merge point is still allowed. + */ + for_each_path(merge_base_files, path_entry, i) { + ASSERT_EQ(0, test_open(path_entry, O_RDWR)); + } + for_each_path(merge_base_directories, path_entry, i) { + ASSERT_EQ(0, test_open(path_entry, O_RDONLY | O_DIRECTORY)); + } + for_each_path(merge_sub_files, path_entry, i) { + ASSERT_EQ(0, test_open(path_entry, O_RDWR)); + } + + /* Sets rules on data directories (i.e. inside overlay scope). */ + ruleset_fd = create_ruleset(_metadata, ACCESS_RW, layer2_data); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* Checks merge. */ + for_each_path(merge_base_files, path_entry, i) { + ASSERT_EQ(0, test_open(path_entry, O_RDWR)); + } + for_each_path(merge_base_directories, path_entry, i) { + ASSERT_EQ(0, test_open(path_entry, O_RDONLY | O_DIRECTORY)); + } + for_each_path(merge_sub_files, path_entry, i) { + ASSERT_EQ(0, test_open(path_entry, O_RDWR)); + } + + /* Same checks with tighter rules. */ + ruleset_fd = create_ruleset(_metadata, ACCESS_RW, layer3_subdirs); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* Checks changes for lower layer. */ + for_each_path(lower_base_files, path_entry, i) { + ASSERT_EQ(EACCES, test_open(path_entry, O_RDONLY)); + } + /* Checks changes for upper layer. */ + for_each_path(upper_base_files, path_entry, i) { + ASSERT_EQ(EACCES, test_open(path_entry, O_RDONLY)); + } + /* Checks all merge accesses. */ + for_each_path(merge_base_files, path_entry, i) { + ASSERT_EQ(EACCES, test_open(path_entry, O_RDWR)); + } + for_each_path(merge_base_directories, path_entry, i) { + ASSERT_EQ(0, test_open(path_entry, O_RDONLY | O_DIRECTORY)); + } + for_each_path(merge_sub_files, path_entry, i) { + ASSERT_EQ(0, test_open(path_entry, O_RDWR)); + } + + /* Sets rules directly on overlayed files. */ + ruleset_fd = create_ruleset(_metadata, ACCESS_RW, layer4_files); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* Checks unchanged accesses on lower layer. */ + for_each_path(lower_sub_files, path_entry, i) { + ASSERT_EQ(0, test_open(path_entry, O_RDONLY)); + ASSERT_EQ(EACCES, test_open(path_entry, O_WRONLY)); + } + /* Checks unchanged accesses on upper layer. */ + for_each_path(upper_sub_files, path_entry, i) { + ASSERT_EQ(0, test_open(path_entry, O_RDONLY)); + ASSERT_EQ(EACCES, test_open(path_entry, O_WRONLY)); + } + /* Checks all merge accesses. */ + for_each_path(merge_base_files, path_entry, i) { + ASSERT_EQ(EACCES, test_open(path_entry, O_RDWR)); + } + for_each_path(merge_base_directories, path_entry, i) { + ASSERT_EQ(EACCES, test_open(path_entry, O_RDONLY | O_DIRECTORY)); + } + for_each_path(merge_sub_files, path_entry, i) { + ASSERT_EQ(0, test_open(path_entry, O_RDWR)); + } + + /* Only allowes access to the merge hierarchy. */ + ruleset_fd = create_ruleset(_metadata, ACCESS_RW, layer5_merge_only); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + /* Checks new accesses on lower layer. */ + for_each_path(lower_sub_files, path_entry, i) { + ASSERT_EQ(EACCES, test_open(path_entry, O_RDONLY)); + } + /* Checks new accesses on upper layer. */ + for_each_path(upper_sub_files, path_entry, i) { + ASSERT_EQ(EACCES, test_open(path_entry, O_RDONLY)); + } + /* Checks all merge accesses. */ + for_each_path(merge_base_files, path_entry, i) { + ASSERT_EQ(EACCES, test_open(path_entry, O_RDWR)); + } + for_each_path(merge_base_directories, path_entry, i) { + ASSERT_EQ(EACCES, test_open(path_entry, O_RDONLY | O_DIRECTORY)); + } + for_each_path(merge_sub_files, path_entry, i) { + ASSERT_EQ(0, test_open(path_entry, O_RDWR)); + } +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/landlock/ptrace_test.c b/tools/testing/selftests/landlock/ptrace_test.c new file mode 100644 index 000000000000..15fbef9cc849 --- /dev/null +++ b/tools/testing/selftests/landlock/ptrace_test.c @@ -0,0 +1,337 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Landlock tests - Ptrace + * + * Copyright © 2017-2020 Mickaël Salaün + * Copyright © 2019-2020 ANSSI + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" + +static void create_domain(struct __test_metadata *const _metadata) +{ + int ruleset_fd; + struct landlock_ruleset_attr ruleset_attr = { + .handled_access_fs = LANDLOCK_ACCESS_FS_MAKE_BLOCK, + }; + + ruleset_fd = landlock_create_ruleset(&ruleset_attr, + sizeof(ruleset_attr), 0); + EXPECT_LE(0, ruleset_fd) { + TH_LOG("Failed to create a ruleset: %s", strerror(errno)); + } + EXPECT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)); + EXPECT_EQ(0, landlock_restrict_self(ruleset_fd, 0)); + EXPECT_EQ(0, close(ruleset_fd)); +} + +static int test_ptrace_read(const pid_t pid) +{ + static const char path_template[] = "/proc/%d/environ"; + char procenv_path[sizeof(path_template) + 10]; + int procenv_path_size, fd; + + procenv_path_size = snprintf(procenv_path, sizeof(procenv_path), + path_template, pid); + if (procenv_path_size >= sizeof(procenv_path)) + return E2BIG; + + fd = open(procenv_path, O_RDONLY | O_CLOEXEC); + if (fd < 0) + return errno; + /* + * Mixing error codes from close(2) and open(2) should not lead to any + * (access type) confusion for this test. + */ + if (close(fd) != 0) + return errno; + return 0; +} + +FIXTURE(hierarchy) { }; + +FIXTURE_VARIANT(hierarchy) { + const bool domain_both; + const bool domain_parent; + const bool domain_child; +}; + +/* + * Test multiple tracing combinations between a parent process P1 and a child + * process P2. + * + * Yama's scoped ptrace is presumed disabled. If enabled, this optional + * restriction is enforced in addition to any Landlock check, which means that + * all P2 requests to trace P1 would be denied. + */ + +/* + * No domain + * + * P1-. P1 -> P2 : allow + * \ P2 -> P1 : allow + * 'P2 + */ +FIXTURE_VARIANT_ADD(hierarchy, allow_without_domain) { + .domain_both = false, + .domain_parent = false, + .domain_child = false, +}; + +/* + * Child domain + * + * P1--. P1 -> P2 : allow + * \ P2 -> P1 : deny + * .'-----. + * | P2 | + * '------' + */ +FIXTURE_VARIANT_ADD(hierarchy, allow_with_one_domain) { + .domain_both = false, + .domain_parent = false, + .domain_child = true, +}; + +/* + * Parent domain + * .------. + * | P1 --. P1 -> P2 : deny + * '------' \ P2 -> P1 : allow + * ' + * P2 + */ +FIXTURE_VARIANT_ADD(hierarchy, deny_with_parent_domain) { + .domain_both = false, + .domain_parent = true, + .domain_child = false, +}; + +/* + * Parent + child domain (siblings) + * .------. + * | P1 ---. P1 -> P2 : deny + * '------' \ P2 -> P1 : deny + * .---'--. + * | P2 | + * '------' + */ +FIXTURE_VARIANT_ADD(hierarchy, deny_with_sibling_domain) { + .domain_both = false, + .domain_parent = true, + .domain_child = true, +}; + +/* + * Same domain (inherited) + * .-------------. + * | P1----. | P1 -> P2 : allow + * | \ | P2 -> P1 : allow + * | ' | + * | P2 | + * '-------------' + */ +FIXTURE_VARIANT_ADD(hierarchy, allow_sibling_domain) { + .domain_both = true, + .domain_parent = false, + .domain_child = false, +}; + +/* + * Inherited + child domain + * .-----------------. + * | P1----. | P1 -> P2 : allow + * | \ | P2 -> P1 : deny + * | .-'----. | + * | | P2 | | + * | '------' | + * '-----------------' + */ +FIXTURE_VARIANT_ADD(hierarchy, allow_with_nested_domain) { + .domain_both = true, + .domain_parent = false, + .domain_child = true, +}; + +/* + * Inherited + parent domain + * .-----------------. + * |.------. | P1 -> P2 : deny + * || P1 ----. | P2 -> P1 : allow + * |'------' \ | + * | ' | + * | P2 | + * '-----------------' + */ +FIXTURE_VARIANT_ADD(hierarchy, deny_with_nested_and_parent_domain) { + .domain_both = true, + .domain_parent = true, + .domain_child = false, +}; + +/* + * Inherited + parent and child domain (siblings) + * .-----------------. + * | .------. | P1 -> P2 : deny + * | | P1 . | P2 -> P1 : deny + * | '------'\ | + * | \ | + * | .--'---. | + * | | P2 | | + * | '------' | + * '-----------------' + */ +FIXTURE_VARIANT_ADD(hierarchy, deny_with_forked_domain) { + .domain_both = true, + .domain_parent = true, + .domain_child = true, +}; + +FIXTURE_SETUP(hierarchy) +{ } + +FIXTURE_TEARDOWN(hierarchy) +{ } + +/* Test PTRACE_TRACEME and PTRACE_ATTACH for parent and child. */ +TEST_F(hierarchy, trace) +{ + pid_t child, parent; + int status, err_proc_read; + int pipe_child[2], pipe_parent[2]; + char buf_parent; + long ret; + + /* + * Removes all effective and permitted capabilities to not interfere + * with cap_ptrace_access_check() in case of PTRACE_MODE_FSCREDS. + */ + drop_caps(_metadata); + + parent = getpid(); + ASSERT_EQ(0, pipe2(pipe_child, O_CLOEXEC)); + ASSERT_EQ(0, pipe2(pipe_parent, O_CLOEXEC)); + if (variant->domain_both) { + create_domain(_metadata); + if (!_metadata->passed) + /* Aborts before forking. */ + return; + } + + child = fork(); + ASSERT_LE(0, child); + if (child == 0) { + char buf_child; + + ASSERT_EQ(0, close(pipe_parent[1])); + ASSERT_EQ(0, close(pipe_child[0])); + if (variant->domain_child) + create_domain(_metadata); + + /* Waits for the parent to be in a domain, if any. */ + ASSERT_EQ(1, read(pipe_parent[0], &buf_child, 1)); + + /* Tests PTRACE_ATTACH and PTRACE_MODE_READ on the parent. */ + err_proc_read = test_ptrace_read(parent); + ret = ptrace(PTRACE_ATTACH, parent, NULL, 0); + if (variant->domain_child) { + EXPECT_EQ(-1, ret); + EXPECT_EQ(EPERM, errno); + EXPECT_EQ(EACCES, err_proc_read); + } else { + EXPECT_EQ(0, ret); + EXPECT_EQ(0, err_proc_read); + } + if (ret == 0) { + ASSERT_EQ(parent, waitpid(parent, &status, 0)); + ASSERT_EQ(1, WIFSTOPPED(status)); + ASSERT_EQ(0, ptrace(PTRACE_DETACH, parent, NULL, 0)); + } + + /* Tests child PTRACE_TRACEME. */ + ret = ptrace(PTRACE_TRACEME); + if (variant->domain_parent) { + EXPECT_EQ(-1, ret); + EXPECT_EQ(EPERM, errno); + } else { + EXPECT_EQ(0, ret); + } + + /* + * Signals that the PTRACE_ATTACH test is done and the + * PTRACE_TRACEME test is ongoing. + */ + ASSERT_EQ(1, write(pipe_child[1], ".", 1)); + + if (!variant->domain_parent) { + ASSERT_EQ(0, raise(SIGSTOP)); + } + + /* Waits for the parent PTRACE_ATTACH test. */ + ASSERT_EQ(1, read(pipe_parent[0], &buf_child, 1)); + _exit(_metadata->passed ? EXIT_SUCCESS : EXIT_FAILURE); + return; + } + + ASSERT_EQ(0, close(pipe_child[1])); + ASSERT_EQ(0, close(pipe_parent[0])); + if (variant->domain_parent) + create_domain(_metadata); + + /* Signals that the parent is in a domain, if any. */ + ASSERT_EQ(1, write(pipe_parent[1], ".", 1)); + + /* + * Waits for the child to test PTRACE_ATTACH on the parent and start + * testing PTRACE_TRACEME. + */ + ASSERT_EQ(1, read(pipe_child[0], &buf_parent, 1)); + + /* Tests child PTRACE_TRACEME. */ + if (!variant->domain_parent) { + ASSERT_EQ(child, waitpid(child, &status, 0)); + ASSERT_EQ(1, WIFSTOPPED(status)); + ASSERT_EQ(0, ptrace(PTRACE_DETACH, child, NULL, 0)); + } else { + /* The child should not be traced by the parent. */ + EXPECT_EQ(-1, ptrace(PTRACE_DETACH, child, NULL, 0)); + EXPECT_EQ(ESRCH, errno); + } + + /* Tests PTRACE_ATTACH and PTRACE_MODE_READ on the child. */ + err_proc_read = test_ptrace_read(child); + ret = ptrace(PTRACE_ATTACH, child, NULL, 0); + if (variant->domain_parent) { + EXPECT_EQ(-1, ret); + EXPECT_EQ(EPERM, errno); + EXPECT_EQ(EACCES, err_proc_read); + } else { + EXPECT_EQ(0, ret); + EXPECT_EQ(0, err_proc_read); + } + if (ret == 0) { + ASSERT_EQ(child, waitpid(child, &status, 0)); + ASSERT_EQ(1, WIFSTOPPED(status)); + ASSERT_EQ(0, ptrace(PTRACE_DETACH, child, NULL, 0)); + } + + /* Signals that the parent PTRACE_ATTACH test is done. */ + ASSERT_EQ(1, write(pipe_parent[1], ".", 1)); + ASSERT_EQ(child, waitpid(child, &status, 0)); + if (WIFSIGNALED(status) || !WIFEXITED(status) || + WEXITSTATUS(status) != EXIT_SUCCESS) + _metadata->passed = 0; +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/landlock/true.c b/tools/testing/selftests/landlock/true.c new file mode 100644 index 000000000000..3f9ccbf52783 --- /dev/null +++ b/tools/testing/selftests/landlock/true.c @@ -0,0 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0 +int main(void) +{ + return 0; +} -- cgit v1.2.3 From 3532b0b4352ce79400b0aa68414f1a0fc422b920 Mon Sep 17 00:00:00 2001 From: Mickaël Salaün Date: Thu, 22 Apr 2021 17:41:23 +0200 Subject: landlock: Enable user space to infer supported features MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new flag LANDLOCK_CREATE_RULESET_VERSION to landlock_create_ruleset(2). This enables to retreive a Landlock ABI version that is useful to efficiently follow a best-effort security approach. Indeed, it would be a missed opportunity to abort the whole sandbox building, because some features are unavailable, instead of protecting users as much as possible with the subset of features provided by the running kernel. This new flag enables user space to identify the minimum set of Landlock features supported by the running kernel without relying on a filesystem interface (e.g. /proc/version, which might be inaccessible) nor testing multiple syscall argument combinations (i.e. syscall bisection). New Landlock features will be documented and tied to a minimum version number (greater than 1). The current version will be incremented for each new kernel release supporting new Landlock features. User space libraries can leverage this information to seamlessly restrict processes as much as possible while being compatible with newer APIs. This is a much more lighter approach than the previous landlock_get_features(2): the complexity is pushed to user space libraries. This flag meets similar needs as securityfs versions: selinux/policyvers, apparmor/features/*/version* and tomoyo/version. Supporting this flag now will be convenient for backward compatibility. Cc: Arnd Bergmann Cc: James Morris Cc: Jann Horn Cc: Kees Cook Cc: Serge E. Hallyn Signed-off-by: Mickaël Salaün Link: https://lore.kernel.org/r/20210422154123.13086-14-mic@digikod.net Signed-off-by: James Morris --- include/uapi/linux/landlock.h | 8 +++++ security/landlock/syscalls.c | 17 +++++++--- tools/testing/selftests/landlock/base_test.c | 47 ++++++++++++++++++++++++++++ 3 files changed, 68 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/include/uapi/linux/landlock.h b/include/uapi/linux/landlock.h index ba946a1e40b2..b3d952067f59 100644 --- a/include/uapi/linux/landlock.h +++ b/include/uapi/linux/landlock.h @@ -27,6 +27,14 @@ struct landlock_ruleset_attr { __u64 handled_access_fs; }; +/* + * sys_landlock_create_ruleset() flags: + * + * - %LANDLOCK_CREATE_RULESET_VERSION: Get the highest supported Landlock ABI + * version. + */ +#define LANDLOCK_CREATE_RULESET_VERSION (1U << 0) + /** * enum landlock_rule_type - Landlock rule type * diff --git a/security/landlock/syscalls.c b/security/landlock/syscalls.c index 93620ad7593b..32396962f04d 100644 --- a/security/landlock/syscalls.c +++ b/security/landlock/syscalls.c @@ -128,6 +128,8 @@ static const struct file_operations ruleset_fops = { .write = fop_dummy_write, }; +#define LANDLOCK_ABI_VERSION 1 + /** * sys_landlock_create_ruleset - Create a new ruleset * @@ -135,15 +137,19 @@ static const struct file_operations ruleset_fops = { * the new ruleset. * @size: Size of the pointed &struct landlock_ruleset_attr (needed for * backward and forward compatibility). - * @flags: Must be 0. + * @flags: Supported value: %LANDLOCK_CREATE_RULESET_VERSION. * * This system call enables to create a new Landlock ruleset, and returns the * related file descriptor on success. * + * If @flags is %LANDLOCK_CREATE_RULESET_VERSION and @attr is NULL and @size is + * 0, then the returned value is the highest supported Landlock ABI version + * (starting at 1). + * * Possible returned errors are: * * - EOPNOTSUPP: Landlock is supported by the kernel but disabled at boot time; - * - EINVAL: @flags is not 0, or unknown access, or too small @size; + * - EINVAL: unknown @flags, or unknown access, or too small @size; * - E2BIG or EFAULT: @attr or @size inconsistencies; * - ENOMSG: empty &landlock_ruleset_attr.handled_access_fs. */ @@ -161,9 +167,12 @@ SYSCALL_DEFINE3(landlock_create_ruleset, if (!landlock_initialized) return -EOPNOTSUPP; - /* No flag for now. */ - if (flags) + if (flags) { + if ((flags == LANDLOCK_CREATE_RULESET_VERSION) + && !attr && !size) + return LANDLOCK_ABI_VERSION; return -EINVAL; + } /* Copies raw user space buffer. */ err = copy_min_struct_from_user(&ruleset_attr, sizeof(ruleset_attr), diff --git a/tools/testing/selftests/landlock/base_test.c b/tools/testing/selftests/landlock/base_test.c index 262c3c8d953a..ca40abe9daa8 100644 --- a/tools/testing/selftests/landlock/base_test.c +++ b/tools/testing/selftests/landlock/base_test.c @@ -63,6 +63,53 @@ TEST(inconsistent_attr) { free(buf); } +TEST(abi_version) { + const struct landlock_ruleset_attr ruleset_attr = { + .handled_access_fs = LANDLOCK_ACCESS_FS_READ_FILE, + }; + ASSERT_EQ(1, landlock_create_ruleset(NULL, 0, + LANDLOCK_CREATE_RULESET_VERSION)); + + ASSERT_EQ(-1, landlock_create_ruleset(&ruleset_attr, 0, + LANDLOCK_CREATE_RULESET_VERSION)); + ASSERT_EQ(EINVAL, errno); + + ASSERT_EQ(-1, landlock_create_ruleset(NULL, sizeof(ruleset_attr), + LANDLOCK_CREATE_RULESET_VERSION)); + ASSERT_EQ(EINVAL, errno); + + ASSERT_EQ(-1, landlock_create_ruleset(&ruleset_attr, + sizeof(ruleset_attr), + LANDLOCK_CREATE_RULESET_VERSION)); + ASSERT_EQ(EINVAL, errno); + + ASSERT_EQ(-1, landlock_create_ruleset(NULL, 0, + LANDLOCK_CREATE_RULESET_VERSION | 1 << 31)); + ASSERT_EQ(EINVAL, errno); +} + +TEST(inval_create_ruleset_flags) { + const int last_flag = LANDLOCK_CREATE_RULESET_VERSION; + const int invalid_flag = last_flag << 1; + const struct landlock_ruleset_attr ruleset_attr = { + .handled_access_fs = LANDLOCK_ACCESS_FS_READ_FILE, + }; + + ASSERT_EQ(-1, landlock_create_ruleset(NULL, 0, invalid_flag)); + ASSERT_EQ(EINVAL, errno); + + ASSERT_EQ(-1, landlock_create_ruleset(&ruleset_attr, 0, invalid_flag)); + ASSERT_EQ(EINVAL, errno); + + ASSERT_EQ(-1, landlock_create_ruleset(NULL, sizeof(ruleset_attr), + invalid_flag)); + ASSERT_EQ(EINVAL, errno); + + ASSERT_EQ(-1, landlock_create_ruleset(&ruleset_attr, + sizeof(ruleset_attr), invalid_flag)); + ASSERT_EQ(EINVAL, errno); +} + TEST(empty_path_beneath_attr) { const struct landlock_ruleset_attr ruleset_attr = { .handled_access_fs = LANDLOCK_ACCESS_FS_EXECUTE, -- cgit v1.2.3 From 3ddb3fd8cdb0a6c11b7c8d91ba42d84c4ea3cc43 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 22 Apr 2021 21:18:22 +0200 Subject: signal, perf: Fix siginfo_t by avoiding u64 on 32-bit architectures The alignment of a structure is that of its largest member. On architectures like 32-bit Arm (but not e.g. 32-bit x86) 64-bit integers will require 64-bit alignment and not its natural word size. This means that there is no portable way to add 64-bit integers to siginfo_t on 32-bit architectures without breaking the ABI, because siginfo_t does not yet (and therefore likely never will) contain 64-bit fields on 32-bit architectures. Adding a 64-bit integer could change the alignment of the union after the 3 initial int si_signo, si_errno, si_code, thus introducing 4 bytes of padding shifting the entire union, which would break the ABI. One alternative would be to use the __packed attribute, however, it is non-standard C. Given siginfo_t has definitions outside the Linux kernel in various standard libraries that can be compiled with any number of different compilers (not just those we rely on), using non-standard attributes on siginfo_t should be avoided to ensure portability. In the case of the si_perf field, word size is sufficient since there is no exact requirement on size, given the data it contains is user-defined via perf_event_attr::sig_data. On 32-bit architectures, any excess bits of perf_event_attr::sig_data will therefore be truncated when copying into si_perf. Since si_perf is intended to disambiguate events (e.g. encoding relevant information if there are more events of the same type), 32 bits should provide enough entropy to do so on 32-bit architectures. For 64-bit architectures, no change is intended. Fixes: fb6cc127e0b6 ("signal: Introduce TRAP_PERF si_code and si_perf to siginfo") Reported-by: Marek Szyprowski Reported-by: Jon Hunter Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Tested-by: Marek Szyprowski Tested-by: Jon Hunter Link: https://lkml.kernel.org/r/20210422191823.79012-1-elver@google.com --- include/linux/compat.h | 2 +- include/uapi/asm-generic/siginfo.h | 2 +- tools/testing/selftests/perf_events/sigtrap_threads.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/include/linux/compat.h b/include/linux/compat.h index c8821d966812..f0d2dd35d408 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -237,7 +237,7 @@ typedef struct compat_siginfo { u32 _pkey; } _addr_pkey; /* used when si_code=TRAP_PERF */ - compat_u64 _perf; + compat_ulong_t _perf; }; } _sigfault; diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h index d0bb9125c853..03d6f6d2c1fe 100644 --- a/include/uapi/asm-generic/siginfo.h +++ b/include/uapi/asm-generic/siginfo.h @@ -92,7 +92,7 @@ union __sifields { __u32 _pkey; } _addr_pkey; /* used when si_code=TRAP_PERF */ - __u64 _perf; + unsigned long _perf; }; } _sigfault; diff --git a/tools/testing/selftests/perf_events/sigtrap_threads.c b/tools/testing/selftests/perf_events/sigtrap_threads.c index 9c0fd442da60..78ddf5e11625 100644 --- a/tools/testing/selftests/perf_events/sigtrap_threads.c +++ b/tools/testing/selftests/perf_events/sigtrap_threads.c @@ -44,7 +44,7 @@ static struct { } ctx; /* Unique value to check si_perf is correctly set from perf_event_attr::sig_data. */ -#define TEST_SIG_DATA(addr) (~(uint64_t)(addr)) +#define TEST_SIG_DATA(addr) (~(unsigned long)(addr)) static struct perf_event_attr make_event_attr(bool enabled, volatile void *addr) { -- cgit v1.2.3 From c8d0260cdd96fdccdef0509c4160e28a1012a5d7 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Fri, 23 Apr 2021 14:19:43 +0200 Subject: selftests: net: mirror_gre_vlan_bridge_1q: Make an FDB entry static The FDB roaming test installs a destination MAC address on the wrong interface of an FDB database and tests whether the mirroring fails, because packets are sent to the wrong port. The test by mistake installs the FDB entry as local. This worked previously, because drivers were notified of local FDB entries in the same way as of static entries. However that has been fixed in the commit 6ab4c3117aec ("net: bridge: don't notify switchdev for local FDB addresses"), and local entries are not notified anymore. As a result, the HW is not reconfigured for the FDB roam, and mirroring keeps working, failing the test. To fix the issue, mark the FDB entry as static. Fixes: 9c7c8a82442c ("selftests: forwarding: mirror_gre_vlan_bridge_1q: Add more tests") Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh b/tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh index c02291e9841e..880e3ab9d088 100755 --- a/tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh +++ b/tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh @@ -271,7 +271,7 @@ test_span_gre_fdb_roaming() while ((RET == 0)); do bridge fdb del dev $swp3 $h3mac vlan 555 master 2>/dev/null - bridge fdb add dev $swp2 $h3mac vlan 555 master + bridge fdb add dev $swp2 $h3mac vlan 555 master static sleep 1 fail_test_span_gre_dir $tundev ingress -- cgit v1.2.3 From b6fc2f212108b3676f54d00a2c38e3bc36753980 Mon Sep 17 00:00:00 2001 From: Danielle Ratson Date: Fri, 23 Apr 2021 14:19:44 +0200 Subject: selftests: mlxsw: Remove a redundant if statement in port_scale test Currently, the error return code of the failure condition is lost after using an if statement, so the test doesn't fail when it should. Remove the if statement that separates the condition and the error code check, so the test won't always pass. Fixes: 5154b1b826d9b ("selftests: mlxsw: Add a scale test for physical ports") Reported-by: Ido Schimmel Signed-off-by: Danielle Ratson Reviewed-by: Petr Machata Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- tools/testing/selftests/drivers/net/mlxsw/port_scale.sh | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/mlxsw/port_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/port_scale.sh index f813ffefc07e..65f43a7ce9c9 100644 --- a/tools/testing/selftests/drivers/net/mlxsw/port_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/port_scale.sh @@ -55,10 +55,6 @@ port_test() | jq '.[][][] | select(.name=="physical_ports") |.["occ"]') [[ $occ -eq $max_ports ]] - if [[ $should_fail -eq 0 ]]; then - check_err $? "Mismatch ports number: Expected $max_ports, got $occ." - else - check_err_fail $should_fail $? "Reached more ports than expected" - fi + check_err_fail $should_fail $? "Attempt to create $max_ports ports (actual result $occ)" } -- cgit v1.2.3 From 1f1c92139e36223b89d8140f2b72f75e79baf8bd Mon Sep 17 00:00:00 2001 From: Danielle Ratson Date: Fri, 23 Apr 2021 14:19:45 +0200 Subject: selftests: mlxsw: Remove a redundant if statement in tc_flower_scale test Currently, the error return code of the failure condition is lost after using an if statement, so the test doesn't fail when it should. Remove the if statement that separates the condition and the error code check, so the test won't always pass. Fixes: abfce9e062021 ("selftests: mlxsw: Reduce running time using offload indication") Reported-by: Ido Schimmel Signed-off-by: Danielle Ratson Reviewed-by: Petr Machata Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- tools/testing/selftests/drivers/net/mlxsw/tc_flower_scale.sh | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/mlxsw/tc_flower_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/tc_flower_scale.sh index cc0f07e72cf2..aa74be9f47c8 100644 --- a/tools/testing/selftests/drivers/net/mlxsw/tc_flower_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/tc_flower_scale.sh @@ -98,11 +98,7 @@ __tc_flower_test() jq -r '[ .[] | select(.kind == "flower") | .options | .in_hw ]' | jq .[] | wc -l) [[ $((offload_count - 1)) -eq $count ]] - if [[ $should_fail -eq 0 ]]; then - check_err $? "Offload mismatch" - else - check_err_fail $should_fail $? "Offload more than expacted" - fi + check_err_fail $should_fail $? "Attempt to offload $count rules (actual result $((offload_count - 1)))" } tc_flower_test() -- cgit v1.2.3 From 059b18e21c631b0eb1668831ae99f4764fb8e7eb Mon Sep 17 00:00:00 2001 From: Danielle Ratson Date: Fri, 23 Apr 2021 14:19:46 +0200 Subject: selftests: mlxsw: Return correct error code in resource scale tests Currently, the resource scale test checks a few cases, when the error code resets between the cases. So for example, if one case fails and the consecutive case passes, the error code eventually will fit the last test and will be 0. Save a new return code that will hold the 'or' return codes of all the cases, so the final return code will consider all the cases. Signed-off-by: Danielle Ratson Reviewed-by: Petr Machata Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- .../testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh | 4 +++- tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh index 4a1c9328555f..50654f8a8c37 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh @@ -30,6 +30,7 @@ trap cleanup EXIT ALL_TESTS="router tc_flower mirror_gre tc_police port" for current_test in ${TESTS:-$ALL_TESTS}; do + RET_FIN=0 source ${current_test}_scale.sh num_netifs_var=${current_test^^}_NUM_NETIFS @@ -48,8 +49,9 @@ for current_test in ${TESTS:-$ALL_TESTS}; do else log_test "'$current_test' overflow $target" fi + RET_FIN=$(( RET_FIN || RET )) done done current_test="" -exit "$RET" +exit "$RET_FIN" diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh index 087a884f66cd..685dfb3478b3 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh @@ -24,6 +24,7 @@ trap cleanup EXIT ALL_TESTS="router tc_flower mirror_gre tc_police port" for current_test in ${TESTS:-$ALL_TESTS}; do + RET_FIN=0 source ${current_test}_scale.sh num_netifs_var=${current_test^^}_NUM_NETIFS @@ -50,8 +51,9 @@ for current_test in ${TESTS:-$ALL_TESTS}; do log_test "'$current_test' [$profile] overflow $target" fi done + RET_FIN=$(( RET_FIN || RET )) done done current_test="" -exit "$RET" +exit "$RET_FIN" -- cgit v1.2.3 From dda7f4fa55839baeb72ae040aeaf9ccf89d3e416 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Fri, 23 Apr 2021 14:19:47 +0200 Subject: selftests: mlxsw: Increase the tolerance of backlog buildup The intention behind this test is to make sure that qdisc limit is correctly projected to the HW. However, first, due to rounding in the qdisc, and then in the driver, the number cannot actually be accurate. And second, the approach to testing this is to oversubscribe the port with traffic generated on the same switch. The actual backlog size therefore fluctuates. In practice, this test proved to be noisier than the rest, and spuriously fails every now and then. Increase the tolerance to 10 % to avoid these issues. Signed-off-by: Petr Machata Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh b/tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh index b0cb1aaffdda..33ddd01689be 100644 --- a/tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh @@ -507,8 +507,8 @@ do_red_test() check_err $? "backlog $backlog / $limit Got $pct% marked packets, expected == 0." local diff=$((limit - backlog)) pct=$((100 * diff / limit)) - ((0 <= pct && pct <= 5)) - check_err $? "backlog $backlog / $limit expected <= 5% distance" + ((0 <= pct && pct <= 10)) + check_err $? "backlog $backlog / $limit expected <= 10% distance" log_test "TC $((vlan - 10)): RED backlog > limit" stop_traffic -- cgit v1.2.3 From 1233898ab758cbcf5f6fea10b8dd16a0b2c24fab Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Fri, 23 Apr 2021 14:19:48 +0200 Subject: selftests: mlxsw: Fix mausezahn invocation in ERSPAN scale test The mirror_gre_scale test creates as many ERSPAN sessions as the underlying chip supports, and tests that they all work. In order to determine that it issues a stream of ICMP packets and checks if they are mirrored as expected. However, the mausezahn invocation missed the -6 flag to identify the use of IPv6 protocol, and was sending ICMP messages over IPv6, as opposed to ICMP6. It also didn't pass an explicit source IP address, which apparently worked at some point in the past, but does not anymore. To fix these issues, extend the function mirror_test() in mirror_lib by detecting the IPv6 protocol addresses, and using a different ICMP scheme. Fix __mirror_gre_test() in the selftest itself to pass a source IP address. Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- .../selftests/drivers/net/mlxsw/mirror_gre_scale.sh | 3 ++- tools/testing/selftests/net/forwarding/mirror_lib.sh | 19 +++++++++++++++++-- 2 files changed, 19 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/drivers/net/mlxsw/mirror_gre_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/mirror_gre_scale.sh index 6f3a70df63bc..e00435753008 100644 --- a/tools/testing/selftests/drivers/net/mlxsw/mirror_gre_scale.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/mirror_gre_scale.sh @@ -120,12 +120,13 @@ __mirror_gre_test() sleep 5 for ((i = 0; i < count; ++i)); do + local sip=$(mirror_gre_ipv6_addr 1 $i)::1 local dip=$(mirror_gre_ipv6_addr 1 $i)::2 local htun=h3-gt6-$i local message icmp6_capture_install $htun - mirror_test v$h1 "" $dip $htun 100 10 + mirror_test v$h1 $sip $dip $htun 100 10 icmp6_capture_uninstall $htun done } diff --git a/tools/testing/selftests/net/forwarding/mirror_lib.sh b/tools/testing/selftests/net/forwarding/mirror_lib.sh index 13db1cb50e57..6406cd76a19d 100644 --- a/tools/testing/selftests/net/forwarding/mirror_lib.sh +++ b/tools/testing/selftests/net/forwarding/mirror_lib.sh @@ -20,6 +20,13 @@ mirror_uninstall() tc filter del dev $swp1 $direction pref 1000 } +is_ipv6() +{ + local addr=$1; shift + + [[ -z ${addr//[0-9a-fA-F:]/} ]] +} + mirror_test() { local vrf_name=$1; shift @@ -29,9 +36,17 @@ mirror_test() local pref=$1; shift local expect=$1; shift + if is_ipv6 $dip; then + local proto=-6 + local type="icmp6 type=128" # Echo request. + else + local proto= + local type="icmp echoreq" + fi + local t0=$(tc_rule_stats_get $dev $pref) - $MZ $vrf_name ${sip:+-A $sip} -B $dip -a own -b bc -q \ - -c 10 -d 100msec -t icmp type=8 + $MZ $proto $vrf_name ${sip:+-A $sip} -B $dip -a own -b bc -q \ + -c 10 -d 100msec -t $type sleep 0.5 local t1=$(tc_rule_stats_get $dev $pref) local delta=$((t1 - t0)) -- cgit v1.2.3 From 0dd7e456bb049ec2b5a9e00250918b346c0d17d5 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 23 Apr 2021 11:13:31 -0700 Subject: bpftool: Support dumping BTF VAR's "extern" linkage Add dumping of "extern" linkage for BTF VAR kind. Also shorten "global-allocated" to "global" to be in line with FUNC's "global". Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210423181348.1801389-2-andrii@kernel.org --- tools/bpf/bpftool/btf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c index 62953bbf68b4..001749a34899 100644 --- a/tools/bpf/bpftool/btf.c +++ b/tools/bpf/bpftool/btf.c @@ -71,7 +71,9 @@ static const char *btf_var_linkage_str(__u32 linkage) case BTF_VAR_STATIC: return "static"; case BTF_VAR_GLOBAL_ALLOCATED: - return "global-alloc"; + return "global"; + case BTF_VAR_GLOBAL_EXTERN: + return "extern"; default: return "(unknown)"; } -- cgit v1.2.3 From 5b438f01d7eb2dc9bec7cd79de881b5f155d9a71 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 23 Apr 2021 11:13:32 -0700 Subject: bpftool: Dump more info about DATASEC members Dump succinct information for each member of DATASEC: its kinds and name. This is extremely helpful to see at a quick glance what is inside each DATASEC of a given BTF. Without this, one has to jump around BTF data to just find out the name of a VAR or FUNC. DATASEC's var_secinfo member is special in that regard because it doesn't itself contain the name of the member, delegating that to the referenced VAR and FUNC kinds. Other kinds, like STRUCT/UNION/FUNC/ENUM, encode member names directly and thus are clearly identifiable in BTF dump. The new output looks like this: [35] DATASEC '.bss' size=0 vlen=6 type_id=8 offset=0 size=4 (VAR 'input_bss1') type_id=13 offset=0 size=4 (VAR 'input_bss_weak') type_id=16 offset=0 size=4 (VAR 'output_bss1') type_id=17 offset=0 size=4 (VAR 'output_data1') type_id=18 offset=0 size=4 (VAR 'output_rodata1') type_id=20 offset=0 size=8 (VAR 'output_sink1') [36] DATASEC '.data' size=0 vlen=2 type_id=9 offset=0 size=4 (VAR 'input_data1') type_id=14 offset=0 size=4 (VAR 'input_data_weak') [37] DATASEC '.kconfig' size=0 vlen=2 type_id=25 offset=0 size=4 (VAR 'LINUX_KERNEL_VERSION') type_id=28 offset=0 size=1 (VAR 'CONFIG_BPF_SYSCALL') [38] DATASEC '.ksyms' size=0 vlen=1 type_id=30 offset=0 size=1 (VAR 'bpf_link_fops') [39] DATASEC '.rodata' size=0 vlen=2 type_id=12 offset=0 size=4 (VAR 'input_rodata1') type_id=15 offset=0 size=4 (VAR 'input_rodata_weak') [40] DATASEC 'license' size=0 vlen=1 type_id=24 offset=0 size=4 (VAR 'LICENSE') Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210423181348.1801389-3-andrii@kernel.org --- tools/bpf/bpftool/btf.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c index 001749a34899..385d5c955cf3 100644 --- a/tools/bpf/bpftool/btf.c +++ b/tools/bpf/bpftool/btf.c @@ -100,26 +100,28 @@ static const char *btf_str(const struct btf *btf, __u32 off) return btf__name_by_offset(btf, off) ? : "(invalid)"; } +static int btf_kind_safe(int kind) +{ + return kind <= BTF_KIND_MAX ? kind : BTF_KIND_UNKN; +} + static int dump_btf_type(const struct btf *btf, __u32 id, const struct btf_type *t) { json_writer_t *w = json_wtr; - int kind, safe_kind; - - kind = BTF_INFO_KIND(t->info); - safe_kind = kind <= BTF_KIND_MAX ? kind : BTF_KIND_UNKN; + int kind = btf_kind(t); if (json_output) { jsonw_start_object(w); jsonw_uint_field(w, "id", id); - jsonw_string_field(w, "kind", btf_kind_str[safe_kind]); + jsonw_string_field(w, "kind", btf_kind_str[btf_kind_safe(kind)]); jsonw_string_field(w, "name", btf_str(btf, t->name_off)); } else { - printf("[%u] %s '%s'", id, btf_kind_str[safe_kind], + printf("[%u] %s '%s'", id, btf_kind_str[btf_kind_safe(kind)], btf_str(btf, t->name_off)); } - switch (BTF_INFO_KIND(t->info)) { + switch (kind) { case BTF_KIND_INT: { __u32 v = *(__u32 *)(t + 1); const char *enc; @@ -302,7 +304,8 @@ static int dump_btf_type(const struct btf *btf, __u32 id, break; } case BTF_KIND_DATASEC: { - const struct btf_var_secinfo *v = (const void *)(t+1); + const struct btf_var_secinfo *v = (const void *)(t + 1); + const struct btf_type *vt; __u16 vlen = BTF_INFO_VLEN(t->info); int i; @@ -324,6 +327,13 @@ static int dump_btf_type(const struct btf *btf, __u32 id, } else { printf("\n\ttype_id=%u offset=%u size=%u", v->type, v->offset, v->size); + + if (v->type <= btf__get_nr_types(btf)) { + vt = btf__type_by_id(btf, v->type); + printf(" (%s '%s')", + btf_kind_str[btf_kind_safe(btf_kind(vt))], + btf_str(btf, vt->name_off)); + } } } if (json_output) -- cgit v1.2.3 From 0fec7a3cee1cf8e4f86ff563d229408ccbdc2d66 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 23 Apr 2021 11:13:33 -0700 Subject: libbpf: Suppress compiler warning when using SEC() macro with externs When used on externs SEC() macro will trigger compilation warning about inapplicable `__attribute__((used))`. That's expected for extern declarations, so suppress it with the corresponding _Pragma. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210423181348.1801389-4-andrii@kernel.org --- tools/lib/bpf/bpf_helpers.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index b904128626c2..75c7581b304c 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -25,9 +25,16 @@ /* * Helper macro to place programs, maps, license in * different sections in elf_bpf file. Section names - * are interpreted by elf_bpf loader + * are interpreted by libbpf depending on the context (BPF programs, BPF maps, + * extern variables, etc). + * To allow use of SEC() with externs (e.g., for extern .maps declarations), + * make sure __attribute__((unused)) doesn't trigger compilation warning. */ -#define SEC(NAME) __attribute__((section(NAME), used)) +#define SEC(name) \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wignored-attributes\"") \ + __attribute__((section(name), used)) \ + _Pragma("GCC diagnostic pop") \ /* Avoid 'linux/stddef.h' definition of '__always_inline'. */ #undef __always_inline -- cgit v1.2.3 From aea28a602fa19fb4fe66374030ab7e2c8ddf643e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 23 Apr 2021 11:13:34 -0700 Subject: libbpf: Mark BPF subprogs with hidden visibility as static for BPF verifier Define __hidden helper macro in bpf_helpers.h, which is a short-hand for __attribute__((visibility("hidden"))). Add libbpf support to mark BPF subprograms marked with __hidden as static in BTF information to enforce BPF verifier's static function validation algorithm, which takes more information (caller's context) into account during a subprogram validation. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210423181348.1801389-5-andrii@kernel.org --- tools/lib/bpf/bpf_helpers.h | 8 ++++++++ tools/lib/bpf/btf.c | 5 ----- tools/lib/bpf/libbpf.c | 45 ++++++++++++++++++++++++++++++++++++++++- tools/lib/bpf/libbpf_internal.h | 6 ++++++ 4 files changed, 58 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index 75c7581b304c..9720dc0b4605 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -47,6 +47,14 @@ #define __weak __attribute__((weak)) #endif +/* + * Use __hidden attribute to mark a non-static BPF subprogram effectively + * static for BPF verifier's verification algorithm purposes, allowing more + * extensive and permissive BPF verification process, taking into account + * subprogram's caller context. + */ +#define __hidden __attribute__((visibility("hidden"))) + /* When utilizing vmlinux.h with BPF CO-RE, user BPF programs can't include * any system-level headers (such as stddef.h, linux/version.h, etc), and * commonly-used macros like NULL and KERNEL_VERSION aren't available through diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index d30e67e7e1e5..d57e13a13798 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -1605,11 +1605,6 @@ static void *btf_add_type_mem(struct btf *btf, size_t add_sz) btf->hdr->type_len, UINT_MAX, add_sz); } -static __u32 btf_type_info(int kind, int vlen, int kflag) -{ - return (kflag << 31) | (kind << 24) | vlen; -} - static void btf_type_inc_vlen(struct btf_type *t) { t->info = btf_type_info(btf_kind(t), btf_vlen(t) + 1, btf_kflag(t)); diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 9cc2d45b0080..8bac9c7627aa 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -71,6 +71,7 @@ static struct bpf_map *bpf_object__add_map(struct bpf_object *obj); static const struct btf_type * skip_mods_and_typedefs(const struct btf *btf, __u32 id, __u32 *res_id); +static bool prog_is_subprog(const struct bpf_object *obj, const struct bpf_program *prog); static int __base_pr(enum libbpf_print_level level, const char *format, va_list args) @@ -274,6 +275,7 @@ struct bpf_program { bpf_program_clear_priv_t clear_priv; bool load; + bool mark_btf_static; enum bpf_prog_type type; enum bpf_attach_type expected_attach_type; int prog_ifindex; @@ -698,6 +700,15 @@ bpf_object__add_programs(struct bpf_object *obj, Elf_Data *sec_data, if (err) return err; + /* if function is a global/weak symbol, but has hidden + * visibility (STV_HIDDEN), mark its BTF FUNC as static to + * enable more permissive BPF verification mode with more + * outside context available to BPF verifier + */ + if (GELF_ST_BIND(sym.st_info) != STB_LOCAL + && GELF_ST_VISIBILITY(sym.st_other) == STV_HIDDEN) + prog->mark_btf_static = true; + nr_progs++; obj->nr_programs = nr_progs; @@ -2618,7 +2629,7 @@ static int bpf_object__sanitize_and_load_btf(struct bpf_object *obj) { struct btf *kern_btf = obj->btf; bool btf_mandatory, sanitize; - int err = 0; + int i, err = 0; if (!obj->btf) return 0; @@ -2632,6 +2643,38 @@ static int bpf_object__sanitize_and_load_btf(struct bpf_object *obj) return 0; } + /* Even though some subprogs are global/weak, user might prefer more + * permissive BPF verification process that BPF verifier performs for + * static functions, taking into account more context from the caller + * functions. In such case, they need to mark such subprogs with + * __attribute__((visibility("hidden"))) and libbpf will adjust + * corresponding FUNC BTF type to be marked as static and trigger more + * involved BPF verification process. + */ + for (i = 0; i < obj->nr_programs; i++) { + struct bpf_program *prog = &obj->programs[i]; + struct btf_type *t; + const char *name; + int j, n; + + if (!prog->mark_btf_static || !prog_is_subprog(obj, prog)) + continue; + + n = btf__get_nr_types(obj->btf); + for (j = 1; j <= n; j++) { + t = btf_type_by_id(obj->btf, j); + if (!btf_is_func(t) || btf_func_linkage(t) != BTF_FUNC_GLOBAL) + continue; + + name = btf__str_by_offset(obj->btf, t->name_off); + if (strcmp(name, prog->name) != 0) + continue; + + t->info = btf_type_info(BTF_KIND_FUNC, BTF_FUNC_STATIC, 0); + break; + } + } + sanitize = btf_needs_sanitization(obj); if (sanitize) { const void *raw_data; diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index 6017902c687e..92b7eae10c6d 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -19,6 +19,7 @@ #pragma GCC poison reallocarray #include "libbpf.h" +#include "btf.h" #ifndef EM_BPF #define EM_BPF 247 @@ -132,6 +133,11 @@ struct btf_type; struct btf_type *btf_type_by_id(struct btf *btf, __u32 type_id); +static inline __u32 btf_type_info(int kind, int vlen, int kflag) +{ + return (kflag << 31) | (kind << 24) | vlen; +} + void *libbpf_add_mem(void **data, size_t *cap_cnt, size_t elem_sz, size_t cur_cnt, size_t max_cnt, size_t add_cnt); int libbpf_ensure_mem(void **data, size_t *cap_cnt, size_t elem_sz, size_t need_cnt); -- cgit v1.2.3 From 6245947c1b3c6783976e3af113bac30250d0a93e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 23 Apr 2021 11:13:35 -0700 Subject: libbpf: Allow gaps in BPF program sections to support overriden weak functions Currently libbpf is very strict about parsing BPF program instruction sections. No gaps are allowed between sequential BPF programs within a given ELF section. Libbpf enforced that by keeping track of the next section offset that should start a new BPF (sub)program and cross-checks that by searching for a corresponding STT_FUNC ELF symbol. But this is too restrictive once we allow to have weak BPF programs and link together two or more BPF object files. In such case, some weak BPF programs might be "overridden" by either non-weak BPF program with the same name and signature, or even by another weak BPF program that just happened to be linked first. That, in turn, leaves BPF instructions of the "lost" BPF (sub)program intact, but there is no corresponding ELF symbol, because no one is going to be referencing it. Libbpf already correctly handles such cases in the sense that it won't append such dead code to actual BPF programs loaded into kernel. So the only change that needs to be done is to relax the logic of parsing BPF instruction sections. Instead of assuming next BPF (sub)program section offset, iterate available STT_FUNC ELF symbols to discover all available BPF subprograms and programs. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210423181348.1801389-6-andrii@kernel.org --- tools/lib/bpf/libbpf.c | 58 +++++++++++++++++++------------------------------- 1 file changed, 22 insertions(+), 36 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 8bac9c7627aa..198d6e149beb 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -502,8 +502,6 @@ static Elf_Scn *elf_sec_by_name(const struct bpf_object *obj, const char *name); static int elf_sec_hdr(const struct bpf_object *obj, Elf_Scn *scn, GElf_Shdr *hdr); static const char *elf_sec_name(const struct bpf_object *obj, Elf_Scn *scn); static Elf_Data *elf_sec_data(const struct bpf_object *obj, Elf_Scn *scn); -static int elf_sym_by_sec_off(const struct bpf_object *obj, size_t sec_idx, - size_t off, __u32 sym_type, GElf_Sym *sym); void bpf_program__unload(struct bpf_program *prog) { @@ -644,25 +642,29 @@ static int bpf_object__add_programs(struct bpf_object *obj, Elf_Data *sec_data, const char *sec_name, int sec_idx) { + Elf_Data *symbols = obj->efile.symbols; struct bpf_program *prog, *progs; void *data = sec_data->d_buf; - size_t sec_sz = sec_data->d_size, sec_off, prog_sz; - int nr_progs, err; + size_t sec_sz = sec_data->d_size, sec_off, prog_sz, nr_syms; + int nr_progs, err, i; const char *name; GElf_Sym sym; progs = obj->programs; nr_progs = obj->nr_programs; + nr_syms = symbols->d_size / sizeof(GElf_Sym); sec_off = 0; - while (sec_off < sec_sz) { - if (elf_sym_by_sec_off(obj, sec_idx, sec_off, STT_FUNC, &sym)) { - pr_warn("sec '%s': failed to find program symbol at offset %zu\n", - sec_name, sec_off); - return -LIBBPF_ERRNO__FORMAT; - } + for (i = 0; i < nr_syms; i++) { + if (!gelf_getsym(symbols, i, &sym)) + continue; + if (sym.st_shndx != sec_idx) + continue; + if (GELF_ST_TYPE(sym.st_info) != STT_FUNC) + continue; prog_sz = sym.st_size; + sec_off = sym.st_value; name = elf_sym_str(obj, sym.st_name); if (!name) { @@ -711,8 +713,6 @@ bpf_object__add_programs(struct bpf_object *obj, Elf_Data *sec_data, nr_progs++; obj->nr_programs = nr_progs; - - sec_off += prog_sz; } return 0; @@ -2825,26 +2825,6 @@ static Elf_Data *elf_sec_data(const struct bpf_object *obj, Elf_Scn *scn) return data; } -static int elf_sym_by_sec_off(const struct bpf_object *obj, size_t sec_idx, - size_t off, __u32 sym_type, GElf_Sym *sym) -{ - Elf_Data *symbols = obj->efile.symbols; - size_t n = symbols->d_size / sizeof(GElf_Sym); - int i; - - for (i = 0; i < n; i++) { - if (!gelf_getsym(symbols, i, sym)) - continue; - if (sym->st_shndx != sec_idx || sym->st_value != off) - continue; - if (GELF_ST_TYPE(sym->st_info) != sym_type) - continue; - return 0; - } - - return -ENOENT; -} - static bool is_sec_name_dwarf(const char *name) { /* approximation, but the actual list is too long */ @@ -3723,11 +3703,16 @@ bpf_object__collect_prog_relos(struct bpf_object *obj, GElf_Shdr *shdr, Elf_Data int err, i, nrels; const char *sym_name; __u32 insn_idx; + Elf_Scn *scn; + Elf_Data *scn_data; GElf_Sym sym; GElf_Rel rel; + scn = elf_sec_by_idx(obj, sec_idx); + scn_data = elf_sec_data(obj, scn); + relo_sec_name = elf_sec_str(obj, shdr->sh_name); - sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, sec_idx)); + sec_name = elf_sec_name(obj, scn); if (!relo_sec_name || !sec_name) return -EINVAL; @@ -3745,7 +3730,8 @@ bpf_object__collect_prog_relos(struct bpf_object *obj, GElf_Shdr *shdr, Elf_Data relo_sec_name, (size_t)GELF_R_SYM(rel.r_info), i); return -LIBBPF_ERRNO__FORMAT; } - if (rel.r_offset % BPF_INSN_SZ) { + + if (rel.r_offset % BPF_INSN_SZ || rel.r_offset >= scn_data->d_size) { pr_warn("sec '%s': invalid offset 0x%zx for relo #%d\n", relo_sec_name, (size_t)GELF_R_SYM(rel.r_info), i); return -LIBBPF_ERRNO__FORMAT; @@ -3769,9 +3755,9 @@ bpf_object__collect_prog_relos(struct bpf_object *obj, GElf_Shdr *shdr, Elf_Data prog = find_prog_by_sec_insn(obj, sec_idx, insn_idx); if (!prog) { - pr_warn("sec '%s': relo #%d: program not found in section '%s' for insn #%u\n", + pr_debug("sec '%s': relo #%d: couldn't find program in section '%s' for insn #%u, probably overridden weak function, skipping...\n", relo_sec_name, i, sec_name, insn_idx); - return -LIBBPF_ERRNO__RELOC; + continue; } relos = libbpf_reallocarray(prog->reloc_desc, -- cgit v1.2.3 From c7ef5ec9573f05535370d8716576263681cabec7 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 23 Apr 2021 11:13:36 -0700 Subject: libbpf: Refactor BTF map definition parsing Refactor BTF-defined maps parsing logic to allow it to be nicely reused by BPF static linker. Further, at least for BPF static linker, it's important to know which attributes of a BPF map were defined explicitly, so provide a bit set for each known portion of BTF map definition. This allows BPF static linker to do a simple check when dealing with extern map declarations. The same capabilities allow to distinguish attributes explicitly set to zero (e.g., __uint(max_entries, 0)) vs the case of not specifying it at all (no max_entries attribute at all). Libbpf is currently not utilizing that, but it could be useful for backwards compatibility reasons later. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210423181348.1801389-7-andrii@kernel.org --- tools/lib/bpf/libbpf.c | 257 +++++++++++++++++++++++----------------- tools/lib/bpf/libbpf_internal.h | 32 +++++ 2 files changed, 178 insertions(+), 111 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 198d6e149beb..4f8335fc0bc9 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -2025,255 +2025,262 @@ static int build_map_pin_path(struct bpf_map *map, const char *path) return bpf_map__set_pin_path(map, buf); } - -static int parse_btf_map_def(struct bpf_object *obj, - struct bpf_map *map, - const struct btf_type *def, - bool strict, bool is_inner, - const char *pin_root_path) +int parse_btf_map_def(const char *map_name, struct btf *btf, + const struct btf_type *def_t, bool strict, + struct btf_map_def *map_def, struct btf_map_def *inner_def) { const struct btf_type *t; const struct btf_member *m; + bool is_inner = inner_def == NULL; int vlen, i; - vlen = btf_vlen(def); - m = btf_members(def); + vlen = btf_vlen(def_t); + m = btf_members(def_t); for (i = 0; i < vlen; i++, m++) { - const char *name = btf__name_by_offset(obj->btf, m->name_off); + const char *name = btf__name_by_offset(btf, m->name_off); if (!name) { - pr_warn("map '%s': invalid field #%d.\n", map->name, i); + pr_warn("map '%s': invalid field #%d.\n", map_name, i); return -EINVAL; } if (strcmp(name, "type") == 0) { - if (!get_map_field_int(map->name, obj->btf, m, - &map->def.type)) + if (!get_map_field_int(map_name, btf, m, &map_def->map_type)) return -EINVAL; - pr_debug("map '%s': found type = %u.\n", - map->name, map->def.type); + map_def->parts |= MAP_DEF_MAP_TYPE; } else if (strcmp(name, "max_entries") == 0) { - if (!get_map_field_int(map->name, obj->btf, m, - &map->def.max_entries)) + if (!get_map_field_int(map_name, btf, m, &map_def->max_entries)) return -EINVAL; - pr_debug("map '%s': found max_entries = %u.\n", - map->name, map->def.max_entries); + map_def->parts |= MAP_DEF_MAX_ENTRIES; } else if (strcmp(name, "map_flags") == 0) { - if (!get_map_field_int(map->name, obj->btf, m, - &map->def.map_flags)) + if (!get_map_field_int(map_name, btf, m, &map_def->map_flags)) return -EINVAL; - pr_debug("map '%s': found map_flags = %u.\n", - map->name, map->def.map_flags); + map_def->parts |= MAP_DEF_MAP_FLAGS; } else if (strcmp(name, "numa_node") == 0) { - if (!get_map_field_int(map->name, obj->btf, m, &map->numa_node)) + if (!get_map_field_int(map_name, btf, m, &map_def->numa_node)) return -EINVAL; - pr_debug("map '%s': found numa_node = %u.\n", map->name, map->numa_node); + map_def->parts |= MAP_DEF_NUMA_NODE; } else if (strcmp(name, "key_size") == 0) { __u32 sz; - if (!get_map_field_int(map->name, obj->btf, m, &sz)) + if (!get_map_field_int(map_name, btf, m, &sz)) return -EINVAL; - pr_debug("map '%s': found key_size = %u.\n", - map->name, sz); - if (map->def.key_size && map->def.key_size != sz) { + if (map_def->key_size && map_def->key_size != sz) { pr_warn("map '%s': conflicting key size %u != %u.\n", - map->name, map->def.key_size, sz); + map_name, map_def->key_size, sz); return -EINVAL; } - map->def.key_size = sz; + map_def->key_size = sz; + map_def->parts |= MAP_DEF_KEY_SIZE; } else if (strcmp(name, "key") == 0) { __s64 sz; - t = btf__type_by_id(obj->btf, m->type); + t = btf__type_by_id(btf, m->type); if (!t) { pr_warn("map '%s': key type [%d] not found.\n", - map->name, m->type); + map_name, m->type); return -EINVAL; } if (!btf_is_ptr(t)) { pr_warn("map '%s': key spec is not PTR: %s.\n", - map->name, btf_kind_str(t)); + map_name, btf_kind_str(t)); return -EINVAL; } - sz = btf__resolve_size(obj->btf, t->type); + sz = btf__resolve_size(btf, t->type); if (sz < 0) { pr_warn("map '%s': can't determine key size for type [%u]: %zd.\n", - map->name, t->type, (ssize_t)sz); + map_name, t->type, (ssize_t)sz); return sz; } - pr_debug("map '%s': found key [%u], sz = %zd.\n", - map->name, t->type, (ssize_t)sz); - if (map->def.key_size && map->def.key_size != sz) { + if (map_def->key_size && map_def->key_size != sz) { pr_warn("map '%s': conflicting key size %u != %zd.\n", - map->name, map->def.key_size, (ssize_t)sz); + map_name, map_def->key_size, (ssize_t)sz); return -EINVAL; } - map->def.key_size = sz; - map->btf_key_type_id = t->type; + map_def->key_size = sz; + map_def->key_type_id = t->type; + map_def->parts |= MAP_DEF_KEY_SIZE | MAP_DEF_KEY_TYPE; } else if (strcmp(name, "value_size") == 0) { __u32 sz; - if (!get_map_field_int(map->name, obj->btf, m, &sz)) + if (!get_map_field_int(map_name, btf, m, &sz)) return -EINVAL; - pr_debug("map '%s': found value_size = %u.\n", - map->name, sz); - if (map->def.value_size && map->def.value_size != sz) { + if (map_def->value_size && map_def->value_size != sz) { pr_warn("map '%s': conflicting value size %u != %u.\n", - map->name, map->def.value_size, sz); + map_name, map_def->value_size, sz); return -EINVAL; } - map->def.value_size = sz; + map_def->value_size = sz; + map_def->parts |= MAP_DEF_VALUE_SIZE; } else if (strcmp(name, "value") == 0) { __s64 sz; - t = btf__type_by_id(obj->btf, m->type); + t = btf__type_by_id(btf, m->type); if (!t) { pr_warn("map '%s': value type [%d] not found.\n", - map->name, m->type); + map_name, m->type); return -EINVAL; } if (!btf_is_ptr(t)) { pr_warn("map '%s': value spec is not PTR: %s.\n", - map->name, btf_kind_str(t)); + map_name, btf_kind_str(t)); return -EINVAL; } - sz = btf__resolve_size(obj->btf, t->type); + sz = btf__resolve_size(btf, t->type); if (sz < 0) { pr_warn("map '%s': can't determine value size for type [%u]: %zd.\n", - map->name, t->type, (ssize_t)sz); + map_name, t->type, (ssize_t)sz); return sz; } - pr_debug("map '%s': found value [%u], sz = %zd.\n", - map->name, t->type, (ssize_t)sz); - if (map->def.value_size && map->def.value_size != sz) { + if (map_def->value_size && map_def->value_size != sz) { pr_warn("map '%s': conflicting value size %u != %zd.\n", - map->name, map->def.value_size, (ssize_t)sz); + map_name, map_def->value_size, (ssize_t)sz); return -EINVAL; } - map->def.value_size = sz; - map->btf_value_type_id = t->type; + map_def->value_size = sz; + map_def->value_type_id = t->type; + map_def->parts |= MAP_DEF_VALUE_SIZE | MAP_DEF_VALUE_TYPE; } else if (strcmp(name, "values") == 0) { + char inner_map_name[128]; int err; if (is_inner) { pr_warn("map '%s': multi-level inner maps not supported.\n", - map->name); + map_name); return -ENOTSUP; } if (i != vlen - 1) { pr_warn("map '%s': '%s' member should be last.\n", - map->name, name); + map_name, name); return -EINVAL; } - if (!bpf_map_type__is_map_in_map(map->def.type)) { + if (!bpf_map_type__is_map_in_map(map_def->map_type)) { pr_warn("map '%s': should be map-in-map.\n", - map->name); + map_name); return -ENOTSUP; } - if (map->def.value_size && map->def.value_size != 4) { + if (map_def->value_size && map_def->value_size != 4) { pr_warn("map '%s': conflicting value size %u != 4.\n", - map->name, map->def.value_size); + map_name, map_def->value_size); return -EINVAL; } - map->def.value_size = 4; - t = btf__type_by_id(obj->btf, m->type); + map_def->value_size = 4; + t = btf__type_by_id(btf, m->type); if (!t) { pr_warn("map '%s': map-in-map inner type [%d] not found.\n", - map->name, m->type); + map_name, m->type); return -EINVAL; } if (!btf_is_array(t) || btf_array(t)->nelems) { pr_warn("map '%s': map-in-map inner spec is not a zero-sized array.\n", - map->name); + map_name); return -EINVAL; } - t = skip_mods_and_typedefs(obj->btf, btf_array(t)->type, - NULL); + t = skip_mods_and_typedefs(btf, btf_array(t)->type, NULL); if (!btf_is_ptr(t)) { pr_warn("map '%s': map-in-map inner def is of unexpected kind %s.\n", - map->name, btf_kind_str(t)); + map_name, btf_kind_str(t)); return -EINVAL; } - t = skip_mods_and_typedefs(obj->btf, t->type, NULL); + t = skip_mods_and_typedefs(btf, t->type, NULL); if (!btf_is_struct(t)) { pr_warn("map '%s': map-in-map inner def is of unexpected kind %s.\n", - map->name, btf_kind_str(t)); + map_name, btf_kind_str(t)); return -EINVAL; } - map->inner_map = calloc(1, sizeof(*map->inner_map)); - if (!map->inner_map) - return -ENOMEM; - map->inner_map->fd = -1; - map->inner_map->sec_idx = obj->efile.btf_maps_shndx; - map->inner_map->name = malloc(strlen(map->name) + - sizeof(".inner") + 1); - if (!map->inner_map->name) - return -ENOMEM; - sprintf(map->inner_map->name, "%s.inner", map->name); - - err = parse_btf_map_def(obj, map->inner_map, t, strict, - true /* is_inner */, NULL); + snprintf(inner_map_name, sizeof(inner_map_name), "%s.inner", map_name); + err = parse_btf_map_def(inner_map_name, btf, t, strict, inner_def, NULL); if (err) return err; + + map_def->parts |= MAP_DEF_INNER_MAP; } else if (strcmp(name, "pinning") == 0) { __u32 val; - int err; if (is_inner) { - pr_debug("map '%s': inner def can't be pinned.\n", - map->name); + pr_warn("map '%s': inner def can't be pinned.\n", map_name); return -EINVAL; } - if (!get_map_field_int(map->name, obj->btf, m, &val)) + if (!get_map_field_int(map_name, btf, m, &val)) return -EINVAL; - pr_debug("map '%s': found pinning = %u.\n", - map->name, val); - - if (val != LIBBPF_PIN_NONE && - val != LIBBPF_PIN_BY_NAME) { + if (val != LIBBPF_PIN_NONE && val != LIBBPF_PIN_BY_NAME) { pr_warn("map '%s': invalid pinning value %u.\n", - map->name, val); + map_name, val); return -EINVAL; } - if (val == LIBBPF_PIN_BY_NAME) { - err = build_map_pin_path(map, pin_root_path); - if (err) { - pr_warn("map '%s': couldn't build pin path.\n", - map->name); - return err; - } - } + map_def->pinning = val; + map_def->parts |= MAP_DEF_PINNING; } else { if (strict) { - pr_warn("map '%s': unknown field '%s'.\n", - map->name, name); + pr_warn("map '%s': unknown field '%s'.\n", map_name, name); return -ENOTSUP; } - pr_debug("map '%s': ignoring unknown field '%s'.\n", - map->name, name); + pr_debug("map '%s': ignoring unknown field '%s'.\n", map_name, name); } } - if (map->def.type == BPF_MAP_TYPE_UNSPEC) { - pr_warn("map '%s': map type isn't specified.\n", map->name); + if (map_def->map_type == BPF_MAP_TYPE_UNSPEC) { + pr_warn("map '%s': map type isn't specified.\n", map_name); return -EINVAL; } return 0; } +static void fill_map_from_def(struct bpf_map *map, const struct btf_map_def *def) +{ + map->def.type = def->map_type; + map->def.key_size = def->key_size; + map->def.value_size = def->value_size; + map->def.max_entries = def->max_entries; + map->def.map_flags = def->map_flags; + + map->numa_node = def->numa_node; + map->btf_key_type_id = def->key_type_id; + map->btf_value_type_id = def->value_type_id; + + if (def->parts & MAP_DEF_MAP_TYPE) + pr_debug("map '%s': found type = %u.\n", map->name, def->map_type); + + if (def->parts & MAP_DEF_KEY_TYPE) + pr_debug("map '%s': found key [%u], sz = %u.\n", + map->name, def->key_type_id, def->key_size); + else if (def->parts & MAP_DEF_KEY_SIZE) + pr_debug("map '%s': found key_size = %u.\n", map->name, def->key_size); + + if (def->parts & MAP_DEF_VALUE_TYPE) + pr_debug("map '%s': found value [%u], sz = %u.\n", + map->name, def->value_type_id, def->value_size); + else if (def->parts & MAP_DEF_VALUE_SIZE) + pr_debug("map '%s': found value_size = %u.\n", map->name, def->value_size); + + if (def->parts & MAP_DEF_MAX_ENTRIES) + pr_debug("map '%s': found max_entries = %u.\n", map->name, def->max_entries); + if (def->parts & MAP_DEF_MAP_FLAGS) + pr_debug("map '%s': found map_flags = %u.\n", map->name, def->map_flags); + if (def->parts & MAP_DEF_PINNING) + pr_debug("map '%s': found pinning = %u.\n", map->name, def->pinning); + if (def->parts & MAP_DEF_NUMA_NODE) + pr_debug("map '%s': found numa_node = %u.\n", map->name, def->numa_node); + + if (def->parts & MAP_DEF_INNER_MAP) + pr_debug("map '%s': found inner map definition.\n", map->name); +} + static int bpf_object__init_user_btf_map(struct bpf_object *obj, const struct btf_type *sec, int var_idx, int sec_idx, const Elf_Data *data, bool strict, const char *pin_root_path) { + struct btf_map_def map_def = {}, inner_def = {}; const struct btf_type *var, *def; const struct btf_var_secinfo *vi; const struct btf_var *var_extra; const char *map_name; struct bpf_map *map; + int err; vi = btf_var_secinfos(sec) + var_idx; var = btf__type_by_id(obj->btf, vi->type); @@ -2327,7 +2334,35 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj, pr_debug("map '%s': at sec_idx %d, offset %zu.\n", map_name, map->sec_idx, map->sec_offset); - return parse_btf_map_def(obj, map, def, strict, false, pin_root_path); + err = parse_btf_map_def(map->name, obj->btf, def, strict, &map_def, &inner_def); + if (err) + return err; + + fill_map_from_def(map, &map_def); + + if (map_def.pinning == LIBBPF_PIN_BY_NAME) { + err = build_map_pin_path(map, pin_root_path); + if (err) { + pr_warn("map '%s': couldn't build pin path.\n", map->name); + return err; + } + } + + if (map_def.parts & MAP_DEF_INNER_MAP) { + map->inner_map = calloc(1, sizeof(*map->inner_map)); + if (!map->inner_map) + return -ENOMEM; + map->inner_map->fd = -1; + map->inner_map->sec_idx = sec_idx; + map->inner_map->name = malloc(strlen(map_name) + sizeof(".inner") + 1); + if (!map->inner_map->name) + return -ENOMEM; + sprintf(map->inner_map->name, "%s.inner", map_name); + + fill_map_from_def(map->inner_map, &inner_def); + } + + return 0; } static int bpf_object__init_user_btf_maps(struct bpf_object *obj, bool strict, diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index 92b7eae10c6d..17883073710c 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -138,6 +138,38 @@ static inline __u32 btf_type_info(int kind, int vlen, int kflag) return (kflag << 31) | (kind << 24) | vlen; } +enum map_def_parts { + MAP_DEF_MAP_TYPE = 0x001, + MAP_DEF_KEY_TYPE = 0x002, + MAP_DEF_KEY_SIZE = 0x004, + MAP_DEF_VALUE_TYPE = 0x008, + MAP_DEF_VALUE_SIZE = 0x010, + MAP_DEF_MAX_ENTRIES = 0x020, + MAP_DEF_MAP_FLAGS = 0x040, + MAP_DEF_NUMA_NODE = 0x080, + MAP_DEF_PINNING = 0x100, + MAP_DEF_INNER_MAP = 0x200, + + MAP_DEF_ALL = 0x3ff, /* combination of all above */ +}; + +struct btf_map_def { + enum map_def_parts parts; + __u32 map_type; + __u32 key_type_id; + __u32 key_size; + __u32 value_type_id; + __u32 value_size; + __u32 max_entries; + __u32 map_flags; + __u32 numa_node; + __u32 pinning; +}; + +int parse_btf_map_def(const char *map_name, struct btf *btf, + const struct btf_type *def_t, bool strict, + struct btf_map_def *map_def, struct btf_map_def *inner_def); + void *libbpf_add_mem(void **data, size_t *cap_cnt, size_t elem_sz, size_t cur_cnt, size_t max_cnt, size_t add_cnt); int libbpf_ensure_mem(void **data, size_t *cap_cnt, size_t elem_sz, size_t need_cnt); -- cgit v1.2.3 From beaa3711ada4e4a0c8e03a78fec72330185213bf Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 23 Apr 2021 11:13:37 -0700 Subject: libbpf: Factor out symtab and relos sanity checks Factor out logic for sanity checking SHT_SYMTAB and SHT_REL sections into separate sections. They are already quite extensive and are suffering from too deep indentation. Subsequent changes will extend SYMTAB sanity checking further, so it's better to factor each into a separate function. No functional changes are intended. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210423181348.1801389-8-andrii@kernel.org --- tools/lib/bpf/linker.c | 233 +++++++++++++++++++++++++++---------------------- 1 file changed, 127 insertions(+), 106 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c index 4e08bc07e635..0bb927226370 100644 --- a/tools/lib/bpf/linker.c +++ b/tools/lib/bpf/linker.c @@ -131,6 +131,8 @@ static int init_output_elf(struct bpf_linker *linker, const char *file); static int linker_load_obj_file(struct bpf_linker *linker, const char *filename, struct src_obj *obj); static int linker_sanity_check_elf(struct src_obj *obj); +static int linker_sanity_check_elf_symtab(struct src_obj *obj, struct src_sec *sec); +static int linker_sanity_check_elf_relos(struct src_obj *obj, struct src_sec *sec); static int linker_sanity_check_btf(struct src_obj *obj); static int linker_sanity_check_btf_ext(struct src_obj *obj); static int linker_fixup_btf(struct src_obj *obj); @@ -663,8 +665,8 @@ static bool is_pow_of_2(size_t x) static int linker_sanity_check_elf(struct src_obj *obj) { - struct src_sec *sec, *link_sec; - int i, j, n; + struct src_sec *sec; + int i, err; if (!obj->symtab_sec_idx) { pr_warn("ELF is missing SYMTAB section in %s\n", obj->filename); @@ -692,43 +694,11 @@ static int linker_sanity_check_elf(struct src_obj *obj) return -EINVAL; switch (sec->shdr->sh_type) { - case SHT_SYMTAB: { - Elf64_Sym *sym; - - if (sec->shdr->sh_entsize != sizeof(Elf64_Sym)) - return -EINVAL; - if (sec->shdr->sh_size % sec->shdr->sh_entsize != 0) - return -EINVAL; - - if (!sec->shdr->sh_link || sec->shdr->sh_link >= obj->sec_cnt) { - pr_warn("ELF SYMTAB section #%zu points to missing STRTAB section #%zu in %s\n", - sec->sec_idx, (size_t)sec->shdr->sh_link, obj->filename); - return -EINVAL; - } - link_sec = &obj->secs[sec->shdr->sh_link]; - if (link_sec->shdr->sh_type != SHT_STRTAB) { - pr_warn("ELF SYMTAB section #%zu points to invalid STRTAB section #%zu in %s\n", - sec->sec_idx, (size_t)sec->shdr->sh_link, obj->filename); - return -EINVAL; - } - - n = sec->shdr->sh_size / sec->shdr->sh_entsize; - sym = sec->data->d_buf; - for (j = 0; j < n; j++, sym++) { - if (sym->st_shndx - && sym->st_shndx < SHN_LORESERVE - && sym->st_shndx >= obj->sec_cnt) { - pr_warn("ELF sym #%d in section #%zu points to missing section #%zu in %s\n", - j, sec->sec_idx, (size_t)sym->st_shndx, obj->filename); - return -EINVAL; - } - if (ELF64_ST_TYPE(sym->st_info) == STT_SECTION) { - if (sym->st_value != 0) - return -EINVAL; - } - } + case SHT_SYMTAB: + err = linker_sanity_check_elf_symtab(obj, sec); + if (err) + return err; break; - } case SHT_STRTAB: break; case SHT_PROGBITS: @@ -739,87 +709,138 @@ static int linker_sanity_check_elf(struct src_obj *obj) break; case SHT_NOBITS: break; - case SHT_REL: { - Elf64_Rel *relo; - struct src_sec *sym_sec; + case SHT_REL: + err = linker_sanity_check_elf_relos(obj, sec); + if (err) + return err; + break; + case SHT_LLVM_ADDRSIG: + break; + default: + pr_warn("ELF section #%zu (%s) has unrecognized type %zu in %s\n", + sec->sec_idx, sec->sec_name, (size_t)sec->shdr->sh_type, obj->filename); + return -EINVAL; + } + } - if (sec->shdr->sh_entsize != sizeof(Elf64_Rel)) - return -EINVAL; - if (sec->shdr->sh_size % sec->shdr->sh_entsize != 0) - return -EINVAL; + return 0; +} - /* SHT_REL's sh_link should point to SYMTAB */ - if (sec->shdr->sh_link != obj->symtab_sec_idx) { - pr_warn("ELF relo section #%zu points to invalid SYMTAB section #%zu in %s\n", - sec->sec_idx, (size_t)sec->shdr->sh_link, obj->filename); - return -EINVAL; - } +static int linker_sanity_check_elf_symtab(struct src_obj *obj, struct src_sec *sec) +{ + struct src_sec *link_sec; + Elf64_Sym *sym; + int i, n; - /* SHT_REL's sh_info points to relocated section */ - if (!sec->shdr->sh_info || sec->shdr->sh_info >= obj->sec_cnt) { - pr_warn("ELF relo section #%zu points to missing section #%zu in %s\n", - sec->sec_idx, (size_t)sec->shdr->sh_info, obj->filename); - return -EINVAL; - } - link_sec = &obj->secs[sec->shdr->sh_info]; + if (sec->shdr->sh_entsize != sizeof(Elf64_Sym)) + return -EINVAL; + if (sec->shdr->sh_size % sec->shdr->sh_entsize != 0) + return -EINVAL; + + if (!sec->shdr->sh_link || sec->shdr->sh_link >= obj->sec_cnt) { + pr_warn("ELF SYMTAB section #%zu points to missing STRTAB section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_link, obj->filename); + return -EINVAL; + } + link_sec = &obj->secs[sec->shdr->sh_link]; + if (link_sec->shdr->sh_type != SHT_STRTAB) { + pr_warn("ELF SYMTAB section #%zu points to invalid STRTAB section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_link, obj->filename); + return -EINVAL; + } - /* .rel -> pattern is followed */ - if (strncmp(sec->sec_name, ".rel", sizeof(".rel") - 1) != 0 - || strcmp(sec->sec_name + sizeof(".rel") - 1, link_sec->sec_name) != 0) { - pr_warn("ELF relo section #%zu name has invalid name in %s\n", - sec->sec_idx, obj->filename); + n = sec->shdr->sh_size / sec->shdr->sh_entsize; + sym = sec->data->d_buf; + for (i = 0; i < n; i++, sym++) { + if (sym->st_shndx + && sym->st_shndx < SHN_LORESERVE + && sym->st_shndx >= obj->sec_cnt) { + pr_warn("ELF sym #%d in section #%zu points to missing section #%zu in %s\n", + i, sec->sec_idx, (size_t)sym->st_shndx, obj->filename); + return -EINVAL; + } + if (ELF64_ST_TYPE(sym->st_info) == STT_SECTION) { + if (sym->st_value != 0) return -EINVAL; - } + continue; + } + } - /* don't further validate relocations for ignored sections */ - if (link_sec->skipped) - break; + return 0; +} - /* relocatable section is data or instructions */ - if (link_sec->shdr->sh_type != SHT_PROGBITS - && link_sec->shdr->sh_type != SHT_NOBITS) { - pr_warn("ELF relo section #%zu points to invalid section #%zu in %s\n", - sec->sec_idx, (size_t)sec->shdr->sh_info, obj->filename); - return -EINVAL; - } +static int linker_sanity_check_elf_relos(struct src_obj *obj, struct src_sec *sec) +{ + struct src_sec *link_sec, *sym_sec; + Elf64_Rel *relo; + int i, n; - /* check sanity of each relocation */ - n = sec->shdr->sh_size / sec->shdr->sh_entsize; - relo = sec->data->d_buf; - sym_sec = &obj->secs[obj->symtab_sec_idx]; - for (j = 0; j < n; j++, relo++) { - size_t sym_idx = ELF64_R_SYM(relo->r_info); - size_t sym_type = ELF64_R_TYPE(relo->r_info); - - if (sym_type != R_BPF_64_64 && sym_type != R_BPF_64_32) { - pr_warn("ELF relo #%d in section #%zu has unexpected type %zu in %s\n", - j, sec->sec_idx, sym_type, obj->filename); - return -EINVAL; - } + if (sec->shdr->sh_entsize != sizeof(Elf64_Rel)) + return -EINVAL; + if (sec->shdr->sh_size % sec->shdr->sh_entsize != 0) + return -EINVAL; - if (!sym_idx || sym_idx * sizeof(Elf64_Sym) >= sym_sec->shdr->sh_size) { - pr_warn("ELF relo #%d in section #%zu points to invalid symbol #%zu in %s\n", - j, sec->sec_idx, sym_idx, obj->filename); - return -EINVAL; - } + /* SHT_REL's sh_link should point to SYMTAB */ + if (sec->shdr->sh_link != obj->symtab_sec_idx) { + pr_warn("ELF relo section #%zu points to invalid SYMTAB section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_link, obj->filename); + return -EINVAL; + } - if (link_sec->shdr->sh_flags & SHF_EXECINSTR) { - if (relo->r_offset % sizeof(struct bpf_insn) != 0) { - pr_warn("ELF relo #%d in section #%zu points to missing symbol #%zu in %s\n", - j, sec->sec_idx, sym_idx, obj->filename); - return -EINVAL; - } - } - } - break; + /* SHT_REL's sh_info points to relocated section */ + if (!sec->shdr->sh_info || sec->shdr->sh_info >= obj->sec_cnt) { + pr_warn("ELF relo section #%zu points to missing section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_info, obj->filename); + return -EINVAL; + } + link_sec = &obj->secs[sec->shdr->sh_info]; + + /* .rel -> pattern is followed */ + if (strncmp(sec->sec_name, ".rel", sizeof(".rel") - 1) != 0 + || strcmp(sec->sec_name + sizeof(".rel") - 1, link_sec->sec_name) != 0) { + pr_warn("ELF relo section #%zu name has invalid name in %s\n", + sec->sec_idx, obj->filename); + return -EINVAL; + } + + /* don't further validate relocations for ignored sections */ + if (link_sec->skipped) + return 0; + + /* relocatable section is data or instructions */ + if (link_sec->shdr->sh_type != SHT_PROGBITS && link_sec->shdr->sh_type != SHT_NOBITS) { + pr_warn("ELF relo section #%zu points to invalid section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_info, obj->filename); + return -EINVAL; + } + + /* check sanity of each relocation */ + n = sec->shdr->sh_size / sec->shdr->sh_entsize; + relo = sec->data->d_buf; + sym_sec = &obj->secs[obj->symtab_sec_idx]; + for (i = 0; i < n; i++, relo++) { + size_t sym_idx = ELF64_R_SYM(relo->r_info); + size_t sym_type = ELF64_R_TYPE(relo->r_info); + + if (sym_type != R_BPF_64_64 && sym_type != R_BPF_64_32) { + pr_warn("ELF relo #%d in section #%zu has unexpected type %zu in %s\n", + i, sec->sec_idx, sym_type, obj->filename); + return -EINVAL; } - case SHT_LLVM_ADDRSIG: - break; - default: - pr_warn("ELF section #%zu (%s) has unrecognized type %zu in %s\n", - sec->sec_idx, sec->sec_name, (size_t)sec->shdr->sh_type, obj->filename); + + if (!sym_idx || sym_idx * sizeof(Elf64_Sym) >= sym_sec->shdr->sh_size) { + pr_warn("ELF relo #%d in section #%zu points to invalid symbol #%zu in %s\n", + i, sec->sec_idx, sym_idx, obj->filename); return -EINVAL; } + + if (link_sec->shdr->sh_flags & SHF_EXECINSTR) { + if (relo->r_offset % sizeof(struct bpf_insn) != 0) { + pr_warn("ELF relo #%d in section #%zu points to missing symbol #%zu in %s\n", + i, sec->sec_idx, sym_idx, obj->filename); + return -EINVAL; + } + } } return 0; -- cgit v1.2.3 From 42869d28527695a75346c988ceeedbba7e3880b7 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 23 Apr 2021 11:13:38 -0700 Subject: libbpf: Make few internal helpers available outside of libbpf.c Make skip_mods_and_typedefs(), btf_kind_str(), and btf_func_linkage() helpers available outside of libbpf.c, to be used by static linker code. Also do few cleanups (error code fixes, comment clean up, etc) that don't deserve their own commit. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210423181348.1801389-9-andrii@kernel.org --- tools/lib/bpf/libbpf.c | 13 +++---------- tools/lib/bpf/libbpf_internal.h | 7 +++++++ tools/lib/bpf/linker.c | 14 ++++++-------- 3 files changed, 16 insertions(+), 18 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 4f8335fc0bc9..a1cddd17af7d 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -69,8 +69,6 @@ #define __printf(a, b) __attribute__((format(printf, a, b))) static struct bpf_map *bpf_object__add_map(struct bpf_object *obj); -static const struct btf_type * -skip_mods_and_typedefs(const struct btf *btf, __u32 id, __u32 *res_id); static bool prog_is_subprog(const struct bpf_object *obj, const struct bpf_program *prog); static int __base_pr(enum libbpf_print_level level, const char *format, @@ -1906,7 +1904,7 @@ static int bpf_object__init_user_maps(struct bpf_object *obj, bool strict) return 0; } -static const struct btf_type * +const struct btf_type * skip_mods_and_typedefs(const struct btf *btf, __u32 id, __u32 *res_id) { const struct btf_type *t = btf__type_by_id(btf, id); @@ -1961,16 +1959,11 @@ static const char *__btf_kind_str(__u16 kind) } } -static const char *btf_kind_str(const struct btf_type *t) +const char *btf_kind_str(const struct btf_type *t) { return __btf_kind_str(btf_kind(t)); } -static enum btf_func_linkage btf_func_linkage(const struct btf_type *t) -{ - return (enum btf_func_linkage)BTF_INFO_VLEN(t->info); -} - /* * Fetch integer attribute of BTF map definition. Such attributes are * represented using a pointer to an array, in which dimensionality of array @@ -7036,7 +7029,7 @@ static bool insn_is_helper_call(struct bpf_insn *insn, enum bpf_func_id *func_id return false; } -static int bpf_object__sanitize_prog(struct bpf_object* obj, struct bpf_program *prog) +static int bpf_object__sanitize_prog(struct bpf_object *obj, struct bpf_program *prog) { struct bpf_insn *insn = prog->insns; enum bpf_func_id func_id; diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index 17883073710c..ee426226928f 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -132,6 +132,13 @@ struct btf; struct btf_type; struct btf_type *btf_type_by_id(struct btf *btf, __u32 type_id); +const char *btf_kind_str(const struct btf_type *t); +const struct btf_type *skip_mods_and_typedefs(const struct btf *btf, __u32 id, __u32 *res_id); + +static inline enum btf_func_linkage btf_func_linkage(const struct btf_type *t) +{ + return (enum btf_func_linkage)(int)btf_vlen(t); +} static inline __u32 btf_type_info(int kind, int vlen, int kflag) { diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c index 0bb927226370..1263641e8b97 100644 --- a/tools/lib/bpf/linker.c +++ b/tools/lib/bpf/linker.c @@ -123,9 +123,7 @@ struct bpf_linker { }; #define pr_warn_elf(fmt, ...) \ -do { \ - libbpf_print(LIBBPF_WARN, "libbpf: " fmt ": %s\n", ##__VA_ARGS__, elf_errmsg(-1)); \ -} while (0) + libbpf_print(LIBBPF_WARN, "libbpf: " fmt ": %s\n", ##__VA_ARGS__, elf_errmsg(-1)) static int init_output_elf(struct bpf_linker *linker, const char *file); @@ -284,7 +282,7 @@ static int init_output_elf(struct bpf_linker *linker, const char *file) /* ELF header */ linker->elf_hdr = elf64_newehdr(linker->elf); - if (!linker->elf_hdr){ + if (!linker->elf_hdr) { pr_warn_elf("failed to create ELF header"); return -EINVAL; } @@ -925,13 +923,13 @@ static int init_sec(struct bpf_linker *linker, struct dst_sec *dst_sec, struct s scn = elf_newscn(linker->elf); if (!scn) - return -1; + return -ENOMEM; data = elf_newdata(scn); if (!data) - return -1; + return -ENOMEM; shdr = elf64_getshdr(scn); if (!shdr) - return -1; + return -ENOMEM; dst_sec->scn = scn; dst_sec->shdr = shdr; @@ -1221,7 +1219,7 @@ static int linker_append_elf_relos(struct bpf_linker *linker, struct src_obj *ob return err; } } else if (!secs_match(dst_sec, src_sec)) { - pr_warn("Secs %s are not compatible\n", src_sec->sec_name); + pr_warn("sections %s are not compatible\n", src_sec->sec_name); return -1; } -- cgit v1.2.3 From 386b1d241e1b975a239d33be836bc183a52ab18c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 23 Apr 2021 11:13:39 -0700 Subject: libbpf: Extend sanity checking ELF symbols with externs validation Add logic to validate extern symbols, plus some other minor extra checks, like ELF symbol #0 validation, general symbol visibility and binding validations. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210423181348.1801389-10-andrii@kernel.org --- tools/lib/bpf/linker.c | 49 ++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c index 1263641e8b97..b0e038480300 100644 --- a/tools/lib/bpf/linker.c +++ b/tools/lib/bpf/linker.c @@ -750,14 +750,45 @@ static int linker_sanity_check_elf_symtab(struct src_obj *obj, struct src_sec *s n = sec->shdr->sh_size / sec->shdr->sh_entsize; sym = sec->data->d_buf; for (i = 0; i < n; i++, sym++) { - if (sym->st_shndx - && sym->st_shndx < SHN_LORESERVE - && sym->st_shndx >= obj->sec_cnt) { + int sym_type = ELF64_ST_TYPE(sym->st_info); + int sym_bind = ELF64_ST_BIND(sym->st_info); + int sym_vis = ELF64_ST_VISIBILITY(sym->st_other); + + if (i == 0) { + if (sym->st_name != 0 || sym->st_info != 0 + || sym->st_other != 0 || sym->st_shndx != 0 + || sym->st_value != 0 || sym->st_size != 0) { + pr_warn("ELF sym #0 is invalid in %s\n", obj->filename); + return -EINVAL; + } + continue; + } + if (sym_bind != STB_LOCAL && sym_bind != STB_GLOBAL && sym_bind != STB_WEAK) { + pr_warn("ELF sym #%d in section #%zu has unsupported symbol binding %d\n", + i, sec->sec_idx, sym_bind); + return -EINVAL; + } + if (sym_vis != STV_DEFAULT && sym_vis != STV_HIDDEN) { + pr_warn("ELF sym #%d in section #%zu has unsupported symbol visibility %d\n", + i, sec->sec_idx, sym_vis); + return -EINVAL; + } + if (sym->st_shndx == 0) { + if (sym_type != STT_NOTYPE || sym_bind == STB_LOCAL + || sym->st_value != 0 || sym->st_size != 0) { + pr_warn("ELF sym #%d is invalid extern symbol in %s\n", + i, obj->filename); + + return -EINVAL; + } + continue; + } + if (sym->st_shndx < SHN_LORESERVE && sym->st_shndx >= obj->sec_cnt) { pr_warn("ELF sym #%d in section #%zu points to missing section #%zu in %s\n", i, sec->sec_idx, (size_t)sym->st_shndx, obj->filename); return -EINVAL; } - if (ELF64_ST_TYPE(sym->st_info) == STT_SECTION) { + if (sym_type == STT_SECTION) { if (sym->st_value != 0) return -EINVAL; continue; @@ -1135,16 +1166,16 @@ static int linker_append_elf_syms(struct bpf_linker *linker, struct src_obj *obj size_t dst_sym_idx; int name_off; - /* we already have all-zero initial symbol */ - if (sym->st_name == 0 && sym->st_info == 0 && - sym->st_other == 0 && sym->st_shndx == SHN_UNDEF && - sym->st_value == 0 && sym->st_size ==0) + /* We already validated all-zero symbol #0 and we already + * appended it preventively to the final SYMTAB, so skip it. + */ + if (i == 0) continue; sym_name = elf_strptr(obj->elf, str_sec_idx, sym->st_name); if (!sym_name) { pr_warn("can't fetch symbol name for symbol #%d in '%s'\n", i, obj->filename); - return -1; + return -EINVAL; } if (sym->st_shndx && sym->st_shndx < SHN_LORESERVE) { -- cgit v1.2.3 From 83a157279f2125ce1c4d6d93750440853746dff0 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 23 Apr 2021 11:13:40 -0700 Subject: libbpf: Tighten BTF type ID rewriting with error checking It should never fail, but if it does, it's better to know about this rather than end up with nonsensical type IDs. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210423181348.1801389-11-andrii@kernel.org --- tools/lib/bpf/linker.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'tools') diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c index b0e038480300..5505c85e8b7b 100644 --- a/tools/lib/bpf/linker.c +++ b/tools/lib/bpf/linker.c @@ -1429,6 +1429,13 @@ static int linker_fixup_btf(struct src_obj *obj) static int remap_type_id(__u32 *type_id, void *ctx) { int *id_map = ctx; + int new_id = id_map[*type_id]; + + /* Error out if the type wasn't remapped. Ignore VOID which stays VOID. */ + if (new_id == 0 && *type_id != 0) { + pr_warn("failed to find new ID mapping for original BTF type ID %u\n", *type_id); + return -EINVAL; + } *type_id = id_map[*type_id]; -- cgit v1.2.3 From a46349227cd832b33c12f74b85712ea67de3c6c4 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 23 Apr 2021 11:13:41 -0700 Subject: libbpf: Add linker extern resolution support for functions and global variables Add BPF static linker logic to resolve extern variables and functions across multiple linked together BPF object files. For that, linker maintains a separate list of struct glob_sym structures, which keeps track of few pieces of metadata (is it extern or resolved global, is it a weak symbol, which ELF section it belongs to, etc) and ties together BTF type info and ELF symbol information and keeps them in sync. With adding support for extern variables/funcs, it's now possible for some sections to contain both extern and non-extern definitions. This means that some sections may start out as ephemeral (if only externs are present and thus there is not corresponding ELF section), but will be "upgraded" to actual ELF section as symbols are resolved or new non-extern definitions are appended. Additional care is taken to not duplicate extern entries in sections like .kconfig and .ksyms. Given libbpf requires BTF type to always be present for .kconfig/.ksym externs, linker extends this requirement to all the externs, even those that are supposed to be resolved during static linking and which won't be visible to libbpf. With BTF information always present, static linker will check not just ELF symbol matches, but entire BTF type signature match as well. That logic is stricter that BPF CO-RE checks. It probably should be re-used by .ksym resolution logic in libbpf as well, but that's left for follow up patches. To make it unnecessary to rewrite ELF symbols and minimize BTF type rewriting/removal, ELF symbols that correspond to externs initially will be updated in place once they are resolved. Similarly for BTF type info, VAR/FUNC and var_secinfo's (sec_vars in struct bpf_linker) are staying stable, but types they point to might get replaced when extern is resolved. This might leave some left-over types (even though we try to minimize this for common cases of having extern funcs with not argument names vs concrete function with names properly specified). That can be addresses later with a generic BTF garbage collection. That's left for a follow up as well. Given BTF type appending phase is separate from ELF symbol appending/resolution, special struct glob_sym->underlying_btf_id variable is used to communicate resolution and rewrite decisions. 0 means underlying_btf_id needs to be appended (it's not yet in final linker->btf), <0 values are used for temporary storage of source BTF type ID (not yet rewritten), so -glob_sym->underlying_btf_id is BTF type id in obj-btf. But by the end of linker_append_btf() phase, that underlying_btf_id will be remapped and will always be > 0. This is the uglies part of the whole process, but keeps the other parts much simpler due to stability of sec_var and VAR/FUNC types, as well as ELF symbol, so please keep that in mind while reviewing. BTF-defined maps require some extra custom logic and is addressed separate in the next patch, so that to keep this one smaller and easier to review. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210423181348.1801389-12-andrii@kernel.org --- tools/lib/bpf/linker.c | 849 +++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 790 insertions(+), 59 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c index 5505c85e8b7b..23f49945dc7a 100644 --- a/tools/lib/bpf/linker.c +++ b/tools/lib/bpf/linker.c @@ -22,6 +22,8 @@ #include "libbpf_internal.h" #include "strset.h" +#define BTF_EXTERN_SEC ".extern" + struct src_sec { const char *sec_name; /* positional (not necessarily ELF) index in an array of sections */ @@ -74,11 +76,36 @@ struct btf_ext_sec_data { void *recs; }; +struct glob_sym { + /* ELF symbol index */ + int sym_idx; + /* associated section id for .ksyms, .kconfig, etc, but not .extern */ + int sec_id; + /* extern name offset in STRTAB */ + int name_off; + /* optional associated BTF type ID */ + int btf_id; + /* BTF type ID to which VAR/FUNC type is pointing to; used for + * rewriting types when extern VAR/FUNC is resolved to a concrete + * definition + */ + int underlying_btf_id; + /* sec_var index in the corresponding dst_sec, if exists */ + int var_idx; + + /* extern or resolved/global symbol */ + bool is_extern; + /* weak or strong symbol, never goes back from strong to weak */ + bool is_weak; +}; + struct dst_sec { char *sec_name; /* positional (not necessarily ELF) index in an array of sections */ int id; + bool ephemeral; + /* ELF info */ size_t sec_idx; Elf_Scn *scn; @@ -120,6 +147,10 @@ struct bpf_linker { struct btf *btf; struct btf_ext *btf_ext; + + /* global (including extern) ELF symbols */ + int glob_sym_cnt; + struct glob_sym *glob_syms; }; #define pr_warn_elf(fmt, ...) \ @@ -136,6 +167,8 @@ static int linker_sanity_check_btf_ext(struct src_obj *obj); static int linker_fixup_btf(struct src_obj *obj); static int linker_append_sec_data(struct bpf_linker *linker, struct src_obj *obj); static int linker_append_elf_syms(struct bpf_linker *linker, struct src_obj *obj); +static int linker_append_elf_sym(struct bpf_linker *linker, struct src_obj *obj, + Elf64_Sym *sym, const char *sym_name, int src_sym_idx); static int linker_append_elf_relos(struct bpf_linker *linker, struct src_obj *obj); static int linker_append_btf(struct bpf_linker *linker, struct src_obj *obj); static int linker_append_btf_ext(struct bpf_linker *linker, struct src_obj *obj); @@ -947,6 +980,7 @@ static int init_sec(struct bpf_linker *linker, struct dst_sec *dst_sec, struct s dst_sec->sec_sz = 0; dst_sec->sec_idx = 0; + dst_sec->ephemeral = src_sec->ephemeral; /* ephemeral sections are just thin section shells lacking most parts */ if (src_sec->ephemeral) @@ -1010,6 +1044,9 @@ static struct dst_sec *find_dst_sec_by_name(struct bpf_linker *linker, const cha static bool secs_match(struct dst_sec *dst, struct src_sec *src) { + if (dst->ephemeral || src->ephemeral) + return true; + if (dst->shdr->sh_type != src->shdr->sh_type) { pr_warn("sec %s types mismatch\n", dst->sec_name); return false; @@ -1035,13 +1072,33 @@ static bool sec_content_is_same(struct dst_sec *dst_sec, struct src_sec *src_sec return true; } -static int extend_sec(struct dst_sec *dst, struct src_sec *src) +static int extend_sec(struct bpf_linker *linker, struct dst_sec *dst, struct src_sec *src) { void *tmp; - size_t dst_align = dst->shdr->sh_addralign; - size_t src_align = src->shdr->sh_addralign; + size_t dst_align, src_align; size_t dst_align_sz, dst_final_sz; + int err; + + /* Ephemeral source section doesn't contribute anything to ELF + * section data. + */ + if (src->ephemeral) + return 0; + + /* Some sections (like .maps) can contain both externs (and thus be + * ephemeral) and non-externs (map definitions). So it's possible that + * it has to be "upgraded" from ephemeral to non-ephemeral when the + * first non-ephemeral entity appears. In such case, we add ELF + * section, data, etc. + */ + if (dst->ephemeral) { + err = init_sec(linker, dst, src); + if (err) + return err; + } + dst_align = dst->shdr->sh_addralign; + src_align = src->shdr->sh_addralign; if (dst_align == 0) dst_align = 1; if (dst_align < src_align) @@ -1137,10 +1194,7 @@ static int linker_append_sec_data(struct bpf_linker *linker, struct src_obj *obj /* record mapped section index */ src_sec->dst_id = dst_sec->id; - if (src_sec->ephemeral) - continue; - - err = extend_sec(dst_sec, src_sec); + err = extend_sec(linker, dst_sec, src_sec); if (err) return err; } @@ -1151,21 +1205,16 @@ static int linker_append_sec_data(struct bpf_linker *linker, struct src_obj *obj static int linker_append_elf_syms(struct bpf_linker *linker, struct src_obj *obj) { struct src_sec *symtab = &obj->secs[obj->symtab_sec_idx]; - Elf64_Sym *sym = symtab->data->d_buf, *dst_sym; - int i, n = symtab->shdr->sh_size / symtab->shdr->sh_entsize; + Elf64_Sym *sym = symtab->data->d_buf; + int i, n = symtab->shdr->sh_size / symtab->shdr->sh_entsize, err; int str_sec_idx = symtab->shdr->sh_link; + const char *sym_name; obj->sym_map = calloc(n + 1, sizeof(*obj->sym_map)); if (!obj->sym_map) return -ENOMEM; for (i = 0; i < n; i++, sym++) { - struct src_sec *src_sec = NULL; - struct dst_sec *dst_sec = NULL; - const char *sym_name; - size_t dst_sym_idx; - int name_off; - /* We already validated all-zero symbol #0 and we already * appended it preventively to the final SYMTAB, so skip it. */ @@ -1178,41 +1227,624 @@ static int linker_append_elf_syms(struct bpf_linker *linker, struct src_obj *obj return -EINVAL; } - if (sym->st_shndx && sym->st_shndx < SHN_LORESERVE) { - src_sec = &obj->secs[sym->st_shndx]; - if (src_sec->skipped) + err = linker_append_elf_sym(linker, obj, sym, sym_name, i); + if (err) + return err; + } + + return 0; +} + +static Elf64_Sym *get_sym_by_idx(struct bpf_linker *linker, size_t sym_idx) +{ + struct dst_sec *symtab = &linker->secs[linker->symtab_sec_idx]; + Elf64_Sym *syms = symtab->raw_data; + + return &syms[sym_idx]; +} + +static struct glob_sym *find_glob_sym(struct bpf_linker *linker, const char *sym_name) +{ + struct glob_sym *glob_sym; + const char *name; + int i; + + for (i = 0; i < linker->glob_sym_cnt; i++) { + glob_sym = &linker->glob_syms[i]; + name = strset__data(linker->strtab_strs) + glob_sym->name_off; + + if (strcmp(name, sym_name) == 0) + return glob_sym; + } + + return NULL; +} + +static struct glob_sym *add_glob_sym(struct bpf_linker *linker) +{ + struct glob_sym *syms, *sym; + + syms = libbpf_reallocarray(linker->glob_syms, linker->glob_sym_cnt + 1, + sizeof(*linker->glob_syms)); + if (!syms) + return NULL; + + sym = &syms[linker->glob_sym_cnt]; + memset(sym, 0, sizeof(*sym)); + sym->var_idx = -1; + + linker->glob_syms = syms; + linker->glob_sym_cnt++; + + return sym; +} + +static bool glob_sym_btf_matches(const char *sym_name, bool exact, + const struct btf *btf1, __u32 id1, + const struct btf *btf2, __u32 id2) +{ + const struct btf_type *t1, *t2; + bool is_static1, is_static2; + const char *n1, *n2; + int i, n; + +recur: + n1 = n2 = NULL; + t1 = skip_mods_and_typedefs(btf1, id1, &id1); + t2 = skip_mods_and_typedefs(btf2, id2, &id2); + + /* check if only one side is FWD, otherwise handle with common logic */ + if (!exact && btf_is_fwd(t1) != btf_is_fwd(t2)) { + n1 = btf__str_by_offset(btf1, t1->name_off); + n2 = btf__str_by_offset(btf2, t2->name_off); + if (strcmp(n1, n2) != 0) { + pr_warn("global '%s': incompatible forward declaration names '%s' and '%s'\n", + sym_name, n1, n2); + return false; + } + /* validate if FWD kind matches concrete kind */ + if (btf_is_fwd(t1)) { + if (btf_kflag(t1) && btf_is_union(t2)) + return true; + if (!btf_kflag(t1) && btf_is_struct(t2)) + return true; + pr_warn("global '%s': incompatible %s forward declaration and concrete kind %s\n", + sym_name, btf_kflag(t1) ? "union" : "struct", btf_kind_str(t2)); + } else { + if (btf_kflag(t2) && btf_is_union(t1)) + return true; + if (!btf_kflag(t2) && btf_is_struct(t1)) + return true; + pr_warn("global '%s': incompatible %s forward declaration and concrete kind %s\n", + sym_name, btf_kflag(t2) ? "union" : "struct", btf_kind_str(t1)); + } + return false; + } + + if (btf_kind(t1) != btf_kind(t2)) { + pr_warn("global '%s': incompatible BTF kinds %s and %s\n", + sym_name, btf_kind_str(t1), btf_kind_str(t2)); + return false; + } + + switch (btf_kind(t1)) { + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + case BTF_KIND_ENUM: + case BTF_KIND_FWD: + case BTF_KIND_FUNC: + case BTF_KIND_VAR: + n1 = btf__str_by_offset(btf1, t1->name_off); + n2 = btf__str_by_offset(btf2, t2->name_off); + if (strcmp(n1, n2) != 0) { + pr_warn("global '%s': incompatible %s names '%s' and '%s'\n", + sym_name, btf_kind_str(t1), n1, n2); + return false; + } + break; + default: + break; + } + + switch (btf_kind(t1)) { + case BTF_KIND_UNKN: /* void */ + case BTF_KIND_FWD: + return true; + case BTF_KIND_INT: + case BTF_KIND_FLOAT: + case BTF_KIND_ENUM: + /* ignore encoding for int and enum values for enum */ + if (t1->size != t2->size) { + pr_warn("global '%s': incompatible %s '%s' size %u and %u\n", + sym_name, btf_kind_str(t1), n1, t1->size, t2->size); + return false; + } + return true; + case BTF_KIND_PTR: + /* just validate overall shape of the referenced type, so no + * contents comparison for struct/union, and allowd fwd vs + * struct/union + */ + exact = false; + id1 = t1->type; + id2 = t2->type; + goto recur; + case BTF_KIND_ARRAY: + /* ignore index type and array size */ + id1 = btf_array(t1)->type; + id2 = btf_array(t2)->type; + goto recur; + case BTF_KIND_FUNC: + /* extern and global linkages are compatible */ + is_static1 = btf_func_linkage(t1) == BTF_FUNC_STATIC; + is_static2 = btf_func_linkage(t2) == BTF_FUNC_STATIC; + if (is_static1 != is_static2) { + pr_warn("global '%s': incompatible func '%s' linkage\n", sym_name, n1); + return false; + } + + id1 = t1->type; + id2 = t2->type; + goto recur; + case BTF_KIND_VAR: + /* extern and global linkages are compatible */ + is_static1 = btf_var(t1)->linkage == BTF_VAR_STATIC; + is_static2 = btf_var(t2)->linkage == BTF_VAR_STATIC; + if (is_static1 != is_static2) { + pr_warn("global '%s': incompatible var '%s' linkage\n", sym_name, n1); + return false; + } + + id1 = t1->type; + id2 = t2->type; + goto recur; + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + const struct btf_member *m1, *m2; + + if (!exact) + return true; + + if (btf_vlen(t1) != btf_vlen(t2)) { + pr_warn("global '%s': incompatible number of %s fields %u and %u\n", + sym_name, btf_kind_str(t1), btf_vlen(t1), btf_vlen(t2)); + return false; + } + + n = btf_vlen(t1); + m1 = btf_members(t1); + m2 = btf_members(t2); + for (i = 0; i < n; i++, m1++, m2++) { + n1 = btf__str_by_offset(btf1, m1->name_off); + n2 = btf__str_by_offset(btf2, m2->name_off); + if (strcmp(n1, n2) != 0) { + pr_warn("global '%s': incompatible field #%d names '%s' and '%s'\n", + sym_name, i, n1, n2); + return false; + } + if (m1->offset != m2->offset) { + pr_warn("global '%s': incompatible field #%d ('%s') offsets\n", + sym_name, i, n1); + return false; + } + if (!glob_sym_btf_matches(sym_name, exact, btf1, m1->type, btf2, m2->type)) + return false; + } + + return true; + } + case BTF_KIND_FUNC_PROTO: { + const struct btf_param *m1, *m2; + + if (btf_vlen(t1) != btf_vlen(t2)) { + pr_warn("global '%s': incompatible number of %s params %u and %u\n", + sym_name, btf_kind_str(t1), btf_vlen(t1), btf_vlen(t2)); + return false; + } + + n = btf_vlen(t1); + m1 = btf_params(t1); + m2 = btf_params(t2); + for (i = 0; i < n; i++, m1++, m2++) { + /* ignore func arg names */ + if (!glob_sym_btf_matches(sym_name, exact, btf1, m1->type, btf2, m2->type)) + return false; + } + + /* now check return type as well */ + id1 = t1->type; + id2 = t2->type; + goto recur; + } + + /* skip_mods_and_typedefs() make this impossible */ + case BTF_KIND_TYPEDEF: + case BTF_KIND_VOLATILE: + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + /* DATASECs are never compared with each other */ + case BTF_KIND_DATASEC: + default: + pr_warn("global '%s': unsupported BTF kind %s\n", + sym_name, btf_kind_str(t1)); + return false; + } +} + +static bool glob_syms_match(const char *sym_name, + struct bpf_linker *linker, struct glob_sym *glob_sym, + struct src_obj *obj, Elf64_Sym *sym, size_t sym_idx, int btf_id) +{ + const struct btf_type *src_t; + + /* if we are dealing with externs, BTF types describing both global + * and extern VARs/FUNCs should be completely present in all files + */ + if (!glob_sym->btf_id || !btf_id) { + pr_warn("BTF info is missing for global symbol '%s'\n", sym_name); + return false; + } + + src_t = btf__type_by_id(obj->btf, btf_id); + if (!btf_is_var(src_t) && !btf_is_func(src_t)) { + pr_warn("only extern variables and functions are supported, but got '%s' for '%s'\n", + btf_kind_str(src_t), sym_name); + return false; + } + + if (!glob_sym_btf_matches(sym_name, true /*exact*/, + linker->btf, glob_sym->btf_id, obj->btf, btf_id)) + return false; + + return true; +} + +static bool btf_is_non_static(const struct btf_type *t) +{ + return (btf_is_var(t) && btf_var(t)->linkage != BTF_VAR_STATIC) + || (btf_is_func(t) && btf_func_linkage(t) != BTF_FUNC_STATIC); +} + +static int find_glob_sym_btf(struct src_obj *obj, Elf64_Sym *sym, const char *sym_name, + int *out_btf_sec_id, int *out_btf_id) +{ + int i, j, n = btf__get_nr_types(obj->btf), m, btf_id = 0; + const struct btf_type *t; + const struct btf_var_secinfo *vi; + const char *name; + + for (i = 1; i <= n; i++) { + t = btf__type_by_id(obj->btf, i); + + /* some global and extern FUNCs and VARs might not be associated with any + * DATASEC, so try to detect them in the same pass + */ + if (btf_is_non_static(t)) { + name = btf__str_by_offset(obj->btf, t->name_off); + if (strcmp(name, sym_name) != 0) continue; - dst_sec = &linker->secs[src_sec->dst_id]; - /* allow only one STT_SECTION symbol per section */ - if (ELF64_ST_TYPE(sym->st_info) == STT_SECTION && dst_sec->sec_sym_idx) { - obj->sym_map[i] = dst_sec->sec_sym_idx; + /* remember and still try to find DATASEC */ + btf_id = i; + continue; + } + + if (!btf_is_datasec(t)) + continue; + + vi = btf_var_secinfos(t); + for (j = 0, m = btf_vlen(t); j < m; j++, vi++) { + t = btf__type_by_id(obj->btf, vi->type); + name = btf__str_by_offset(obj->btf, t->name_off); + + if (strcmp(name, sym_name) != 0) + continue; + if (btf_is_var(t) && btf_var(t)->linkage == BTF_VAR_STATIC) + continue; + if (btf_is_func(t) && btf_func_linkage(t) == BTF_FUNC_STATIC) continue; + + if (btf_id && btf_id != vi->type) { + pr_warn("global/extern '%s' BTF is ambiguous: both types #%d and #%u match\n", + sym_name, btf_id, vi->type); + return -EINVAL; } + + *out_btf_sec_id = i; + *out_btf_id = vi->type; + + return 0; } + } - name_off = strset__add_str(linker->strtab_strs, sym_name); - if (name_off < 0) - return name_off; + /* free-floating extern or global FUNC */ + if (btf_id) { + *out_btf_sec_id = 0; + *out_btf_id = btf_id; + return 0; + } - dst_sym = add_new_sym(linker, &dst_sym_idx); - if (!dst_sym) - return -ENOMEM; + pr_warn("failed to find BTF info for global/extern symbol '%s'\n", sym_name); + return -ENOENT; +} - dst_sym->st_name = name_off; - dst_sym->st_info = sym->st_info; - dst_sym->st_other = sym->st_other; - dst_sym->st_shndx = src_sec ? dst_sec->sec_idx : sym->st_shndx; - dst_sym->st_value = (src_sec ? src_sec->dst_off : 0) + sym->st_value; - dst_sym->st_size = sym->st_size; +static struct src_sec *find_src_sec_by_name(struct src_obj *obj, const char *sec_name) +{ + struct src_sec *sec; + int i; - obj->sym_map[i] = dst_sym_idx; + for (i = 1; i < obj->sec_cnt; i++) { + sec = &obj->secs[i]; - if (ELF64_ST_TYPE(sym->st_info) == STT_SECTION && dst_sym) { - dst_sec->sec_sym_idx = dst_sym_idx; - dst_sym->st_value = 0; + if (strcmp(sec->sec_name, sec_name) == 0) + return sec; + } + + return NULL; +} + +static int complete_extern_btf_info(struct btf *dst_btf, int dst_id, + struct btf *src_btf, int src_id) +{ + struct btf_type *dst_t = btf_type_by_id(dst_btf, dst_id); + struct btf_type *src_t = btf_type_by_id(src_btf, src_id); + struct btf_param *src_p, *dst_p; + const char *s; + int i, n, off; + + /* We already made sure that source and destination types (FUNC or + * VAR) match in terms of types and argument names. + */ + if (btf_is_var(dst_t)) { + btf_var(dst_t)->linkage = BTF_VAR_GLOBAL_ALLOCATED; + return 0; + } + + dst_t->info = btf_type_info(BTF_KIND_FUNC, BTF_FUNC_GLOBAL, 0); + + /* now onto FUNC_PROTO types */ + src_t = btf_type_by_id(src_btf, src_t->type); + dst_t = btf_type_by_id(dst_btf, dst_t->type); + + /* Fill in all the argument names, which for extern FUNCs are missing. + * We'll end up with two copies of FUNCs/VARs for externs, but that + * will be taken care of by BTF dedup at the very end. + * It might be that BTF types for extern in one file has less/more BTF + * information (e.g., FWD instead of full STRUCT/UNION information), + * but that should be (in most cases, subject to BTF dedup rules) + * handled and resolved by BTF dedup algorithm as well, so we won't + * worry about it. Our only job is to make sure that argument names + * are populated on both sides, otherwise BTF dedup will pedantically + * consider them different. + */ + src_p = btf_params(src_t); + dst_p = btf_params(dst_t); + for (i = 0, n = btf_vlen(dst_t); i < n; i++, src_p++, dst_p++) { + if (!src_p->name_off) + continue; + + /* src_btf has more complete info, so add name to dst_btf */ + s = btf__str_by_offset(src_btf, src_p->name_off); + off = btf__add_str(dst_btf, s); + if (off < 0) + return off; + dst_p->name_off = off; + } + return 0; +} + +static void sym_update_bind(Elf64_Sym *sym, int sym_bind) +{ + sym->st_info = ELF64_ST_INFO(sym_bind, ELF64_ST_TYPE(sym->st_info)); +} + +static void sym_update_type(Elf64_Sym *sym, int sym_type) +{ + sym->st_info = ELF64_ST_INFO(ELF64_ST_BIND(sym->st_info), sym_type); +} + +static void sym_update_visibility(Elf64_Sym *sym, int sym_vis) +{ + /* libelf doesn't provide setters for ST_VISIBILITY, + * but it is stored in the lower 2 bits of st_other + */ + sym->st_other &= 0x03; + sym->st_other |= sym_vis; +} + +static int linker_append_elf_sym(struct bpf_linker *linker, struct src_obj *obj, + Elf64_Sym *sym, const char *sym_name, int src_sym_idx) +{ + struct src_sec *src_sec = NULL; + struct dst_sec *dst_sec = NULL; + struct glob_sym *glob_sym = NULL; + int name_off, sym_type, sym_bind, sym_vis, err; + int btf_sec_id = 0, btf_id = 0; + size_t dst_sym_idx; + Elf64_Sym *dst_sym; + bool sym_is_extern; + + sym_type = ELF64_ST_TYPE(sym->st_info); + sym_bind = ELF64_ST_BIND(sym->st_info); + sym_vis = ELF64_ST_VISIBILITY(sym->st_other); + sym_is_extern = sym->st_shndx == SHN_UNDEF; + + if (sym_is_extern) { + if (!obj->btf) { + pr_warn("externs without BTF info are not supported\n"); + return -ENOTSUP; + } + } else if (sym->st_shndx < SHN_LORESERVE) { + src_sec = &obj->secs[sym->st_shndx]; + if (src_sec->skipped) + return 0; + dst_sec = &linker->secs[src_sec->dst_id]; + + /* allow only one STT_SECTION symbol per section */ + if (sym_type == STT_SECTION && dst_sec->sec_sym_idx) { + obj->sym_map[src_sym_idx] = dst_sec->sec_sym_idx; + return 0; + } + } + + if (sym_bind == STB_LOCAL) + goto add_sym; + + /* find matching BTF info */ + err = find_glob_sym_btf(obj, sym, sym_name, &btf_sec_id, &btf_id); + if (err) + return err; + + if (sym_is_extern && btf_sec_id) { + const char *sec_name = NULL; + const struct btf_type *t; + + t = btf__type_by_id(obj->btf, btf_sec_id); + sec_name = btf__str_by_offset(obj->btf, t->name_off); + + /* Clang puts unannotated extern vars into + * '.extern' BTF DATASEC. Treat them the same + * as unannotated extern funcs (which are + * currently not put into any DATASECs). + * Those don't have associated src_sec/dst_sec. + */ + if (strcmp(sec_name, BTF_EXTERN_SEC) != 0) { + src_sec = find_src_sec_by_name(obj, sec_name); + if (!src_sec) { + pr_warn("failed to find matching ELF sec '%s'\n", sec_name); + return -ENOENT; + } + dst_sec = &linker->secs[src_sec->dst_id]; + } + } + + glob_sym = find_glob_sym(linker, sym_name); + if (glob_sym) { + /* Preventively resolve to existing symbol. This is + * needed for further relocation symbol remapping in + * the next step of linking. + */ + obj->sym_map[src_sym_idx] = glob_sym->sym_idx; + + /* If both symbols are non-externs, at least one of + * them has to be STB_WEAK, otherwise they are in + * a conflict with each other. + */ + if (!sym_is_extern && !glob_sym->is_extern + && !glob_sym->is_weak && sym_bind != STB_WEAK) { + pr_warn("conflicting non-weak symbol #%d (%s) definition in '%s'\n", + src_sym_idx, sym_name, obj->filename); + return -EINVAL; + } + + if (!glob_syms_match(sym_name, linker, glob_sym, obj, sym, src_sym_idx, btf_id)) + return -EINVAL; + + dst_sym = get_sym_by_idx(linker, glob_sym->sym_idx); + + /* If new symbol is strong, then force dst_sym to be strong as + * well; this way a mix of weak and non-weak extern + * definitions will end up being strong. + */ + if (sym_bind == STB_GLOBAL) { + /* We still need to preserve type (NOTYPE or + * OBJECT/FUNC, depending on whether the symbol is + * extern or not) + */ + sym_update_bind(dst_sym, STB_GLOBAL); + glob_sym->is_weak = false; } + /* Non-default visibility is "contaminating", with stricter + * visibility overwriting more permissive ones, even if more + * permissive visibility comes from just an extern definition. + * Currently only STV_DEFAULT and STV_HIDDEN are allowed and + * ensured by ELF symbol sanity checks above. + */ + if (sym_vis > ELF64_ST_VISIBILITY(dst_sym->st_other)) + sym_update_visibility(dst_sym, sym_vis); + + /* If the new symbol is extern, then regardless if + * existing symbol is extern or resolved global, just + * keep the existing one untouched. + */ + if (sym_is_extern) + return 0; + + /* If existing symbol is a strong resolved symbol, bail out, + * because we lost resolution battle have nothing to + * contribute. We already checked abover that there is no + * strong-strong conflict. We also already tightened binding + * and visibility, so nothing else to contribute at that point. + */ + if (!glob_sym->is_extern && sym_bind == STB_WEAK) + return 0; + + /* At this point, new symbol is strong non-extern, + * so overwrite glob_sym with new symbol information. + * Preserve binding and visibility. + */ + sym_update_type(dst_sym, sym_type); + dst_sym->st_shndx = dst_sec->sec_idx; + dst_sym->st_value = src_sec->dst_off + sym->st_value; + dst_sym->st_size = sym->st_size; + + /* see comment below about dst_sec->id vs dst_sec->sec_idx */ + glob_sym->sec_id = dst_sec->id; + glob_sym->is_extern = false; + + if (complete_extern_btf_info(linker->btf, glob_sym->btf_id, + obj->btf, btf_id)) + return -EINVAL; + + /* request updating VAR's/FUNC's underlying BTF type when appending BTF type */ + glob_sym->underlying_btf_id = 0; + + obj->sym_map[src_sym_idx] = glob_sym->sym_idx; + return 0; + } + +add_sym: + name_off = strset__add_str(linker->strtab_strs, sym_name); + if (name_off < 0) + return name_off; + + dst_sym = add_new_sym(linker, &dst_sym_idx); + if (!dst_sym) + return -ENOMEM; + + dst_sym->st_name = name_off; + dst_sym->st_info = sym->st_info; + dst_sym->st_other = sym->st_other; + dst_sym->st_shndx = dst_sec ? dst_sec->sec_idx : sym->st_shndx; + dst_sym->st_value = (src_sec ? src_sec->dst_off : 0) + sym->st_value; + dst_sym->st_size = sym->st_size; + + obj->sym_map[src_sym_idx] = dst_sym_idx; + + if (sym_type == STT_SECTION && dst_sym) { + dst_sec->sec_sym_idx = dst_sym_idx; + dst_sym->st_value = 0; + } + + if (sym_bind != STB_LOCAL) { + glob_sym = add_glob_sym(linker); + if (!glob_sym) + return -ENOMEM; + + glob_sym->sym_idx = dst_sym_idx; + /* we use dst_sec->id (and not dst_sec->sec_idx), because + * ephemeral sections (.kconfig, .ksyms, etc) don't have + * sec_idx (as they don't have corresponding ELF section), but + * still have id. .extern doesn't have even ephemeral section + * associated with it, so dst_sec->id == dst_sec->sec_idx == 0. + */ + glob_sym->sec_id = dst_sec ? dst_sec->id : 0; + glob_sym->name_off = name_off; + /* we will fill btf_id in during BTF merging step */ + glob_sym->btf_id = 0; + glob_sym->is_extern = sym_is_extern; + glob_sym->is_weak = sym_bind == STB_WEAK; } return 0; @@ -1262,7 +1894,7 @@ static int linker_append_elf_relos(struct bpf_linker *linker, struct src_obj *ob dst_sec->shdr->sh_info = dst_linked_sec->sec_idx; src_sec->dst_id = dst_sec->id; - err = extend_sec(dst_sec, src_sec); + err = extend_sec(linker, dst_sec, src_sec); if (err) return err; @@ -1315,21 +1947,6 @@ static int linker_append_elf_relos(struct bpf_linker *linker, struct src_obj *ob return 0; } -static struct src_sec *find_src_sec_by_name(struct src_obj *obj, const char *sec_name) -{ - struct src_sec *sec; - int i; - - for (i = 1; i < obj->sec_cnt; i++) { - sec = &obj->secs[i]; - - if (strcmp(sec->sec_name, sec_name) == 0) - return sec; - } - - return NULL; -} - static Elf64_Sym *find_sym_by_name(struct src_obj *obj, size_t sec_idx, int sym_type, const char *sym_name) { @@ -1384,12 +2001,32 @@ static int linker_fixup_btf(struct src_obj *obj) t->size = sec->shdr->sh_size; } else { /* BTF can have some sections that are not represented - * in ELF, e.g., .kconfig and .ksyms, which are used - * for special extern variables. Here we'll - * pre-create "section shells" for them to be able to - * keep track of extra per-section metadata later - * (e.g., BTF variables). + * in ELF, e.g., .kconfig, .ksyms, .extern, which are used + * for special extern variables. + * + * For all but one such special (ephemeral) + * sections, we pre-create "section shells" to be able + * to keep track of extra per-section metadata later + * (e.g., those BTF extern variables). + * + * .extern is even more special, though, because it + * contains extern variables that need to be resolved + * by static linker, not libbpf and kernel. When such + * externs are resolved, we are going to remove them + * from .extern BTF section and might end up not + * needing it at all. Each resolved extern should have + * matching non-extern VAR/FUNC in other sections. + * + * We do support leaving some of the externs + * unresolved, though, to support cases of building + * libraries, which will later be linked against final + * BPF applications. So if at finalization we still + * see unresolved externs, we'll create .extern + * section on our own. */ + if (strcmp(sec_name, BTF_EXTERN_SEC) == 0) + continue; + sec = add_src_sec(obj, sec_name); if (!sec) return -ENOMEM; @@ -1446,6 +2083,7 @@ static int linker_append_btf(struct bpf_linker *linker, struct src_obj *obj) { const struct btf_type *t; int i, j, n, start_id, id; + const char *name; if (!obj->btf) return 0; @@ -1458,12 +2096,44 @@ static int linker_append_btf(struct bpf_linker *linker, struct src_obj *obj) return -ENOMEM; for (i = 1; i <= n; i++) { + struct glob_sym *glob_sym = NULL; + t = btf__type_by_id(obj->btf, i); /* DATASECs are handled specially below */ if (btf_kind(t) == BTF_KIND_DATASEC) continue; + if (btf_is_non_static(t)) { + /* there should be glob_sym already */ + name = btf__str_by_offset(obj->btf, t->name_off); + glob_sym = find_glob_sym(linker, name); + + /* VARs without corresponding glob_sym are those that + * belong to skipped/deduplicated sections (i.e., + * license and version), so just skip them + */ + if (!glob_sym) + continue; + + /* linker_append_elf_sym() might have requested + * updating underlying type ID, if extern was resolved + * to strong symbol or weak got upgraded to non-weak + */ + if (glob_sym->underlying_btf_id == 0) + glob_sym->underlying_btf_id = -t->type; + + /* globals from previous object files that match our + * VAR/FUNC already have a corresponding associated + * BTF type, so just make sure to use it + */ + if (glob_sym->btf_id) { + /* reuse existing BTF type for global var/func */ + obj->btf_type_map[i] = glob_sym->btf_id; + continue; + } + } + id = btf__add_type(linker->btf, obj->btf, t); if (id < 0) { pr_warn("failed to append BTF type #%d from file '%s'\n", i, obj->filename); @@ -1471,6 +2141,12 @@ static int linker_append_btf(struct bpf_linker *linker, struct src_obj *obj) } obj->btf_type_map[i] = id; + + /* record just appended BTF type for var/func */ + if (glob_sym) { + glob_sym->btf_id = id; + glob_sym->underlying_btf_id = -t->type; + } } /* remap all the types except DATASECs */ @@ -1482,6 +2158,22 @@ static int linker_append_btf(struct bpf_linker *linker, struct src_obj *obj) return -EINVAL; } + /* Rewrite VAR/FUNC underlying types (i.e., FUNC's FUNC_PROTO and VAR's + * actual type), if necessary + */ + for (i = 0; i < linker->glob_sym_cnt; i++) { + struct glob_sym *glob_sym = &linker->glob_syms[i]; + struct btf_type *glob_t; + + if (glob_sym->underlying_btf_id >= 0) + continue; + + glob_sym->underlying_btf_id = obj->btf_type_map[-glob_sym->underlying_btf_id]; + + glob_t = btf_type_by_id(linker->btf, glob_sym->btf_id); + glob_t->type = glob_sym->underlying_btf_id; + } + /* append DATASEC info */ for (i = 1; i < obj->sec_cnt; i++) { struct src_sec *src_sec; @@ -1509,6 +2201,42 @@ static int linker_append_btf(struct bpf_linker *linker, struct src_obj *obj) n = btf_vlen(t); for (j = 0; j < n; j++, src_var++) { void *sec_vars = dst_sec->sec_vars; + int new_id = obj->btf_type_map[src_var->type]; + struct glob_sym *glob_sym = NULL; + + t = btf_type_by_id(linker->btf, new_id); + if (btf_is_non_static(t)) { + name = btf__str_by_offset(linker->btf, t->name_off); + glob_sym = find_glob_sym(linker, name); + if (glob_sym->sec_id != dst_sec->id) { + pr_warn("global '%s': section mismatch %d vs %d\n", + name, glob_sym->sec_id, dst_sec->id); + return -EINVAL; + } + } + + /* If there is already a member (VAR or FUNC) mapped + * to the same type, don't add a duplicate entry. + * This will happen when multiple object files define + * the same extern VARs/FUNCs. + */ + if (glob_sym && glob_sym->var_idx >= 0) { + __s64 sz; + + dst_var = &dst_sec->sec_vars[glob_sym->var_idx]; + /* Because underlying BTF type might have + * changed, so might its size have changed, so + * re-calculate and update it in sec_var. + */ + sz = btf__resolve_size(linker->btf, glob_sym->underlying_btf_id); + if (sz < 0) { + pr_warn("global '%s': failed to resolve size of underlying type: %d\n", + name, (int)sz); + return -EINVAL; + } + dst_var->size = sz; + continue; + } sec_vars = libbpf_reallocarray(sec_vars, dst_sec->sec_var_cnt + 1, @@ -1523,6 +2251,9 @@ static int linker_append_btf(struct bpf_linker *linker, struct src_obj *obj) dst_var->type = obj->btf_type_map[src_var->type]; dst_var->size = src_var->size; dst_var->offset = src_sec->dst_off + src_var->offset; + + if (glob_sym) + glob_sym->var_idx = dst_sec->sec_var_cnt - 1; } } -- cgit v1.2.3 From 0a342457b3bd36e6f9b558da3ff520dee35c5363 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 23 Apr 2021 11:13:42 -0700 Subject: libbpf: Support extern resolution for BTF-defined maps in .maps section Add extra logic to handle map externs (only BTF-defined maps are supported for linking). Re-use the map parsing logic used during bpf_object__open(). Map externs are currently restricted to always match complete map definition. So all the specified attributes will be compared (down to pining, map_flags, numa_node, etc). In the future this restriction might be relaxed with no backwards compatibility issues. If any attribute is mismatched between extern and actual map definition, linker will report an error, pointing out which one mismatches. The original intent was to allow for extern to specify attributes that matters (to user) to enforce. E.g., if you specify just key information and omit value, then any value fits. Similarly, it should have been possible to enforce map_flags, pinning, and any other possible map attribute. Unfortunately, that means that multiple externs can be only partially overlapping with each other, which means linker would need to combine their type definitions to end up with the most restrictive and fullest map definition. This requires an extra amount of BTF manipulation which at this time was deemed unnecessary and would require further extending generic BTF writer APIs. So that is left for future follow ups, if there will be demand for that. But the idea seems intresting and useful, so I want to document it here. Weak definitions are also supported, but are pretty strict as well, just like externs: all weak map definitions have to match exactly. In the follow up patches this most probably will be relaxed, with __weak map definitions being able to differ between each other (with non-weak definition always winning, of course). Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210423181348.1801389-13-andrii@kernel.org --- tools/lib/bpf/linker.c | 132 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) (limited to 'tools') diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c index 23f49945dc7a..9de084b1c699 100644 --- a/tools/lib/bpf/linker.c +++ b/tools/lib/bpf/linker.c @@ -1471,6 +1471,134 @@ recur: } } +static bool map_defs_match(const char *sym_name, + const struct btf *main_btf, + const struct btf_map_def *main_def, + const struct btf_map_def *main_inner_def, + const struct btf *extra_btf, + const struct btf_map_def *extra_def, + const struct btf_map_def *extra_inner_def) +{ + const char *reason; + + if (main_def->map_type != extra_def->map_type) { + reason = "type"; + goto mismatch; + } + + /* check key type/size match */ + if (main_def->key_size != extra_def->key_size) { + reason = "key_size"; + goto mismatch; + } + if (!!main_def->key_type_id != !!extra_def->key_type_id) { + reason = "key type"; + goto mismatch; + } + if ((main_def->parts & MAP_DEF_KEY_TYPE) + && !glob_sym_btf_matches(sym_name, true /*exact*/, + main_btf, main_def->key_type_id, + extra_btf, extra_def->key_type_id)) { + reason = "key type"; + goto mismatch; + } + + /* validate value type/size match */ + if (main_def->value_size != extra_def->value_size) { + reason = "value_size"; + goto mismatch; + } + if (!!main_def->value_type_id != !!extra_def->value_type_id) { + reason = "value type"; + goto mismatch; + } + if ((main_def->parts & MAP_DEF_VALUE_TYPE) + && !glob_sym_btf_matches(sym_name, true /*exact*/, + main_btf, main_def->value_type_id, + extra_btf, extra_def->value_type_id)) { + reason = "key type"; + goto mismatch; + } + + if (main_def->max_entries != extra_def->max_entries) { + reason = "max_entries"; + goto mismatch; + } + if (main_def->map_flags != extra_def->map_flags) { + reason = "map_flags"; + goto mismatch; + } + if (main_def->numa_node != extra_def->numa_node) { + reason = "numa_node"; + goto mismatch; + } + if (main_def->pinning != extra_def->pinning) { + reason = "pinning"; + goto mismatch; + } + + if ((main_def->parts & MAP_DEF_INNER_MAP) != (extra_def->parts & MAP_DEF_INNER_MAP)) { + reason = "inner map"; + goto mismatch; + } + + if (main_def->parts & MAP_DEF_INNER_MAP) { + char inner_map_name[128]; + + snprintf(inner_map_name, sizeof(inner_map_name), "%s.inner", sym_name); + + return map_defs_match(inner_map_name, + main_btf, main_inner_def, NULL, + extra_btf, extra_inner_def, NULL); + } + + return true; + +mismatch: + pr_warn("global '%s': map %s mismatch\n", sym_name, reason); + return false; +} + +static bool glob_map_defs_match(const char *sym_name, + struct bpf_linker *linker, struct glob_sym *glob_sym, + struct src_obj *obj, Elf64_Sym *sym, int btf_id) +{ + struct btf_map_def dst_def = {}, dst_inner_def = {}; + struct btf_map_def src_def = {}, src_inner_def = {}; + const struct btf_type *t; + int err; + + t = btf__type_by_id(obj->btf, btf_id); + if (!btf_is_var(t)) { + pr_warn("global '%s': invalid map definition type [%d]\n", sym_name, btf_id); + return false; + } + t = skip_mods_and_typedefs(obj->btf, t->type, NULL); + + err = parse_btf_map_def(sym_name, obj->btf, t, true /*strict*/, &src_def, &src_inner_def); + if (err) { + pr_warn("global '%s': invalid map definition\n", sym_name); + return false; + } + + /* re-parse existing map definition */ + t = btf__type_by_id(linker->btf, glob_sym->btf_id); + t = skip_mods_and_typedefs(linker->btf, t->type, NULL); + err = parse_btf_map_def(sym_name, linker->btf, t, true /*strict*/, &dst_def, &dst_inner_def); + if (err) { + /* this should not happen, because we already validated it */ + pr_warn("global '%s': invalid dst map definition\n", sym_name); + return false; + } + + /* Currently extern map definition has to be complete and match + * concrete map definition exactly. This restriction might be lifted + * in the future. + */ + return map_defs_match(sym_name, linker->btf, &dst_def, &dst_inner_def, + obj->btf, &src_def, &src_inner_def); +} + static bool glob_syms_match(const char *sym_name, struct bpf_linker *linker, struct glob_sym *glob_sym, struct src_obj *obj, Elf64_Sym *sym, size_t sym_idx, int btf_id) @@ -1492,6 +1620,10 @@ static bool glob_syms_match(const char *sym_name, return false; } + /* deal with .maps definitions specially */ + if (glob_sym->sec_id && strcmp(linker->secs[glob_sym->sec_id].sec_name, MAPS_ELF_SEC) == 0) + return glob_map_defs_match(sym_name, linker, glob_sym, obj, sym, btf_id); + if (!glob_sym_btf_matches(sym_name, true /*exact*/, linker->btf, glob_sym->btf_id, obj->btf, btf_id)) return false; -- cgit v1.2.3 From 41c472e85b531a228067bee9be59a508900bcd9f Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 23 Apr 2021 11:13:43 -0700 Subject: selftests/bpf: Use -O0 instead of -Og in selftests builds While -Og is designed to work well with debugger, it's still inferior to -O0 in terms of debuggability experience. It will cause some variables to still be inlined, it will also prevent single-stepping some statements and otherwise interfere with debugging experience. So switch to -O0 which turns off any optimization and provides the best debugging experience. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210423181348.1801389-14-andrii@kernel.org --- tools/testing/selftests/bpf/Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index c5bcdb3d4b12..dd8061069b08 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -21,7 +21,7 @@ endif BPF_GCC ?= $(shell command -v bpf-gcc;) SAN_CFLAGS ?= -CFLAGS += -g -Og -rdynamic -Wall $(GENFLAGS) $(SAN_CFLAGS) \ +CFLAGS += -g -O0 -rdynamic -Wall $(GENFLAGS) $(SAN_CFLAGS) \ -I$(CURDIR) -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \ -I$(TOOLSINCDIR) -I$(APIDIR) -I$(OUTPUT) \ -Dbpf_prog_load=bpf_prog_test_load \ @@ -205,7 +205,7 @@ $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) \ $(HOST_BPFOBJ) | $(HOST_BUILD_DIR)/bpftool $(Q)$(MAKE) $(submake_extras) -C $(BPFTOOLDIR) \ CC=$(HOSTCC) LD=$(HOSTLD) \ - EXTRA_CFLAGS='-g -Og' \ + EXTRA_CFLAGS='-g -O0' \ OUTPUT=$(HOST_BUILD_DIR)/bpftool/ \ prefix= DESTDIR=$(HOST_SCRATCH_DIR)/ install @@ -225,7 +225,7 @@ $(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ ../../../include/uapi/linux/bpf.h \ | $(INCLUDE_DIR) $(BUILD_DIR)/libbpf $(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(BUILD_DIR)/libbpf/ \ - EXTRA_CFLAGS='-g -Og' \ + EXTRA_CFLAGS='-g -O0' \ DESTDIR=$(SCRATCH_DIR) prefix= all install_headers ifneq ($(BPFOBJ),$(HOST_BPFOBJ)) @@ -233,7 +233,7 @@ $(HOST_BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ ../../../include/uapi/linux/bpf.h \ | $(INCLUDE_DIR) $(HOST_BUILD_DIR)/libbpf $(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) \ - EXTRA_CFLAGS='-g -Og' \ + EXTRA_CFLAGS='-g -O0' \ OUTPUT=$(HOST_BUILD_DIR)/libbpf/ CC=$(HOSTCC) LD=$(HOSTLD) \ DESTDIR=$(HOST_SCRATCH_DIR)/ prefix= all install_headers endif -- cgit v1.2.3 From b131aed910097a2eeac8180bf3cf214eea475d9a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 23 Apr 2021 11:13:44 -0700 Subject: selftests/bpf: Omit skeleton generation for multi-linked BPF object files Skip generating individual BPF skeletons for files that are supposed to be linked together to form the final BPF object file. Very often such files are "incomplete" BPF object files, which will fail libbpf bpf_object__open() step, if used individually, thus failing BPF skeleton generation. This is by design, so skip individual BPF skeletons and only validate them as part of their linked final BPF object file and skeleton. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210423181348.1801389-15-andrii@kernel.org --- tools/testing/selftests/bpf/Makefile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index dd8061069b08..dd3692ec56d3 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -313,6 +313,8 @@ LINKED_SKELS := test_static_linked.skel.h test_static_linked.skel.h-deps := test_static_linked1.o test_static_linked2.o +LINKED_BPF_SRCS := $(patsubst %.o,%.c,$(foreach skel,$(LINKED_SKELS),$($(skel)-deps))) + # Set up extra TRUNNER_XXX "temporary" variables in the environment (relies on # $eval()) and pass control to DEFINE_TEST_RUNNER_RULES. # Parameters: @@ -331,7 +333,7 @@ TRUNNER_TESTS_HDR := $(TRUNNER_TESTS_DIR)/tests.h TRUNNER_BPF_SRCS := $$(notdir $$(wildcard $(TRUNNER_BPF_PROGS_DIR)/*.c)) TRUNNER_BPF_OBJS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.o, $$(TRUNNER_BPF_SRCS)) TRUNNER_BPF_SKELS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.skel.h, \ - $$(filter-out $(SKEL_BLACKLIST), \ + $$(filter-out $(SKEL_BLACKLIST) $(LINKED_BPF_SRCS),\ $$(TRUNNER_BPF_SRCS))) TRUNNER_BPF_SKELS_LINKED := $$(addprefix $$(TRUNNER_OUTPUT)/,$(LINKED_SKELS)) TEST_GEN_FILES += $$(TRUNNER_BPF_OBJS) -- cgit v1.2.3 From f2644fb44de9abd54e57b55f584c7c67526f7c02 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 23 Apr 2021 11:13:45 -0700 Subject: selftests/bpf: Add function linking selftest Add selftest validating various aspects of statically linking functions: - no conflicts and correct resolution for name-conflicting static funcs; - correct resolution of extern functions; - correct handling of weak functions, both resolution itself and libbpf's handling of unused weak function that "lost" (it leaves gaps in code with no ELF symbols); - correct handling of hidden visibility to turn global function into "static" for the purpose of BPF verification. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210423181348.1801389-16-andrii@kernel.org --- tools/testing/selftests/bpf/Makefile | 3 +- .../selftests/bpf/prog_tests/linked_funcs.c | 42 +++++++++++++ tools/testing/selftests/bpf/progs/linked_funcs1.c | 73 ++++++++++++++++++++++ tools/testing/selftests/bpf/progs/linked_funcs2.c | 73 ++++++++++++++++++++++ 4 files changed, 190 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/linked_funcs.c create mode 100644 tools/testing/selftests/bpf/progs/linked_funcs1.c create mode 100644 tools/testing/selftests/bpf/progs/linked_funcs2.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index dd3692ec56d3..ab7b129a8408 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -309,9 +309,10 @@ endef SKEL_BLACKLIST := btf__% test_pinning_invalid.c test_sk_assign.c -LINKED_SKELS := test_static_linked.skel.h +LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h test_static_linked.skel.h-deps := test_static_linked1.o test_static_linked2.o +linked_funcs.skel.h-deps := linked_funcs1.o linked_funcs2.o LINKED_BPF_SRCS := $(patsubst %.o,%.c,$(foreach skel,$(LINKED_SKELS),$($(skel)-deps))) diff --git a/tools/testing/selftests/bpf/prog_tests/linked_funcs.c b/tools/testing/selftests/bpf/prog_tests/linked_funcs.c new file mode 100644 index 000000000000..e9916f2817ec --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/linked_funcs.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include +#include +#include "linked_funcs.skel.h" + +void test_linked_funcs(void) +{ + int err; + struct linked_funcs *skel; + + skel = linked_funcs__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + skel->rodata->my_tid = syscall(SYS_gettid); + skel->bss->syscall_id = SYS_getpgid; + + err = linked_funcs__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + err = linked_funcs__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto cleanup; + + /* trigger */ + syscall(SYS_getpgid); + + ASSERT_EQ(skel->bss->output_val1, 2000 + 2000, "output_val1"); + ASSERT_EQ(skel->bss->output_ctx1, SYS_getpgid, "output_ctx1"); + ASSERT_EQ(skel->bss->output_weak1, 42, "output_weak1"); + + ASSERT_EQ(skel->bss->output_val2, 2 * 1000 + 2 * (2 * 1000), "output_val2"); + ASSERT_EQ(skel->bss->output_ctx2, SYS_getpgid, "output_ctx2"); + /* output_weak2 should never be updated */ + ASSERT_EQ(skel->bss->output_weak2, 0, "output_weak2"); + +cleanup: + linked_funcs__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/linked_funcs1.c b/tools/testing/selftests/bpf/progs/linked_funcs1.c new file mode 100644 index 000000000000..b964ec1390c2 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/linked_funcs1.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include "vmlinux.h" +#include +#include + +/* weak and shared between two files */ +const volatile int my_tid __weak; +long syscall_id __weak; + +int output_val1; +int output_ctx1; +int output_weak1; + +/* same "subprog" name in all files, but it's ok because they all are static */ +static __noinline int subprog(int x) +{ + /* but different formula */ + return x * 1; +} + +/* Global functions can't be void */ +int set_output_val1(int x) +{ + output_val1 = x + subprog(x); + return x; +} + +/* This function can't be verified as global, as it assumes raw_tp/sys_enter + * context and accesses syscall id (second argument). So we mark it as + * __hidden, so that libbpf will mark it as static in the final object file, + * right before verifying it in the kernel. + * + * But we don't mark it as __hidden here, rather at extern site. __hidden is + * "contaminating" visibility, so it will get propagated from either extern or + * actual definition (including from the losing __weak definition). + */ +void set_output_ctx1(__u64 *ctx) +{ + output_ctx1 = ctx[1]; /* long id, same as in BPF_PROG below */ +} + +/* this weak instance should win because it's the first one */ +__weak int set_output_weak(int x) +{ + output_weak1 = x; + return x; +} + +extern int set_output_val2(int x); + +/* here we'll force set_output_ctx2() to be __hidden in the final obj file */ +__hidden extern void set_output_ctx2(__u64 *ctx); + +SEC("raw_tp/sys_enter") +int BPF_PROG(handler1, struct pt_regs *regs, long id) +{ + if (my_tid != (u32)bpf_get_current_pid_tgid() || id != syscall_id) + return 0; + + set_output_val2(1000); + set_output_ctx2(ctx); /* ctx definition is hidden in BPF_PROG macro */ + + /* keep input value the same across both files to avoid dependency on + * handler call order; differentiate by output_weak1 vs output_weak2. + */ + set_output_weak(42); + + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/linked_funcs2.c b/tools/testing/selftests/bpf/progs/linked_funcs2.c new file mode 100644 index 000000000000..575e958e60b7 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/linked_funcs2.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include "vmlinux.h" +#include +#include + +/* weak and shared between both files */ +const volatile int my_tid __weak; +long syscall_id __weak; + +int output_val2; +int output_ctx2; +int output_weak2; /* should stay zero */ + +/* same "subprog" name in all files, but it's ok because they all are static */ +static __noinline int subprog(int x) +{ + /* but different formula */ + return x * 2; +} + +/* Global functions can't be void */ +int set_output_val2(int x) +{ + output_val2 = 2 * x + 2 * subprog(x); + return 2 * x; +} + +/* This function can't be verified as global, as it assumes raw_tp/sys_enter + * context and accesses syscall id (second argument). So we mark it as + * __hidden, so that libbpf will mark it as static in the final object file, + * right before verifying it in the kernel. + * + * But we don't mark it as __hidden here, rather at extern site. __hidden is + * "contaminating" visibility, so it will get propagated from either extern or + * actual definition (including from the losing __weak definition). + */ +void set_output_ctx2(__u64 *ctx) +{ + output_ctx2 = ctx[1]; /* long id, same as in BPF_PROG below */ +} + +/* this weak instance should lose, because it will be processed second */ +__weak int set_output_weak(int x) +{ + output_weak2 = x; + return 2 * x; +} + +extern int set_output_val1(int x); + +/* here we'll force set_output_ctx1() to be __hidden in the final obj file */ +__hidden extern void set_output_ctx1(__u64 *ctx); + +SEC("raw_tp/sys_enter") +int BPF_PROG(handler2, struct pt_regs *regs, long id) +{ + if (my_tid != (u32)bpf_get_current_pid_tgid() || id != syscall_id) + return 0; + + set_output_val1(2000); + set_output_ctx1(ctx); /* ctx definition is hidden in BPF_PROG macro */ + + /* keep input value the same across both files to avoid dependency on + * handler call order; differentiate by output_weak1 vs output_weak2. + */ + set_output_weak(42); + + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; -- cgit v1.2.3 From 14f1aae17ee13d08315873d4b68d573e91df892f Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 23 Apr 2021 11:13:46 -0700 Subject: selftests/bpf: Add global variables linking selftest Add selftest validating various aspects of statically linking global variables: - correct resolution of extern variables across .bss, .data, and .rodata sections; - correct handling of weak definitions; - correct de-duplication of repeating special externs (.kconfig, .ksyms). Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210423181348.1801389-17-andrii@kernel.org --- tools/testing/selftests/bpf/Makefile | 3 +- .../testing/selftests/bpf/prog_tests/linked_vars.c | 43 +++++++++++++++++ tools/testing/selftests/bpf/progs/linked_vars1.c | 54 +++++++++++++++++++++ tools/testing/selftests/bpf/progs/linked_vars2.c | 55 ++++++++++++++++++++++ 4 files changed, 154 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/linked_vars.c create mode 100644 tools/testing/selftests/bpf/progs/linked_vars1.c create mode 100644 tools/testing/selftests/bpf/progs/linked_vars2.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index ab7b129a8408..411e2cf07ba5 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -309,10 +309,11 @@ endef SKEL_BLACKLIST := btf__% test_pinning_invalid.c test_sk_assign.c -LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h +LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h linked_vars.skel.h test_static_linked.skel.h-deps := test_static_linked1.o test_static_linked2.o linked_funcs.skel.h-deps := linked_funcs1.o linked_funcs2.o +linked_vars.skel.h-deps := linked_vars1.o linked_vars2.o LINKED_BPF_SRCS := $(patsubst %.o,%.c,$(foreach skel,$(LINKED_SKELS),$($(skel)-deps))) diff --git a/tools/testing/selftests/bpf/prog_tests/linked_vars.c b/tools/testing/selftests/bpf/prog_tests/linked_vars.c new file mode 100644 index 000000000000..267166abe4c1 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/linked_vars.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include +#include +#include "linked_vars.skel.h" + +void test_linked_vars(void) +{ + int err; + struct linked_vars *skel; + + skel = linked_vars__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + skel->bss->input_bss1 = 1000; + skel->bss->input_bss2 = 2000; + skel->bss->input_bss_weak = 3000; + + err = linked_vars__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + err = linked_vars__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto cleanup; + + /* trigger */ + syscall(SYS_getpgid); + + ASSERT_EQ(skel->bss->output_bss1, 1000 + 2000 + 3000, "output_bss1"); + ASSERT_EQ(skel->bss->output_bss2, 1000 + 2000 + 3000, "output_bss2"); + /* 10 comes from "winner" input_data_weak in first obj file */ + ASSERT_EQ(skel->bss->output_data1, 1 + 2 + 10, "output_bss1"); + ASSERT_EQ(skel->bss->output_data2, 1 + 2 + 10, "output_bss2"); + /* 100 comes from "winner" input_rodata_weak in first obj file */ + ASSERT_EQ(skel->bss->output_rodata1, 11 + 22 + 100, "output_weak1"); + ASSERT_EQ(skel->bss->output_rodata2, 11 + 22 + 100, "output_weak2"); + +cleanup: + linked_vars__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/linked_vars1.c b/tools/testing/selftests/bpf/progs/linked_vars1.c new file mode 100644 index 000000000000..ef9e9d0bb0ca --- /dev/null +++ b/tools/testing/selftests/bpf/progs/linked_vars1.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include "vmlinux.h" +#include +#include + +extern int LINUX_KERNEL_VERSION __kconfig; +/* this weak extern will be strict due to the other file's strong extern */ +extern bool CONFIG_BPF_SYSCALL __kconfig __weak; +extern const void bpf_link_fops __ksym __weak; + +int input_bss1; +int input_data1 = 1; +const volatile int input_rodata1 = 11; + +int input_bss_weak __weak; +/* these two definitions should win */ +int input_data_weak __weak = 10; +const volatile int input_rodata_weak __weak = 100; + +extern int input_bss2; +extern int input_data2; +extern const int input_rodata2; + +int output_bss1; +int output_data1; +int output_rodata1; + +long output_sink1; + +static __noinline int get_bss_res(void) +{ + /* just make sure all the relocations work against .text as well */ + return input_bss1 + input_bss2 + input_bss_weak; +} + +SEC("raw_tp/sys_enter") +int BPF_PROG(handler1) +{ + output_bss1 = get_bss_res(); + output_data1 = input_data1 + input_data2 + input_data_weak; + output_rodata1 = input_rodata1 + input_rodata2 + input_rodata_weak; + + /* make sure we actually use above special externs, otherwise compiler + * will optimize them out + */ + output_sink1 = LINUX_KERNEL_VERSION + + CONFIG_BPF_SYSCALL + + (long)&bpf_link_fops; + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/linked_vars2.c b/tools/testing/selftests/bpf/progs/linked_vars2.c new file mode 100644 index 000000000000..e4f5bd388a3c --- /dev/null +++ b/tools/testing/selftests/bpf/progs/linked_vars2.c @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include "vmlinux.h" +#include +#include + +extern int LINUX_KERNEL_VERSION __kconfig; +/* when an extern is defined as both strong and weak, resulting symbol will be strong */ +extern bool CONFIG_BPF_SYSCALL __kconfig; +extern const void __start_BTF __ksym; + +int input_bss2; +int input_data2 = 2; +const volatile int input_rodata2 = 22; + +int input_bss_weak __weak; +/* these two weak variables should lose */ +int input_data_weak __weak = 20; +const volatile int input_rodata_weak __weak = 200; + +extern int input_bss1; +extern int input_data1; +extern const int input_rodata1; + +int output_bss2; +int output_data2; +int output_rodata2; + +int output_sink2; + +static __noinline int get_data_res(void) +{ + /* just make sure all the relocations work against .text as well */ + return input_data1 + input_data2 + input_data_weak; +} + +SEC("raw_tp/sys_enter") +int BPF_PROG(handler2) +{ + output_bss2 = input_bss1 + input_bss2 + input_bss_weak; + output_data2 = get_data_res(); + output_rodata2 = input_rodata1 + input_rodata2 + input_rodata_weak; + + /* make sure we actually use above special externs, otherwise compiler + * will optimize them out + */ + output_sink2 = LINUX_KERNEL_VERSION + + CONFIG_BPF_SYSCALL + + (long)&__start_BTF; + + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; -- cgit v1.2.3 From 3b2ad502256b7f0f9415978fd7f158656d11401e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 23 Apr 2021 11:13:47 -0700 Subject: selftests/bpf: Add map linking selftest Add selftest validating various aspects of statically linking BTF-defined map definitions. Legacy map definitions do not support extern resolution between object files. Some of the aspects validated: - correct resolution of extern maps against concrete map definitions; - extern maps can currently only specify map type and key/value size and/or type information; - weak concrete map definitions are resolved properly. Static map definitions are not yet supported by libbpf, so they are not explicitly tested, though manual testing showes that BPF linker handles them properly. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210423181348.1801389-18-andrii@kernel.org --- tools/testing/selftests/bpf/Makefile | 4 +- .../testing/selftests/bpf/prog_tests/linked_maps.c | 30 ++++++++ tools/testing/selftests/bpf/progs/linked_maps1.c | 82 ++++++++++++++++++++++ tools/testing/selftests/bpf/progs/linked_maps2.c | 76 ++++++++++++++++++++ 4 files changed, 191 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/linked_maps.c create mode 100644 tools/testing/selftests/bpf/progs/linked_maps1.c create mode 100644 tools/testing/selftests/bpf/progs/linked_maps2.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 411e2cf07ba5..283e5ad8385e 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -309,11 +309,13 @@ endef SKEL_BLACKLIST := btf__% test_pinning_invalid.c test_sk_assign.c -LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h linked_vars.skel.h +LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h \ + linked_vars.skel.h linked_maps.skel.h test_static_linked.skel.h-deps := test_static_linked1.o test_static_linked2.o linked_funcs.skel.h-deps := linked_funcs1.o linked_funcs2.o linked_vars.skel.h-deps := linked_vars1.o linked_vars2.o +linked_maps.skel.h-deps := linked_maps1.o linked_maps2.o LINKED_BPF_SRCS := $(patsubst %.o,%.c,$(foreach skel,$(LINKED_SKELS),$($(skel)-deps))) diff --git a/tools/testing/selftests/bpf/prog_tests/linked_maps.c b/tools/testing/selftests/bpf/prog_tests/linked_maps.c new file mode 100644 index 000000000000..85dcaaaf2775 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/linked_maps.c @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include +#include +#include "linked_maps.skel.h" + +void test_linked_maps(void) +{ + int err; + struct linked_maps *skel; + + skel = linked_maps__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + err = linked_maps__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto cleanup; + + /* trigger */ + syscall(SYS_getpgid); + + ASSERT_EQ(skel->bss->output_first1, 2000, "output_first1"); + ASSERT_EQ(skel->bss->output_second1, 2, "output_second1"); + ASSERT_EQ(skel->bss->output_weak1, 2, "output_weak1"); + +cleanup: + linked_maps__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/linked_maps1.c b/tools/testing/selftests/bpf/progs/linked_maps1.c new file mode 100644 index 000000000000..52291515cc72 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/linked_maps1.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include "vmlinux.h" +#include +#include + +struct my_key { long x; }; +struct my_value { long x; }; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct my_key); + __type(value, struct my_value); + __uint(max_entries, 16); +} map1 SEC(".maps"); + + /* Matches map2 definition in linked_maps2.c. Order of the attributes doesn't + * matter. + */ +typedef struct { + __uint(max_entries, 8); + __type(key, int); + __type(value, int); + __uint(type, BPF_MAP_TYPE_ARRAY); +} map2_t; + +extern map2_t map2 SEC(".maps"); + +/* This should be the winning map definition, but we have no way of verifying, + * so we just make sure that it links and works without errors + */ +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, int); + __type(value, int); + __uint(max_entries, 16); +} map_weak __weak SEC(".maps"); + +int output_first1; +int output_second1; +int output_weak1; + +SEC("raw_tp/sys_enter") +int BPF_PROG(handler_enter1) +{ + /* update values with key = 1 */ + int key = 1, val = 1; + struct my_key key_struct = { .x = 1 }; + struct my_value val_struct = { .x = 1000 }; + + bpf_map_update_elem(&map1, &key_struct, &val_struct, 0); + bpf_map_update_elem(&map2, &key, &val, 0); + bpf_map_update_elem(&map_weak, &key, &val, 0); + + return 0; +} + +SEC("raw_tp/sys_exit") +int BPF_PROG(handler_exit1) +{ + /* lookup values with key = 2, set in another file */ + int key = 2, *val; + struct my_key key_struct = { .x = 2 }; + struct my_value *value_struct; + + value_struct = bpf_map_lookup_elem(&map1, &key_struct); + if (value_struct) + output_first1 = value_struct->x; + + val = bpf_map_lookup_elem(&map2, &key); + if (val) + output_second1 = *val; + + val = bpf_map_lookup_elem(&map_weak, &key); + if (val) + output_weak1 = *val; + + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/linked_maps2.c b/tools/testing/selftests/bpf/progs/linked_maps2.c new file mode 100644 index 000000000000..0693687474ed --- /dev/null +++ b/tools/testing/selftests/bpf/progs/linked_maps2.c @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include "vmlinux.h" +#include +#include + +/* modifiers and typedefs are ignored when comparing key/value types */ +typedef struct my_key { long x; } key_type; +typedef struct my_value { long x; } value_type; + +extern struct { + __uint(max_entries, 16); + __type(key, key_type); + __type(value, value_type); + __uint(type, BPF_MAP_TYPE_HASH); +} map1 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, int); + __type(value, int); + __uint(max_entries, 8); +} map2 SEC(".maps"); + +/* this definition will lose, but it has to exactly match the winner */ +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, int); + __type(value, int); + __uint(max_entries, 16); +} map_weak __weak SEC(".maps"); + +int output_first2; +int output_second2; +int output_weak2; + +SEC("raw_tp/sys_enter") +int BPF_PROG(handler_enter2) +{ + /* update values with key = 2 */ + int key = 2, val = 2; + key_type key_struct = { .x = 2 }; + value_type val_struct = { .x = 2000 }; + + bpf_map_update_elem(&map1, &key_struct, &val_struct, 0); + bpf_map_update_elem(&map2, &key, &val, 0); + bpf_map_update_elem(&map_weak, &key, &val, 0); + + return 0; +} + +SEC("raw_tp/sys_exit") +int BPF_PROG(handler_exit2) +{ + /* lookup values with key = 1, set in another file */ + int key = 1, *val; + key_type key_struct = { .x = 1 }; + value_type *value_struct; + + value_struct = bpf_map_lookup_elem(&map1, &key_struct); + if (value_struct) + output_first2 = value_struct->x; + + val = bpf_map_lookup_elem(&map2, &key); + if (val) + output_second2 = *val; + + val = bpf_map_lookup_elem(&map_weak, &key); + if (val) + output_weak2 = *val; + + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; -- cgit v1.2.3 From a9dab4e4569425e26cd9c2d8bdcc74bd12fcb8bf Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 23 Apr 2021 11:13:48 -0700 Subject: selftests/bpf: Document latest Clang fix expectations for linking tests Document which fixes are required to generate correct static linking selftests. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210423181348.1801389-19-andrii@kernel.org --- tools/testing/selftests/bpf/README.rst | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst index 65fe318d1e71..3353778c30f8 100644 --- a/tools/testing/selftests/bpf/README.rst +++ b/tools/testing/selftests/bpf/README.rst @@ -193,3 +193,12 @@ Without it, the error from compiling bpf selftests looks like: libbpf: failed to find BTF for extern 'tcp_slow_start' [25] section: -2 __ https://reviews.llvm.org/D93563 + +Clang dependencies for static linking tests +=========================================== + +linked_vars, linked_maps, and linked_funcs tests depend on `Clang fix`__ to +generate valid BTF information for weak variables. Please make sure you use +Clang that contains the fix. + +__ https://reviews.llvm.org/D100362 -- cgit v1.2.3 From df8aee6d6fa520ff77f48d46ebd2034249669033 Mon Sep 17 00:00:00 2001 From: Yonglong Li Date: Fri, 23 Apr 2021 11:17:09 -0700 Subject: selftests: mptcp: add a test case for MSG_PEEK Extend mptcp_connect tool with MSG_PEEK support and add a test case in mptcp_connect.sh that checks the data received from/after recv() with MSG_PEEK. Acked-by: Paolo Abeni Co-developed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: Yonglong Li Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/mptcp_connect.c | 48 +++++++++++++++++++++- tools/testing/selftests/net/mptcp/mptcp_connect.sh | 29 ++++++++++--- 2 files changed, 69 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c index 2f207cf33661..d88e1fdfb147 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.c +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c @@ -45,7 +45,14 @@ enum cfg_mode { CFG_MODE_SENDFILE, }; +enum cfg_peek { + CFG_NONE_PEEK, + CFG_WITH_PEEK, + CFG_AFTER_PEEK, +}; + static enum cfg_mode cfg_mode = CFG_MODE_POLL; +static enum cfg_peek cfg_peek = CFG_NONE_PEEK; static const char *cfg_host; static const char *cfg_port = "12000"; static int cfg_sock_proto = IPPROTO_MPTCP; @@ -73,6 +80,8 @@ static void die_usage(void) fprintf(stderr, "\t-M mark -- set socket packet mark\n"); fprintf(stderr, "\t-u -- check mptcp ulp\n"); fprintf(stderr, "\t-w num -- wait num sec before closing the socket\n"); + fprintf(stderr, + "\t-P [saveWithPeek|saveAfterPeek] -- save data with/after MSG_PEEK form tcp socket\n"); exit(1); } @@ -331,6 +340,8 @@ static size_t do_write(const int fd, char *buf, const size_t len) static ssize_t do_rnd_read(const int fd, char *buf, const size_t len) { + int ret = 0; + char tmp[16384]; size_t cap = rand(); cap &= 0xffff; @@ -340,7 +351,17 @@ static ssize_t do_rnd_read(const int fd, char *buf, const size_t len) else if (cap > len) cap = len; - return read(fd, buf, cap); + if (cfg_peek == CFG_WITH_PEEK) { + ret = recv(fd, buf, cap, MSG_PEEK); + ret = (ret < 0) ? ret : read(fd, tmp, ret); + } else if (cfg_peek == CFG_AFTER_PEEK) { + ret = recv(fd, buf, cap, MSG_PEEK); + ret = (ret < 0) ? ret : read(fd, buf, cap); + } else { + ret = read(fd, buf, cap); + } + + return ret; } static void set_nonblock(int fd) @@ -819,6 +840,26 @@ int parse_mode(const char *mode) return 0; } +int parse_peek(const char *mode) +{ + if (!strcasecmp(mode, "saveWithPeek")) + return CFG_WITH_PEEK; + if (!strcasecmp(mode, "saveAfterPeek")) + return CFG_AFTER_PEEK; + + fprintf(stderr, "Unknown: %s\n", mode); + fprintf(stderr, "Supported MSG_PEEK mode are:\n"); + fprintf(stderr, + "\t\t\"saveWithPeek\" - recv data with flags 'MSG_PEEK' and save the peek data into file\n"); + fprintf(stderr, + "\t\t\"saveAfterPeek\" - read and save data into file after recv with flags 'MSG_PEEK'\n"); + + die_usage(); + + /* silence compiler warning */ + return 0; +} + static int parse_int(const char *size) { unsigned long s; @@ -846,7 +887,7 @@ static void parse_opts(int argc, char **argv) { int c; - while ((c = getopt(argc, argv, "6jr:lp:s:hut:m:S:R:w:M:")) != -1) { + while ((c = getopt(argc, argv, "6jr:lp:s:hut:m:S:R:w:M:P:")) != -1) { switch (c) { case 'j': cfg_join = true; @@ -899,6 +940,9 @@ static void parse_opts(int argc, char **argv) case 'M': cfg_mark = strtol(optarg, NULL, 0); break; + case 'P': + cfg_peek = parse_peek(optarg); + break; } } diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index 385cdc98aed8..9236609731b1 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -375,7 +375,7 @@ do_transfer() local srv_proto="$4" local connect_addr="$5" local local_addr="$6" - local extra_args="" + local extra_args="$7" local port port=$((10000+$TEST_COUNT)) @@ -394,9 +394,9 @@ do_transfer() fi if [ -n "$extra_args" ] && $options_log; then - options_log=false echo "INFO: extra options: $extra_args" fi + options_log=false :> "$cout" :> "$sout" @@ -589,6 +589,7 @@ run_tests_lo() local connector_ns="$2" local connect_addr="$3" local loopback="$4" + local extra_args="$5" local lret=0 # skip if test programs are running inside same netns for subsequent runs. @@ -608,7 +609,8 @@ run_tests_lo() local_addr="0.0.0.0" fi - do_transfer ${listener_ns} ${connector_ns} MPTCP MPTCP ${connect_addr} ${local_addr} + do_transfer ${listener_ns} ${connector_ns} MPTCP MPTCP \ + ${connect_addr} ${local_addr} "${extra_args}" lret=$? if [ $lret -ne 0 ]; then ret=$lret @@ -622,14 +624,16 @@ run_tests_lo() fi fi - do_transfer ${listener_ns} ${connector_ns} MPTCP TCP ${connect_addr} ${local_addr} + do_transfer ${listener_ns} ${connector_ns} MPTCP TCP \ + ${connect_addr} ${local_addr} "${extra_args}" lret=$? if [ $lret -ne 0 ]; then ret=$lret return 1 fi - do_transfer ${listener_ns} ${connector_ns} TCP MPTCP ${connect_addr} ${local_addr} + do_transfer ${listener_ns} ${connector_ns} TCP MPTCP \ + ${connect_addr} ${local_addr} "${extra_args}" lret=$? if [ $lret -ne 0 ]; then ret=$lret @@ -637,7 +641,8 @@ run_tests_lo() fi if [ $do_tcp -gt 1 ] ;then - do_transfer ${listener_ns} ${connector_ns} TCP TCP ${connect_addr} ${local_addr} + do_transfer ${listener_ns} ${connector_ns} TCP TCP \ + ${connect_addr} ${local_addr} "${extra_args}" lret=$? if [ $lret -ne 0 ]; then ret=$lret @@ -653,6 +658,15 @@ run_tests() run_tests_lo $1 $2 $3 0 } +run_tests_peekmode() +{ + local peekmode="$1" + + echo "INFO: with peek mode: ${peekmode}" + run_tests_lo "$ns1" "$ns1" 10.0.1.1 1 "-P ${peekmode}" + run_tests_lo "$ns1" "$ns1" dead:beef:1::1 1 "-P ${peekmode}" +} + make_file "$cin" "client" make_file "$sin" "server" @@ -732,6 +746,9 @@ for sender in $ns1 $ns2 $ns3 $ns4;do run_tests "$ns4" $sender dead:beef:3::1 done +run_tests_peekmode "saveWithPeek" +run_tests_peekmode "saveAfterPeek" + time_end=$(date +%s) time_run=$((time_end-time_start)) -- cgit v1.2.3 From b881d089c7c9c7032da812cda1b4b0818f477780 Mon Sep 17 00:00:00 2001 From: Po-Hsu Lin Date: Fri, 23 Apr 2021 19:15:38 +0800 Subject: selftests/net: bump timeout to 5 minutes We found that with the latest mainline kernel (5.12.0-051200rc8) on some KVM instances / bare-metal systems, the following tests will take longer than the kselftest framework default timeout (45 seconds) to run and thus got terminated with TIMEOUT error: * xfrm_policy.sh - took about 2m20s * pmtu.sh - took about 3m5s * udpgso_bench.sh - took about 60s Bump the timeout setting to 5 minutes to allow them have a chance to finish. https://bugs.launchpad.net/bugs/1856010 Signed-off-by: Po-Hsu Lin Signed-off-by: David S. Miller --- tools/testing/selftests/net/Makefile | 2 ++ tools/testing/selftests/net/settings | 1 + 2 files changed, 3 insertions(+) create mode 100644 tools/testing/selftests/net/settings (limited to 'tools') diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index f4242a961088..3915bb7bfc39 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -39,6 +39,8 @@ TEST_GEN_FILES += ipsec TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict tls +TEST_FILES := settings + KSFT_KHDR_INSTALL := 1 include ../lib.mk diff --git a/tools/testing/selftests/net/settings b/tools/testing/selftests/net/settings new file mode 100644 index 000000000000..694d70710ff0 --- /dev/null +++ b/tools/testing/selftests/net/settings @@ -0,0 +1 @@ +timeout=300 -- cgit v1.2.3 From b61442df748f06e98085fb604093a6215ce730eb Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 16 Apr 2021 22:00:51 +0900 Subject: tools: do not include scripts/Kbuild.include Since commit 57fd251c7896 ("kbuild: split cc-option and friends to scripts/Makefile.compiler"), some kselftests fail to build. The tools/ directory opted out Kbuild, and went in a different direction. People copied scripts and Makefiles to the tools/ directory to create their own build system. tools/build/Build.include mimics scripts/Kbuild.include, but some tool Makefiles include the Kbuild one to import a feature that is missing in tools/build/Build.include: - Commit ec04aa3ae87b ("tools/thermal: tmon: use "-fstack-protector" only if supported") included scripts/Kbuild.include from tools/thermal/tmon/Makefile to import the cc-option macro. - Commit c2390f16fc5b ("selftests: kvm: fix for compilers that do not support -no-pie") included scripts/Kbuild.include from tools/testing/selftests/kvm/Makefile to import the try-run macro. - Commit 9cae4ace80ef ("selftests/bpf: do not ignore clang failures") included scripts/Kbuild.include from tools/testing/selftests/bpf/Makefile to import the .DELETE_ON_ERROR target. - Commit 0695f8bca93e ("selftests/powerpc: Handle Makefile for unrecognized option") included scripts/Kbuild.include from tools/testing/selftests/powerpc/pmu/ebb/Makefile to import the try-run macro. Copy what they need into tools/build/Build.include, and make them include it instead of scripts/Kbuild.include. Link: https://lore.kernel.org/lkml/86dadf33-70f7-a5ac-cb8c-64966d2f45a1@linux.ibm.com/ Fixes: 57fd251c7896 ("kbuild: split cc-option and friends to scripts/Makefile.compiler") Reported-by: Janosch Frank Reported-by: Christian Borntraeger Signed-off-by: Masahiro Yamada Tested-by: Christian Borntraeger Acked-by: Yonghong Song --- tools/build/Build.include | 24 ++++++++++++++++++++++++ tools/testing/selftests/bpf/Makefile | 2 +- tools/testing/selftests/kvm/Makefile | 2 +- tools/testing/selftests/powerpc/pmu/ebb/Makefile | 2 +- tools/thermal/tmon/Makefile | 2 +- 5 files changed, 28 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/build/Build.include b/tools/build/Build.include index 585486e40995..2cf3b1bde86e 100644 --- a/tools/build/Build.include +++ b/tools/build/Build.include @@ -100,3 +100,27 @@ cxx_flags = -Wp,-MD,$(depfile) -Wp,-MT,$@ $(CXXFLAGS) -D"BUILD_STR(s)=\#s" $(CXX ## HOSTCC C flags host_c_flags = -Wp,-MD,$(depfile) -Wp,-MT,$@ $(KBUILD_HOSTCFLAGS) -D"BUILD_STR(s)=\#s" $(HOSTCFLAGS_$(basetarget).o) $(HOSTCFLAGS_$(obj)) + +# output directory for tests below +TMPOUT = .tmp_$$$$ + +# try-run +# Usage: option = $(call try-run, $(CC)...-o "$$TMP",option-ok,otherwise) +# Exit code chooses option. "$$TMP" serves as a temporary file and is +# automatically cleaned up. +try-run = $(shell set -e; \ + TMP=$(TMPOUT)/tmp; \ + mkdir -p $(TMPOUT); \ + trap "rm -rf $(TMPOUT)" EXIT; \ + if ($(1)) >/dev/null 2>&1; \ + then echo "$(2)"; \ + else echo "$(3)"; \ + fi) + +# cc-option +# Usage: cflags-y += $(call cc-option,-march=winchip-c6,-march=i586) +cc-option = $(call try-run, \ + $(CC) -Werror $(1) -c -x c /dev/null -o "$$TMP",$(1),$(2)) + +# delete partially updated (i.e. corrupted) files on error +.DELETE_ON_ERROR: diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 044bfdcf5b74..17a5cdf48d37 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -include ../../../../scripts/Kbuild.include +include ../../../build/Build.include include ../../../scripts/Makefile.arch include ../../../scripts/Makefile.include diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index a6d61f451f88..5ef141f265bd 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only -include ../../../../scripts/Kbuild.include +include ../../../build/Build.include all: diff --git a/tools/testing/selftests/powerpc/pmu/ebb/Makefile b/tools/testing/selftests/powerpc/pmu/ebb/Makefile index af3df79d8163..c5ecb4634094 100644 --- a/tools/testing/selftests/powerpc/pmu/ebb/Makefile +++ b/tools/testing/selftests/powerpc/pmu/ebb/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -include ../../../../../../scripts/Kbuild.include +include ../../../../../build/Build.include noarg: $(MAKE) -C ../../ diff --git a/tools/thermal/tmon/Makefile b/tools/thermal/tmon/Makefile index 59e417ec3e13..9db867df7679 100644 --- a/tools/thermal/tmon/Makefile +++ b/tools/thermal/tmon/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 # We need this for the "cc-option" macro. -include ../../../scripts/Kbuild.include +include ../../build/Build.include VERSION = 1.0 -- cgit v1.2.3 From 464c62f6f6e1c836d7aae68dbf46101de84fdcb7 Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Mon, 29 Mar 2021 15:09:03 +0800 Subject: perf vendor events intel: Add missing skylake & icelake model numbers Kernel has supported COMETLAKE/COMETLAKE_L to use the SKYLAKE events and supported TIGERLAKE_L/TIGERLAKE/ROCKETLAKE to use the ICELAKE events. But pmu-events mapfile.csv is missing these model numbers. Now add the missing model numbers to mapfile.csv. Signed-off-by: Jin Yao Reviewed-by: Andi Kleen Cc: Alexander Shishkin Cc: Jin Yao Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20210329070903.8894-1-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/x86/mapfile.csv | 3 +++ 1 file changed, 3 insertions(+) (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index 9f97b32533a0..0a6a8c7f937f 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -24,6 +24,7 @@ GenuineIntel-6-1F,v2,nehalemep,core GenuineIntel-6-1A,v2,nehalemep,core GenuineIntel-6-2E,v2,nehalemex,core GenuineIntel-6-[4589]E,v24,skylake,core +GenuineIntel-6-A[56],v24,skylake,core GenuineIntel-6-37,v13,silvermont,core GenuineIntel-6-4D,v13,silvermont,core GenuineIntel-6-4C,v13,silvermont,core @@ -35,6 +36,8 @@ GenuineIntel-6-55-[01234],v1,skylakex,core GenuineIntel-6-55-[56789ABCDEF],v1,cascadelakex,core GenuineIntel-6-7D,v1,icelake,core GenuineIntel-6-7E,v1,icelake,core +GenuineIntel-6-8[CD],v1,icelake,core +GenuineIntel-6-A7,v1,icelake,core GenuineIntel-6-86,v1,tremontx,core AuthenticAMD-23-([12][0-9A-F]|[0-9A-F]),v2,amdzen1,core AuthenticAMD-23-[[:xdigit:]]+,v1,amdzen2,core -- cgit v1.2.3 From 56dda5a48f4f610ac9a0487c6fb64d31950e4a3e Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 14 Apr 2021 21:51:43 +0200 Subject: selftests/bpf: Add re-attach test to fentry_test Adding the test to re-attach (detach/attach again) tracing fentry programs, plus check that already linked program can't be attached again. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210414195147.1624932-4-jolsa@kernel.org --- .../testing/selftests/bpf/prog_tests/fentry_test.c | 52 +++++++++++++++------- 1 file changed, 37 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/fentry_test.c b/tools/testing/selftests/bpf/prog_tests/fentry_test.c index 04ebbf1cb390..7cb111b11995 100644 --- a/tools/testing/selftests/bpf/prog_tests/fentry_test.c +++ b/tools/testing/selftests/bpf/prog_tests/fentry_test.c @@ -3,35 +3,57 @@ #include #include "fentry_test.skel.h" -void test_fentry_test(void) +static int fentry_test(struct fentry_test *fentry_skel) { - struct fentry_test *fentry_skel = NULL; int err, prog_fd, i; __u32 duration = 0, retval; + struct bpf_link *link; __u64 *result; - fentry_skel = fentry_test__open_and_load(); - if (CHECK(!fentry_skel, "fentry_skel_load", "fentry skeleton failed\n")) - goto cleanup; - err = fentry_test__attach(fentry_skel); - if (CHECK(err, "fentry_attach", "fentry attach failed: %d\n", err)) - goto cleanup; + if (!ASSERT_OK(err, "fentry_attach")) + return err; + + /* Check that already linked program can't be attached again. */ + link = bpf_program__attach(fentry_skel->progs.test1); + if (!ASSERT_ERR_PTR(link, "fentry_attach_link")) + return -1; prog_fd = bpf_program__fd(fentry_skel->progs.test1); err = bpf_prog_test_run(prog_fd, 1, NULL, 0, NULL, NULL, &retval, &duration); - CHECK(err || retval, "test_run", - "err %d errno %d retval %d duration %d\n", - err, errno, retval, duration); + ASSERT_OK(err, "test_run"); + ASSERT_EQ(retval, 0, "test_run"); result = (__u64 *)fentry_skel->bss; - for (i = 0; i < 6; i++) { - if (CHECK(result[i] != 1, "result", - "fentry_test%d failed err %lld\n", i + 1, result[i])) - goto cleanup; + for (i = 0; i < sizeof(*fentry_skel->bss) / sizeof(__u64); i++) { + if (!ASSERT_EQ(result[i], 1, "fentry_result")) + return -1; } + fentry_test__detach(fentry_skel); + + /* zero results for re-attach test */ + memset(fentry_skel->bss, 0, sizeof(*fentry_skel->bss)); + return 0; +} + +void test_fentry_test(void) +{ + struct fentry_test *fentry_skel = NULL; + int err; + + fentry_skel = fentry_test__open_and_load(); + if (!ASSERT_OK_PTR(fentry_skel, "fentry_skel_load")) + goto cleanup; + + err = fentry_test(fentry_skel); + if (!ASSERT_OK(err, "fentry_first_attach")) + goto cleanup; + + err = fentry_test(fentry_skel); + ASSERT_OK(err, "fentry_second_attach"); + cleanup: fentry_test__destroy(fentry_skel); } -- cgit v1.2.3 From 8caadc43f2019caebbf314f7a6ae2faed791e783 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 14 Apr 2021 21:51:44 +0200 Subject: selftests/bpf: Add re-attach test to fexit_test Adding the test to re-attach (detach/attach again) tracing fexit programs, plus check that already linked program can't be attached again. Also switching to ASSERT* macros. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210414195147.1624932-5-jolsa@kernel.org --- .../testing/selftests/bpf/prog_tests/fexit_test.c | 52 +++++++++++++++------- 1 file changed, 37 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_test.c b/tools/testing/selftests/bpf/prog_tests/fexit_test.c index 78d7a2765c27..6792e41f7f69 100644 --- a/tools/testing/selftests/bpf/prog_tests/fexit_test.c +++ b/tools/testing/selftests/bpf/prog_tests/fexit_test.c @@ -3,35 +3,57 @@ #include #include "fexit_test.skel.h" -void test_fexit_test(void) +static int fexit_test(struct fexit_test *fexit_skel) { - struct fexit_test *fexit_skel = NULL; int err, prog_fd, i; __u32 duration = 0, retval; + struct bpf_link *link; __u64 *result; - fexit_skel = fexit_test__open_and_load(); - if (CHECK(!fexit_skel, "fexit_skel_load", "fexit skeleton failed\n")) - goto cleanup; - err = fexit_test__attach(fexit_skel); - if (CHECK(err, "fexit_attach", "fexit attach failed: %d\n", err)) - goto cleanup; + if (!ASSERT_OK(err, "fexit_attach")) + return err; + + /* Check that already linked program can't be attached again. */ + link = bpf_program__attach(fexit_skel->progs.test1); + if (!ASSERT_ERR_PTR(link, "fexit_attach_link")) + return -1; prog_fd = bpf_program__fd(fexit_skel->progs.test1); err = bpf_prog_test_run(prog_fd, 1, NULL, 0, NULL, NULL, &retval, &duration); - CHECK(err || retval, "test_run", - "err %d errno %d retval %d duration %d\n", - err, errno, retval, duration); + ASSERT_OK(err, "test_run"); + ASSERT_EQ(retval, 0, "test_run"); result = (__u64 *)fexit_skel->bss; - for (i = 0; i < 6; i++) { - if (CHECK(result[i] != 1, "result", - "fexit_test%d failed err %lld\n", i + 1, result[i])) - goto cleanup; + for (i = 0; i < sizeof(*fexit_skel->bss) / sizeof(__u64); i++) { + if (!ASSERT_EQ(result[i], 1, "fexit_result")) + return -1; } + fexit_test__detach(fexit_skel); + + /* zero results for re-attach test */ + memset(fexit_skel->bss, 0, sizeof(*fexit_skel->bss)); + return 0; +} + +void test_fexit_test(void) +{ + struct fexit_test *fexit_skel = NULL; + int err; + + fexit_skel = fexit_test__open_and_load(); + if (!ASSERT_OK_PTR(fexit_skel, "fexit_skel_load")) + goto cleanup; + + err = fexit_test(fexit_skel); + if (!ASSERT_OK(err, "fexit_first_attach")) + goto cleanup; + + err = fexit_test(fexit_skel); + ASSERT_OK(err, "fexit_second_attach"); + cleanup: fexit_test__destroy(fexit_skel); } -- cgit v1.2.3 From cede72ad367a105852e814ef91717aac4383b853 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 14 Apr 2021 21:51:45 +0200 Subject: selftests/bpf: Add re-attach test to lsm test Adding the test to re-attach (detach/attach again) lsm programs, plus check that already linked program can't be attached again. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210414195147.1624932-6-jolsa@kernel.org --- tools/testing/selftests/bpf/prog_tests/test_lsm.c | 48 ++++++++++++++++++----- 1 file changed, 38 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/test_lsm.c b/tools/testing/selftests/bpf/prog_tests/test_lsm.c index 2755e4f81499..d492e76e01cf 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_lsm.c +++ b/tools/testing/selftests/bpf/prog_tests/test_lsm.c @@ -18,6 +18,8 @@ char *CMD_ARGS[] = {"true", NULL}; #define GET_PAGE_ADDR(ADDR, PAGE_SIZE) \ (char *)(((unsigned long) (ADDR + PAGE_SIZE)) & ~(PAGE_SIZE-1)) +static int duration = 0; + int stack_mprotect(void) { void *buf; @@ -51,23 +53,25 @@ int exec_cmd(int *monitored_pid) return -EINVAL; } -void test_test_lsm(void) +static int test_lsm(struct lsm *skel) { - struct lsm *skel = NULL; - int err, duration = 0; + struct bpf_link *link; int buf = 1234; - - skel = lsm__open_and_load(); - if (CHECK(!skel, "skel_load", "lsm skeleton failed\n")) - goto close_prog; + int err; err = lsm__attach(skel); if (CHECK(err, "attach", "lsm attach failed: %d\n", err)) - goto close_prog; + return err; + + /* Check that already linked program can't be attached again. */ + link = bpf_program__attach(skel->progs.test_int_hook); + if (CHECK(!IS_ERR(link), "attach_link", + "re-attach without detach should not succeed")) + return -1; err = exec_cmd(&skel->bss->monitored_pid); if (CHECK(err < 0, "exec_cmd", "err %d errno %d\n", err, errno)) - goto close_prog; + return err; CHECK(skel->bss->bprm_count != 1, "bprm_count", "bprm_count = %d\n", skel->bss->bprm_count); @@ -77,7 +81,7 @@ void test_test_lsm(void) err = stack_mprotect(); if (CHECK(errno != EPERM, "stack_mprotect", "want err=EPERM, got %d\n", errno)) - goto close_prog; + return err; CHECK(skel->bss->mprotect_count != 1, "mprotect_count", "mprotect_count = %d\n", skel->bss->mprotect_count); @@ -89,6 +93,30 @@ void test_test_lsm(void) CHECK(skel->bss->copy_test != 3, "copy_test", "copy_test = %d\n", skel->bss->copy_test); + lsm__detach(skel); + + skel->bss->copy_test = 0; + skel->bss->bprm_count = 0; + skel->bss->mprotect_count = 0; + return 0; +} + +void test_test_lsm(void) +{ + struct lsm *skel = NULL; + int err; + + skel = lsm__open_and_load(); + if (CHECK(!skel, "lsm_skel_load", "lsm skeleton failed\n")) + goto close_prog; + + err = test_lsm(skel); + if (CHECK(err, "test_lsm", "first attach failed\n")) + goto close_prog; + + err = test_lsm(skel); + CHECK(err, "test_lsm", "second attach failed\n"); + close_prog: lsm__destroy(skel); } -- cgit v1.2.3 From a1c05c3b09e0a92b26b94650837bf06c664beb1b Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 14 Apr 2021 21:51:46 +0200 Subject: selftests/bpf: Test that module can't be unloaded with attached trampoline Adding test to verify that once we attach module's trampoline, the module can't be unloaded. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210414195147.1624932-7-jolsa@kernel.org --- .../selftests/bpf/prog_tests/module_attach.c | 23 ++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/module_attach.c b/tools/testing/selftests/bpf/prog_tests/module_attach.c index 5bc53d53d86e..d85a69b7ce44 100644 --- a/tools/testing/selftests/bpf/prog_tests/module_attach.c +++ b/tools/testing/selftests/bpf/prog_tests/module_attach.c @@ -45,12 +45,18 @@ static int trigger_module_test_write(int write_sz) return 0; } +static int delete_module(const char *name, int flags) +{ + return syscall(__NR_delete_module, name, flags); +} + void test_module_attach(void) { const int READ_SZ = 456; const int WRITE_SZ = 457; struct test_module_attach* skel; struct test_module_attach__bss *bss; + struct bpf_link *link; int err; skel = test_module_attach__open(); @@ -84,6 +90,23 @@ void test_module_attach(void) ASSERT_EQ(bss->fexit_ret, -EIO, "fexit_tet"); ASSERT_EQ(bss->fmod_ret_read_sz, READ_SZ, "fmod_ret"); + test_module_attach__detach(skel); + + /* attach fentry/fexit and make sure it get's module reference */ + link = bpf_program__attach(skel->progs.handle_fentry); + if (!ASSERT_OK_PTR(link, "attach_fentry")) + goto cleanup; + + ASSERT_ERR(delete_module("bpf_testmod", 0), "delete_module"); + bpf_link__destroy(link); + + link = bpf_program__attach(skel->progs.handle_fexit); + if (!ASSERT_OK_PTR(link, "attach_fexit")) + goto cleanup; + + ASSERT_ERR(delete_module("bpf_testmod", 0), "delete_module"); + bpf_link__destroy(link); + cleanup: test_module_attach__destroy(skel); } -- cgit v1.2.3 From 7bb2cc19aee8f7150851bb8668c9ff655a5e7678 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 14 Apr 2021 21:51:47 +0200 Subject: selftests/bpf: Use ASSERT macros in lsm test Replacing CHECK with ASSERT macros. Suggested-by: Andrii Nakryiko Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210414195147.1624932-8-jolsa@kernel.org --- tools/testing/selftests/bpf/prog_tests/test_lsm.c | 27 +++++++++-------------- 1 file changed, 10 insertions(+), 17 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/test_lsm.c b/tools/testing/selftests/bpf/prog_tests/test_lsm.c index d492e76e01cf..244c01125126 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_lsm.c +++ b/tools/testing/selftests/bpf/prog_tests/test_lsm.c @@ -18,8 +18,6 @@ char *CMD_ARGS[] = {"true", NULL}; #define GET_PAGE_ADDR(ADDR, PAGE_SIZE) \ (char *)(((unsigned long) (ADDR + PAGE_SIZE)) & ~(PAGE_SIZE-1)) -static int duration = 0; - int stack_mprotect(void) { void *buf; @@ -60,38 +58,33 @@ static int test_lsm(struct lsm *skel) int err; err = lsm__attach(skel); - if (CHECK(err, "attach", "lsm attach failed: %d\n", err)) + if (!ASSERT_OK(err, "attach")) return err; /* Check that already linked program can't be attached again. */ link = bpf_program__attach(skel->progs.test_int_hook); - if (CHECK(!IS_ERR(link), "attach_link", - "re-attach without detach should not succeed")) + if (!ASSERT_ERR_PTR(link, "attach_link")) return -1; err = exec_cmd(&skel->bss->monitored_pid); - if (CHECK(err < 0, "exec_cmd", "err %d errno %d\n", err, errno)) + if (!ASSERT_OK(err, "exec_cmd")) return err; - CHECK(skel->bss->bprm_count != 1, "bprm_count", "bprm_count = %d\n", - skel->bss->bprm_count); + ASSERT_EQ(skel->bss->bprm_count, 1, "bprm_count"); skel->bss->monitored_pid = getpid(); err = stack_mprotect(); - if (CHECK(errno != EPERM, "stack_mprotect", "want err=EPERM, got %d\n", - errno)) + if (!ASSERT_EQ(errno, EPERM, "stack_mprotect")) return err; - CHECK(skel->bss->mprotect_count != 1, "mprotect_count", - "mprotect_count = %d\n", skel->bss->mprotect_count); + ASSERT_EQ(skel->bss->mprotect_count, 1, "mprotect_count"); syscall(__NR_setdomainname, &buf, -2L); syscall(__NR_setdomainname, 0, -3L); syscall(__NR_setdomainname, ~0L, -4L); - CHECK(skel->bss->copy_test != 3, "copy_test", - "copy_test = %d\n", skel->bss->copy_test); + ASSERT_EQ(skel->bss->copy_test, 3, "copy_test"); lsm__detach(skel); @@ -107,15 +100,15 @@ void test_test_lsm(void) int err; skel = lsm__open_and_load(); - if (CHECK(!skel, "lsm_skel_load", "lsm skeleton failed\n")) + if (!ASSERT_OK_PTR(skel, "lsm_skel_load")) goto close_prog; err = test_lsm(skel); - if (CHECK(err, "test_lsm", "first attach failed\n")) + if (!ASSERT_OK(err, "test_lsm_first_attach")) goto close_prog; err = test_lsm(skel); - CHECK(err, "test_lsm", "second attach failed\n"); + ASSERT_OK(err, "test_lsm_second_attach"); close_prog: lsm__destroy(skel); -- cgit v1.2.3 From d4787579d2133370ab47963c6527e79731df5b2a Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Tue, 27 Apr 2021 03:31:38 +0800 Subject: selftests: kvm: Fix the check of return value In vm_vcpu_rm() and kvm_vm_release(), a stale return value is checked in TEST_ASSERT macro. Fix it by assigning variable ret with correct return value. Signed-off-by: Zhenzhong Duan Message-Id: <20210426193138.118276-1-zhenzhong.duan@intel.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/lib/kvm_util.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 8926f91d1a98..fc83f6c5902d 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -520,7 +520,7 @@ static void vm_vcpu_rm(struct kvm_vm *vm, struct vcpu *vcpu) ret = munmap(vcpu->state, vcpu_mmap_sz()); TEST_ASSERT(ret == 0, "munmap of VCPU fd failed, rc: %i " "errno: %i", ret, errno); - close(vcpu->fd); + ret = close(vcpu->fd); TEST_ASSERT(ret == 0, "Close of VCPU fd failed, rc: %i " "errno: %i", ret, errno); @@ -540,7 +540,7 @@ void kvm_vm_release(struct kvm_vm *vmp) TEST_ASSERT(ret == 0, "Close of vm fd failed,\n" " vmp->fd: %i rc: %i errno: %i", vmp->fd, ret, errno); - close(vmp->kvm_fd); + ret = close(vmp->kvm_fd); TEST_ASSERT(ret == 0, "Close of /dev/kvm fd failed,\n" " vmp->kvm_fd: %i rc: %i errno: %i", vmp->kvm_fd, ret, errno); } -- cgit v1.2.3 From 3bf0fcd754345d7ea63e1446015ba65ece6788ca Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 26 Apr 2021 15:01:21 +0200 Subject: KVM: selftests: Speed up set_memory_region_test After commit 4fc096a99e01 ("KVM: Raise the maximum number of user memslots") set_memory_region_test may take too long, reports are that the default timeout value we have (120s) may not be enough even on a physical host. Speed things up a bit by throwing away vm_userspace_mem_region_add() usage from test_add_max_memory_regions(), we don't really need to do the majority of the stuff it does for the sake of this test. On my AMD EPYC 7401P, # time ./set_memory_region_test pre-patch: Testing KVM_RUN with zero added memory regions Allowed number of memory slots: 32764 Adding slots 0..32763, each memory region with 2048K size Testing MOVE of in-use region, 10 loops Testing DELETE of in-use region, 10 loops real 0m44.917s user 0m7.416s sys 0m34.601s post-patch: Testing KVM_RUN with zero added memory regions Allowed number of memory slots: 32764 Adding slots 0..32763, each memory region with 2048K size Testing MOVE of in-use region, 10 loops Testing DELETE of in-use region, 10 loops real 0m20.714s user 0m0.109s sys 0m18.359s Reported-by: kernel test robot Signed-off-by: Vitaly Kuznetsov Message-Id: <20210426130121.758229-1-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/set_memory_region_test.c | 61 ++++++++++++++++------ 1 file changed, 45 insertions(+), 16 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index f127ed31dba7..978f5b5f4dc0 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -329,6 +329,22 @@ static void test_zero_memory_regions(void) } #endif /* __x86_64__ */ +static int test_memory_region_add(struct kvm_vm *vm, void *mem, uint32_t slot, + uint32_t size, uint64_t guest_addr) +{ + struct kvm_userspace_memory_region region; + int ret; + + region.slot = slot; + region.flags = 0; + region.guest_phys_addr = guest_addr; + region.memory_size = size; + region.userspace_addr = (uintptr_t) mem; + ret = ioctl(vm_get_fd(vm), KVM_SET_USER_MEMORY_REGION, ®ion); + + return ret; +} + /* * Test it can be added memory slots up to KVM_CAP_NR_MEMSLOTS, then any * tentative to add further slots should fail. @@ -339,9 +355,15 @@ static void test_add_max_memory_regions(void) struct kvm_vm *vm; uint32_t max_mem_slots; uint32_t slot; - uint64_t guest_addr = 0x0; - uint64_t mem_reg_npages; - void *mem; + void *mem, *mem_aligned, *mem_extra; + size_t alignment; + +#ifdef __s390x__ + /* On s390x, the host address must be aligned to 1M (due to PGSTEs) */ + alignment = 0x100000; +#else + alignment = 1; +#endif max_mem_slots = kvm_check_cap(KVM_CAP_NR_MEMSLOTS); TEST_ASSERT(max_mem_slots > 0, @@ -350,30 +372,37 @@ static void test_add_max_memory_regions(void) vm = vm_create(VM_MODE_DEFAULT, 0, O_RDWR); - mem_reg_npages = vm_calc_num_guest_pages(VM_MODE_DEFAULT, MEM_REGION_SIZE); - /* Check it can be added memory slots up to the maximum allowed */ pr_info("Adding slots 0..%i, each memory region with %dK size\n", (max_mem_slots - 1), MEM_REGION_SIZE >> 10); + + mem = mmap(NULL, MEM_REGION_SIZE * max_mem_slots + alignment, + PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + TEST_ASSERT(mem != MAP_FAILED, "Failed to mmap() host"); + mem_aligned = (void *)(((size_t) mem + alignment - 1) & ~(alignment - 1)); + for (slot = 0; slot < max_mem_slots; slot++) { - vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, - guest_addr, slot, mem_reg_npages, - 0); - guest_addr += MEM_REGION_SIZE; + ret = test_memory_region_add(vm, mem_aligned + + ((uint64_t)slot * MEM_REGION_SIZE), + slot, MEM_REGION_SIZE, + (uint64_t)slot * MEM_REGION_SIZE); + TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n" + " rc: %i errno: %i slot: %i\n", + ret, errno, slot); } /* Check it cannot be added memory slots beyond the limit */ - mem = mmap(NULL, MEM_REGION_SIZE, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - TEST_ASSERT(mem != MAP_FAILED, "Failed to mmap() host"); + mem_extra = mmap(NULL, MEM_REGION_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + TEST_ASSERT(mem_extra != MAP_FAILED, "Failed to mmap() host"); - ret = ioctl(vm_get_fd(vm), KVM_SET_USER_MEMORY_REGION, - &(struct kvm_userspace_memory_region) {slot, 0, guest_addr, - MEM_REGION_SIZE, (uint64_t) mem}); + ret = test_memory_region_add(vm, mem_extra, max_mem_slots, MEM_REGION_SIZE, + (uint64_t)max_mem_slots * MEM_REGION_SIZE); TEST_ASSERT(ret == -1 && errno == EINVAL, "Adding one more memory slot should fail with EINVAL"); - munmap(mem, MEM_REGION_SIZE); + munmap(mem, MEM_REGION_SIZE * max_mem_slots + alignment); + munmap(mem_extra, MEM_REGION_SIZE); kvm_vm_free(vm); } -- cgit v1.2.3 From 7a2fa70aaffc2f8823feca22709a00f5c069a8a9 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 26 Apr 2021 12:29:45 -0700 Subject: selftests/bpf: Add remaining ASSERT_xxx() variants Add ASSERT_TRUE/ASSERT_FALSE for conditions calculated with custom logic to true/false. Also add remaining arithmetical assertions: - ASSERT_LE -- less than or equal; - ASSERT_GT -- greater than; - ASSERT_GE -- greater than or equal. This should cover most scenarios where people fall back to error-prone CHECK()s. Also extend ASSERT_ERR() to print out errno, in addition to direct error. Also convert few CHECK() instances to ensure new ASSERT_xxx() variants work as expected. Subsequent patch will also use ASSERT_TRUE/ASSERT_FALSE more extensively. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/20210426192949.416837-2-andrii@kernel.org --- tools/testing/selftests/bpf/prog_tests/btf_dump.c | 2 +- .../testing/selftests/bpf/prog_tests/btf_endian.c | 4 +- .../testing/selftests/bpf/prog_tests/cgroup_link.c | 2 +- tools/testing/selftests/bpf/prog_tests/kfree_skb.c | 2 +- .../selftests/bpf/prog_tests/resolve_btfids.c | 7 +-- .../selftests/bpf/prog_tests/snprintf_btf.c | 4 +- tools/testing/selftests/bpf/test_progs.h | 50 +++++++++++++++++++++- 7 files changed, 56 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c index c60091ee8a21..5e129dc2073c 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c @@ -77,7 +77,7 @@ static int test_btf_dump_case(int n, struct btf_dump_test_case *t) snprintf(out_file, sizeof(out_file), "/tmp/%s.output.XXXXXX", t->file); fd = mkstemp(out_file); - if (CHECK(fd < 0, "create_tmp", "failed to create file: %d\n", fd)) { + if (!ASSERT_GE(fd, 0, "create_tmp")) { err = fd; goto done; } diff --git a/tools/testing/selftests/bpf/prog_tests/btf_endian.c b/tools/testing/selftests/bpf/prog_tests/btf_endian.c index 8c52d72c876e..8ab5d3e358dd 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_endian.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_endian.c @@ -6,8 +6,6 @@ #include #include -static int duration = 0; - void test_btf_endian() { #if __BYTE_ORDER == __LITTLE_ENDIAN enum btf_endianness endian = BTF_LITTLE_ENDIAN; @@ -71,7 +69,7 @@ void test_btf_endian() { /* now modify original BTF */ var_id = btf__add_var(btf, "some_var", BTF_VAR_GLOBAL_ALLOCATED, 1); - CHECK(var_id <= 0, "var_id", "failed %d\n", var_id); + ASSERT_GT(var_id, 0, "var_id"); btf__free(swap_btf); swap_btf = NULL; diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_link.c b/tools/testing/selftests/bpf/prog_tests/cgroup_link.c index 4d9b514b3fd9..736796e56ed1 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_link.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_link.c @@ -54,7 +54,7 @@ void test_cgroup_link(void) for (i = 0; i < cg_nr; i++) { cgs[i].fd = create_and_get_cgroup(cgs[i].path); - if (CHECK(cgs[i].fd < 0, "cg_create", "fail: %d\n", cgs[i].fd)) + if (!ASSERT_GE(cgs[i].fd, 0, "cg_create")) goto cleanup; } diff --git a/tools/testing/selftests/bpf/prog_tests/kfree_skb.c b/tools/testing/selftests/bpf/prog_tests/kfree_skb.c index 42c3a3103c26..d65107919998 100644 --- a/tools/testing/selftests/bpf/prog_tests/kfree_skb.c +++ b/tools/testing/selftests/bpf/prog_tests/kfree_skb.c @@ -134,7 +134,7 @@ void test_kfree_skb(void) /* make sure kfree_skb program was triggered * and it sent expected skb into ring buffer */ - CHECK_FAIL(!passed); + ASSERT_TRUE(passed, "passed"); err = bpf_map_lookup_elem(bpf_map__fd(global_data), &zero, test_ok); if (CHECK(err, "get_result", diff --git a/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c b/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c index 6ace5e9efec1..d3c2de2c24d1 100644 --- a/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c +++ b/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c @@ -160,11 +160,8 @@ int test_resolve_btfids(void) break; if (i > 0) { - ret = CHECK(test_set.ids[i - 1] > test_set.ids[i], - "sort_check", - "test_set is not sorted\n"); - if (ret) - break; + if (!ASSERT_LE(test_set.ids[i - 1], test_set.ids[i], "sort_check")) + return -1; } } diff --git a/tools/testing/selftests/bpf/prog_tests/snprintf_btf.c b/tools/testing/selftests/bpf/prog_tests/snprintf_btf.c index 686b40f11a45..76e1f5fe18fa 100644 --- a/tools/testing/selftests/bpf/prog_tests/snprintf_btf.c +++ b/tools/testing/selftests/bpf/prog_tests/snprintf_btf.c @@ -42,9 +42,7 @@ void test_snprintf_btf(void) * and it set expected return values from bpf_trace_printk()s * and all tests ran. */ - if (CHECK(bss->ret <= 0, - "bpf_snprintf_btf: got return value", - "ret <= 0 %ld test %d\n", bss->ret, bss->ran_subtests)) + if (!ASSERT_GT(bss->ret, 0, "bpf_snprintf_ret")) goto cleanup; if (CHECK(bss->ran_subtests == 0, "check if subtests ran", diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index ee7e3b45182a..dda52cb649dc 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -130,6 +130,20 @@ extern int test__join_cgroup(const char *path); #define CHECK_ATTR(condition, tag, format...) \ _CHECK(condition, tag, tattr.duration, format) +#define ASSERT_TRUE(actual, name) ({ \ + static int duration = 0; \ + bool ___ok = (actual); \ + CHECK(!___ok, (name), "unexpected %s: got FALSE\n", (name)); \ + ___ok; \ +}) + +#define ASSERT_FALSE(actual, name) ({ \ + static int duration = 0; \ + bool ___ok = !(actual); \ + CHECK(!___ok, (name), "unexpected %s: got TRUE\n", (name)); \ + ___ok; \ +}) + #define ASSERT_EQ(actual, expected, name) ({ \ static int duration = 0; \ typeof(actual) ___act = (actual); \ @@ -163,6 +177,39 @@ extern int test__join_cgroup(const char *path); ___ok; \ }) +#define ASSERT_LE(actual, expected, name) ({ \ + static int duration = 0; \ + typeof(actual) ___act = (actual); \ + typeof(expected) ___exp = (expected); \ + bool ___ok = ___act <= ___exp; \ + CHECK(!___ok, (name), \ + "unexpected %s: actual %lld > expected %lld\n", \ + (name), (long long)(___act), (long long)(___exp)); \ + ___ok; \ +}) + +#define ASSERT_GT(actual, expected, name) ({ \ + static int duration = 0; \ + typeof(actual) ___act = (actual); \ + typeof(expected) ___exp = (expected); \ + bool ___ok = ___act > ___exp; \ + CHECK(!___ok, (name), \ + "unexpected %s: actual %lld <= expected %lld\n", \ + (name), (long long)(___act), (long long)(___exp)); \ + ___ok; \ +}) + +#define ASSERT_GE(actual, expected, name) ({ \ + static int duration = 0; \ + typeof(actual) ___act = (actual); \ + typeof(expected) ___exp = (expected); \ + bool ___ok = ___act >= ___exp; \ + CHECK(!___ok, (name), \ + "unexpected %s: actual %lld < expected %lld\n", \ + (name), (long long)(___act), (long long)(___exp)); \ + ___ok; \ +}) + #define ASSERT_STREQ(actual, expected, name) ({ \ static int duration = 0; \ const char *___act = actual; \ @@ -178,7 +225,8 @@ extern int test__join_cgroup(const char *path); static int duration = 0; \ long long ___res = (res); \ bool ___ok = ___res == 0; \ - CHECK(!___ok, (name), "unexpected error: %lld\n", ___res); \ + CHECK(!___ok, (name), "unexpected error: %lld (errno %d)\n", \ + ___res, errno); \ ___ok; \ }) -- cgit v1.2.3 From 6709a914c8498f42b1498b3d31f4b078d092fd35 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 26 Apr 2021 12:29:46 -0700 Subject: libbpf: Support BTF_KIND_FLOAT during type compatibility checks in CO-RE Add BTF_KIND_FLOAT support when doing CO-RE field type compatibility check. Without this, relocations against float/double fields will fail. Also adjust one error message to emit instruction index instead of less convenient instruction byte offset. Fixes: 22541a9eeb0d ("libbpf: Add BTF_KIND_FLOAT support") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/20210426192949.416837-3-andrii@kernel.org --- tools/lib/bpf/libbpf.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index a1cddd17af7d..e2a3cf437814 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -5115,6 +5115,7 @@ err_out: * least one of enums should be anonymous; * - for ENUMs, check sizes, names are ignored; * - for INT, size and signedness are ignored; + * - any two FLOATs are always compatible; * - for ARRAY, dimensionality is ignored, element types are checked for * compatibility recursively; * - everything else shouldn't be ever a target of relocation. @@ -5141,6 +5142,7 @@ recur: switch (btf_kind(local_type)) { case BTF_KIND_PTR: + case BTF_KIND_FLOAT: return 1; case BTF_KIND_FWD: case BTF_KIND_ENUM: { @@ -6245,8 +6247,8 @@ patch_insn: /* bpf_core_patch_insn() should know how to handle missing targ_spec */ err = bpf_core_patch_insn(prog, relo, relo_idx, &targ_res); if (err) { - pr_warn("prog '%s': relo #%d: failed to patch insn at offset %d: %d\n", - prog->name, relo_idx, relo->insn_off, err); + pr_warn("prog '%s': relo #%d: failed to patch insn #%zu: %d\n", + prog->name, relo_idx, relo->insn_off / BPF_INSN_SZ, err); return -EINVAL; } -- cgit v1.2.3 From 0f20615d64ee2ad5e2a133a812382d0c4071589b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 26 Apr 2021 12:29:47 -0700 Subject: selftests/bpf: Fix BPF_CORE_READ_BITFIELD() macro Fix BPF_CORE_READ_BITFIELD() macro used for reading CO-RE-relocatable bitfields. Missing breaks in a switch caused 8-byte reads always. This can confuse libbpf because it does strict checks that memory load size corresponds to the original size of the field, which in this case quite often would be wrong. After fixing that, we run into another problem, which quite subtle, so worth documenting here. The issue is in Clang optimization and CO-RE relocation interactions. Without that asm volatile construct (also known as barrier_var()), Clang will re-order BYTE_OFFSET and BYTE_SIZE relocations and will apply BYTE_OFFSET 4 times for each switch case arm. This will result in the same error from libbpf about mismatch of memory load size and original field size. I.e., if we were reading u32, we'd still have *(u8 *), *(u16 *), *(u32 *), and *(u64 *) memory loads, three of which will fail. Using barrier_var() forces Clang to apply BYTE_OFFSET relocation first (and once) to calculate p, after which value of p is used without relocation in each of switch case arms, doing appropiately-sized memory load. Here's the list of relevant relocations and pieces of generated BPF code before and after this patch for test_core_reloc_bitfields_direct selftests. BEFORE ===== #45: core_reloc: insn #160 --> [5] + 0:5: byte_sz --> struct core_reloc_bitfields.u32 #46: core_reloc: insn #167 --> [5] + 0:5: byte_off --> struct core_reloc_bitfields.u32 #47: core_reloc: insn #174 --> [5] + 0:5: byte_off --> struct core_reloc_bitfields.u32 #48: core_reloc: insn #178 --> [5] + 0:5: byte_off --> struct core_reloc_bitfields.u32 #49: core_reloc: insn #182 --> [5] + 0:5: byte_off --> struct core_reloc_bitfields.u32 157: 18 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r2 = 0 ll 159: 7b 12 20 01 00 00 00 00 *(u64 *)(r2 + 288) = r1 160: b7 02 00 00 04 00 00 00 r2 = 4 ; BYTE_SIZE relocation here ^^^ 161: 66 02 07 00 03 00 00 00 if w2 s> 3 goto +7 162: 16 02 0d 00 01 00 00 00 if w2 == 1 goto +13 163: 16 02 01 00 02 00 00 00 if w2 == 2 goto +1 164: 05 00 12 00 00 00 00 00 goto +18 0000000000000528 : 165: 18 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r1 = 0 ll 167: 69 11 08 00 00 00 00 00 r1 = *(u16 *)(r1 + 8) ; BYTE_OFFSET relo here w/ WRONG size ^^^^^^^^^^^^^^^^ 168: 05 00 0e 00 00 00 00 00 goto +14 0000000000000548 : 169: 16 02 0a 00 04 00 00 00 if w2 == 4 goto +10 170: 16 02 01 00 08 00 00 00 if w2 == 8 goto +1 171: 05 00 0b 00 00 00 00 00 goto +11 0000000000000560 : 172: 18 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r1 = 0 ll 174: 79 11 08 00 00 00 00 00 r1 = *(u64 *)(r1 + 8) ; BYTE_OFFSET relo here w/ WRONG size ^^^^^^^^^^^^^^^^ 175: 05 00 07 00 00 00 00 00 goto +7 0000000000000580 : 176: 18 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r1 = 0 ll 178: 71 11 08 00 00 00 00 00 r1 = *(u8 *)(r1 + 8) ; BYTE_OFFSET relo here w/ WRONG size ^^^^^^^^^^^^^^^^ 179: 05 00 03 00 00 00 00 00 goto +3 00000000000005a0 : 180: 18 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r1 = 0 ll 182: 61 11 08 00 00 00 00 00 r1 = *(u32 *)(r1 + 8) ; BYTE_OFFSET relo here w/ RIGHT size ^^^^^^^^^^^^^^^^ 00000000000005b8 : 183: 67 01 00 00 20 00 00 00 r1 <<= 32 184: b7 02 00 00 00 00 00 00 r2 = 0 185: 16 02 02 00 00 00 00 00 if w2 == 0 goto +2 186: c7 01 00 00 20 00 00 00 r1 s>>= 32 187: 05 00 01 00 00 00 00 00 goto +1 00000000000005e0 : 188: 77 01 00 00 20 00 00 00 r1 >>= 32 AFTER ===== #30: core_reloc: insn #132 --> [5] + 0:5: byte_off --> struct core_reloc_bitfields.u32 #31: core_reloc: insn #134 --> [5] + 0:5: byte_sz --> struct core_reloc_bitfields.u32 129: 18 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r2 = 0 ll 131: 7b 12 20 01 00 00 00 00 *(u64 *)(r2 + 288) = r1 132: b7 01 00 00 08 00 00 00 r1 = 8 ; BYTE_OFFSET relo here ^^^ ; no size check for non-memory dereferencing instructions 133: 0f 12 00 00 00 00 00 00 r2 += r1 134: b7 03 00 00 04 00 00 00 r3 = 4 ; BYTE_SIZE relocation here ^^^ 135: 66 03 05 00 03 00 00 00 if w3 s> 3 goto +5 136: 16 03 09 00 01 00 00 00 if w3 == 1 goto +9 137: 16 03 01 00 02 00 00 00 if w3 == 2 goto +1 138: 05 00 0a 00 00 00 00 00 goto +10 0000000000000458 : 139: 69 21 00 00 00 00 00 00 r1 = *(u16 *)(r2 + 0) ; NO CO-RE relocation here ^^^^^^^^^^^^^^^^ 140: 05 00 08 00 00 00 00 00 goto +8 0000000000000468 : 141: 16 03 06 00 04 00 00 00 if w3 == 4 goto +6 142: 16 03 01 00 08 00 00 00 if w3 == 8 goto +1 143: 05 00 05 00 00 00 00 00 goto +5 0000000000000480 : 144: 79 21 00 00 00 00 00 00 r1 = *(u64 *)(r2 + 0) ; NO CO-RE relocation here ^^^^^^^^^^^^^^^^ 145: 05 00 03 00 00 00 00 00 goto +3 0000000000000490 : 146: 71 21 00 00 00 00 00 00 r1 = *(u8 *)(r2 + 0) ; NO CO-RE relocation here ^^^^^^^^^^^^^^^^ 147: 05 00 01 00 00 00 00 00 goto +1 00000000000004a0 : 148: 61 21 00 00 00 00 00 00 r1 = *(u32 *)(r2 + 0) ; NO CO-RE relocation here ^^^^^^^^^^^^^^^^ 00000000000004a8 : 149: 67 01 00 00 20 00 00 00 r1 <<= 32 150: b7 02 00 00 00 00 00 00 r2 = 0 151: 16 02 02 00 00 00 00 00 if w2 == 0 goto +2 152: c7 01 00 00 20 00 00 00 r1 s>>= 32 153: 05 00 01 00 00 00 00 00 goto +1 00000000000004d0 : 154: 77 01 00 00 20 00 00 00 r1 >>= 323 Fixes: ee26dade0e3b ("libbpf: Add support for relocatable bitfields") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/20210426192949.416837-4-andrii@kernel.org --- tools/lib/bpf/bpf_core_read.h | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/bpf_core_read.h b/tools/lib/bpf/bpf_core_read.h index 53b3e199fb25..09ebe3db5f2f 100644 --- a/tools/lib/bpf/bpf_core_read.h +++ b/tools/lib/bpf/bpf_core_read.h @@ -88,11 +88,19 @@ enum bpf_enum_value_kind { const void *p = (const void *)s + __CORE_RELO(s, field, BYTE_OFFSET); \ unsigned long long val; \ \ + /* This is a so-called barrier_var() operation that makes specified \ + * variable "a black box" for optimizing compiler. \ + * It forces compiler to perform BYTE_OFFSET relocation on p and use \ + * its calculated value in the switch below, instead of applying \ + * the same relocation 4 times for each individual memory load. \ + */ \ + asm volatile("" : "=r"(p) : "0"(p)); \ + \ switch (__CORE_RELO(s, field, BYTE_SIZE)) { \ - case 1: val = *(const unsigned char *)p; \ - case 2: val = *(const unsigned short *)p; \ - case 4: val = *(const unsigned int *)p; \ - case 8: val = *(const unsigned long long *)p; \ + case 1: val = *(const unsigned char *)p; break; \ + case 2: val = *(const unsigned short *)p; break; \ + case 4: val = *(const unsigned int *)p; break; \ + case 8: val = *(const unsigned long long *)p; break; \ } \ val <<= __CORE_RELO(s, field, LSHIFT_U64); \ if (__CORE_RELO(s, field, SIGNED)) \ -- cgit v1.2.3 From 5a30eb23922b52f33222c6729b6b3ff1c37a6c66 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 26 Apr 2021 12:29:48 -0700 Subject: selftests/bpf: Fix field existence CO-RE reloc tests Negative field existence cases for have a broken assumption that FIELD_EXISTS CO-RE relo will fail for fields that match the name but have incompatible type signature. That's not how CO-RE relocations generally behave. Types and fields that match by name but not by expected type are treated as non-matching candidates and are skipped. Error later is reported if no matching candidate was found. That's what happens for most relocations, but existence relocations (FIELD_EXISTS and TYPE_EXISTS) are more permissive and they are designed to return 0 or 1, depending if a match is found. This allows to handle name-conflicting but incompatible types in BPF code easily. Combined with ___flavor suffixes, it's possible to handle pretty much any structural type changes in kernel within the compiled once BPF source code. So, long story short, negative field existence test cases are invalid in their assumptions, so this patch reworks them into a single consolidated positive case that doesn't match any of the fields. Fixes: c7566a69695c ("selftests/bpf: Add field existence CO-RE relocs tests") Reported-by: Lorenz Bauer Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/20210426192949.416837-5-andrii@kernel.org --- .../testing/selftests/bpf/prog_tests/core_reloc.c | 31 +++++++++++++--------- ...tf__core_reloc_existence___err_wrong_arr_kind.c | 3 --- ...re_reloc_existence___err_wrong_arr_value_type.c | 3 --- ...tf__core_reloc_existence___err_wrong_int_kind.c | 3 --- .../btf__core_reloc_existence___err_wrong_int_sz.c | 3 --- ...tf__core_reloc_existence___err_wrong_int_type.c | 3 --- ..._core_reloc_existence___err_wrong_struct_type.c | 3 --- .../btf__core_reloc_existence___wrong_field_defs.c | 3 +++ .../testing/selftests/bpf/progs/core_reloc_types.h | 20 ++------------ 9 files changed, 24 insertions(+), 48 deletions(-) delete mode 100644 tools/testing/selftests/bpf/progs/btf__core_reloc_existence___err_wrong_arr_kind.c delete mode 100644 tools/testing/selftests/bpf/progs/btf__core_reloc_existence___err_wrong_arr_value_type.c delete mode 100644 tools/testing/selftests/bpf/progs/btf__core_reloc_existence___err_wrong_int_kind.c delete mode 100644 tools/testing/selftests/bpf/progs/btf__core_reloc_existence___err_wrong_int_sz.c delete mode 100644 tools/testing/selftests/bpf/progs/btf__core_reloc_existence___err_wrong_int_type.c delete mode 100644 tools/testing/selftests/bpf/progs/btf__core_reloc_existence___err_wrong_struct_type.c create mode 100644 tools/testing/selftests/bpf/progs/btf__core_reloc_existence___wrong_field_defs.c (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c index d94dcead72e6..385fd7696a2e 100644 --- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c +++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c @@ -210,11 +210,6 @@ static int duration = 0; .bpf_obj_file = "test_core_reloc_existence.o", \ .btf_src_file = "btf__core_reloc_" #name ".o" \ -#define FIELD_EXISTS_ERR_CASE(name) { \ - FIELD_EXISTS_CASE_COMMON(name), \ - .fails = true, \ -} - #define BITFIELDS_CASE_COMMON(objfile, test_name_prefix, name) \ .case_name = test_name_prefix#name, \ .bpf_obj_file = objfile, \ @@ -643,13 +638,25 @@ static struct core_reloc_test_case test_cases[] = { }, .output_len = sizeof(struct core_reloc_existence_output), }, - - FIELD_EXISTS_ERR_CASE(existence__err_int_sz), - FIELD_EXISTS_ERR_CASE(existence__err_int_type), - FIELD_EXISTS_ERR_CASE(existence__err_int_kind), - FIELD_EXISTS_ERR_CASE(existence__err_arr_kind), - FIELD_EXISTS_ERR_CASE(existence__err_arr_value_type), - FIELD_EXISTS_ERR_CASE(existence__err_struct_type), + { + FIELD_EXISTS_CASE_COMMON(existence___wrong_field_defs), + .input = STRUCT_TO_CHAR_PTR(core_reloc_existence___wrong_field_defs) { + }, + .input_len = sizeof(struct core_reloc_existence___wrong_field_defs), + .output = STRUCT_TO_CHAR_PTR(core_reloc_existence_output) { + .a_exists = 0, + .b_exists = 0, + .c_exists = 0, + .arr_exists = 0, + .s_exists = 0, + .a_value = 0xff000001u, + .b_value = 0xff000002u, + .c_value = 0xff000003u, + .arr_value = 0xff000004u, + .s_value = 0xff000005u, + }, + .output_len = sizeof(struct core_reloc_existence_output), + }, /* bitfield relocation checks */ BITFIELDS_CASE(bitfields, { diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___err_wrong_arr_kind.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___err_wrong_arr_kind.c deleted file mode 100644 index dd0ffa518f36..000000000000 --- a/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___err_wrong_arr_kind.c +++ /dev/null @@ -1,3 +0,0 @@ -#include "core_reloc_types.h" - -void f(struct core_reloc_existence___err_wrong_arr_kind x) {} diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___err_wrong_arr_value_type.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___err_wrong_arr_value_type.c deleted file mode 100644 index bc83372088ad..000000000000 --- a/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___err_wrong_arr_value_type.c +++ /dev/null @@ -1,3 +0,0 @@ -#include "core_reloc_types.h" - -void f(struct core_reloc_existence___err_wrong_arr_value_type x) {} diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___err_wrong_int_kind.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___err_wrong_int_kind.c deleted file mode 100644 index 917bec41be08..000000000000 --- a/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___err_wrong_int_kind.c +++ /dev/null @@ -1,3 +0,0 @@ -#include "core_reloc_types.h" - -void f(struct core_reloc_existence___err_wrong_int_kind x) {} diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___err_wrong_int_sz.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___err_wrong_int_sz.c deleted file mode 100644 index 6ec7e6ec1c91..000000000000 --- a/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___err_wrong_int_sz.c +++ /dev/null @@ -1,3 +0,0 @@ -#include "core_reloc_types.h" - -void f(struct core_reloc_existence___err_wrong_int_sz x) {} diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___err_wrong_int_type.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___err_wrong_int_type.c deleted file mode 100644 index 7bbcacf2b0d1..000000000000 --- a/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___err_wrong_int_type.c +++ /dev/null @@ -1,3 +0,0 @@ -#include "core_reloc_types.h" - -void f(struct core_reloc_existence___err_wrong_int_type x) {} diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___err_wrong_struct_type.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___err_wrong_struct_type.c deleted file mode 100644 index f384dd38ec70..000000000000 --- a/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___err_wrong_struct_type.c +++ /dev/null @@ -1,3 +0,0 @@ -#include "core_reloc_types.h" - -void f(struct core_reloc_existence___err_wrong_struct_type x) {} diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___wrong_field_defs.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___wrong_field_defs.c new file mode 100644 index 000000000000..d14b496190c3 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___wrong_field_defs.c @@ -0,0 +1,3 @@ +#include "core_reloc_types.h" + +void f(struct core_reloc_existence___wrong_field_defs x) {} diff --git a/tools/testing/selftests/bpf/progs/core_reloc_types.h b/tools/testing/selftests/bpf/progs/core_reloc_types.h index 9982eb969048..c95c0cabe951 100644 --- a/tools/testing/selftests/bpf/progs/core_reloc_types.h +++ b/tools/testing/selftests/bpf/progs/core_reloc_types.h @@ -700,27 +700,11 @@ struct core_reloc_existence___minimal { int a; }; -struct core_reloc_existence___err_wrong_int_sz { - short a; -}; - -struct core_reloc_existence___err_wrong_int_type { +struct core_reloc_existence___wrong_field_defs { + void *a; int b[1]; -}; - -struct core_reloc_existence___err_wrong_int_kind { struct{ int x; } c; -}; - -struct core_reloc_existence___err_wrong_arr_kind { int arr; -}; - -struct core_reloc_existence___err_wrong_arr_value_type { - short arr[1]; -}; - -struct core_reloc_existence___err_wrong_struct_type { int s; }; -- cgit v1.2.3 From bede0ebf0be87e9678103486a77f39e0334c6791 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 26 Apr 2021 12:29:49 -0700 Subject: selftests/bpf: Fix core_reloc test runner Fix failed tests checks in core_reloc test runner, which allowed failing tests to pass quietly. Also add extra check to make sure that expected to fail test cases with invalid names are caught as test failure anyway, as this is not an expected failure mode. Also fix mislabeled probed vs direct bitfield test cases. Fixes: 124a892d1c41 ("selftests/bpf: Test TYPE_EXISTS and TYPE_SIZE CO-RE relocations") Reported-by: Lorenz Bauer Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/20210426192949.416837-6-andrii@kernel.org --- tools/testing/selftests/bpf/prog_tests/core_reloc.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c index 385fd7696a2e..607710826dca 100644 --- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c +++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c @@ -217,7 +217,7 @@ static int duration = 0; #define BITFIELDS_CASE(name, ...) { \ BITFIELDS_CASE_COMMON("test_core_reloc_bitfields_probed.o", \ - "direct:", name), \ + "probed:", name), \ .input = STRUCT_TO_CHAR_PTR(core_reloc_##name) __VA_ARGS__, \ .input_len = sizeof(struct core_reloc_##name), \ .output = STRUCT_TO_CHAR_PTR(core_reloc_bitfields_output) \ @@ -225,7 +225,7 @@ static int duration = 0; .output_len = sizeof(struct core_reloc_bitfields_output), \ }, { \ BITFIELDS_CASE_COMMON("test_core_reloc_bitfields_direct.o", \ - "probed:", name), \ + "direct:", name), \ .input = STRUCT_TO_CHAR_PTR(core_reloc_##name) __VA_ARGS__, \ .input_len = sizeof(struct core_reloc_##name), \ .output = STRUCT_TO_CHAR_PTR(core_reloc_bitfields_output) \ @@ -546,8 +546,7 @@ static struct core_reloc_test_case test_cases[] = { ARRAYS_ERR_CASE(arrays___err_too_small), ARRAYS_ERR_CASE(arrays___err_too_shallow), ARRAYS_ERR_CASE(arrays___err_non_array), - ARRAYS_ERR_CASE(arrays___err_wrong_val_type1), - ARRAYS_ERR_CASE(arrays___err_wrong_val_type2), + ARRAYS_ERR_CASE(arrays___err_wrong_val_type), ARRAYS_ERR_CASE(arrays___err_bad_zero_sz_arr), /* enum/ptr/int handling scenarios */ @@ -865,13 +864,20 @@ void test_core_reloc(void) "prog '%s' not found\n", probe_name)) goto cleanup; + + if (test_case->btf_src_file) { + err = access(test_case->btf_src_file, R_OK); + if (!ASSERT_OK(err, "btf_src_file")) + goto cleanup; + } + load_attr.obj = obj; load_attr.log_level = 0; load_attr.target_btf_path = test_case->btf_src_file; err = bpf_object__load_xattr(&load_attr); if (err) { if (!test_case->fails) - CHECK(false, "obj_load", "failed to load prog '%s': %d\n", probe_name, err); + ASSERT_OK(err, "obj_load"); goto cleanup; } @@ -910,10 +916,8 @@ void test_core_reloc(void) goto cleanup; } - if (test_case->fails) { - CHECK(false, "obj_load_fail", "should fail to load prog '%s'\n", probe_name); + if (!ASSERT_FALSE(test_case->fails, "obj_load_should_fail")) goto cleanup; - } equal = memcmp(data->out, test_case->output, test_case->output_len) == 0; -- cgit v1.2.3 From 10bf4e83167cc68595b85fd73bb91e8f2c086e36 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 23 Apr 2021 13:59:55 +0000 Subject: bpf: Fix propagation of 32 bit unsigned bounds from 64 bit bounds Similarly as b02709587ea3 ("bpf: Fix propagation of 32-bit signed bounds from 64-bit bounds."), we also need to fix the propagation of 32 bit unsigned bounds from 64 bit counterparts. That is, really only set the u32_{min,max}_value when /both/ {umin,umax}_value safely fit in 32 bit space. For example, the register with a umin_value == 1 does /not/ imply that u32_min_value is also equal to 1, since umax_value could be much larger than 32 bit subregister can hold, and thus u32_min_value is in the interval [0,1] instead. Before fix, invalid tracking result of R2_w=inv1: [...] 5: R0_w=inv1337 R1=ctx(id=0,off=0,imm=0) R2_w=inv(id=0) R10=fp0 5: (35) if r2 >= 0x1 goto pc+1 [...] // goto path 7: R0=inv1337 R1=ctx(id=0,off=0,imm=0) R2=inv(id=0,umin_value=1) R10=fp0 7: (b6) if w2 <= 0x1 goto pc+1 [...] // goto path 9: R0=inv1337 R1=ctx(id=0,off=0,imm=0) R2=inv(id=0,smin_value=-9223372036854775807,smax_value=9223372032559808513,umin_value=1,umax_value=18446744069414584321,var_off=(0x1; 0xffffffff00000000),s32_min_value=1,s32_max_value=1,u32_max_value=1) R10=fp0 9: (bc) w2 = w2 10: R0=inv1337 R1=ctx(id=0,off=0,imm=0) R2_w=inv1 R10=fp0 [...] After fix, correct tracking result of R2_w=inv(id=0,umax_value=1,var_off=(0x0; 0x1)): [...] 5: R0_w=inv1337 R1=ctx(id=0,off=0,imm=0) R2_w=inv(id=0) R10=fp0 5: (35) if r2 >= 0x1 goto pc+1 [...] // goto path 7: R0=inv1337 R1=ctx(id=0,off=0,imm=0) R2=inv(id=0,umin_value=1) R10=fp0 7: (b6) if w2 <= 0x1 goto pc+1 [...] // goto path 9: R0=inv1337 R1=ctx(id=0,off=0,imm=0) R2=inv(id=0,smax_value=9223372032559808513,umax_value=18446744069414584321,var_off=(0x0; 0xffffffff00000001),s32_min_value=0,s32_max_value=1,u32_max_value=1) R10=fp0 9: (bc) w2 = w2 10: R0=inv1337 R1=ctx(id=0,off=0,imm=0) R2_w=inv(id=0,umax_value=1,var_off=(0x0; 0x1)) R10=fp0 [...] Thus, same issue as in b02709587ea3 holds for unsigned subregister tracking. Also, align __reg64_bound_u32() similarly to __reg64_bound_s32() as done in b02709587ea3 to make them uniform again. Fixes: 3f50f132d840 ("bpf: Verifier, do explicit ALU32 bounds tracking") Reported-by: Manfred Paul (@_manfp) Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 8 +++----- tools/testing/selftests/bpf/verifier/array_access.c | 2 +- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 637462e9b6ee..9145f88b2a0a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1398,9 +1398,7 @@ static bool __reg64_bound_s32(s64 a) static bool __reg64_bound_u32(u64 a) { - if (a > U32_MIN && a < U32_MAX) - return true; - return false; + return a > U32_MIN && a < U32_MAX; } static void __reg_combine_64_into_32(struct bpf_reg_state *reg) @@ -1411,10 +1409,10 @@ static void __reg_combine_64_into_32(struct bpf_reg_state *reg) reg->s32_min_value = (s32)reg->smin_value; reg->s32_max_value = (s32)reg->smax_value; } - if (__reg64_bound_u32(reg->umin_value)) + if (__reg64_bound_u32(reg->umin_value) && __reg64_bound_u32(reg->umax_value)) { reg->u32_min_value = (u32)reg->umin_value; - if (__reg64_bound_u32(reg->umax_value)) reg->u32_max_value = (u32)reg->umax_value; + } /* Intersecting with the old var_off might have improved our bounds * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc), diff --git a/tools/testing/selftests/bpf/verifier/array_access.c b/tools/testing/selftests/bpf/verifier/array_access.c index 1b138cd2b187..1b1c798e9248 100644 --- a/tools/testing/selftests/bpf/verifier/array_access.c +++ b/tools/testing/selftests/bpf/verifier/array_access.c @@ -186,7 +186,7 @@ }, .fixup_map_hash_48b = { 3 }, .errstr_unpriv = "R0 leaks addr", - .errstr = "invalid access to map value, value_size=48 off=44 size=8", + .errstr = "R0 unbounded memory access", .result_unpriv = REJECT, .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, -- cgit v1.2.3 From 3733bfbbdd28f7a65340d0058d15d15190a4944a Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Sat, 24 Apr 2021 18:45:10 -0300 Subject: bpf, selftests: Update array map tests for per-cpu batched ops Follows the same logic as the hashtable tests. Signed-off-by: Pedro Tammela Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210424214510.806627-3-pctammela@mojatatu.com --- .../selftests/bpf/map_tests/array_map_batch_ops.c | 104 +++++++++++++++------ 1 file changed, 75 insertions(+), 29 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c b/tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c index e42ea1195d18..f4d870da7684 100644 --- a/tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c +++ b/tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c @@ -9,10 +9,13 @@ #include +static int nr_cpus; + static void map_batch_update(int map_fd, __u32 max_entries, int *keys, - int *values) + __s64 *values, bool is_pcpu) { - int i, err; + int i, j, err; + int cpu_offset = 0; DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, opts, .elem_flags = 0, .flags = 0, @@ -20,22 +23,41 @@ static void map_batch_update(int map_fd, __u32 max_entries, int *keys, for (i = 0; i < max_entries; i++) { keys[i] = i; - values[i] = i + 1; + if (is_pcpu) { + cpu_offset = i * nr_cpus; + for (j = 0; j < nr_cpus; j++) + (values + cpu_offset)[j] = i + 1 + j; + } else { + values[i] = i + 1; + } } err = bpf_map_update_batch(map_fd, keys, values, &max_entries, &opts); CHECK(err, "bpf_map_update_batch()", "error:%s\n", strerror(errno)); } -static void map_batch_verify(int *visited, __u32 max_entries, - int *keys, int *values) +static void map_batch_verify(int *visited, __u32 max_entries, int *keys, + __s64 *values, bool is_pcpu) { - int i; + int i, j; + int cpu_offset = 0; memset(visited, 0, max_entries * sizeof(*visited)); for (i = 0; i < max_entries; i++) { - CHECK(keys[i] + 1 != values[i], "key/value checking", - "error: i %d key %d value %d\n", i, keys[i], values[i]); + if (is_pcpu) { + cpu_offset = i * nr_cpus; + for (j = 0; j < nr_cpus; j++) { + __s64 value = (values + cpu_offset)[j]; + CHECK(keys[i] + j + 1 != value, + "key/value checking", + "error: i %d j %d key %d value %lld\n", i, + j, keys[i], value); + } + } else { + CHECK(keys[i] + 1 != values[i], "key/value checking", + "error: i %d key %d value %lld\n", i, keys[i], + values[i]); + } visited[i] = 1; } for (i = 0; i < max_entries; i++) { @@ -44,19 +66,21 @@ static void map_batch_verify(int *visited, __u32 max_entries, } } -void test_array_map_batch_ops(void) +static void __test_map_lookup_and_update_batch(bool is_pcpu) { struct bpf_create_map_attr xattr = { .name = "array_map", - .map_type = BPF_MAP_TYPE_ARRAY, + .map_type = is_pcpu ? BPF_MAP_TYPE_PERCPU_ARRAY : + BPF_MAP_TYPE_ARRAY, .key_size = sizeof(int), - .value_size = sizeof(int), + .value_size = sizeof(__s64), }; - int map_fd, *keys, *values, *visited; + int map_fd, *keys, *visited; __u32 count, total, total_success; const __u32 max_entries = 10; __u64 batch = 0; - int err, step; + int err, step, value_size; + void *values; DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, opts, .elem_flags = 0, .flags = 0, @@ -67,22 +91,23 @@ void test_array_map_batch_ops(void) CHECK(map_fd == -1, "bpf_create_map_xattr()", "error:%s\n", strerror(errno)); - keys = malloc(max_entries * sizeof(int)); - values = malloc(max_entries * sizeof(int)); - visited = malloc(max_entries * sizeof(int)); + value_size = sizeof(__s64); + if (is_pcpu) + value_size *= nr_cpus; + + keys = calloc(max_entries, sizeof(*keys)); + values = calloc(max_entries, value_size); + visited = calloc(max_entries, sizeof(*visited)); CHECK(!keys || !values || !visited, "malloc()", "error:%s\n", strerror(errno)); - /* populate elements to the map */ - map_batch_update(map_fd, max_entries, keys, values); - /* test 1: lookup in a loop with various steps. */ total_success = 0; for (step = 1; step < max_entries; step++) { - map_batch_update(map_fd, max_entries, keys, values); - map_batch_verify(visited, max_entries, keys, values); + map_batch_update(map_fd, max_entries, keys, values, is_pcpu); + map_batch_verify(visited, max_entries, keys, values, is_pcpu); memset(keys, 0, max_entries * sizeof(*keys)); - memset(values, 0, max_entries * sizeof(*values)); + memset(values, 0, max_entries * value_size); batch = 0; total = 0; /* iteratively lookup/delete elements with 'step' @@ -91,10 +116,10 @@ void test_array_map_batch_ops(void) count = step; while (true) { err = bpf_map_lookup_batch(map_fd, - total ? &batch : NULL, &batch, - keys + total, - values + total, - &count, &opts); + total ? &batch : NULL, + &batch, keys + total, + values + total * value_size, + &count, &opts); CHECK((err && errno != ENOENT), "lookup with steps", "error: %s\n", strerror(errno)); @@ -108,7 +133,7 @@ void test_array_map_batch_ops(void) CHECK(total != max_entries, "lookup with steps", "total = %u, max_entries = %u\n", total, max_entries); - map_batch_verify(visited, max_entries, keys, values); + map_batch_verify(visited, max_entries, keys, values, is_pcpu); total_success++; } @@ -116,9 +141,30 @@ void test_array_map_batch_ops(void) CHECK(total_success == 0, "check total_success", "unexpected failure\n"); - printf("%s:PASS\n", __func__); - free(keys); free(values); free(visited); } + +static void array_map_batch_ops(void) +{ + __test_map_lookup_and_update_batch(false); + printf("test_%s:PASS\n", __func__); +} + +static void array_percpu_map_batch_ops(void) +{ + __test_map_lookup_and_update_batch(true); + printf("test_%s:PASS\n", __func__); +} + +void test_array_map_batch_ops(void) +{ + nr_cpus = libbpf_num_possible_cpus(); + + CHECK(nr_cpus < 0, "nr_cpus checking", + "error: get possible cpus failed"); + + array_map_batch_ops(); + array_percpu_map_batch_ops(); +} -- cgit v1.2.3 From ec8149fba64b719a618b432ce9eea7ce937a523c Mon Sep 17 00:00:00 2001 From: Song Liu Date: Sun, 25 Apr 2021 14:43:29 -0700 Subject: perf util: Move bpf_perf definitions to a libperf header By following the same protocol, other tools can share hardware PMCs with perf. Move perf_event_attr_map_entry and BPF_PERF_DEFAULT_ATTR_MAP_PATH to bpf_perf.h for other tools to use. Signed-off-by: Song Liu Cc: Jiri Olsa Cc: Namhyung Kim Cc: Song Liu Cc: kernel-team@fb.com Link: https://lore.kernel.org/r/20210425214333.1090950-2-song@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/perf/include/perf/bpf_perf.h | 31 +++++++++++++++++++++++++++++++ tools/perf/util/bpf_counter.c | 27 +++------------------------ 2 files changed, 34 insertions(+), 24 deletions(-) create mode 100644 tools/lib/perf/include/perf/bpf_perf.h (limited to 'tools') diff --git a/tools/lib/perf/include/perf/bpf_perf.h b/tools/lib/perf/include/perf/bpf_perf.h new file mode 100644 index 000000000000..e7cf6ba7b674 --- /dev/null +++ b/tools/lib/perf/include/perf/bpf_perf.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +#ifndef __LIBPERF_BPF_PERF_H +#define __LIBPERF_BPF_PERF_H + +#include /* for __u32 */ + +/* + * bpf_perf uses a hashmap, the attr_map, to track all the leader programs. + * The hashmap is pinned in bpffs. flock() on this file is used to ensure + * no concurrent access to the attr_map. The key of attr_map is struct + * perf_event_attr, and the value is struct perf_event_attr_map_entry. + * + * struct perf_event_attr_map_entry contains two __u32 IDs, bpf_link of the + * leader prog, and the diff_map. Each perf-stat session holds a reference + * to the bpf_link to make sure the leader prog is attached to sched_switch + * tracepoint. + * + * Since the hashmap only contains IDs of the bpf_link and diff_map, it + * does not hold any references to the leader program. Once all perf-stat + * sessions of these events exit, the leader prog, its maps, and the + * perf_events will be freed. + */ +struct perf_event_attr_map_entry { + __u32 link_id; + __u32 diff_map_id; +}; + +/* default attr_map name */ +#define BPF_PERF_DEFAULT_ATTR_MAP_PATH "perf_attr_map" + +#endif /* __LIBPERF_BPF_PERF_H */ diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c index 81d1df3c4ec0..be484ddbbd5b 100644 --- a/tools/perf/util/bpf_counter.c +++ b/tools/perf/util/bpf_counter.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "bpf_counter.h" #include "counts.h" @@ -29,28 +30,6 @@ #include "bpf_skel/bperf_leader.skel.h" #include "bpf_skel/bperf_follower.skel.h" -/* - * bperf uses a hashmap, the attr_map, to track all the leader programs. - * The hashmap is pinned in bpffs. flock() on this file is used to ensure - * no concurrent access to the attr_map. The key of attr_map is struct - * perf_event_attr, and the value is struct perf_event_attr_map_entry. - * - * struct perf_event_attr_map_entry contains two __u32 IDs, bpf_link of the - * leader prog, and the diff_map. Each perf-stat session holds a reference - * to the bpf_link to make sure the leader prog is attached to sched_switch - * tracepoint. - * - * Since the hashmap only contains IDs of the bpf_link and diff_map, it - * does not hold any references to the leader program. Once all perf-stat - * sessions of these events exit, the leader prog, its maps, and the - * perf_events will be freed. - */ -struct perf_event_attr_map_entry { - __u32 link_id; - __u32 diff_map_id; -}; - -#define DEFAULT_ATTR_MAP_PATH "fs/bpf/perf_attr_map" #define ATTR_MAP_SIZE 16 static inline void *u64_to_ptr(__u64 ptr) @@ -341,8 +320,8 @@ static int bperf_lock_attr_map(struct target *target) if (target->attr_map) { scnprintf(path, PATH_MAX, "%s", target->attr_map); } else { - scnprintf(path, PATH_MAX, "%s/%s", sysfs__mountpoint(), - DEFAULT_ATTR_MAP_PATH); + scnprintf(path, PATH_MAX, "%s/fs/bpf/%s", sysfs__mountpoint(), + BPF_PERF_DEFAULT_ATTR_MAP_PATH); } if (access(path, F_OK)) { -- cgit v1.2.3 From fe3dd8263b9f3912a0f3a2f66c0fdb3987d69a1a Mon Sep 17 00:00:00 2001 From: Song Liu Date: Sun, 25 Apr 2021 14:43:30 -0700 Subject: perf bpf: check perf_attr_map is compatible with the perf binary perf_attr_map could be shared among different version of perf binary. Add bperf_attr_map_compatible() to check whether the existing attr_map is compatible with current perf binary. Signed-off-by: Song Liu Cc: Jiri Olsa Cc: Namhyung Kim Cc: Song Liu Cc: kernel-team@fb.com Link: https://lore.kernel.org/r/20210425214333.1090950-3-song@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/bpf_counter.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c index be484ddbbd5b..5de991ab46af 100644 --- a/tools/perf/util/bpf_counter.c +++ b/tools/perf/util/bpf_counter.c @@ -312,6 +312,20 @@ static __u32 bpf_map_get_id(int fd) return map_info.id; } +static bool bperf_attr_map_compatible(int attr_map_fd) +{ + struct bpf_map_info map_info = {0}; + __u32 map_info_len = sizeof(map_info); + int err; + + err = bpf_obj_get_info_by_fd(attr_map_fd, &map_info, &map_info_len); + + if (err) + return false; + return (map_info.key_size == sizeof(struct perf_event_attr)) && + (map_info.value_size == sizeof(struct perf_event_attr_map_entry)); +} + static int bperf_lock_attr_map(struct target *target) { char path[PATH_MAX]; @@ -346,6 +360,11 @@ static int bperf_lock_attr_map(struct target *target) return -1; } + if (!bperf_attr_map_compatible(map_fd)) { + close(map_fd); + return -1; + + } err = flock(map_fd, LOCK_EX); if (err) { close(map_fd); -- cgit v1.2.3 From 112cb56164bc2108a55aee785d841a35aab0616a Mon Sep 17 00:00:00 2001 From: Song Liu Date: Sun, 25 Apr 2021 14:43:31 -0700 Subject: perf stat: Introduce config stat.bpf-counter-events Currently, to use BPF to aggregate perf event counters, the user uses --bpf-counters option. Enable "use bpf by default" events with a config option, stat.bpf-counter-events. Events with name in the option will use BPF. This also enables mixed BPF event and regular event in the same sesssion. For example: perf config stat.bpf-counter-events=instructions perf stat -e instructions,cs The second command will use BPF for "instructions" but not "cs". Signed-off-by: Song Liu Cc: Jiri Olsa Cc: Namhyung Kim Cc: Song Liu Link: https://lore.kernel.org/r/20210425214333.1090950-4-song@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-stat.txt | 2 ++ tools/perf/builtin-stat.c | 42 ++++++++++++++++++++-------------- tools/perf/util/bpf_counter.c | 3 ++- tools/perf/util/config.c | 4 ++++ tools/perf/util/evsel.c | 22 ++++++++++++++++++ tools/perf/util/evsel.h | 8 +++++++ tools/perf/util/target.h | 5 ---- 7 files changed, 63 insertions(+), 23 deletions(-) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt index 6ec5960b08c3..f10e24da23e9 100644 --- a/tools/perf/Documentation/perf-stat.txt +++ b/tools/perf/Documentation/perf-stat.txt @@ -97,6 +97,8 @@ report:: Use BPF programs to aggregate readings from perf_events. This allows multiple perf-stat sessions that are counting the same metric (cycles, instructions, etc.) to share hardware counters. + To use BPF programs on common events by default, use + "perf config stat.bpf-counter-events=". --bpf-attr-map:: With option "--bpf-counters", different perf-stat sessions share diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index ba5b31aab86b..5ec2190e45cd 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -161,6 +161,7 @@ static const char *smi_cost_attrs = { }; static struct evlist *evsel_list; +static bool all_counters_use_bpf = true; static struct target target = { .uid = UINT_MAX, @@ -401,6 +402,9 @@ static int read_affinity_counters(struct timespec *rs) struct affinity affinity; int i, ncpus, cpu; + if (all_counters_use_bpf) + return 0; + if (affinity__setup(&affinity) < 0) return -1; @@ -415,6 +419,8 @@ static int read_affinity_counters(struct timespec *rs) evlist__for_each_entry(evsel_list, counter) { if (evsel__cpu_iter_skip(counter, cpu)) continue; + if (evsel__is_bpf(counter)) + continue; if (!counter->err) { counter->err = read_counter_cpu(counter, rs, counter->cpu_iter - 1); @@ -431,6 +437,9 @@ static int read_bpf_map_counters(void) int err; evlist__for_each_entry(evsel_list, counter) { + if (!evsel__is_bpf(counter)) + continue; + err = bpf_counter__read(counter); if (err) return err; @@ -441,14 +450,10 @@ static int read_bpf_map_counters(void) static void read_counters(struct timespec *rs) { struct evsel *counter; - int err; if (!stat_config.stop_read_counter) { - if (target__has_bpf(&target)) - err = read_bpf_map_counters(); - else - err = read_affinity_counters(rs); - if (err < 0) + if (read_bpf_map_counters() || + read_affinity_counters(rs)) return; } @@ -537,12 +542,13 @@ static int enable_counters(void) struct evsel *evsel; int err; - if (target__has_bpf(&target)) { - evlist__for_each_entry(evsel_list, evsel) { - err = bpf_counter__enable(evsel); - if (err) - return err; - } + evlist__for_each_entry(evsel_list, evsel) { + if (!evsel__is_bpf(evsel)) + continue; + + err = bpf_counter__enable(evsel); + if (err) + return err; } if (stat_config.initial_delay < 0) { @@ -786,11 +792,11 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx) if (affinity__setup(&affinity) < 0) return -1; - if (target__has_bpf(&target)) { - evlist__for_each_entry(evsel_list, counter) { - if (bpf_counter__load(counter, &target)) - return -1; - } + evlist__for_each_entry(evsel_list, counter) { + if (bpf_counter__load(counter, &target)) + return -1; + if (!evsel__is_bpf(counter)) + all_counters_use_bpf = false; } evlist__for_each_cpu (evsel_list, i, cpu) { @@ -807,6 +813,8 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx) continue; if (counter->reset_group || counter->errored) continue; + if (evsel__is_bpf(counter)) + continue; try_again: if (create_perf_stat_counter(counter, &stat_config, &target, counter->cpu_iter - 1) < 0) { diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c index 5de991ab46af..33b1888103df 100644 --- a/tools/perf/util/bpf_counter.c +++ b/tools/perf/util/bpf_counter.c @@ -790,7 +790,8 @@ int bpf_counter__load(struct evsel *evsel, struct target *target) { if (target->bpf_str) evsel->bpf_counter_ops = &bpf_program_profiler_ops; - else if (target->use_bpf) + else if (target->use_bpf || + evsel__match_bpf_counter_events(evsel->name)) evsel->bpf_counter_ops = &bperf_ops; if (evsel->bpf_counter_ops) diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c index 6bcb5ef221f8..63d472b336de 100644 --- a/tools/perf/util/config.c +++ b/tools/perf/util/config.c @@ -18,6 +18,7 @@ #include "util/hist.h" /* perf_hist_config */ #include "util/llvm-utils.h" /* perf_llvm_config */ #include "util/stat.h" /* perf_stat__set_big_num */ +#include "util/evsel.h" /* evsel__hw_names, evsel__use_bpf_counters */ #include "build-id.h" #include "debug.h" #include "config.h" @@ -460,6 +461,9 @@ static int perf_stat_config(const char *var, const char *value) if (!strcmp(var, "stat.no-csv-summary")) perf_stat__set_no_csv_summary(perf_config_bool(var, value)); + if (!strcmp(var, "stat.bpf-counter-events")) + evsel__bpf_counter_events = strdup(value); + /* Add other config variables here. */ return 0; } diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 2d2614eeaa20..080ddcfefbcd 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -492,6 +492,28 @@ const char *evsel__hw_names[PERF_COUNT_HW_MAX] = { "ref-cycles", }; +char *evsel__bpf_counter_events; + +bool evsel__match_bpf_counter_events(const char *name) +{ + int name_len; + bool match; + char *ptr; + + if (!evsel__bpf_counter_events) + return false; + + ptr = strstr(evsel__bpf_counter_events, name); + name_len = strlen(name); + + /* check name matches a full token in evsel__bpf_counter_events */ + match = (ptr != NULL) && + ((ptr == evsel__bpf_counter_events) || (*(ptr - 1) == ',')) && + ((*(ptr + name_len) == ',') || (*(ptr + name_len) == '\0')); + + return match; +} + static const char *__evsel__hw_name(u64 config) { if (config < PERF_COUNT_HW_MAX && evsel__hw_names[config]) diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index eccc4fd5b3eb..ce4b629d659c 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -239,6 +239,11 @@ void evsel__calc_id_pos(struct evsel *evsel); bool evsel__is_cache_op_valid(u8 type, u8 op); +static inline bool evsel__is_bpf(struct evsel *evsel) +{ + return evsel->bpf_counter_ops != NULL; +} + #define EVSEL__MAX_ALIASES 8 extern const char *evsel__hw_cache[PERF_COUNT_HW_CACHE_MAX][EVSEL__MAX_ALIASES]; @@ -246,6 +251,9 @@ extern const char *evsel__hw_cache_op[PERF_COUNT_HW_CACHE_OP_MAX][EVSEL__MAX_ALI extern const char *evsel__hw_cache_result[PERF_COUNT_HW_CACHE_RESULT_MAX][EVSEL__MAX_ALIASES]; extern const char *evsel__hw_names[PERF_COUNT_HW_MAX]; extern const char *evsel__sw_names[PERF_COUNT_SW_MAX]; +extern char *evsel__bpf_counter_events; +bool evsel__match_bpf_counter_events(const char *name); + int __evsel__hw_cache_type_op_res_name(u8 type, u8 op, u8 result, char *bf, size_t size); const char *evsel__name(struct evsel *evsel); diff --git a/tools/perf/util/target.h b/tools/perf/util/target.h index 1bce3eb28ef2..4ff56217f2a6 100644 --- a/tools/perf/util/target.h +++ b/tools/perf/util/target.h @@ -66,11 +66,6 @@ static inline bool target__has_cpu(struct target *target) return target->system_wide || target->cpu_list; } -static inline bool target__has_bpf(struct target *target) -{ - return target->bpf_str || target->use_bpf; -} - static inline bool target__none(struct target *target) { return !target__has_task(target) && !target__has_cpu(target); -- cgit v1.2.3 From 01bd8efcec444468db0275bbd71b49927f7e1544 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Sun, 25 Apr 2021 14:43:32 -0700 Subject: perf stat: Introduce ':b' modifier Introduce 'b' modifier to event parser, which means use BPF program to manage this event. This is the same as --bpf-counters option, but only applies to this event. For example, perf stat -e cycles:b,cs # use bpf for cycles, but not cs perf stat -e cycles,cs --bpf-counters # use bpf for both cycles and cs Suggested-by: Jiri Olsa Signed-off-by: Song Liu Cc: Namhyung Kim Cc: Song Liu Link: https://lore.kernel.org/r/20210425214333.1090950-5-song@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/bpf_counter.c | 2 +- tools/perf/util/evsel.h | 1 + tools/perf/util/parse-events.c | 8 +++++++- tools/perf/util/parse-events.l | 2 +- 4 files changed, 10 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c index 33b1888103df..f179f5743025 100644 --- a/tools/perf/util/bpf_counter.c +++ b/tools/perf/util/bpf_counter.c @@ -790,7 +790,7 @@ int bpf_counter__load(struct evsel *evsel, struct target *target) { if (target->bpf_str) evsel->bpf_counter_ops = &bpf_program_profiler_ops; - else if (target->use_bpf || + else if (target->use_bpf || evsel->bpf_counter || evsel__match_bpf_counter_events(evsel->name)) evsel->bpf_counter_ops = &bperf_ops; diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index ce4b629d659c..8f66cdcb338d 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -82,6 +82,7 @@ struct evsel { bool auto_merge_stats; bool collect_stat; bool weak_group; + bool bpf_counter; int bpf_fd; struct bpf_object *bpf_obj; }; diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 8123d218ad17..46ebd269a98d 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1804,6 +1804,7 @@ struct event_modifier { int pinned; int weak; int exclusive; + int bpf_counter; }; static int get_event_modifier(struct event_modifier *mod, char *str, @@ -1824,6 +1825,7 @@ static int get_event_modifier(struct event_modifier *mod, char *str, int exclude = eu | ek | eh; int exclude_GH = evsel ? evsel->exclude_GH : 0; int weak = 0; + int bpf_counter = 0; memset(mod, 0, sizeof(*mod)); @@ -1867,6 +1869,8 @@ static int get_event_modifier(struct event_modifier *mod, char *str, exclusive = 1; } else if (*str == 'W') { weak = 1; + } else if (*str == 'b') { + bpf_counter = 1; } else break; @@ -1898,6 +1902,7 @@ static int get_event_modifier(struct event_modifier *mod, char *str, mod->sample_read = sample_read; mod->pinned = pinned; mod->weak = weak; + mod->bpf_counter = bpf_counter; mod->exclusive = exclusive; return 0; @@ -1912,7 +1917,7 @@ static int check_modifier(char *str) char *p = str; /* The sizeof includes 0 byte as well. */ - if (strlen(str) > (sizeof("ukhGHpppPSDIWe") - 1)) + if (strlen(str) > (sizeof("ukhGHpppPSDIWeb") - 1)) return -1; while (*p) { @@ -1953,6 +1958,7 @@ int parse_events__modifier_event(struct list_head *list, char *str, bool add) evsel->sample_read = mod.sample_read; evsel->precise_max = mod.precise_max; evsel->weak_group = mod.weak; + evsel->bpf_counter = mod.bpf_counter; if (evsel__is_group_leader(evsel)) { evsel->core.attr.pinned = mod.pinned; diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l index 0b36285a9435..fb8646cc3e83 100644 --- a/tools/perf/util/parse-events.l +++ b/tools/perf/util/parse-events.l @@ -210,7 +210,7 @@ name_tag [\'][a-zA-Z_*?\[\]][a-zA-Z0-9_*?\-,\.\[\]:=]*[\'] name_minus [a-zA-Z_*?][a-zA-Z0-9\-_*?.:]* drv_cfg_term [a-zA-Z0-9_\.]+(=[a-zA-Z0-9_*?\.:]+)? /* If you add a modifier you need to update check_modifier() */ -modifier_event [ukhpPGHSDIWe]+ +modifier_event [ukhpPGHSDIWeb]+ modifier_bp [rwx]{1,3} %% -- cgit v1.2.3 From 5508c9dae2a4a111acc7472900164f556ae75346 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Sun, 25 Apr 2021 14:43:33 -0700 Subject: perf stat: Introduce bpf_counter_ops->disable() Introduce bpf_counter_ops->disable(), which is used stop counting the event. Committer notes: Added a dummy bpf_counter__disable() to the python binding to avoid having 'perf test python' failing. bpf_counter isn't supported in the python binding. Signed-off-by: Song Liu Cc: Jiri Olsa Cc: Namhyung Kim Cc: Song Liu Cc: kernel-team@fb.com Link: https://lore.kernel.org/r/20210425214333.1090950-6-song@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/bpf_counter.c | 26 ++++++++++++++++++++++++++ tools/perf/util/bpf_counter.h | 7 +++++++ tools/perf/util/evlist.c | 4 ++++ tools/perf/util/python.c | 6 ++++++ 4 files changed, 43 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c index f179f5743025..ddb52f748c8e 100644 --- a/tools/perf/util/bpf_counter.c +++ b/tools/perf/util/bpf_counter.c @@ -215,6 +215,17 @@ static int bpf_program_profiler__enable(struct evsel *evsel) return 0; } +static int bpf_program_profiler__disable(struct evsel *evsel) +{ + struct bpf_counter *counter; + + list_for_each_entry(counter, &evsel->bpf_counter_list, list) { + assert(counter->skel != NULL); + bpf_prog_profiler_bpf__detach(counter->skel); + } + return 0; +} + static int bpf_program_profiler__read(struct evsel *evsel) { // perf_cpu_map uses /sys/devices/system/cpu/online @@ -280,6 +291,7 @@ static int bpf_program_profiler__install_pe(struct evsel *evsel, int cpu, struct bpf_counter_ops bpf_program_profiler_ops = { .load = bpf_program_profiler__load, .enable = bpf_program_profiler__enable, + .disable = bpf_program_profiler__disable, .read = bpf_program_profiler__read, .destroy = bpf_program_profiler__destroy, .install_pe = bpf_program_profiler__install_pe, @@ -627,6 +639,12 @@ static int bperf__enable(struct evsel *evsel) return 0; } +static int bperf__disable(struct evsel *evsel) +{ + evsel->follower_skel->bss->enabled = 0; + return 0; +} + static int bperf__read(struct evsel *evsel) { struct bperf_follower_bpf *skel = evsel->follower_skel; @@ -768,6 +786,7 @@ static int bperf__destroy(struct evsel *evsel) struct bpf_counter_ops bperf_ops = { .load = bperf__load, .enable = bperf__enable, + .disable = bperf__disable, .read = bperf__read, .install_pe = bperf__install_pe, .destroy = bperf__destroy, @@ -806,6 +825,13 @@ int bpf_counter__enable(struct evsel *evsel) return evsel->bpf_counter_ops->enable(evsel); } +int bpf_counter__disable(struct evsel *evsel) +{ + if (bpf_counter_skip(evsel)) + return 0; + return evsel->bpf_counter_ops->disable(evsel); +} + int bpf_counter__read(struct evsel *evsel) { if (bpf_counter_skip(evsel)) diff --git a/tools/perf/util/bpf_counter.h b/tools/perf/util/bpf_counter.h index cb9c532e0a07..d6d907c3dcf9 100644 --- a/tools/perf/util/bpf_counter.h +++ b/tools/perf/util/bpf_counter.h @@ -18,6 +18,7 @@ typedef int (*bpf_counter_evsel_install_pe_op)(struct evsel *evsel, struct bpf_counter_ops { bpf_counter_evsel_target_op load; bpf_counter_evsel_op enable; + bpf_counter_evsel_op disable; bpf_counter_evsel_op read; bpf_counter_evsel_op destroy; bpf_counter_evsel_install_pe_op install_pe; @@ -32,6 +33,7 @@ struct bpf_counter { int bpf_counter__load(struct evsel *evsel, struct target *target); int bpf_counter__enable(struct evsel *evsel); +int bpf_counter__disable(struct evsel *evsel); int bpf_counter__read(struct evsel *evsel); void bpf_counter__destroy(struct evsel *evsel); int bpf_counter__install_pe(struct evsel *evsel, int cpu, int fd); @@ -51,6 +53,11 @@ static inline int bpf_counter__enable(struct evsel *evsel __maybe_unused) return 0; } +static inline int bpf_counter__disable(struct evsel *evsel __maybe_unused) +{ + return 0; +} + static inline int bpf_counter__read(struct evsel *evsel __maybe_unused) { return -EAGAIN; diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index d29a8a118973..e71041c89010 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -17,6 +17,7 @@ #include "evsel.h" #include "debug.h" #include "units.h" +#include "bpf_counter.h" #include // page_size #include "affinity.h" #include "../perf.h" @@ -421,6 +422,9 @@ static void __evlist__disable(struct evlist *evlist, char *evsel_name) if (affinity__setup(&affinity) < 0) return; + evlist__for_each_entry(evlist, pos) + bpf_counter__disable(pos); + /* Disable 'immediate' events last */ for (imm = 0; imm <= 1; imm++) { evlist__for_each_cpu(evlist, i, cpu) { diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index 278abecb5bdf..412f8e79e409 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -90,6 +90,7 @@ int metricgroup__copy_metric_events(struct evlist *evlist, struct cgroup *cgrp, */ void bpf_counter__destroy(struct evsel *evsel); int bpf_counter__install_pe(struct evsel *evsel, int cpu, int fd); +int bpf_counter__disable(struct evsel *evsel); void bpf_counter__destroy(struct evsel *evsel __maybe_unused) { @@ -100,6 +101,11 @@ int bpf_counter__install_pe(struct evsel *evsel __maybe_unused, int cpu __maybe_ return 0; } +int bpf_counter__disable(struct evsel *evsel __maybe_unused) +{ + return 0; +} + /* * Support debug printing even though util/debug.c is not linked. That means * implementing 'verbose' and 'eprintf'. -- cgit v1.2.3 From d0713d4ca3e94827de77f8758e3e8045a0d85215 Mon Sep 17 00:00:00 2001 From: Nicholas Fraser Date: Mon, 26 Apr 2021 10:47:16 -0400 Subject: perf data: Add JSON export MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds a feature to export perf data to JSON. The resolved symbols are exported into the JSON so that external tools don't need to load the dsos themselves (or even have access to them at all.) This makes it easy to load and analyze perf data with standalone tools where direct perf or libbabeltrace integration is impractical. The exporter uses a minimal inline JSON encoding without any external dependencies. Currently it only outputs some headers and sample metadata but it's easily extensible. Use it like this: $ perf data convert --to-json out.json Committer notes: Fixup a __printf() bug that broke the build: util/data-convert-json.c:103:11: error: expected ‘)’ before numeric constant 103 | __(printf, 5, 6) | ^~ | ) util/data-convert-json.c: In function ‘output_sample_callchain_entry’: util/data-convert-json.c:124:2: error: implicit declaration of function ‘output_json_key_format’; did you mean ‘output_json_format’? [-Werror=implicit-function-declaration] 124 | output_json_key_format(out, false, 5, "ip", "\"0x%" PRIx64 "\"", ip); | ^~~~~~~~~~~~~~~~~~~~~~ | output_json_format Also had to add this patch to fix errors reported by various versions of clang: - if (al && al->sym && al->sym->name && strlen(al->sym->name) > 0) { + if (al && al->sym && al->sym->namelen) { al->sym->name is a zero sized array, to avoid one extra alloc in the symbol__new() constructor, sym->namelen carries its strlen. Committer testing: $ ls -la out.json ls: cannot access 'out.json': No such file or directory $ perf record sleep 0.1 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.001 MB perf.data (8 samples) ] $ perf report --stats | grep -w SAMPLE SAMPLE events: 8 $ perf data convert --to-json out.json [ perf data convert: Converted 'perf.data' into JSON data 'out.json' ] [ perf data convert: Converted and wrote 0.002 MB (8 samples) ] $ ls -la out.json -rw-rw-r--. 1 acme acme 2017 Apr 26 17:29 out.json $ cat out.json { "linux-perf-json-version": 1, "headers": { "header-version": 1, "captured-on": "2021-04-26T20:28:57Z", "data-offset": 432, "data-size": 1016, "feat-offset": 1448, "hostname": "five", "os-release": "5.11.14-200.fc33.x86_64", "arch": "x86_64", "cpu-desc": "AMD Ryzen 9 3900X 12-Core Processor", "cpuid": "AuthenticAMD,23,113,0", "nrcpus-online": 24, "nrcpus-avail": 24, "perf-version": "5.12.gee134f3189bd", "cmdline": [ "/home/acme/bin/perf", "record", "sleep", "0.1" ] }, "samples": [ { "timestamp": 170517539043684, "pid": 375844, "tid": 375844, "comm": "sleep", "callchain": [ { "ip": "0xffffffffa6268827" } ] }, { "timestamp": 170517539048443, "pid": 375844, "tid": 375844, "comm": "sleep", "callchain": [ { "ip": "0xffffffffa661359d" } ] }, { "timestamp": 170517539051018, "pid": 375844, "tid": 375844, "comm": "sleep", "callchain": [ { "ip": "0xffffffffa6311e18" } ] }, { "timestamp": 170517539053652, "pid": 375844, "tid": 375844, "comm": "sleep", "callchain": [ { "ip": "0x7fdb77b4812b", "symbol": "_dl_start", "dso": "ld-2.32.so" } ] }, { "timestamp": 170517539055306, "pid": 375844, "tid": 375844, "comm": "sleep", "callchain": [ { "ip": "0xffffffffa6269286" } ] }, { "timestamp": 170517539057590, "pid": 375844, "tid": 375844, "comm": "sleep", "callchain": [ { "ip": "0xffffffffa62abd8b" } ] }, { "timestamp": 170517539067559, "pid": 375844, "tid": 375844, "comm": "sleep", "callchain": [ { "ip": "0x7fdb77b5e9e9", "symbol": "__GI___tunables_init", "dso": "ld-2.32.so" } ] }, { "timestamp": 170517539282452, "pid": 375844, "tid": 375844, "comm": "sleep", "callchain": [ { "ip": "0x7fdb779978d2", "symbol": "getenv", "dso": "libc-2.32.so" } ] } ] } $ Signed-off-by: Nicholas Fraser Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Changbin Du Cc: Ian Rogers Cc: Jin Yao Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Tan Xiaojun Cc: Ulrich Czekalla Link: http://lore.kernel.org/lkml/3884969f-804d-2f53-c648-e2b0bd85edff@codeweavers.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-data.txt | 5 +- tools/perf/builtin-data.c | 26 ++- tools/perf/util/Build | 1 + tools/perf/util/data-convert-bt.c | 2 +- tools/perf/util/data-convert-bt.h | 11 - tools/perf/util/data-convert-json.c | 384 +++++++++++++++++++++++++++++++++ tools/perf/util/data-convert.h | 10 + 7 files changed, 418 insertions(+), 21 deletions(-) delete mode 100644 tools/perf/util/data-convert-bt.h create mode 100644 tools/perf/util/data-convert-json.c (limited to 'tools') diff --git a/tools/perf/Documentation/perf-data.txt b/tools/perf/Documentation/perf-data.txt index 726b9bc9e1a7..417bf17e265c 100644 --- a/tools/perf/Documentation/perf-data.txt +++ b/tools/perf/Documentation/perf-data.txt @@ -17,7 +17,7 @@ Data file related processing. COMMANDS -------- convert:: - Converts perf data file into another format (only CTF [1] format is support by now). + Converts perf data file into another format. It's possible to set data-convert debug variable to get debug messages from conversion, like: perf --debug data-convert data convert ... @@ -27,6 +27,9 @@ OPTIONS for 'convert' --to-ctf:: Triggers the CTF conversion, specify the path of CTF data directory. +--to-json:: + Triggers JSON conversion. Specify the JSON filename to output. + --tod:: Convert time to wall clock time. diff --git a/tools/perf/builtin-data.c b/tools/perf/builtin-data.c index 8d23b8d6ee8e..15ca23675ef0 100644 --- a/tools/perf/builtin-data.c +++ b/tools/perf/builtin-data.c @@ -7,7 +7,6 @@ #include "debug.h" #include #include "data-convert.h" -#include "data-convert-bt.h" typedef int (*data_cmd_fn_t)(int argc, const char **argv); @@ -55,7 +54,8 @@ static const char * const data_convert_usage[] = { static int cmd_data_convert(int argc, const char **argv) { - const char *to_ctf = NULL; + const char *to_json = NULL; + const char *to_ctf = NULL; struct perf_data_convert_opts opts = { .force = false, .all = false, @@ -63,6 +63,7 @@ static int cmd_data_convert(int argc, const char **argv) const struct option options[] = { OPT_INCR('v', "verbose", &verbose, "be more verbose"), OPT_STRING('i', "input", &input_name, "file", "input file name"), + OPT_STRING(0, "to-json", &to_json, NULL, "Convert to JSON format"), #ifdef HAVE_LIBBABELTRACE_SUPPORT OPT_STRING(0, "to-ctf", &to_ctf, NULL, "Convert to CTF format"), OPT_BOOLEAN(0, "tod", &opts.tod, "Convert time to wall clock time"), @@ -72,11 +73,6 @@ static int cmd_data_convert(int argc, const char **argv) OPT_END() }; -#ifndef HAVE_LIBBABELTRACE_SUPPORT - pr_err("No conversion support compiled in. perf should be compiled with environment variables LIBBABELTRACE=1 and LIBBABELTRACE_DIR=/path/to/libbabeltrace/\n"); - return -1; -#endif - argc = parse_options(argc, argv, options, data_convert_usage, 0); if (argc) { @@ -84,11 +80,25 @@ static int cmd_data_convert(int argc, const char **argv) return -1; } + if (to_json && to_ctf) { + pr_err("You cannot specify both --to-ctf and --to-json.\n"); + return -1; + } + if (!to_json && !to_ctf) { + pr_err("You must specify one of --to-ctf or --to-json.\n"); + return -1; + } + + if (to_json) + return bt_convert__perf2json(input_name, to_json, &opts); + if (to_ctf) { #ifdef HAVE_LIBBABELTRACE_SUPPORT return bt_convert__perf2ctf(input_name, to_ctf, &opts); #else - pr_err("The libbabeltrace support is not compiled in.\n"); + pr_err("The libbabeltrace support is not compiled in. perf should be " + "compiled with environment variables LIBBABELTRACE=1 and " + "LIBBABELTRACE_DIR=/path/to/libbabeltrace/\n"); return -1; #endif } diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 7dd815712d60..827d216a780e 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -165,6 +165,7 @@ perf-$(CONFIG_LIBUNWIND_X86) += libunwind/x86_32.o perf-$(CONFIG_LIBUNWIND_AARCH64) += libunwind/arm64.o perf-$(CONFIG_LIBBABELTRACE) += data-convert-bt.o +perf-y += data-convert-json.o perf-y += scripting-engines/ diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c index a9c375e63e73..cace349fb700 100644 --- a/tools/perf/util/data-convert-bt.c +++ b/tools/perf/util/data-convert-bt.c @@ -21,7 +21,7 @@ #include #include #include "asm/bug.h" -#include "data-convert-bt.h" +#include "data-convert.h" #include "session.h" #include "debug.h" #include "tool.h" diff --git a/tools/perf/util/data-convert-bt.h b/tools/perf/util/data-convert-bt.h deleted file mode 100644 index 821674d63c4e..000000000000 --- a/tools/perf/util/data-convert-bt.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __DATA_CONVERT_BT_H -#define __DATA_CONVERT_BT_H -#include "data-convert.h" -#ifdef HAVE_LIBBABELTRACE_SUPPORT - -int bt_convert__perf2ctf(const char *input_name, const char *to_ctf, - struct perf_data_convert_opts *opts); - -#endif /* HAVE_LIBBABELTRACE_SUPPORT */ -#endif /* __DATA_CONVERT_BT_H */ diff --git a/tools/perf/util/data-convert-json.c b/tools/perf/util/data-convert-json.c new file mode 100644 index 000000000000..355cd1948bdf --- /dev/null +++ b/tools/perf/util/data-convert-json.c @@ -0,0 +1,384 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * JSON export. + * + * Copyright (C) 2021, CodeWeavers Inc. + */ + +#include "data-convert.h" + +#include +#include +#include +#include + +#include "linux/compiler.h" +#include "linux/err.h" +#include "util/auxtrace.h" +#include "util/debug.h" +#include "util/dso.h" +#include "util/event.h" +#include "util/evsel.h" +#include "util/evlist.h" +#include "util/header.h" +#include "util/map.h" +#include "util/session.h" +#include "util/symbol.h" +#include "util/thread.h" +#include "util/tool.h" + +struct convert_json { + struct perf_tool tool; + FILE *out; + bool first; + u64 events_count; +}; + +// Outputs a JSON-encoded string surrounded by quotes with characters escaped. +static void output_json_string(FILE *out, const char *s) +{ + fputc('"', out); + while (*s) { + switch (*s) { + + // required escapes with special forms as per RFC 8259 + case '"': fputs("\\\"", out); break; + case '\\': fputs("\\\\", out); break; + case '\b': fputs("\\b", out); break; + case '\f': fputs("\\f", out); break; + case '\n': fputs("\\n", out); break; + case '\r': fputs("\\r", out); break; + case '\t': fputs("\\t", out); break; + + default: + // all other control characters must be escaped by hex code + if (*s <= 0x1f) + fprintf(out, "\\u%04x", *s); + else + fputc(*s, out); + break; + } + + ++s; + } + fputc('"', out); +} + +// Outputs an optional comma, newline and indentation to delimit a new value +// from the previous one in a JSON object or array. +static void output_json_delimiters(FILE *out, bool comma, int depth) +{ + int i; + + if (comma) + fputc(',', out); + fputc('\n', out); + for (i = 0; i < depth; ++i) + fputc('\t', out); +} + +// Outputs a printf format string (with delimiter) as a JSON value. +__printf(4, 5) +static void output_json_format(FILE *out, bool comma, int depth, const char *format, ...) +{ + va_list args; + + output_json_delimiters(out, comma, depth); + va_start(args, format); + vfprintf(out, format, args); + va_end(args); +} + +// Outputs a JSON key-value pair where the value is a string. +static void output_json_key_string(FILE *out, bool comma, int depth, + const char *key, const char *value) +{ + output_json_delimiters(out, comma, depth); + output_json_string(out, key); + fputs(": ", out); + output_json_string(out, value); +} + +// Outputs a JSON key-value pair where the value is a printf format string. +__printf(5, 6) +static void output_json_key_format(FILE *out, bool comma, int depth, + const char *key, const char *format, ...) +{ + va_list args; + + output_json_delimiters(out, comma, depth); + output_json_string(out, key); + fputs(": ", out); + va_start(args, format); + vfprintf(out, format, args); + va_end(args); +} + +static void output_sample_callchain_entry(struct perf_tool *tool, + u64 ip, struct addr_location *al) +{ + struct convert_json *c = container_of(tool, struct convert_json, tool); + FILE *out = c->out; + + output_json_format(out, false, 4, "{"); + output_json_key_format(out, false, 5, "ip", "\"0x%" PRIx64 "\"", ip); + + if (al && al->sym && al->sym->namelen) { + fputc(',', out); + output_json_key_string(out, false, 5, "symbol", al->sym->name); + + if (al->map && al->map->dso) { + const char *dso = al->map->dso->short_name; + + if (dso && strlen(dso) > 0) { + fputc(',', out); + output_json_key_string(out, false, 5, "dso", dso); + } + } + } + + output_json_format(out, false, 4, "}"); +} + +static int process_sample_event(struct perf_tool *tool, + union perf_event *event __maybe_unused, + struct perf_sample *sample, + struct evsel *evsel __maybe_unused, + struct machine *machine) +{ + struct convert_json *c = container_of(tool, struct convert_json, tool); + FILE *out = c->out; + struct addr_location al, tal; + u8 cpumode = PERF_RECORD_MISC_USER; + + if (machine__resolve(machine, &al, sample) < 0) { + pr_err("Sample resolution failed!\n"); + return -1; + } + + ++c->events_count; + + if (c->first) + c->first = false; + else + fputc(',', out); + output_json_format(out, false, 2, "{"); + + output_json_key_format(out, false, 3, "timestamp", "%" PRIi64, sample->time); + output_json_key_format(out, true, 3, "pid", "%i", al.thread->pid_); + output_json_key_format(out, true, 3, "tid", "%i", al.thread->tid); + + if (al.thread->cpu >= 0) + output_json_key_format(out, true, 3, "cpu", "%i", al.thread->cpu); + + output_json_key_string(out, true, 3, "comm", thread__comm_str(al.thread)); + + output_json_key_format(out, true, 3, "callchain", "["); + if (sample->callchain) { + unsigned int i; + bool ok; + bool first_callchain = true; + + for (i = 0; i < sample->callchain->nr; ++i) { + u64 ip = sample->callchain->ips[i]; + + if (ip >= PERF_CONTEXT_MAX) { + switch (ip) { + case PERF_CONTEXT_HV: + cpumode = PERF_RECORD_MISC_HYPERVISOR; + break; + case PERF_CONTEXT_KERNEL: + cpumode = PERF_RECORD_MISC_KERNEL; + break; + case PERF_CONTEXT_USER: + cpumode = PERF_RECORD_MISC_USER; + break; + default: + pr_debug("invalid callchain context: %" + PRId64 "\n", (s64) ip); + break; + } + continue; + } + + if (first_callchain) + first_callchain = false; + else + fputc(',', out); + + ok = thread__find_symbol(al.thread, cpumode, ip, &tal); + output_sample_callchain_entry(tool, ip, ok ? &tal : NULL); + } + } else { + output_sample_callchain_entry(tool, sample->ip, &al); + } + output_json_format(out, false, 3, "]"); + + output_json_format(out, false, 2, "}"); + return 0; +} + +static void output_headers(struct perf_session *session, struct convert_json *c) +{ + struct stat st; + struct perf_header *header = &session->header; + int ret; + int fd = perf_data__fd(session->data); + int i; + FILE *out = c->out; + + output_json_key_format(out, false, 2, "header-version", "%u", header->version); + + ret = fstat(fd, &st); + if (ret >= 0) { + time_t stctime = st.st_mtime; + char buf[256]; + + strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&stctime)); + output_json_key_string(out, true, 2, "captured-on", buf); + } else { + pr_debug("Failed to get mtime of source file, not writing captured-on"); + } + + output_json_key_format(out, true, 2, "data-offset", "%" PRIu64, header->data_offset); + output_json_key_format(out, true, 2, "data-size", "%" PRIu64, header->data_size); + output_json_key_format(out, true, 2, "feat-offset", "%" PRIu64, header->feat_offset); + + output_json_key_string(out, true, 2, "hostname", header->env.hostname); + output_json_key_string(out, true, 2, "os-release", header->env.os_release); + output_json_key_string(out, true, 2, "arch", header->env.arch); + + output_json_key_string(out, true, 2, "cpu-desc", header->env.cpu_desc); + output_json_key_string(out, true, 2, "cpuid", header->env.cpuid); + output_json_key_format(out, true, 2, "nrcpus-online", "%u", header->env.nr_cpus_online); + output_json_key_format(out, true, 2, "nrcpus-avail", "%u", header->env.nr_cpus_avail); + + if (header->env.clock.enabled) { + output_json_key_format(out, true, 2, "clockid", + "%u", header->env.clock.clockid); + output_json_key_format(out, true, 2, "clock-time", + "%" PRIu64, header->env.clock.clockid_ns); + output_json_key_format(out, true, 2, "real-time", + "%" PRIu64, header->env.clock.tod_ns); + } + + output_json_key_string(out, true, 2, "perf-version", header->env.version); + + output_json_key_format(out, true, 2, "cmdline", "["); + for (i = 0; i < header->env.nr_cmdline; i++) { + output_json_delimiters(out, i != 0, 3); + output_json_string(c->out, header->env.cmdline_argv[i]); + } + output_json_format(out, false, 2, "]"); +} + +int bt_convert__perf2json(const char *input_name, const char *output_name, + struct perf_data_convert_opts *opts __maybe_unused) +{ + struct perf_session *session; + int fd; + int ret = -1; + + struct convert_json c = { + .tool = { + .sample = process_sample_event, + .mmap = perf_event__process_mmap, + .mmap2 = perf_event__process_mmap2, + .comm = perf_event__process_comm, + .namespaces = perf_event__process_namespaces, + .cgroup = perf_event__process_cgroup, + .exit = perf_event__process_exit, + .fork = perf_event__process_fork, + .lost = perf_event__process_lost, + .tracing_data = perf_event__process_tracing_data, + .build_id = perf_event__process_build_id, + .id_index = perf_event__process_id_index, + .auxtrace_info = perf_event__process_auxtrace_info, + .auxtrace = perf_event__process_auxtrace, + .event_update = perf_event__process_event_update, + .ordered_events = true, + .ordering_requires_timestamps = true, + }, + .first = true, + .events_count = 0, + }; + + struct perf_data data = { + .mode = PERF_DATA_MODE_READ, + .path = input_name, + .force = opts->force, + }; + + if (opts->all) { + pr_err("--all is currently unsupported for JSON output.\n"); + goto err; + } + if (opts->tod) { + pr_err("--tod is currently unsupported for JSON output.\n"); + goto err; + } + + fd = open(output_name, O_CREAT | O_WRONLY | (opts->force ? O_TRUNC : O_EXCL), 0666); + if (fd == -1) { + if (errno == EEXIST) + pr_err("Output file exists. Use --force to overwrite it.\n"); + else + pr_err("Error opening output file!\n"); + goto err; + } + + c.out = fdopen(fd, "w"); + if (!c.out) { + fprintf(stderr, "Error opening output file!\n"); + close(fd); + goto err; + } + + session = perf_session__new(&data, false, &c.tool); + if (IS_ERR(session)) { + fprintf(stderr, "Error creating perf session!\n"); + goto err_fclose; + } + + if (symbol__init(&session->header.env) < 0) { + fprintf(stderr, "Symbol init error!\n"); + goto err_session_delete; + } + + // The opening brace is printed manually because it isn't delimited from a + // previous value (i.e. we don't want a leading newline) + fputc('{', c.out); + + // Version number for future-proofing. Most additions should be able to be + // done in a backwards-compatible way so this should only need to be bumped + // if some major breaking change must be made. + output_json_format(c.out, false, 1, "\"linux-perf-json-version\": 1"); + + // Output headers + output_json_format(c.out, true, 1, "\"headers\": {"); + output_headers(session, &c); + output_json_format(c.out, false, 1, "}"); + + // Output samples + output_json_format(c.out, true, 1, "\"samples\": ["); + perf_session__process_events(session); + output_json_format(c.out, false, 1, "]"); + output_json_format(c.out, false, 0, "}"); + fputc('\n', c.out); + + fprintf(stderr, + "[ perf data convert: Converted '%s' into JSON data '%s' ]\n", + data.path, output_name); + + fprintf(stderr, + "[ perf data convert: Converted and wrote %.3f MB (%" PRIu64 " samples) ]\n", + (ftell(c.out)) / 1024.0 / 1024.0, c.events_count); + + ret = 0; +err_session_delete: + perf_session__delete(session); +err_fclose: + fclose(c.out); +err: + return ret; +} diff --git a/tools/perf/util/data-convert.h b/tools/perf/util/data-convert.h index feab5f114e37..1b4c5f598415 100644 --- a/tools/perf/util/data-convert.h +++ b/tools/perf/util/data-convert.h @@ -2,10 +2,20 @@ #ifndef __DATA_CONVERT_H #define __DATA_CONVERT_H +#include + struct perf_data_convert_opts { bool force; bool all; bool tod; }; +#ifdef HAVE_LIBBABELTRACE_SUPPORT +int bt_convert__perf2ctf(const char *input_name, const char *to_ctf, + struct perf_data_convert_opts *opts); +#endif /* HAVE_LIBBABELTRACE_SUPPORT */ + +int bt_convert__perf2json(const char *input_name, const char *to_ctf, + struct perf_data_convert_opts *opts); + #endif /* __DATA_CONVERT_H */ -- cgit v1.2.3 From bf8f8587bfb6d1315771a252a1a3be20fda1d783 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 26 Apr 2021 18:37:12 -0700 Subject: perf top: Use evlist->events_stat to count events It's mainly to count lost events for the warning so it should be ok to use the evlist->stats instead. This is needed for changes in the next commit. Reviewed-by: Andi Kleen Signed-off-by: Namhyung Kim Acked-by: Jiri Olsa Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Mark Rutland Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20210427013717.1651674-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-top.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 173ace43f845..69cb3635f5ef 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -328,13 +328,13 @@ static void perf_top__print_sym_table(struct perf_top *top) printf("%-*.*s\n", win_width, win_width, graph_dotted_line); if (!top->record_opts.overwrite && - (hists->stats.nr_lost_warned != - hists->stats.nr_events[PERF_RECORD_LOST])) { - hists->stats.nr_lost_warned = - hists->stats.nr_events[PERF_RECORD_LOST]; + (top->evlist->stats.nr_lost_warned != + top->evlist->stats.nr_events[PERF_RECORD_LOST])) { + top->evlist->stats.nr_lost_warned = + top->evlist->stats.nr_events[PERF_RECORD_LOST]; color_fprintf(stdout, PERF_COLOR_RED, "WARNING: LOST %d chunks, Check IO/CPU overload", - hists->stats.nr_lost_warned); + top->evlist->stats.nr_lost_warned); ++printed; } @@ -852,11 +852,9 @@ static void perf_top__process_lost(struct perf_top *top, union perf_event *event, struct evsel *evsel) { - struct hists *hists = evsel__hists(evsel); - top->lost += event->lost.lost; top->lost_total += event->lost.lost; - hists->stats.total_lost += event->lost.lost; + evsel->evlist->stats.total_lost += event->lost.lost; } static void @@ -864,11 +862,9 @@ perf_top__process_lost_samples(struct perf_top *top, union perf_event *event, struct evsel *evsel) { - struct hists *hists = evsel__hists(evsel); - top->lost += event->lost_samples.lost; top->lost_total += event->lost_samples.lost; - hists->stats.total_lost_samples += event->lost_samples.lost; + evsel->evlist->stats.total_lost_samples += event->lost_samples.lost; } static u64 last_timestamp; @@ -1205,7 +1201,7 @@ static int deliver_event(struct ordered_events *qe, } else if (event->header.type == PERF_RECORD_LOST_SAMPLES) { perf_top__process_lost_samples(top, event, evsel); } else if (event->header.type < PERF_RECORD_MAX) { - hists__inc_nr_events(evsel__hists(evsel), event->header.type); + events_stats__inc(&session->evlist->stats, event->header.type); machine__process_event(machine, event, &sample); } else ++session->evlist->stats.nr_unknown_events; -- cgit v1.2.3 From 0f0abbace3cddc92aaed2db3783c9c501354b3be Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 26 Apr 2021 18:37:13 -0700 Subject: perf hists: Split hists_stats from events_stats Each struct hists have events_stats but most of the fields were not used. It's to count number of samples and periods whether filtered or not. And other fields are used only by evlist. So it'd be better to split hists_stats and events_stats to reduce wasted memory in the struct hists. This makes the output of event statistics in the perf report compact by skipping 0 events in each evsel/hists. Reviewed-by: Andi Kleen Signed-off-by: Namhyung Kim Acked-by: Jiri Olsa Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Mark Rutland Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20210427013717.1651674-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-annotate.c | 2 +- tools/perf/builtin-report.c | 4 ++-- tools/perf/tests/hists_filter.c | 14 +++++++------- tools/perf/ui/browsers/hists.c | 17 +++++++++-------- tools/perf/util/events_stats.h | 10 +++++++--- tools/perf/util/hist.c | 20 ++++++++++++++------ tools/perf/util/hist.h | 4 ++-- 7 files changed, 42 insertions(+), 29 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 524e6f0dff22..717efd78eee6 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -418,7 +418,7 @@ static int __cmd_annotate(struct perf_annotate *ann) total_nr_samples = 0; evlist__for_each_entry(session->evlist, pos) { struct hists *hists = evsel__hists(pos); - u32 nr_samples = hists->stats.nr_events[PERF_RECORD_SAMPLE]; + u32 nr_samples = hists->stats.nr_samples; if (nr_samples > 0) { total_nr_samples += nr_samples; diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 0d65c98794a8..b0b9b60f74e5 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -436,7 +436,7 @@ static size_t hists__fprintf_nr_sample_events(struct hists *hists, struct report { size_t ret; char unit; - unsigned long nr_samples = hists->stats.nr_events[PERF_RECORD_SAMPLE]; + unsigned long nr_samples = hists->stats.nr_samples; u64 nr_events = hists->stats.total_period; struct evsel *evsel = hists_to_evsel(hists); char buf[512]; @@ -464,7 +464,7 @@ static size_t hists__fprintf_nr_sample_events(struct hists *hists, struct report nr_samples += pos_hists->stats.nr_non_filtered_samples; nr_events += pos_hists->stats.total_non_filtered_period; } else { - nr_samples += pos_hists->stats.nr_events[PERF_RECORD_SAMPLE]; + nr_samples += pos_hists->stats.nr_samples; nr_events += pos_hists->stats.total_period; } } diff --git a/tools/perf/tests/hists_filter.c b/tools/perf/tests/hists_filter.c index 123e07d35b55..ca6120cd1d90 100644 --- a/tools/perf/tests/hists_filter.c +++ b/tools/perf/tests/hists_filter.c @@ -150,13 +150,13 @@ int test__hists_filter(struct test *test __maybe_unused, int subtest __maybe_unu } TEST_ASSERT_VAL("Invalid nr samples", - hists->stats.nr_events[PERF_RECORD_SAMPLE] == 10); + hists->stats.nr_samples == 10); TEST_ASSERT_VAL("Invalid nr hist entries", hists->nr_entries == 9); TEST_ASSERT_VAL("Invalid total period", hists->stats.total_period == 1000); TEST_ASSERT_VAL("Unmatched nr samples", - hists->stats.nr_events[PERF_RECORD_SAMPLE] == + hists->stats.nr_samples == hists->stats.nr_non_filtered_samples); TEST_ASSERT_VAL("Unmatched nr hist entries", hists->nr_entries == hists->nr_non_filtered_entries); @@ -175,7 +175,7 @@ int test__hists_filter(struct test *test __maybe_unused, int subtest __maybe_unu /* normal stats should be invariant */ TEST_ASSERT_VAL("Invalid nr samples", - hists->stats.nr_events[PERF_RECORD_SAMPLE] == 10); + hists->stats.nr_samples == 10); TEST_ASSERT_VAL("Invalid nr hist entries", hists->nr_entries == 9); TEST_ASSERT_VAL("Invalid total period", @@ -204,7 +204,7 @@ int test__hists_filter(struct test *test __maybe_unused, int subtest __maybe_unu /* normal stats should be invariant */ TEST_ASSERT_VAL("Invalid nr samples", - hists->stats.nr_events[PERF_RECORD_SAMPLE] == 10); + hists->stats.nr_samples == 10); TEST_ASSERT_VAL("Invalid nr hist entries", hists->nr_entries == 9); TEST_ASSERT_VAL("Invalid total period", @@ -239,7 +239,7 @@ int test__hists_filter(struct test *test __maybe_unused, int subtest __maybe_unu /* normal stats should be invariant */ TEST_ASSERT_VAL("Invalid nr samples", - hists->stats.nr_events[PERF_RECORD_SAMPLE] == 10); + hists->stats.nr_samples == 10); TEST_ASSERT_VAL("Invalid nr hist entries", hists->nr_entries == 9); TEST_ASSERT_VAL("Invalid total period", @@ -268,7 +268,7 @@ int test__hists_filter(struct test *test __maybe_unused, int subtest __maybe_unu /* normal stats should be invariant */ TEST_ASSERT_VAL("Invalid nr samples", - hists->stats.nr_events[PERF_RECORD_SAMPLE] == 10); + hists->stats.nr_samples == 10); TEST_ASSERT_VAL("Invalid nr hist entries", hists->nr_entries == 9); TEST_ASSERT_VAL("Invalid total period", @@ -299,7 +299,7 @@ int test__hists_filter(struct test *test __maybe_unused, int subtest __maybe_unu /* normal stats should be invariant */ TEST_ASSERT_VAL("Invalid nr samples", - hists->stats.nr_events[PERF_RECORD_SAMPLE] == 10); + hists->stats.nr_samples == 10); TEST_ASSERT_VAL("Invalid nr hist entries", hists->nr_entries == 9); TEST_ASSERT_VAL("Invalid total period", diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c index bcfd0a45953b..b72ee6822222 100644 --- a/tools/perf/ui/browsers/hists.c +++ b/tools/perf/ui/browsers/hists.c @@ -682,6 +682,7 @@ static int hist_browser__handle_hotkey(struct hist_browser *browser, bool warn_l switch (key) { case K_TIMER: { struct hist_browser_timer *hbt = browser->hbt; + struct evsel *evsel = hists_to_evsel(browser->hists); u64 nr_entries; WARN_ON_ONCE(!hbt); @@ -696,10 +697,10 @@ static int hist_browser__handle_hotkey(struct hist_browser *browser, bool warn_l ui_browser__update_nr_entries(&browser->b, nr_entries); if (warn_lost_event && - (browser->hists->stats.nr_lost_warned != - browser->hists->stats.nr_events[PERF_RECORD_LOST])) { - browser->hists->stats.nr_lost_warned = - browser->hists->stats.nr_events[PERF_RECORD_LOST]; + (evsel->evlist->stats.nr_lost_warned != + evsel->evlist->stats.nr_events[PERF_RECORD_LOST])) { + evsel->evlist->stats.nr_lost_warned = + evsel->evlist->stats.nr_events[PERF_RECORD_LOST]; ui_browser__warn_lost_events(&browser->b); } @@ -3416,7 +3417,7 @@ static void perf_evsel_menu__write(struct ui_browser *browser, struct evsel *evsel = list_entry(entry, struct evsel, core.node); struct hists *hists = evsel__hists(evsel); bool current_entry = ui_browser__is_current_entry(browser, row); - unsigned long nr_events = hists->stats.nr_events[PERF_RECORD_SAMPLE]; + unsigned long nr_events = hists->stats.nr_samples; const char *ev_name = evsel__name(evsel); char bf[256], unit; const char *warn = " "; @@ -3432,7 +3433,7 @@ static void perf_evsel_menu__write(struct ui_browser *browser, for_each_group_member(pos, evsel) { struct hists *pos_hists = evsel__hists(pos); - nr_events += pos_hists->stats.nr_events[PERF_RECORD_SAMPLE]; + nr_events += pos_hists->stats.nr_samples; } } @@ -3441,7 +3442,7 @@ static void perf_evsel_menu__write(struct ui_browser *browser, unit, unit == ' ' ? "" : " ", ev_name); ui_browser__printf(browser, "%s", bf); - nr_events = hists->stats.nr_events[PERF_RECORD_LOST]; + nr_events = evsel->evlist->stats.nr_events[PERF_RECORD_LOST]; if (nr_events != 0) { menu->lost_events = true; if (!current_entry) @@ -3647,7 +3648,7 @@ static int block_hists_browser__title(struct hist_browser *browser, char *bf, { struct hists *hists = evsel__hists(browser->block_evsel); const char *evname = evsel__name(browser->block_evsel); - unsigned long nr_samples = hists->stats.nr_events[PERF_RECORD_SAMPLE]; + unsigned long nr_samples = hists->stats.nr_samples; int ret; ret = scnprintf(bf, size, "# Samples: %lu", nr_samples); diff --git a/tools/perf/util/events_stats.h b/tools/perf/util/events_stats.h index 631a4af2ed86..e271c8004c89 100644 --- a/tools/perf/util/events_stats.h +++ b/tools/perf/util/events_stats.h @@ -26,15 +26,12 @@ * perf_record_sample.period and stash the result in total_period. */ struct events_stats { - u64 total_period; - u64 total_non_filtered_period; u64 total_lost; u64 total_lost_samples; u64 total_aux_lost; u64 total_aux_partial; u64 total_invalid_chains; u32 nr_events[PERF_RECORD_HEADER_MAX]; - u32 nr_non_filtered_samples; u32 nr_lost_warned; u32 nr_unknown_events; u32 nr_invalid_chains; @@ -44,6 +41,13 @@ struct events_stats { u32 nr_proc_map_timeout; }; +struct hists_stats { + u64 total_period; + u64 total_non_filtered_period; + u32 nr_samples; + u32 nr_non_filtered_samples; +}; + void events_stats__inc(struct events_stats *stats, u32 type); size_t events_stats__fprintf(struct events_stats *stats, FILE *fp); diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 9299ee535518..691a6a777d14 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -2325,14 +2325,19 @@ void events_stats__inc(struct events_stats *stats, u32 type) ++stats->nr_events[type]; } -void hists__inc_nr_events(struct hists *hists, u32 type) +static void hists_stats__inc(struct hists_stats *stats) { - events_stats__inc(&hists->stats, type); + ++stats->nr_samples; +} + +void hists__inc_nr_events(struct hists *hists) +{ + hists_stats__inc(&hists->stats); } void hists__inc_nr_samples(struct hists *hists, bool filtered) { - events_stats__inc(&hists->stats, PERF_RECORD_SAMPLE); + hists_stats__inc(&hists->stats); if (!filtered) hists->stats.nr_non_filtered_samples++; } @@ -2677,8 +2682,11 @@ size_t evlist__fprintf_nr_events(struct evlist *evlist, FILE *fp) size_t ret = 0; evlist__for_each_entry(evlist, pos) { + struct hists *hists = evsel__hists(pos); + ret += fprintf(fp, "%s stats:\n", evsel__name(pos)); - ret += events_stats__fprintf(&evsel__hists(pos)->stats, fp); + ret += fprintf(fp, "%16s events: %10d\n", + "SAMPLE", hists->stats.nr_samples); } return ret; @@ -2698,7 +2706,7 @@ int __hists__scnprintf_title(struct hists *hists, char *bf, size_t size, bool sh const struct dso *dso = hists->dso_filter; struct thread *thread = hists->thread_filter; int socket_id = hists->socket_filter; - unsigned long nr_samples = hists->stats.nr_events[PERF_RECORD_SAMPLE]; + unsigned long nr_samples = hists->stats.nr_samples; u64 nr_events = hists->stats.total_period; struct evsel *evsel = hists_to_evsel(hists); const char *ev_name = evsel__name(evsel); @@ -2725,7 +2733,7 @@ int __hists__scnprintf_title(struct hists *hists, char *bf, size_t size, bool sh nr_samples += pos_hists->stats.nr_non_filtered_samples; nr_events += pos_hists->stats.total_non_filtered_period; } else { - nr_samples += pos_hists->stats.nr_events[PERF_RECORD_SAMPLE]; + nr_samples += pos_hists->stats.nr_samples; nr_events += pos_hists->stats.total_period; } } diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index e2faa745c8d6..6b0f708f08ac 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -96,7 +96,7 @@ struct hists { const char *uid_filter_str; const char *symbol_filter_str; pthread_mutex_t lock; - struct events_stats stats; + struct hists_stats stats; u64 event_stream; u16 col_len[HISTC_NR_COLS]; bool has_callchains; @@ -196,7 +196,7 @@ struct hist_entry *hists__get_entry(struct hists *hists, int idx); u64 hists__total_period(struct hists *hists); void hists__reset_stats(struct hists *hists); void hists__inc_stats(struct hists *hists, struct hist_entry *h); -void hists__inc_nr_events(struct hists *hists, u32 type); +void hists__inc_nr_events(struct hists *hists); void hists__inc_nr_samples(struct hists *hists, bool filtered); size_t hists__fprintf(struct hists *hists, bool show_header, int max_rows, -- cgit v1.2.3 From 55f754443890043956ee81431faa3c529309ba24 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 26 Apr 2021 18:37:14 -0700 Subject: perf report: Show event sample counts in --stat output To make the output identical with perf report -D, it needs to show per-event sample counts along with the aggregated stat at the end. Reviewed-by: Andi Kleen Signed-off-by: Namhyung Kim Acked-by: Jiri Olsa Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Mark Rutland Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20210427013717.1651674-4-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-report.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'tools') diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index b0b9b60f74e5..be56f3efa413 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -708,9 +708,22 @@ static void report__output_resort(struct report *rep) ui_progress__finish(); } +static int count_sample_event(struct perf_tool *tool __maybe_unused, + union perf_event *event __maybe_unused, + struct perf_sample *sample __maybe_unused, + struct evsel *evsel, + struct machine *machine __maybe_unused) +{ + struct hists *hists = evsel__hists(evsel); + + hists__inc_nr_events(hists); + return 0; +} + static void stats_setup(struct report *rep) { memset(&rep->tool, 0, sizeof(rep->tool)); + rep->tool.sample = count_sample_event; rep->tool.no_warn = true; } @@ -719,6 +732,7 @@ static int stats_print(struct report *rep) struct perf_session *session = rep->session; perf_session__fprintf_nr_events(session, stdout); + perf_evlist__fprintf_nr_events(session->evlist, stdout); return 0; } -- cgit v1.2.3 From 2775de0b115a6ffab7882c45c755005ee0ac0122 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 26 Apr 2021 18:37:15 -0700 Subject: perf report: Add --skip-empty option to suppress 0 event stat To make the output more readable, I think it's better to remove 0's in the output. Also the dummy event has no event stats so it just wasts the space. Let's use the --skip-empty option to suppress it. $ perf report --stat --skip-empty Aggregated stats: TOTAL events: 16530 MMAP events: 226 COMM events: 1596 EXIT events: 2 THROTTLE events: 121 UNTHROTTLE events: 117 FORK events: 1595 SAMPLE events: 719 MMAP2 events: 12147 CGROUP events: 2 FINISHED_ROUND events: 2 THREAD_MAP events: 1 CPU_MAP events: 1 TIME_CONV events: 1 cycles stats: SAMPLE events: 719 Reviewed-by: Andi Kleen Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Acked-by: Jiri Olsa Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Mark Rutland Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20210427013717.1651674-5-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-report.txt | 3 +++ tools/perf/builtin-annotate.c | 4 ++-- tools/perf/builtin-report.c | 16 ++++++++++++---- tools/perf/ui/stdio/hist.c | 5 ++++- tools/perf/util/events_stats.h | 3 ++- tools/perf/util/hist.c | 6 +++++- tools/perf/util/hist.h | 3 ++- tools/perf/util/session.c | 5 +++-- tools/perf/util/session.h | 3 ++- 9 files changed, 35 insertions(+), 13 deletions(-) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index f51f0000676e..24efc0583c93 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -571,6 +571,9 @@ include::itrace.txt[] sampled cycles 'Avg Cycles' - block average sampled cycles +--skip-empty:: + Do not print 0 results in the --stat output. + include::callchain-overhead-calculation.txt[] SEE ALSO diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 717efd78eee6..49627a7bed7c 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -404,8 +404,8 @@ static int __cmd_annotate(struct perf_annotate *ann) goto out; if (dump_trace) { - perf_session__fprintf_nr_events(session, stdout); - evlist__fprintf_nr_events(session->evlist, stdout); + perf_session__fprintf_nr_events(session, stdout, false); + evlist__fprintf_nr_events(session->evlist, stdout, false); goto out; } diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index be56f3efa413..4910194acaa6 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -85,6 +85,7 @@ struct report { bool group_set; bool stitch_lbr; bool disable_order; + bool skip_empty; int max_stack; struct perf_read_values show_threads_values; struct annotation_options annotation_opts; @@ -530,6 +531,9 @@ static int evlist__tty_browse_hists(struct evlist *evlist, struct report *rep, c if (symbol_conf.event_group && !evsel__is_group_leader(pos)) continue; + if (rep->skip_empty && !hists->stats.nr_samples) + continue; + hists__fprintf_nr_sample_events(hists, rep, evname, stdout); if (rep->total_cycles_mode) { @@ -731,8 +735,8 @@ static int stats_print(struct report *rep) { struct perf_session *session = rep->session; - perf_session__fprintf_nr_events(session, stdout); - perf_evlist__fprintf_nr_events(session->evlist, stdout); + perf_session__fprintf_nr_events(session, stdout, rep->skip_empty); + evlist__fprintf_nr_events(session->evlist, stdout, rep->skip_empty); return 0; } @@ -944,8 +948,10 @@ static int __cmd_report(struct report *rep) perf_session__fprintf_dsos(session, stdout); if (dump_trace) { - perf_session__fprintf_nr_events(session, stdout); - evlist__fprintf_nr_events(session->evlist, stdout); + perf_session__fprintf_nr_events(session, stdout, + rep->skip_empty); + evlist__fprintf_nr_events(session->evlist, stdout, + rep->skip_empty); return 0; } } @@ -1313,6 +1319,8 @@ int cmd_report(int argc, const char **argv) "Sort all blocks by 'Sampled Cycles%'"), OPT_BOOLEAN(0, "disable-order", &report.disable_order, "Disable raw trace ordering"), + OPT_BOOLEAN(0, "skip-empty", &report.skip_empty, + "Do not display empty (or dummy) events in the output"), OPT_END() }; struct perf_data data = { diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c index 2ab2af4d4849..d9e634406175 100644 --- a/tools/perf/ui/stdio/hist.c +++ b/tools/perf/ui/stdio/hist.c @@ -897,7 +897,8 @@ out: return ret; } -size_t events_stats__fprintf(struct events_stats *stats, FILE *fp) +size_t events_stats__fprintf(struct events_stats *stats, FILE *fp, + bool skip_empty) { int i; size_t ret = 0; @@ -908,6 +909,8 @@ size_t events_stats__fprintf(struct events_stats *stats, FILE *fp) name = perf_event__name(i); if (!strcmp(name, "UNKNOWN")) continue; + if (skip_empty && !stats->nr_events[i]) + continue; ret += fprintf(fp, "%16s events: %10d\n", name, stats->nr_events[i]); } diff --git a/tools/perf/util/events_stats.h b/tools/perf/util/events_stats.h index e271c8004c89..3480bafd414b 100644 --- a/tools/perf/util/events_stats.h +++ b/tools/perf/util/events_stats.h @@ -50,6 +50,7 @@ struct hists_stats { void events_stats__inc(struct events_stats *stats, u32 type); -size_t events_stats__fprintf(struct events_stats *stats, FILE *fp); +size_t events_stats__fprintf(struct events_stats *stats, FILE *fp, + bool skip_empty); #endif /* __PERF_EVENTS_STATS_ */ diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 691a6a777d14..65fe65ba03c2 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -2676,7 +2676,8 @@ void hist__account_cycles(struct branch_stack *bs, struct addr_location *al, } } -size_t evlist__fprintf_nr_events(struct evlist *evlist, FILE *fp) +size_t evlist__fprintf_nr_events(struct evlist *evlist, FILE *fp, + bool skip_empty) { struct evsel *pos; size_t ret = 0; @@ -2684,6 +2685,9 @@ size_t evlist__fprintf_nr_events(struct evlist *evlist, FILE *fp) evlist__for_each_entry(evlist, pos) { struct hists *hists = evsel__hists(pos); + if (skip_empty && !hists->stats.nr_samples) + continue; + ret += fprintf(fp, "%s stats:\n", evsel__name(pos)); ret += fprintf(fp, "%16s events: %10d\n", "SAMPLE", hists->stats.nr_samples); diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index 6b0f708f08ac..5343b62476e6 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -202,7 +202,8 @@ void hists__inc_nr_samples(struct hists *hists, bool filtered); size_t hists__fprintf(struct hists *hists, bool show_header, int max_rows, int max_cols, float min_pcnt, FILE *fp, bool ignore_callchains); -size_t evlist__fprintf_nr_events(struct evlist *evlist, FILE *fp); +size_t evlist__fprintf_nr_events(struct evlist *evlist, FILE *fp, + bool skip_empty); void hists__filter_by_dso(struct hists *hists); void hists__filter_by_thread(struct hists *hists); diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index eba3769be3f1..a6659b616e6d 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -2352,7 +2352,8 @@ size_t perf_session__fprintf_dsos_buildid(struct perf_session *session, FILE *fp return machines__fprintf_dsos_buildid(&session->machines, fp, skip, parm); } -size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp) +size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp, + bool skip_empty) { size_t ret; const char *msg = ""; @@ -2362,7 +2363,7 @@ size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp) ret = fprintf(fp, "\nAggregated stats:%s\n", msg); - ret += events_stats__fprintf(&session->evlist->stats, fp); + ret += events_stats__fprintf(&session->evlist->stats, fp, skip_empty); return ret; } diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h index f76480166d38..e31ba4c92a6c 100644 --- a/tools/perf/util/session.h +++ b/tools/perf/util/session.h @@ -113,7 +113,8 @@ size_t perf_session__fprintf_dsos(struct perf_session *session, FILE *fp); size_t perf_session__fprintf_dsos_buildid(struct perf_session *session, FILE *fp, bool (fn)(struct dso *dso, int parm), int parm); -size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp); +size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp, + bool skip_empty); struct evsel *perf_session__find_first_evtype(struct perf_session *session, unsigned int type); -- cgit v1.2.3 From 8f08cf3330da0582e7a51bd1b999c820147e19d1 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 26 Apr 2021 18:37:16 -0700 Subject: perf report: Make --skip-empty as default so that the compact output is shown by default. Also add 'report.skip-empty' config option to override the default. Users can also use --no-skip-empty command line option to change the behavior anytime. Committer testing: $ perf report --stat Aggregated stats: TOTAL events: 19 COMM events: 2 EXIT events: 1 SAMPLE events: 8 MMAP2 events: 4 FINISHED_ROUND events: 1 THREAD_MAP events: 1 CPU_MAP events: 1 TIME_CONV events: 1 cycles:u stats: SAMPLE events: 8 $ perf config report.skip-empty=false $ perf report --stat Aggregated stats: TOTAL events: 19 MMAP events: 0 LOST events: 0 COMM events: 2 EXIT events: 1 THROTTLE events: 0 UNTHROTTLE events: 0 FORK events: 0 READ events: 0 SAMPLE events: 8 MMAP2 events: 4 AUX events: 0 ITRACE_START events: 0 LOST_SAMPLES events: 0 SWITCH events: 0 SWITCH_CPU_WIDE events: 0 NAMESPACES events: 0 KSYMBOL events: 0 BPF_EVENT events: 0 CGROUP events: 0 TEXT_POKE events: 0 ATTR events: 0 EVENT_TYPE events: 0 TRACING_DATA events: 0 BUILD_ID events: 0 FINISHED_ROUND events: 1 ID_INDEX events: 0 AUXTRACE_INFO events: 0 AUXTRACE events: 0 AUXTRACE_ERROR events: 0 THREAD_MAP events: 1 CPU_MAP events: 1 STAT_CONFIG events: 0 STAT events: 0 STAT_ROUND events: 0 EVENT_UPDATE events: 0 TIME_CONV events: 1 FEATURE events: 0 COMPRESSED events: 0 cycles:u stats: SAMPLE events: 8 $ perf config report.skip-empty report.skip-empty=false $ Reviewed-by: Andi Kleen Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Acked-by: Jiri Olsa Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Mark Rutland Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20210427013717.1651674-6-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-config.txt | 5 +++++ tools/perf/builtin-report.c | 6 ++++++ 2 files changed, 11 insertions(+) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-config.txt b/tools/perf/Documentation/perf-config.txt index 154a1ced72b2..b0872c801866 100644 --- a/tools/perf/Documentation/perf-config.txt +++ b/tools/perf/Documentation/perf-config.txt @@ -123,6 +123,7 @@ Given a $HOME/.perfconfig like this: queue-size = 0 children = true group = true + skip-empty = true [llvm] dump-obj = true @@ -531,6 +532,10 @@ report.*:: 0.07% 0.00% noploop ld-2.15.so [.] strcmp 0.03% 0.00% noploop [kernel.kallsyms] [k] timerqueue_del + report.skip-empty:: + This option can change default stat behavior with empty results. + If it's set true, 'perf report --stat' will not show 0 stats. + top.*:: top.children:: Same as 'report.children'. So if it is enabled, the output of 'top' diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 4910194acaa6..36f9ccfeb38a 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -136,6 +136,11 @@ static int report__config(const char *var, const char *value, void *cb) return 0; } + if (!strcmp(var, "report.skip-empty")) { + rep->skip_empty = perf_config_bool(var, value); + return 0; + } + return 0; } @@ -1160,6 +1165,7 @@ int cmd_report(int argc, const char **argv) .pretty_printing_style = "normal", .socket_filter = -1, .annotation_opts = annotation__default_options, + .skip_empty = true, }; const struct option options[] = { OPT_STRING('i', "input", &input_name, "file", -- cgit v1.2.3 From 462f57dbf9fa1fdcdeae2e0b19a667f7f9989bdb Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 26 Apr 2021 18:37:17 -0700 Subject: perf report: Print percentage of each event statistics It's sometimes useful to see how many samples vs other events in the data file with percent values. $ perf report --stat Aggregated stats: TOTAL events: 20064 MMAP events: 239 ( 1.2%) COMM events: 1518 ( 7.6%) EXIT events: 1 ( 0.0%) FORK events: 1517 ( 7.6%) SAMPLE events: 4015 (20.0%) MMAP2 events: 12769 (63.6%) FINISHED_ROUND events: 2 ( 0.0%) THREAD_MAP events: 1 ( 0.0%) CPU_MAP events: 1 ( 0.0%) TIME_CONV events: 1 ( 0.0%) cycles stats: SAMPLE events: 2475 instructions stats: SAMPLE events: 1540 Suggested-by: Andi Kleen Reviewed-by: Andi Kleen Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Acked-by: Jiri Olsa Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Mark Rutland Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20210427013717.1651674-7-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/stdio/hist.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c index d9e634406175..f36270485168 100644 --- a/tools/perf/ui/stdio/hist.c +++ b/tools/perf/ui/stdio/hist.c @@ -902,6 +902,7 @@ size_t events_stats__fprintf(struct events_stats *stats, FILE *fp, { int i; size_t ret = 0; + u32 total = stats->nr_events[0]; for (i = 0; i < PERF_RECORD_HEADER_MAX; ++i) { const char *name; @@ -912,7 +913,14 @@ size_t events_stats__fprintf(struct events_stats *stats, FILE *fp, if (skip_empty && !stats->nr_events[i]) continue; - ret += fprintf(fp, "%16s events: %10d\n", name, stats->nr_events[i]); + if (i && total) { + ret += fprintf(fp, "%16s events: %10d (%4.1f%%)\n", + name, stats->nr_events[i], + 100.0 * stats->nr_events[i] / total); + } else { + ret += fprintf(fp, "%16s events: %10d\n", + name, stats->nr_events[i]); + } } return ret; -- cgit v1.2.3 From 412736119116d0161688e9061485fbc3e25f78d5 Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Tue, 27 Apr 2021 15:01:14 +0800 Subject: tools headers uapi: Update tools's copy of linux/perf_event.h To get the changes in: Liang Kan's patch 55bcf6ef314ae8ba ("perf: Extend PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE") Kan's patch is in the tip/perf/core branch. So the next perf tool patches need this interface for hybrid support. Signed-off-by: Jin Yao Reviewed-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20210427070139.25256-2-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/perf_event.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'tools') diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index ad15e40d7f5d..14332f4cf816 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -37,6 +37,21 @@ enum perf_type_id { PERF_TYPE_MAX, /* non-ABI */ }; +/* + * attr.config layout for type PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE + * PERF_TYPE_HARDWARE: 0xEEEEEEEE000000AA + * AA: hardware event ID + * EEEEEEEE: PMU type ID + * PERF_TYPE_HW_CACHE: 0xEEEEEEEE00DDCCBB + * BB: hardware cache ID + * CC: hardware cache op ID + * DD: hardware cache op result ID + * EEEEEEEE: PMU type ID + * If the PMU type ID is 0, the PERF_TYPE_RAW will be applied. + */ +#define PERF_PMU_TYPE_SHIFT 32 +#define PERF_HW_EVENT_MASK 0xffffffff + /* * Generalized performance event event_id types, used by the * attr.event_id parameter of the sys_perf_event_open() -- cgit v1.2.3 From 6b64833b9e49fda28b0eb94d865c334b37b4662f Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Tue, 27 Apr 2021 15:01:15 +0800 Subject: perf jevents: Support unit value "cpu_core" and "cpu_atom" For some Intel platforms, such as Alderlake, which is a hybrid platform and it consists of atom cpu and core cpu. Each cpu has dedicated event list. Part of events are available on core cpu, part of events are available on atom cpu. The kernel exports new cpu pmus: cpu_core and cpu_atom. The event in json is added with a new field "Unit" to indicate which pmu the event is available on. For example, one event in cache.json, { "BriefDescription": "Counts the number of load ops retired that", "CollectPEBSRecord": "2", "Counter": "0,1,2,3", "EventCode": "0xd2", "EventName": "MEM_LOAD_UOPS_RETIRED_MISC.MMIO", "PEBScounters": "0,1,2,3", "SampleAfterValue": "1000003", "UMask": "0x80", "Unit": "cpu_atom" }, The unit "cpu_atom" indicates this event is only available on "cpu_atom". In generated pmu-events.c, we can see: { .name = "mem_load_uops_retired_misc.mmio", .event = "period=1000003,umask=0x80,event=0xd2", .desc = "Counts the number of load ops retired that. Unit: cpu_atom ", .topic = "cache", .pmu = "cpu_atom", }, But if without this patch, the "uncore_" prefix is added before "cpu_atom", such as: .pmu = "uncore_cpu_atom" That would be a wrong pmu. Signed-off-by: Jin Yao Reviewed-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20210427070139.25256-3-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/jevents.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/perf/pmu-events/jevents.c b/tools/perf/pmu-events/jevents.c index 33aa3c885eaf..ed4f0bd72e5a 100644 --- a/tools/perf/pmu-events/jevents.c +++ b/tools/perf/pmu-events/jevents.c @@ -285,6 +285,8 @@ static struct map { { "imx8_ddr", "imx8_ddr" }, { "L3PMC", "amd_l3" }, { "DFPMC", "amd_df" }, + { "cpu_core", "cpu_core" }, + { "cpu_atom", "cpu_atom" }, {} }; -- cgit v1.2.3 From eab35953e67b48c763fbb0e0ffc64dd3152361ea Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Tue, 27 Apr 2021 15:01:16 +0800 Subject: perf pmu: Simplify arguments of __perf_pmu__new_alias Simplify the arguments of __perf_pmu__new_alias() by passing the whole 'struct pme_event' pointer. Signed-off-by: Jin Yao Reviewed-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20210427070139.25256-4-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/pmu.c | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 286d5e415bdc..8214def7b0f0 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -306,18 +306,25 @@ static bool perf_pmu_merge_alias(struct perf_pmu_alias *newalias, } static int __perf_pmu__new_alias(struct list_head *list, char *dir, char *name, - char *desc, char *val, - char *long_desc, char *topic, - char *unit, char *perpkg, - char *metric_expr, - char *metric_name, - char *deprecated) + char *desc, char *val, struct pmu_event *pe) { struct parse_events_term *term; struct perf_pmu_alias *alias; int ret; int num; char newval[256]; + char *long_desc = NULL, *topic = NULL, *unit = NULL, *perpkg = NULL, + *metric_expr = NULL, *metric_name = NULL, *deprecated = NULL; + + if (pe) { + long_desc = (char *)pe->long_desc; + topic = (char *)pe->topic; + unit = (char *)pe->unit; + perpkg = (char *)pe->perpkg; + metric_expr = (char *)pe->metric_expr; + metric_name = (char *)pe->metric_name; + deprecated = (char *)pe->deprecated; + } alias = malloc(sizeof(*alias)); if (!alias) @@ -406,8 +413,7 @@ static int perf_pmu__new_alias(struct list_head *list, char *dir, char *name, FI /* Remove trailing newline from sysfs file */ strim(buf); - return __perf_pmu__new_alias(list, dir, name, NULL, buf, NULL, NULL, NULL, - NULL, NULL, NULL, NULL); + return __perf_pmu__new_alias(list, dir, name, NULL, buf, NULL); } static inline bool pmu_alias_info_file(char *name) @@ -798,11 +804,7 @@ new_alias: /* need type casts to override 'const' */ __perf_pmu__new_alias(head, NULL, (char *)pe->name, (char *)pe->desc, (char *)pe->event, - (char *)pe->long_desc, (char *)pe->topic, - (char *)pe->unit, (char *)pe->perpkg, - (char *)pe->metric_expr, - (char *)pe->metric_name, - (char *)pe->deprecated); + pe); } } @@ -869,13 +871,7 @@ static int pmu_add_sys_aliases_iter_fn(struct pmu_event *pe, void *data) (char *)pe->name, (char *)pe->desc, (char *)pe->event, - (char *)pe->long_desc, - (char *)pe->topic, - (char *)pe->unit, - (char *)pe->perpkg, - (char *)pe->metric_expr, - (char *)pe->metric_name, - (char *)pe->deprecated); + pe); } return 0; -- cgit v1.2.3 From 32705de7d45d0ed989517a63454c2b3e5e5ea267 Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Tue, 27 Apr 2021 15:01:17 +0800 Subject: perf pmu: Save pmu name On hybrid platform, one event is available on one pmu (such as, available on cpu_core or on cpu_atom). This patch saves the pmu name to the pmu field of struct perf_pmu_alias. Then next we can know the pmu which the event can be enabled on. Signed-off-by: Jin Yao Reviewed-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20210427070139.25256-5-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/pmu.c | 10 +++++++++- tools/perf/util/pmu.h | 1 + 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 8214def7b0f0..44225838eb03 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -283,6 +283,7 @@ void perf_pmu_free_alias(struct perf_pmu_alias *newalias) zfree(&newalias->str); zfree(&newalias->metric_expr); zfree(&newalias->metric_name); + zfree(&newalias->pmu_name); parse_events_terms__purge(&newalias->terms); free(newalias); } @@ -297,6 +298,10 @@ static bool perf_pmu_merge_alias(struct perf_pmu_alias *newalias, list_for_each_entry(a, alist, list) { if (!strcasecmp(newalias->name, a->name)) { + if (newalias->pmu_name && a->pmu_name && + !strcasecmp(newalias->pmu_name, a->pmu_name)) { + continue; + } perf_pmu_update_alias(a, newalias); perf_pmu_free_alias(newalias); return true; @@ -314,7 +319,8 @@ static int __perf_pmu__new_alias(struct list_head *list, char *dir, char *name, int num; char newval[256]; char *long_desc = NULL, *topic = NULL, *unit = NULL, *perpkg = NULL, - *metric_expr = NULL, *metric_name = NULL, *deprecated = NULL; + *metric_expr = NULL, *metric_name = NULL, *deprecated = NULL, + *pmu_name = NULL; if (pe) { long_desc = (char *)pe->long_desc; @@ -324,6 +330,7 @@ static int __perf_pmu__new_alias(struct list_head *list, char *dir, char *name, metric_expr = (char *)pe->metric_expr; metric_name = (char *)pe->metric_name; deprecated = (char *)pe->deprecated; + pmu_name = (char *)pe->pmu; } alias = malloc(sizeof(*alias)); @@ -389,6 +396,7 @@ static int __perf_pmu__new_alias(struct list_head *list, char *dir, char *name, } alias->per_pkg = perpkg && sscanf(perpkg, "%d", &num) == 1 && num == 1; alias->str = strdup(newval); + alias->pmu_name = pmu_name ? strdup(pmu_name) : NULL; if (deprecated) alias->deprecated = true; diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index 1f1749ba830f..4f100768c264 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -72,6 +72,7 @@ struct perf_pmu_alias { bool deprecated; char *metric_expr; char *metric_name; + char *pmu_name; }; struct perf_pmu *perf_pmu__find(const char *name); -- cgit v1.2.3 From 444624307c4e06d35de12df1cfe08a4964ac086f Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Tue, 27 Apr 2021 15:01:18 +0800 Subject: perf pmu: Save detected hybrid pmus to a global pmu list We identify the cpu_core pmu and cpu_atom pmu by explicitly checking following files: For cpu_core, checks: "/sys/bus/event_source/devices/cpu_core/cpus" For cpu_atom, checks: "/sys/bus/event_source/devices/cpu_atom/cpus" If the 'cpus' file exists and it has data, the pmu exists. But in order not to hardcode the "cpu_core" and "cpu_atom", and make the code in a generic way. So if the path "/sys/bus/event_source/devices/cpu_xxx/cpus" exists, the hybrid pmu exists. All the detected hybrid pmus are linked to a global list 'perf_pmu__hybrid_pmus' and then next we just need to iterate the list to get all hybrid pmu by using perf_pmu__for_each_hybrid_pmu. Signed-off-by: Jin Yao Reviewed-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20210427070139.25256-6-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/Build | 1 + tools/perf/util/pmu-hybrid.c | 49 ++++++++++++++++++++++++++++++++++++++++++++ tools/perf/util/pmu-hybrid.h | 18 ++++++++++++++++ tools/perf/util/pmu.c | 9 +++++++- tools/perf/util/pmu.h | 4 ++++ 5 files changed, 80 insertions(+), 1 deletion(-) create mode 100644 tools/perf/util/pmu-hybrid.c create mode 100644 tools/perf/util/pmu-hybrid.h (limited to 'tools') diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 827d216a780e..45bea9911669 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -69,6 +69,7 @@ perf-y += parse-events-bison.o perf-y += pmu.o perf-y += pmu-flex.o perf-y += pmu-bison.o +perf-y += pmu-hybrid.o perf-y += trace-event-read.o perf-y += trace-event-info.o perf-y += trace-event-scripting.o diff --git a/tools/perf/util/pmu-hybrid.c b/tools/perf/util/pmu-hybrid.c new file mode 100644 index 000000000000..8ed0e6e1776d --- /dev/null +++ b/tools/perf/util/pmu-hybrid.c @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "fncache.h" +#include "pmu-hybrid.h" + +LIST_HEAD(perf_pmu__hybrid_pmus); + +bool perf_pmu__hybrid_mounted(const char *name) +{ + char path[PATH_MAX]; + const char *sysfs; + FILE *file; + int n, cpu; + + if (strncmp(name, "cpu_", 4)) + return false; + + sysfs = sysfs__mountpoint(); + if (!sysfs) + return false; + + snprintf(path, PATH_MAX, CPUS_TEMPLATE_CPU, sysfs, name); + if (!file_available(path)) + return false; + + file = fopen(path, "r"); + if (!file) + return false; + + n = fscanf(file, "%u", &cpu); + fclose(file); + if (n <= 0) + return false; + + return true; +} diff --git a/tools/perf/util/pmu-hybrid.h b/tools/perf/util/pmu-hybrid.h new file mode 100644 index 000000000000..35bed3714438 --- /dev/null +++ b/tools/perf/util/pmu-hybrid.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __PMU_HYBRID_H +#define __PMU_HYBRID_H + +#include +#include +#include +#include +#include "pmu.h" + +extern struct list_head perf_pmu__hybrid_pmus; + +#define perf_pmu__for_each_hybrid_pmu(pmu) \ + list_for_each_entry(pmu, &perf_pmu__hybrid_pmus, hybrid_list) + +bool perf_pmu__hybrid_mounted(const char *name); + +#endif /* __PMU_HYBRID_H */ diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 44225838eb03..6e49c7b8ad71 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -25,6 +25,7 @@ #include "string2.h" #include "strbuf.h" #include "fncache.h" +#include "pmu-hybrid.h" struct perf_pmu perf_pmu__fake; @@ -613,7 +614,6 @@ static struct perf_cpu_map *__pmu_cpumask(const char *path) */ #define SYS_TEMPLATE_ID "./bus/event_source/devices/%s/identifier" #define CPUS_TEMPLATE_UNCORE "%s/bus/event_source/devices/%s/cpumask" -#define CPUS_TEMPLATE_CPU "%s/bus/event_source/devices/%s/cpus" static struct perf_cpu_map *pmu_cpumask(const char *name) { @@ -645,6 +645,9 @@ static bool pmu_is_uncore(const char *name) char path[PATH_MAX]; const char *sysfs; + if (perf_pmu__hybrid_mounted(name)) + return false; + sysfs = sysfs__mountpoint(); snprintf(path, PATH_MAX, CPUS_TEMPLATE_UNCORE, sysfs, name); return file_available(path); @@ -951,6 +954,7 @@ static struct perf_pmu *pmu_lookup(const char *name) pmu->is_uncore = pmu_is_uncore(name); if (pmu->is_uncore) pmu->id = pmu_id(name); + pmu->is_hybrid = perf_pmu__hybrid_mounted(name); pmu->max_precise = pmu_max_precise(name); pmu_add_cpu_aliases(&aliases, pmu); pmu_add_sys_aliases(&aliases, pmu); @@ -962,6 +966,9 @@ static struct perf_pmu *pmu_lookup(const char *name) list_splice(&aliases, &pmu->aliases); list_add_tail(&pmu->list, &pmus); + if (pmu->is_hybrid) + list_add_tail(&pmu->hybrid_list, &perf_pmu__hybrid_pmus); + pmu->default_config = perf_pmu__get_default_config(pmu); return pmu; diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index 4f100768c264..9a2f89eeab6f 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include "parse-events.h" #include "pmu-events/pmu-events.h" @@ -19,6 +20,7 @@ enum { #define PERF_PMU_FORMAT_BITS 64 #define EVENT_SOURCE_DEVICE_PATH "/bus/event_source/devices/" +#define CPUS_TEMPLATE_CPU "%s/bus/event_source/devices/%s/cpus" struct perf_event_attr; @@ -34,6 +36,7 @@ struct perf_pmu { __u32 type; bool selectable; bool is_uncore; + bool is_hybrid; bool auxtrace; int max_precise; struct perf_event_attr *default_config; @@ -42,6 +45,7 @@ struct perf_pmu { struct list_head aliases; /* HEAD struct perf_pmu_alias -> list */ struct list_head caps; /* HEAD struct perf_pmu_caps -> list */ struct list_head list; /* ELEM */ + struct list_head hybrid_list; }; extern struct perf_pmu perf_pmu__fake; -- cgit v1.2.3 From c5a26ea490a16798d973e6fa352c6b8375646bc4 Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Tue, 27 Apr 2021 15:01:19 +0800 Subject: perf pmu: Add hybrid helper functions The functions perf_pmu__is_hybrid and perf_pmu__find_hybrid_pmu can be used to identify the hybrid platform and return the found hybrid cpu pmu. All the detected hybrid pmus have been saved in 'perf_pmu__hybrid_pmus' list. So we just need to search this list. perf_pmu__hybrid_type_to_pmu converts the user specified string to hybrid pmu name. This is used to support the '--cputype' option in next patches. perf_pmu__has_hybrid checks the existing of hybrid pmu. Note that, we have to define it in pmu.c (make pmu-hybrid.c no more symbol dependency), otherwise perf test python would be failed. Signed-off-by: Jin Yao Reviewed-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20210427070139.25256-7-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/pmu-hybrid.c | 40 ++++++++++++++++++++++++++++++++++++++++ tools/perf/util/pmu-hybrid.h | 4 ++++ tools/perf/util/pmu.c | 11 +++++++++++ tools/perf/util/pmu.h | 2 ++ 4 files changed, 57 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/pmu-hybrid.c b/tools/perf/util/pmu-hybrid.c index 8ed0e6e1776d..f51ccaac60ee 100644 --- a/tools/perf/util/pmu-hybrid.c +++ b/tools/perf/util/pmu-hybrid.c @@ -47,3 +47,43 @@ bool perf_pmu__hybrid_mounted(const char *name) return true; } + +struct perf_pmu *perf_pmu__find_hybrid_pmu(const char *name) +{ + struct perf_pmu *pmu; + + if (!name) + return NULL; + + perf_pmu__for_each_hybrid_pmu(pmu) { + if (!strcmp(name, pmu->name)) + return pmu; + } + + return NULL; +} + +bool perf_pmu__is_hybrid(const char *name) +{ + return perf_pmu__find_hybrid_pmu(name) != NULL; +} + +char *perf_pmu__hybrid_type_to_pmu(const char *type) +{ + char *pmu_name = NULL; + + if (asprintf(&pmu_name, "cpu_%s", type) < 0) + return NULL; + + if (perf_pmu__is_hybrid(pmu_name)) + return pmu_name; + + /* + * pmu may be not scanned, check the sysfs. + */ + if (perf_pmu__hybrid_mounted(pmu_name)) + return pmu_name; + + free(pmu_name); + return NULL; +} diff --git a/tools/perf/util/pmu-hybrid.h b/tools/perf/util/pmu-hybrid.h index 35bed3714438..d0fa7bc50a76 100644 --- a/tools/perf/util/pmu-hybrid.h +++ b/tools/perf/util/pmu-hybrid.h @@ -15,4 +15,8 @@ extern struct list_head perf_pmu__hybrid_pmus; bool perf_pmu__hybrid_mounted(const char *name); +struct perf_pmu *perf_pmu__find_hybrid_pmu(const char *name); +bool perf_pmu__is_hybrid(const char *name); +char *perf_pmu__hybrid_type_to_pmu(const char *type); + #endif /* __PMU_HYBRID_H */ diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 6e49c7b8ad71..88c8ecdc60b0 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -40,6 +40,7 @@ int perf_pmu_parse(struct list_head *list, char *name); extern FILE *perf_pmu_in; static LIST_HEAD(pmus); +static bool hybrid_scanned; /* * Parse & process all the sysfs attributes located under @@ -1861,3 +1862,13 @@ void perf_pmu__warn_invalid_config(struct perf_pmu *pmu, __u64 config, "'%llx' not supported by kernel)!\n", name ?: "N/A", buf, config); } + +bool perf_pmu__has_hybrid(void) +{ + if (!hybrid_scanned) { + hybrid_scanned = true; + perf_pmu__scan(NULL); + } + + return !list_empty(&perf_pmu__hybrid_pmus); +} diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index 9a2f89eeab6f..a790ef758171 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -132,4 +132,6 @@ int perf_pmu__caps_parse(struct perf_pmu *pmu); void perf_pmu__warn_invalid_config(struct perf_pmu *pmu, __u64 config, char *name); +bool perf_pmu__has_hybrid(void); + #endif /* __PMU_H */ -- cgit v1.2.3 From 12279429d8620fe0cb2cdc0ba68cae3cc2c826f9 Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Tue, 27 Apr 2021 15:01:20 +0800 Subject: perf stat: Uniquify hybrid event name It would be useful to let user know the pmu which the event belongs to. perf-stat has supported '--no-merge' option and it can print the pmu name after the event name, such as: "cycles [cpu_core]" Now this option is enabled by default for hybrid platform but change the format to: "cpu_core/cycles/" If user configs the name, we still use the user specified name. Signed-off-by: Jin Yao Reviewed-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Kan Liang Cc: Peter Zijlstra ink: https://lore.kernel.org/r/20210427070139.25256-8-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 4 ++++ tools/perf/util/evsel.h | 1 + tools/perf/util/parse-events.c | 3 +++ tools/perf/util/stat-display.c | 15 +++++++++++++-- 4 files changed, 21 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 5ec2190e45cd..835e3696b9ce 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -69,6 +69,7 @@ #include "util/pfm.h" #include "util/bpf_counter.h" #include "util/iostat.h" +#include "util/pmu-hybrid.h" #include "asm/bug.h" #include @@ -2402,6 +2403,9 @@ int cmd_stat(int argc, const char **argv) evlist__check_cpu_maps(evsel_list); + if (perf_pmu__has_hybrid()) + stat_config.no_merge = true; + /* * Initialize thread_map with comm names, * so we could print it out on output. diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 8f66cdcb338d..ad74c83bd35b 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -116,6 +116,7 @@ struct evsel { bool merged_stat; bool reset_group; bool errored; + bool use_config_name; struct hashmap *per_pkg_mask; struct evsel *leader; struct list_head config_terms; diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 46ebd269a98d..f4628c886e30 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1567,6 +1567,9 @@ int parse_events_add_pmu(struct parse_events_state *parse_state, if (!evsel) return -ENOMEM; + if (evsel->name) + evsel->use_config_name = true; + evsel->pmu_name = name ? strdup(name) : NULL; evsel->use_uncore_alias = use_uncore_alias; evsel->percore = config_term_percore(&evsel->config_terms); diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c index 98664caef6af..0679129ad05e 100644 --- a/tools/perf/util/stat-display.c +++ b/tools/perf/util/stat-display.c @@ -18,6 +18,7 @@ #include #include "util.h" #include "iostat.h" +#include "pmu-hybrid.h" #define CNTR_NOT_SUPPORTED "" #define CNTR_NOT_COUNTED "" @@ -538,6 +539,7 @@ static void uniquify_event_name(struct evsel *counter) { char *new_name; char *config; + int ret = 0; if (counter->uniquified_name || !counter->pmu_name || !strncmp(counter->name, counter->pmu_name, @@ -552,8 +554,17 @@ static void uniquify_event_name(struct evsel *counter) counter->name = new_name; } } else { - if (asprintf(&new_name, - "%s [%s]", counter->name, counter->pmu_name) > 0) { + if (perf_pmu__has_hybrid()) { + if (!counter->use_config_name) { + ret = asprintf(&new_name, "%s/%s/", + counter->pmu_name, counter->name); + } + } else { + ret = asprintf(&new_name, "%s [%s]", + counter->name, counter->pmu_name); + } + + if (ret) { free(counter->name); counter->name = new_name; } -- cgit v1.2.3 From 9cbfa2f64c04d98ad2bbce93066e2e021d12a24b Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Tue, 27 Apr 2021 15:01:21 +0800 Subject: perf parse-events: Create two hybrid hardware events Current hardware events has special perf types PERF_TYPE_HARDWARE. But it doesn't pass the PMU type in the user interface. For a hybrid system, the perf kernel doesn't know which PMU the events belong to. So now this type is extended to be PMU aware type. The PMU type ID is stored at attr.config[63:32]. PMU type ID is retrieved from sysfs. root@lkp-adl-d01:/sys/devices/cpu_atom# cat type 8 root@lkp-adl-d01:/sys/devices/cpu_core# cat type 4 When enabling a hybrid hardware event without specified pmu, such as, 'perf stat -e cycles -a', two events are created automatically. One is for atom, the other is for core. # perf stat -e cycles -a -vv -- sleep 1 Control descriptor is not initialized ------------------------------------------------------------ perf_event_attr: size 120 config 0x400000000 sample_type IDENTIFIER read_format TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING disabled 1 inherit 1 exclude_guest 1 ------------------------------------------------------------ sys_perf_event_open: pid -1 cpu 0 group_fd -1 flags 0x8 = 3 ------------------------------------------------------------ ... ------------------------------------------------------------ perf_event_attr: size 120 config 0x400000000 sample_type IDENTIFIER read_format TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING disabled 1 inherit 1 exclude_guest 1 ------------------------------------------------------------ sys_perf_event_open: pid -1 cpu 15 group_fd -1 flags 0x8 = 19 ------------------------------------------------------------ perf_event_attr: size 120 config 0x800000000 sample_type IDENTIFIER read_format TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING disabled 1 inherit 1 exclude_guest 1 ------------------------------------------------------------ sys_perf_event_open: pid -1 cpu 16 group_fd -1 flags 0x8 = 20 ------------------------------------------------------------ ... ------------------------------------------------------------ perf_event_attr: size 120 config 0x800000000 sample_type IDENTIFIER read_format TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING disabled 1 inherit 1 exclude_guest 1 ------------------------------------------------------------ sys_perf_event_open: pid -1 cpu 23 group_fd -1 flags 0x8 = 27 cycles: 0: 836272 1001525722 1001525722 cycles: 1: 628564 1001580453 1001580453 cycles: 2: 872693 1001605997 1001605997 cycles: 3: 70417 1001641369 1001641369 cycles: 4: 88593 1001726722 1001726722 cycles: 5: 470495 1001752993 1001752993 cycles: 6: 484733 1001840440 1001840440 cycles: 7: 1272477 1001593105 1001593105 cycles: 8: 209185 1001608616 1001608616 cycles: 9: 204391 1001633962 1001633962 cycles: 10: 264121 1001661745 1001661745 cycles: 11: 826104 1001689904 1001689904 cycles: 12: 89935 1001728861 1001728861 cycles: 13: 70639 1001756757 1001756757 cycles: 14: 185266 1001784810 1001784810 cycles: 15: 171094 1001825466 1001825466 cycles: 0: 129624 1001854843 1001854843 cycles: 1: 122533 1001840421 1001840421 cycles: 2: 90055 1001882506 1001882506 cycles: 3: 139607 1001896463 1001896463 cycles: 4: 141791 1001907838 1001907838 cycles: 5: 530927 1001883880 1001883880 cycles: 6: 143246 1001852529 1001852529 cycles: 7: 667769 1001872626 1001872626 cycles: 6744979 16026956922 16026956922 cycles: 1965552 8014991106 8014991106 Performance counter stats for 'system wide': 6,744,979 cpu_core/cycles/ 1,965,552 cpu_atom/cycles/ 1.001882711 seconds time elapsed 0x4 in 0x400000000 indicates the cpu_core pmu. 0x8 in 0x800000000 indicates the cpu_atom pmu. Signed-off-by: Jin Yao Reviewed-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20210427070139.25256-9-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/Build | 1 + tools/perf/util/parse-events-hybrid.c | 100 ++++++++++++++++++++++++++++++++++ tools/perf/util/parse-events-hybrid.h | 17 ++++++ tools/perf/util/parse-events.c | 18 ++++++ tools/perf/util/parse-events.h | 5 ++ 5 files changed, 141 insertions(+) create mode 100644 tools/perf/util/parse-events-hybrid.c create mode 100644 tools/perf/util/parse-events-hybrid.h (limited to 'tools') diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 45bea9911669..34fcbf3c3bce 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -23,6 +23,7 @@ perf-y += llvm-utils.o perf-y += mmap.o perf-y += memswap.o perf-y += parse-events.o +perf-y += parse-events-hybrid.o perf-y += perf_regs.o perf-y += path.o perf-y += print_binary.o diff --git a/tools/perf/util/parse-events-hybrid.c b/tools/perf/util/parse-events-hybrid.c new file mode 100644 index 000000000000..8fd7f19a9865 --- /dev/null +++ b/tools/perf/util/parse-events-hybrid.c @@ -0,0 +1,100 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include "evlist.h" +#include "evsel.h" +#include "parse-events.h" +#include "parse-events-hybrid.h" +#include "debug.h" +#include "pmu.h" +#include "pmu-hybrid.h" +#include "perf.h" + +static void config_hybrid_attr(struct perf_event_attr *attr, + int type, int pmu_type) +{ + /* + * attr.config layout for type PERF_TYPE_HARDWARE and + * PERF_TYPE_HW_CACHE + * + * PERF_TYPE_HARDWARE: 0xEEEEEEEE000000AA + * AA: hardware event ID + * EEEEEEEE: PMU type ID + * PERF_TYPE_HW_CACHE: 0xEEEEEEEE00DDCCBB + * BB: hardware cache ID + * CC: hardware cache op ID + * DD: hardware cache op result ID + * EEEEEEEE: PMU type ID + * If the PMU type ID is 0, the PERF_TYPE_RAW will be applied. + */ + attr->type = type; + attr->config = attr->config | ((__u64)pmu_type << PERF_PMU_TYPE_SHIFT); +} + +static int create_event_hybrid(__u32 config_type, int *idx, + struct list_head *list, + struct perf_event_attr *attr, char *name, + struct list_head *config_terms, + struct perf_pmu *pmu) +{ + struct evsel *evsel; + __u32 type = attr->type; + __u64 config = attr->config; + + config_hybrid_attr(attr, config_type, pmu->type); + evsel = parse_events__add_event_hybrid(list, idx, attr, name, + pmu, config_terms); + if (evsel) + evsel->pmu_name = strdup(pmu->name); + else + return -ENOMEM; + + attr->type = type; + attr->config = config; + return 0; +} + +static int add_hw_hybrid(struct parse_events_state *parse_state, + struct list_head *list, struct perf_event_attr *attr, + char *name, struct list_head *config_terms) +{ + struct perf_pmu *pmu; + int ret; + + perf_pmu__for_each_hybrid_pmu(pmu) { + ret = create_event_hybrid(PERF_TYPE_HARDWARE, + &parse_state->idx, list, attr, name, + config_terms, pmu); + if (ret) + return ret; + } + + return 0; +} + +int parse_events__add_numeric_hybrid(struct parse_events_state *parse_state, + struct list_head *list, + struct perf_event_attr *attr, + char *name, struct list_head *config_terms, + bool *hybrid) +{ + *hybrid = false; + if (attr->type == PERF_TYPE_SOFTWARE) + return 0; + + if (!perf_pmu__has_hybrid()) + return 0; + + *hybrid = true; + if (attr->type != PERF_TYPE_RAW) { + return add_hw_hybrid(parse_state, list, attr, name, + config_terms); + } + + return -1; +} diff --git a/tools/perf/util/parse-events-hybrid.h b/tools/perf/util/parse-events-hybrid.h new file mode 100644 index 000000000000..d81a76978480 --- /dev/null +++ b/tools/perf/util/parse-events-hybrid.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __PERF_PARSE_EVENTS_HYBRID_H +#define __PERF_PARSE_EVENTS_HYBRID_H + +#include +#include +#include +#include +#include + +int parse_events__add_numeric_hybrid(struct parse_events_state *parse_state, + struct list_head *list, + struct perf_event_attr *attr, + char *name, struct list_head *config_terms, + bool *hybrid); + +#endif /* __PERF_PARSE_EVENTS_HYBRID_H */ diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index f4628c886e30..9109f8b03fe7 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -37,6 +37,7 @@ #include "util/evsel_config.h" #include "util/event.h" #include "util/pfm.h" +#include "util/parse-events-hybrid.h" #include "perf.h" #define MAX_NAME_LEN 100 @@ -1419,6 +1420,8 @@ int parse_events_add_numeric(struct parse_events_state *parse_state, { struct perf_event_attr attr; LIST_HEAD(config_terms); + bool hybrid; + int ret; memset(&attr, 0, sizeof(attr)); attr.type = type; @@ -1433,6 +1436,12 @@ int parse_events_add_numeric(struct parse_events_state *parse_state, return -ENOMEM; } + ret = parse_events__add_numeric_hybrid(parse_state, list, &attr, + get_config_name(head_config), + &config_terms, &hybrid); + if (hybrid) + return ret; + return add_event(list, &parse_state->idx, &attr, get_config_name(head_config), &config_terms); } @@ -3194,3 +3203,12 @@ char *parse_events_formats_error_string(char *additional_terms) fail: return NULL; } + +struct evsel *parse_events__add_event_hybrid(struct list_head *list, int *idx, + struct perf_event_attr *attr, + char *name, struct perf_pmu *pmu, + struct list_head *config_terms) +{ + return __add_event(list, idx, attr, true, name, pmu, + config_terms, false, NULL); +} diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index e80c9b74f2f2..c4f2f96304ce 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -263,4 +263,9 @@ static inline bool is_sdt_event(char *str __maybe_unused) int perf_pmu__test_parse_init(void); +struct evsel *parse_events__add_event_hybrid(struct list_head *list, int *idx, + struct perf_event_attr *attr, + char *name, struct perf_pmu *pmu, + struct list_head *config_terms); + #endif /* __PERF_PARSE_EVENTS_H */ -- cgit v1.2.3 From 30def61f64bac5f5cfe2a3cf96bae5b889403b4c Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Tue, 27 Apr 2021 15:01:22 +0800 Subject: perf parse-events: Create two hybrid cache events For cache events, they have pre-defined configs. The kernel needs to know where the cache event comes from (e.g. from cpu_core pmu or from cpu_atom pmu). But the perf type PERF_TYPE_HW_CACHE can't carry pmu information. Now the type PERF_TYPE_HW_CACHE is extended to be PMU aware type. The PMU type ID is stored at attr.config[63:32]. When enabling a hybrid cache event without specified pmu, such as, 'perf stat -e LLC-loads -a', two events are created automatically. One is for atom, the other is for core. # perf stat -e LLC-loads -a -vv -- sleep 1 Control descriptor is not initialized ------------------------------------------------------------ perf_event_attr: type 3 size 120 config 0x400000002 sample_type IDENTIFIER read_format TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING disabled 1 inherit 1 exclude_guest 1 ------------------------------------------------------------ sys_perf_event_open: pid -1 cpu 0 group_fd -1 flags 0x8 = 3 ------------------------------------------------------------ ... ------------------------------------------------------------ perf_event_attr: type 3 size 120 config 0x400000002 sample_type IDENTIFIER read_format TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING disabled 1 inherit 1 exclude_guest 1 ------------------------------------------------------------ sys_perf_event_open: pid -1 cpu 15 group_fd -1 flags 0x8 = 19 ------------------------------------------------------------ perf_event_attr: type 3 size 120 config 0x800000002 sample_type IDENTIFIER read_format TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING disabled 1 inherit 1 exclude_guest 1 ------------------------------------------------------------ sys_perf_event_open: pid -1 cpu 16 group_fd -1 flags 0x8 = 20 ------------------------------------------------------------ ... ------------------------------------------------------------ perf_event_attr: type 3 size 120 config 0x800000002 sample_type IDENTIFIER read_format TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING disabled 1 inherit 1 exclude_guest 1 ------------------------------------------------------------ sys_perf_event_open: pid -1 cpu 23 group_fd -1 flags 0x8 = 27 LLC-loads: 0: 1507 1001800280 1001800280 LLC-loads: 1: 666 1001812250 1001812250 LLC-loads: 2: 3353 1001813453 1001813453 LLC-loads: 3: 514 1001848795 1001848795 LLC-loads: 4: 627 1001952832 1001952832 LLC-loads: 5: 4399 1001451154 1001451154 LLC-loads: 6: 1240 1001481052 1001481052 LLC-loads: 7: 478 1001520348 1001520348 LLC-loads: 8: 691 1001551236 1001551236 LLC-loads: 9: 310 1001578945 1001578945 LLC-loads: 10: 1018 1001594354 1001594354 LLC-loads: 11: 3656 1001622355 1001622355 LLC-loads: 12: 882 1001661416 1001661416 LLC-loads: 13: 506 1001693963 1001693963 LLC-loads: 14: 3547 1001721013 1001721013 LLC-loads: 15: 1399 1001734818 1001734818 LLC-loads: 0: 1314 1001793826 1001793826 LLC-loads: 1: 2857 1001752764 1001752764 LLC-loads: 2: 646 1001830694 1001830694 LLC-loads: 3: 1612 1001864861 1001864861 LLC-loads: 4: 2244 1001912381 1001912381 LLC-loads: 5: 1255 1001943889 1001943889 LLC-loads: 6: 4624 1002021109 1002021109 LLC-loads: 7: 2703 1001959302 1001959302 LLC-loads: 24793 16026838264 16026838264 LLC-loads: 17255 8015078826 8015078826 Performance counter stats for 'system wide': 24,793 cpu_core/LLC-loads/ 17,255 cpu_atom/LLC-loads/ 1.001970988 seconds time elapsed 0x4 in 0x400000002 indicates the cpu_core pmu. 0x8 in 0x800000002 indicates the cpu_atom pmu. Signed-off-by: Jin Yao Reviewed-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20210427070139.25256-10-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events-hybrid.c | 23 +++++++++++++++++++++++ tools/perf/util/parse-events-hybrid.h | 5 +++++ tools/perf/util/parse-events.c | 10 +++++++++- 3 files changed, 37 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/parse-events-hybrid.c b/tools/perf/util/parse-events-hybrid.c index 8fd7f19a9865..7a7e065d2b5f 100644 --- a/tools/perf/util/parse-events-hybrid.c +++ b/tools/perf/util/parse-events-hybrid.c @@ -98,3 +98,26 @@ int parse_events__add_numeric_hybrid(struct parse_events_state *parse_state, return -1; } + +int parse_events__add_cache_hybrid(struct list_head *list, int *idx, + struct perf_event_attr *attr, char *name, + struct list_head *config_terms, + bool *hybrid) +{ + struct perf_pmu *pmu; + int ret; + + *hybrid = false; + if (!perf_pmu__has_hybrid()) + return 0; + + *hybrid = true; + perf_pmu__for_each_hybrid_pmu(pmu) { + ret = create_event_hybrid(PERF_TYPE_HW_CACHE, idx, list, + attr, name, config_terms, pmu); + if (ret) + return ret; + } + + return 0; +} diff --git a/tools/perf/util/parse-events-hybrid.h b/tools/perf/util/parse-events-hybrid.h index d81a76978480..9ad33cd0cef4 100644 --- a/tools/perf/util/parse-events-hybrid.h +++ b/tools/perf/util/parse-events-hybrid.h @@ -14,4 +14,9 @@ int parse_events__add_numeric_hybrid(struct parse_events_state *parse_state, char *name, struct list_head *config_terms, bool *hybrid); +int parse_events__add_cache_hybrid(struct list_head *list, int *idx, + struct perf_event_attr *attr, char *name, + struct list_head *config_terms, + bool *hybrid); + #endif /* __PERF_PARSE_EVENTS_HYBRID_H */ diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 9109f8b03fe7..c228bf5c7d88 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -460,7 +460,8 @@ int parse_events_add_cache(struct list_head *list, int *idx, char name[MAX_NAME_LEN], *config_name; int cache_type = -1, cache_op = -1, cache_result = -1; char *op_result[2] = { op_result1, op_result2 }; - int i, n; + int i, n, ret; + bool hybrid; /* * No fallback - if we cannot get a clear cache type @@ -520,6 +521,13 @@ int parse_events_add_cache(struct list_head *list, int *idx, if (get_config_terms(head_config, &config_terms)) return -ENOMEM; } + + ret = parse_events__add_cache_hybrid(list, idx, &attr, + config_name ? : name, &config_terms, + &hybrid); + if (hybrid) + return ret; + return add_event(list, idx, &attr, config_name ? : name, &config_terms); } -- cgit v1.2.3 From 94da591b1c7913880957c3477f6abff563783b33 Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Tue, 27 Apr 2021 15:01:23 +0800 Subject: perf parse-events: Create two hybrid raw events On hybrid platform, same raw event is possible to be available on both cpu_core pmu and cpu_atom pmu. It's supported to create two raw events for one event encoding. For raw events, the attr.type is PMU type. # perf stat -e r3c -a -vv -- sleep 1 Control descriptor is not initialized ------------------------------------------------------------ perf_event_attr: type 4 size 120 config 0x3c sample_type IDENTIFIER read_format TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING disabled 1 inherit 1 exclude_guest 1 ------------------------------------------------------------ sys_perf_event_open: pid -1 cpu 0 group_fd -1 flags 0x8 = 3 ------------------------------------------------------------ ... ------------------------------------------------------------ perf_event_attr: type 4 size 120 config 0x3c sample_type IDENTIFIER read_format TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING disabled 1 inherit 1 exclude_guest 1 ------------------------------------------------------------ sys_perf_event_open: pid -1 cpu 15 group_fd -1 flags 0x8 = 19 ------------------------------------------------------------ perf_event_attr: type 8 size 120 config 0x3c sample_type IDENTIFIER read_format TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING disabled 1 inherit 1 exclude_guest 1 ------------------------------------------------------------ sys_perf_event_open: pid -1 cpu 16 group_fd -1 flags 0x8 = 20 ------------------------------------------------------------ ... ------------------------------------------------------------ perf_event_attr: type 8 size 120 config 0x3c sample_type IDENTIFIER read_format TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING disabled 1 inherit 1 exclude_guest 1 ------------------------------------------------------------ sys_perf_event_open: pid -1 cpu 23 group_fd -1 flags 0x8 = 27 r3c: 0: 434449 1001412521 1001412521 r3c: 1: 173162 1001482031 1001482031 r3c: 2: 231710 1001524974 1001524974 r3c: 3: 110012 1001563523 1001563523 r3c: 4: 191517 1001593221 1001593221 r3c: 5: 956458 1001628147 1001628147 r3c: 6: 416969 1001715626 1001715626 r3c: 7: 1047527 1001596650 1001596650 r3c: 8: 103877 1001633520 1001633520 r3c: 9: 70571 1001637898 1001637898 r3c: 10: 550284 1001714398 1001714398 r3c: 11: 1257274 1001738349 1001738349 r3c: 12: 107797 1001801432 1001801432 r3c: 13: 67471 1001836281 1001836281 r3c: 14: 286782 1001923161 1001923161 r3c: 15: 815509 1001952550 1001952550 r3c: 0: 95994 1002071117 1002071117 r3c: 1: 105570 1002142438 1002142438 r3c: 2: 115921 1002189147 1002189147 r3c: 3: 72747 1002238133 1002238133 r3c: 4: 103519 1002276753 1002276753 r3c: 5: 121382 1002315131 1002315131 r3c: 6: 80298 1002248050 1002248050 r3c: 7: 466790 1002278221 1002278221 r3c: 6821369 16026754282 16026754282 r3c: 1162221 8017758990 8017758990 Performance counter stats for 'system wide': 6,821,369 cpu_core/r3c/ 1,162,221 cpu_atom/r3c/ 1.002289965 seconds time elapsed Signed-off-by: Jin Yao Reviewed-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20210427070139.25256-11-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events-hybrid.c | 38 ++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/parse-events-hybrid.c b/tools/perf/util/parse-events-hybrid.c index 7a7e065d2b5f..e27b747080e1 100644 --- a/tools/perf/util/parse-events-hybrid.c +++ b/tools/perf/util/parse-events-hybrid.c @@ -77,6 +77,41 @@ static int add_hw_hybrid(struct parse_events_state *parse_state, return 0; } +static int create_raw_event_hybrid(int *idx, struct list_head *list, + struct perf_event_attr *attr, char *name, + struct list_head *config_terms, + struct perf_pmu *pmu) +{ + struct evsel *evsel; + + attr->type = pmu->type; + evsel = parse_events__add_event_hybrid(list, idx, attr, name, + pmu, config_terms); + if (evsel) + evsel->pmu_name = strdup(pmu->name); + else + return -ENOMEM; + + return 0; +} + +static int add_raw_hybrid(struct parse_events_state *parse_state, + struct list_head *list, struct perf_event_attr *attr, + char *name, struct list_head *config_terms) +{ + struct perf_pmu *pmu; + int ret; + + perf_pmu__for_each_hybrid_pmu(pmu) { + ret = create_raw_event_hybrid(&parse_state->idx, list, attr, + name, config_terms, pmu); + if (ret) + return ret; + } + + return 0; +} + int parse_events__add_numeric_hybrid(struct parse_events_state *parse_state, struct list_head *list, struct perf_event_attr *attr, @@ -96,7 +131,8 @@ int parse_events__add_numeric_hybrid(struct parse_events_state *parse_state, config_terms); } - return -1; + return add_raw_hybrid(parse_state, list, attr, name, + config_terms); } int parse_events__add_cache_hybrid(struct list_head *list, int *idx, -- cgit v1.2.3 From c93afadc924dbec51a38c4f6f0d07a8adfddd339 Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Tue, 27 Apr 2021 15:01:24 +0800 Subject: perf parse-events: Compare with hybrid pmu name On hybrid platform, user may want to enable event only on one pmu. Following syntax will be supported: cpu_core// cpu_atom// For hardware event, hardware cache event and raw event, two events are created by default. We pass the specified pmu name in parse_state and it would be checked before event creation. So next only the event with the specified pmu would be created. Signed-off-by: Jin Yao Reviewed-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20210427070139.25256-12-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events-hybrid.c | 21 ++++++++++++++++++++- tools/perf/util/parse-events-hybrid.h | 3 ++- tools/perf/util/parse-events.c | 5 +++-- tools/perf/util/parse-events.h | 4 +++- tools/perf/util/parse-events.y | 9 ++++++--- 5 files changed, 34 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/parse-events-hybrid.c b/tools/perf/util/parse-events-hybrid.c index e27b747080e1..10160ab126f9 100644 --- a/tools/perf/util/parse-events-hybrid.c +++ b/tools/perf/util/parse-events-hybrid.c @@ -59,6 +59,15 @@ static int create_event_hybrid(__u32 config_type, int *idx, return 0; } +static int pmu_cmp(struct parse_events_state *parse_state, + struct perf_pmu *pmu) +{ + if (!parse_state->hybrid_pmu_name) + return 0; + + return strcmp(parse_state->hybrid_pmu_name, pmu->name); +} + static int add_hw_hybrid(struct parse_events_state *parse_state, struct list_head *list, struct perf_event_attr *attr, char *name, struct list_head *config_terms) @@ -67,6 +76,9 @@ static int add_hw_hybrid(struct parse_events_state *parse_state, int ret; perf_pmu__for_each_hybrid_pmu(pmu) { + if (pmu_cmp(parse_state, pmu)) + continue; + ret = create_event_hybrid(PERF_TYPE_HARDWARE, &parse_state->idx, list, attr, name, config_terms, pmu); @@ -103,6 +115,9 @@ static int add_raw_hybrid(struct parse_events_state *parse_state, int ret; perf_pmu__for_each_hybrid_pmu(pmu) { + if (pmu_cmp(parse_state, pmu)) + continue; + ret = create_raw_event_hybrid(&parse_state->idx, list, attr, name, config_terms, pmu); if (ret) @@ -138,7 +153,8 @@ int parse_events__add_numeric_hybrid(struct parse_events_state *parse_state, int parse_events__add_cache_hybrid(struct list_head *list, int *idx, struct perf_event_attr *attr, char *name, struct list_head *config_terms, - bool *hybrid) + bool *hybrid, + struct parse_events_state *parse_state) { struct perf_pmu *pmu; int ret; @@ -149,6 +165,9 @@ int parse_events__add_cache_hybrid(struct list_head *list, int *idx, *hybrid = true; perf_pmu__for_each_hybrid_pmu(pmu) { + if (pmu_cmp(parse_state, pmu)) + continue; + ret = create_event_hybrid(PERF_TYPE_HW_CACHE, idx, list, attr, name, config_terms, pmu); if (ret) diff --git a/tools/perf/util/parse-events-hybrid.h b/tools/perf/util/parse-events-hybrid.h index 9ad33cd0cef4..f33bd67aa851 100644 --- a/tools/perf/util/parse-events-hybrid.h +++ b/tools/perf/util/parse-events-hybrid.h @@ -17,6 +17,7 @@ int parse_events__add_numeric_hybrid(struct parse_events_state *parse_state, int parse_events__add_cache_hybrid(struct list_head *list, int *idx, struct perf_event_attr *attr, char *name, struct list_head *config_terms, - bool *hybrid); + bool *hybrid, + struct parse_events_state *parse_state); #endif /* __PERF_PARSE_EVENTS_HYBRID_H */ diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index c228bf5c7d88..6236513e6369 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -453,7 +453,8 @@ static int config_attr(struct perf_event_attr *attr, int parse_events_add_cache(struct list_head *list, int *idx, char *type, char *op_result1, char *op_result2, struct parse_events_error *err, - struct list_head *head_config) + struct list_head *head_config, + struct parse_events_state *parse_state) { struct perf_event_attr attr; LIST_HEAD(config_terms); @@ -524,7 +525,7 @@ int parse_events_add_cache(struct list_head *list, int *idx, ret = parse_events__add_cache_hybrid(list, idx, &attr, config_name ? : name, &config_terms, - &hybrid); + &hybrid, parse_state); if (hybrid) return ret; diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index c4f2f96304ce..bf6e41aa9b6a 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -138,6 +138,7 @@ struct parse_events_state { struct list_head *terms; int stoken; struct perf_pmu *fake_pmu; + char *hybrid_pmu_name; }; void parse_events__handle_error(struct parse_events_error *err, int idx, @@ -188,7 +189,8 @@ int parse_events_add_tool(struct parse_events_state *parse_state, int parse_events_add_cache(struct list_head *list, int *idx, char *type, char *op_result1, char *op_result2, struct parse_events_error *error, - struct list_head *head_config); + struct list_head *head_config, + struct parse_events_state *parse_state); int parse_events_add_breakpoint(struct list_head *list, int *idx, u64 addr, char *type, u64 len); int parse_events_add_pmu(struct parse_events_state *parse_state, diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y index d57ac86ce7ca..aba12a4d488e 100644 --- a/tools/perf/util/parse-events.y +++ b/tools/perf/util/parse-events.y @@ -454,7 +454,8 @@ PE_NAME_CACHE_TYPE '-' PE_NAME_CACHE_OP_RESULT '-' PE_NAME_CACHE_OP_RESULT opt_e list = alloc_list(); ABORT_ON(!list); - err = parse_events_add_cache(list, &parse_state->idx, $1, $3, $5, error, $6); + err = parse_events_add_cache(list, &parse_state->idx, $1, $3, $5, error, $6, + parse_state); parse_events_terms__delete($6); free($1); free($3); @@ -475,7 +476,8 @@ PE_NAME_CACHE_TYPE '-' PE_NAME_CACHE_OP_RESULT opt_event_config list = alloc_list(); ABORT_ON(!list); - err = parse_events_add_cache(list, &parse_state->idx, $1, $3, NULL, error, $4); + err = parse_events_add_cache(list, &parse_state->idx, $1, $3, NULL, error, $4, + parse_state); parse_events_terms__delete($4); free($1); free($3); @@ -495,7 +497,8 @@ PE_NAME_CACHE_TYPE opt_event_config list = alloc_list(); ABORT_ON(!list); - err = parse_events_add_cache(list, &parse_state->idx, $1, NULL, NULL, error, $2); + err = parse_events_add_cache(list, &parse_state->idx, $1, NULL, NULL, error, $2, + parse_state); parse_events_terms__delete($2); free($1); if (err) { -- cgit v1.2.3 From 5e4edd1f73b5d59905aeb0fe43ab74301c39a5c1 Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Tue, 27 Apr 2021 15:01:25 +0800 Subject: perf parse-events: Support event inside hybrid pmu On hybrid platform, user may want to enable events on one pmu. Following syntax are supported: cpu_core// cpu_atom// But the syntax doesn't work for cache event. Before: # perf stat -e cpu_core/LLC-loads/ -a -- sleep 1 event syntax error: 'cpu_core/LLC-loads/' \___ unknown term 'LLC-loads' for pmu 'cpu_core' Cache events are a bit complex. We can't create aliases for them. We use another solution. For example, if we use "cpu_core/LLC-loads/", in parse_events_add_pmu(), term->config is "LLC-loads". Then we create a new parser to scan "LLC-loads". The parse_events_add_cache() would be called during parsing. The parse_state->hybrid_pmu_name is used to identify the pmu where the event should be enabled on. After: # perf stat -e cpu_core/LLC-loads/ -a -- sleep 1 Performance counter stats for 'system wide': 24,593 cpu_core/LLC-loads/ 1.003911601 seconds time elapsed Signed-off-by: Jin Yao Reviewed-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20210427070139.25256-13-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 63 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 6236513e6369..4dad14265b81 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -38,6 +38,7 @@ #include "util/event.h" #include "util/pfm.h" #include "util/parse-events-hybrid.h" +#include "util/pmu-hybrid.h" #include "perf.h" #define MAX_NAME_LEN 100 @@ -48,6 +49,9 @@ extern int parse_events_debug; int parse_events_parse(void *parse_state, void *scanner); static int get_config_terms(struct list_head *head_config, struct list_head *head_terms __maybe_unused); +static int parse_events__with_hybrid_pmu(struct parse_events_state *parse_state, + const char *str, char *pmu_name, + struct list_head *list); static struct perf_pmu_event_symbol *perf_pmu_events_list; /* @@ -1474,6 +1478,33 @@ static bool config_term_percore(struct list_head *config_terms) return false; } +static int parse_events__inside_hybrid_pmu(struct parse_events_state *parse_state, + struct list_head *list, char *name, + struct list_head *head_config) +{ + struct parse_events_term *term; + int ret = -1; + + if (parse_state->fake_pmu || !head_config || list_empty(head_config) || + !perf_pmu__is_hybrid(name)) { + return -1; + } + + /* + * More than one term in list. + */ + if (head_config->next && head_config->next->next != head_config) + return -1; + + term = list_first_entry(head_config, struct parse_events_term, list); + if (term && term->config && strcmp(term->config, "event")) { + ret = parse_events__with_hybrid_pmu(parse_state, term->config, + name, list); + } + + return ret; +} + int parse_events_add_pmu(struct parse_events_state *parse_state, struct list_head *list, char *name, struct list_head *head_config, @@ -1567,6 +1598,11 @@ int parse_events_add_pmu(struct parse_events_state *parse_state, if (pmu->default_config && get_config_chgs(pmu, head_config, &config_terms)) return -ENOMEM; + if (!parse_events__inside_hybrid_pmu(parse_state, list, name, + head_config)) { + return 0; + } + if (!parse_state->fake_pmu && perf_pmu__config(pmu, &attr, head_config, parse_state->error)) { struct evsel_config_term *pos, *tmp; @@ -2189,6 +2225,33 @@ int parse_events_terms(struct list_head *terms, const char *str) return ret; } +static int parse_events__with_hybrid_pmu(struct parse_events_state *parse_state, + const char *str, char *pmu_name, + struct list_head *list) +{ + struct parse_events_state ps = { + .list = LIST_HEAD_INIT(ps.list), + .stoken = PE_START_EVENTS, + .hybrid_pmu_name = pmu_name, + .idx = parse_state->idx, + }; + int ret; + + ret = parse_events__scanner(str, &ps); + perf_pmu__parse_cleanup(); + + if (!ret) { + if (!list_empty(&ps.list)) { + list_splice(&ps.list, list); + parse_state->idx = ps.idx; + return 0; + } else + return -1; + } + + return ret; +} + int __parse_events(struct evlist *evlist, const char *str, struct parse_events_error *err, struct perf_pmu *fake_pmu) { -- cgit v1.2.3 From b53a0755d5c2d19b13db897d6faf4969e03e45ae Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Tue, 27 Apr 2021 15:01:26 +0800 Subject: perf record: Create two hybrid 'cycles' events by default When evlist is empty, for example no '-e' specified in perf record, one default 'cycles' event is added to evlist. While on hybrid platform, it needs to create two default 'cycles' events. One is for cpu_core, the other is for cpu_atom. This patch actually calls evsel__new_cycles() two times to create two 'cycles' events. # ./perf record -vv -a -- sleep 1 ... ------------------------------------------------------------ perf_event_attr: size 120 config 0x400000000 { sample_period, sample_freq } 4000 sample_type IP|TID|TIME|ID|CPU|PERIOD read_format ID disabled 1 inherit 1 freq 1 precise_ip 3 sample_id_all 1 exclude_guest 1 ------------------------------------------------------------ sys_perf_event_open: pid -1 cpu 0 group_fd -1 flags 0x8 = 5 sys_perf_event_open: pid -1 cpu 1 group_fd -1 flags 0x8 = 6 sys_perf_event_open: pid -1 cpu 2 group_fd -1 flags 0x8 = 7 sys_perf_event_open: pid -1 cpu 3 group_fd -1 flags 0x8 = 9 sys_perf_event_open: pid -1 cpu 4 group_fd -1 flags 0x8 = 10 sys_perf_event_open: pid -1 cpu 5 group_fd -1 flags 0x8 = 11 sys_perf_event_open: pid -1 cpu 6 group_fd -1 flags 0x8 = 12 sys_perf_event_open: pid -1 cpu 7 group_fd -1 flags 0x8 = 13 sys_perf_event_open: pid -1 cpu 8 group_fd -1 flags 0x8 = 14 sys_perf_event_open: pid -1 cpu 9 group_fd -1 flags 0x8 = 15 sys_perf_event_open: pid -1 cpu 10 group_fd -1 flags 0x8 = 16 sys_perf_event_open: pid -1 cpu 11 group_fd -1 flags 0x8 = 17 sys_perf_event_open: pid -1 cpu 12 group_fd -1 flags 0x8 = 18 sys_perf_event_open: pid -1 cpu 13 group_fd -1 flags 0x8 = 19 sys_perf_event_open: pid -1 cpu 14 group_fd -1 flags 0x8 = 20 sys_perf_event_open: pid -1 cpu 15 group_fd -1 flags 0x8 = 21 ------------------------------------------------------------ perf_event_attr: size 120 config 0x800000000 { sample_period, sample_freq } 4000 sample_type IP|TID|TIME|ID|CPU|PERIOD read_format ID disabled 1 inherit 1 freq 1 precise_ip 3 sample_id_all 1 exclude_guest 1 ------------------------------------------------------------ sys_perf_event_open: pid -1 cpu 16 group_fd -1 flags 0x8 = 22 sys_perf_event_open: pid -1 cpu 17 group_fd -1 flags 0x8 = 23 sys_perf_event_open: pid -1 cpu 18 group_fd -1 flags 0x8 = 24 sys_perf_event_open: pid -1 cpu 19 group_fd -1 flags 0x8 = 25 sys_perf_event_open: pid -1 cpu 20 group_fd -1 flags 0x8 = 26 sys_perf_event_open: pid -1 cpu 21 group_fd -1 flags 0x8 = 27 sys_perf_event_open: pid -1 cpu 22 group_fd -1 flags 0x8 = 28 sys_perf_event_open: pid -1 cpu 23 group_fd -1 flags 0x8 = 29 ------------------------------------------------------------ We have to create evlist-hybrid.c otherwise due to the symbol dependency the perf test python would be failed. Signed-off-by: Jin Yao Reviewed-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20210427070139.25256-14-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 19 +++++++++++++++---- tools/perf/util/Build | 1 + tools/perf/util/evlist-hybrid.c | 41 +++++++++++++++++++++++++++++++++++++++++ tools/perf/util/evlist-hybrid.h | 12 ++++++++++++ tools/perf/util/evlist.c | 5 ++++- tools/perf/util/evsel.c | 6 +++--- tools/perf/util/evsel.h | 2 +- 7 files changed, 77 insertions(+), 9 deletions(-) create mode 100644 tools/perf/util/evlist-hybrid.c create mode 100644 tools/perf/util/evlist-hybrid.h (limited to 'tools') diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 5fb9665a2ec2..6af46c6a4fd8 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -47,6 +47,8 @@ #include "util/util.h" #include "util/pfm.h" #include "util/clockid.h" +#include "util/pmu-hybrid.h" +#include "util/evlist-hybrid.h" #include "asm/bug.h" #include "perf.h" @@ -2790,10 +2792,19 @@ int cmd_record(int argc, const char **argv) if (record.opts.overwrite) record.opts.tail_synthesize = true; - if (rec->evlist->core.nr_entries == 0 && - __evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) { - pr_err("Not enough memory for event selector list\n"); - goto out; + if (rec->evlist->core.nr_entries == 0) { + if (perf_pmu__has_hybrid()) { + err = evlist__add_default_hybrid(rec->evlist, + !record.opts.no_samples); + } else { + err = __evlist__add_default(rec->evlist, + !record.opts.no_samples); + } + + if (err < 0) { + pr_err("Not enough memory for event selector list\n"); + goto out; + } } if (rec->opts.target.tid && !rec->opts.no_inherit_set) diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 34fcbf3c3bce..8c0d9f368ebc 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -10,6 +10,7 @@ perf-y += db-export.o perf-y += env.o perf-y += event.o perf-y += evlist.o +perf-y += evlist-hybrid.o perf-y += sideband_evlist.o perf-y += evsel.o perf-y += evsel_fprintf.o diff --git a/tools/perf/util/evlist-hybrid.c b/tools/perf/util/evlist-hybrid.c new file mode 100644 index 000000000000..e11998526f2e --- /dev/null +++ b/tools/perf/util/evlist-hybrid.c @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include "cpumap.h" +#include "evlist.h" +#include "evsel.h" +#include "../perf.h" +#include "util/pmu-hybrid.h" +#include "util/evlist-hybrid.h" +#include +#include +#include +#include +#include +#include +#include + +int evlist__add_default_hybrid(struct evlist *evlist, bool precise) +{ + struct evsel *evsel; + struct perf_pmu *pmu; + __u64 config; + struct perf_cpu_map *cpus; + + perf_pmu__for_each_hybrid_pmu(pmu) { + config = PERF_COUNT_HW_CPU_CYCLES | + ((__u64)pmu->type << PERF_PMU_TYPE_SHIFT); + evsel = evsel__new_cycles(precise, PERF_TYPE_HARDWARE, + config); + if (!evsel) + return -ENOMEM; + + cpus = perf_cpu_map__get(pmu->cpus); + evsel->core.cpus = cpus; + evsel->core.own_cpus = perf_cpu_map__get(cpus); + evsel->pmu_name = strdup(pmu->name); + evlist__add(evlist, evsel); + } + + return 0; +} diff --git a/tools/perf/util/evlist-hybrid.h b/tools/perf/util/evlist-hybrid.h new file mode 100644 index 000000000000..e25861649d8f --- /dev/null +++ b/tools/perf/util/evlist-hybrid.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __PERF_EVLIST_HYBRID_H +#define __PERF_EVLIST_HYBRID_H + +#include +#include +#include "evlist.h" +#include + +int evlist__add_default_hybrid(struct evlist *evlist, bool precise); + +#endif /* __PERF_EVLIST_HYBRID_H */ diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index e71041c89010..6e5c41528c7d 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -26,6 +26,7 @@ #include "util/string2.h" #include "util/perf_api_probe.h" #include "util/evsel_fprintf.h" +#include "util/evlist-hybrid.h" #include #include #include @@ -248,8 +249,10 @@ void evlist__set_leader(struct evlist *evlist) int __evlist__add_default(struct evlist *evlist, bool precise) { - struct evsel *evsel = evsel__new_cycles(precise); + struct evsel *evsel; + evsel = evsel__new_cycles(precise, PERF_TYPE_HARDWARE, + PERF_COUNT_HW_CPU_CYCLES); if (evsel == NULL) return -ENOMEM; diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 080ddcfefbcd..0fc9555a2ac4 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -295,11 +295,11 @@ static bool perf_event_can_profile_kernel(void) return perf_event_paranoid_check(1); } -struct evsel *evsel__new_cycles(bool precise) +struct evsel *evsel__new_cycles(bool precise, __u32 type, __u64 config) { struct perf_event_attr attr = { - .type = PERF_TYPE_HARDWARE, - .config = PERF_COUNT_HW_CPU_CYCLES, + .type = type, + .config = config, .exclude_kernel = !perf_event_can_profile_kernel(), }; struct evsel *evsel; diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index ad74c83bd35b..a9cc9563ac59 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -221,7 +221,7 @@ static inline struct evsel *evsel__newtp(const char *sys, const char *name) return evsel__newtp_idx(sys, name, 0); } -struct evsel *evsel__new_cycles(bool precise); +struct evsel *evsel__new_cycles(bool precise, __u32 type, __u64 config); struct tep_event *event_format__new(const char *sys, const char *name); -- cgit v1.2.3 From ac2dc29edd21f9ec011863336ab1c7c9fe77a1d3 Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Tue, 27 Apr 2021 15:01:27 +0800 Subject: perf stat: Add default hybrid events Previously if '-e' is not specified in perf stat, some software events and hardware events are added to evlist by default. Before: # perf stat -a -- sleep 1 Performance counter stats for 'system wide': 24,044.40 msec cpu-clock # 23.946 CPUs utilized 99 context-switches # 4.117 /sec 24 cpu-migrations # 0.998 /sec 3 page-faults # 0.125 /sec 7,000,244 cycles # 0.000 GHz 2,955,024 instructions # 0.42 insn per cycle 608,941 branches # 25.326 K/sec 31,991 branch-misses # 5.25% of all branches 1.004106859 seconds time elapsed Among the events, cycles, instructions, branches and branch-misses are hardware events. One hybrid platform, two hardware events are created for one hardware event. cpu_core/cycles/, cpu_atom/cycles/, cpu_core/instructions/, cpu_atom/instructions/, cpu_core/branches/, cpu_atom/branches/, cpu_core/branch-misses/, cpu_atom/branch-misses/ These events would be added to evlist on hybrid platform. Since parse_events() has been supported to create two hardware events for one event on hybrid platform, so we just use parse_events(evlist, "cycles,instructions,branches,branch-misses") to create the default events and add them to evlist. After: # perf stat -a -- sleep 1 Performance counter stats for 'system wide': 24,043.99 msec cpu-clock # 23.991 CPUs utilized 139 context-switches # 5.781 /sec 25 cpu-migrations # 1.040 /sec 6 page-faults # 0.250 /sec 10,381,751 cpu_core/cycles/ # 431.782 K/sec 1,264,216 cpu_atom/cycles/ # 52.579 K/sec 3,406,958 cpu_core/instructions/ # 141.697 K/sec 414,588 cpu_atom/instructions/ # 17.243 K/sec 705,149 cpu_core/branches/ # 29.327 K/sec 82,358 cpu_atom/branches/ # 3.425 K/sec 40,821 cpu_core/branch-misses/ # 1.698 K/sec 9,086 cpu_atom/branch-misses/ # 377.891 /sec 1.002228863 seconds time elapsed We can see two events are created for one hardware event. One TODO is, the shadow stats looks a bit different, now it's just 'M/sec'. The perf_stat__update_shadow_stats and perf_stat__print_shadow_stats need to be improved in future if we want to get the original shadow stats. Signed-off-by: Jin Yao Reviewed-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20210427070139.25256-15-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'tools') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 835e3696b9ce..42e60764ad8d 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1639,6 +1639,12 @@ static int add_default_attributes(void) { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES }, +}; + struct perf_event_attr default_sw_attrs[] = { + { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK }, + { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES }, + { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS }, + { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS }, }; /* @@ -1876,6 +1882,28 @@ setup_metrics: } if (!evsel_list->core.nr_entries) { + if (perf_pmu__has_hybrid()) { + const char *hybrid_str = "cycles,instructions,branches,branch-misses"; + + if (target__has_cpu(&target)) + default_sw_attrs[0].config = PERF_COUNT_SW_CPU_CLOCK; + + if (evlist__add_default_attrs(evsel_list, + default_sw_attrs) < 0) { + return -1; + } + + err = parse_events(evsel_list, hybrid_str, &errinfo); + if (err) { + fprintf(stderr, + "Cannot set up hybrid events %s: %d\n", + hybrid_str, err); + parse_events_print_error(&errinfo, hybrid_str); + return -1; + } + return err; + } + if (target__has_cpu(&target)) default_attrs0[0].config = PERF_COUNT_SW_CPU_CLOCK; -- cgit v1.2.3 From 92637cc7295510f4b3cb945cafcaec97c82e42f2 Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Tue, 27 Apr 2021 15:01:28 +0800 Subject: perf stat: Filter out unmatched aggregation for hybrid event perf-stat has supported some aggregation modes, such as --per-core, --per-socket and etc. While for hybrid event, it may only available on part of cpus. So for --per-core, we need to filter out the unavailable cores, for --per-socket, filter out the unavailable sockets, and so on. Before: # perf stat --per-core -e cpu_core/cycles/ -a -- sleep 1 Performance counter stats for 'system wide': S0-D0-C0 2 479,530 cpu_core/cycles/ S0-D0-C4 2 175,007 cpu_core/cycles/ S0-D0-C8 2 166,240 cpu_core/cycles/ S0-D0-C12 2 704,673 cpu_core/cycles/ S0-D0-C16 2 865,835 cpu_core/cycles/ S0-D0-C20 2 2,958,461 cpu_core/cycles/ S0-D0-C24 2 163,988 cpu_core/cycles/ S0-D0-C28 2 164,729 cpu_core/cycles/ S0-D0-C32 0 cpu_core/cycles/ S0-D0-C33 0 cpu_core/cycles/ S0-D0-C34 0 cpu_core/cycles/ S0-D0-C35 0 cpu_core/cycles/ S0-D0-C36 0 cpu_core/cycles/ S0-D0-C37 0 cpu_core/cycles/ S0-D0-C38 0 cpu_core/cycles/ S0-D0-C39 0 cpu_core/cycles/ 1.003597211 seconds time elapsed After: # perf stat --per-core -e cpu_core/cycles/ -a -- sleep 1 Performance counter stats for 'system wide': S0-D0-C0 2 210,428 cpu_core/cycles/ S0-D0-C4 2 444,830 cpu_core/cycles/ S0-D0-C8 2 435,241 cpu_core/cycles/ S0-D0-C12 2 423,976 cpu_core/cycles/ S0-D0-C16 2 859,350 cpu_core/cycles/ S0-D0-C20 2 1,559,589 cpu_core/cycles/ S0-D0-C24 2 163,924 cpu_core/cycles/ S0-D0-C28 2 376,610 cpu_core/cycles/ 1.003621290 seconds time elapsed Signed-off-by: Jin Yao Co-developed-by: Jiri Olsa Reviewed-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20210427070139.25256-16-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/stat-display.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c index 0679129ad05e..a76fff5e7d83 100644 --- a/tools/perf/util/stat-display.c +++ b/tools/perf/util/stat-display.c @@ -667,6 +667,9 @@ static void print_counter_aggrdata(struct perf_stat_config *config, if (!collect_data(config, counter, aggr_cb, &ad)) return; + if (perf_pmu__has_hybrid() && ad.ena == 0) + return; + nr = ad.nr; ena = ad.ena; run = ad.run; -- cgit v1.2.3 From 660e533e87ff4e66434f90fca987b929d4eb0059 Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Tue, 27 Apr 2021 15:01:29 +0800 Subject: perf stat: Warn group events from different hybrid PMU If a group has events which are from different hybrid PMUs, shows a warning: "WARNING: events in group from different hybrid PMUs!" This is to remind the user not to put the core event and atom event into one group. Next, just disable grouping. # perf stat -e "{cpu_core/cycles/,cpu_atom/cycles/}" -a -- sleep 1 WARNING: events in group from different hybrid PMUs! WARNING: grouped events cpus do not match, disabling group: anon group { cpu_core/cycles/, cpu_atom/cycles/ } Performance counter stats for 'system wide': 5,438,125 cpu_core/cycles/ 3,914,586 cpu_atom/cycles/ 1.004250966 seconds time elapsed Signed-off-by: Jin Yao Reviewed-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20210427070139.25256-17-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 4 ++++ tools/perf/util/evlist-hybrid.c | 47 ++++++++++++++++++++++++++++++++++++++ tools/perf/util/evlist-hybrid.h | 2 ++ tools/perf/util/evsel.c | 6 +++++ tools/perf/util/evsel.h | 1 + tools/perf/util/python-ext-sources | 2 ++ 6 files changed, 62 insertions(+) (limited to 'tools') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 42e60764ad8d..5a830ae09418 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -48,6 +48,7 @@ #include "util/pmu.h" #include "util/event.h" #include "util/evlist.h" +#include "util/evlist-hybrid.h" #include "util/evsel.h" #include "util/debug.h" #include "util/color.h" @@ -243,6 +244,9 @@ static void evlist__check_cpu_maps(struct evlist *evlist) struct evsel *evsel, *pos, *leader; char buf[1024]; + if (evlist__has_hybrid(evlist)) + evlist__warn_hybrid_group(evlist); + evlist__for_each_entry(evlist, evsel) { leader = evsel->leader; diff --git a/tools/perf/util/evlist-hybrid.c b/tools/perf/util/evlist-hybrid.c index e11998526f2e..db3f5fbdebe1 100644 --- a/tools/perf/util/evlist-hybrid.c +++ b/tools/perf/util/evlist-hybrid.c @@ -7,6 +7,7 @@ #include "../perf.h" #include "util/pmu-hybrid.h" #include "util/evlist-hybrid.h" +#include "debug.h" #include #include #include @@ -39,3 +40,49 @@ int evlist__add_default_hybrid(struct evlist *evlist, bool precise) return 0; } + +static bool group_hybrid_conflict(struct evsel *leader) +{ + struct evsel *pos, *prev = NULL; + + for_each_group_evsel(pos, leader) { + if (!evsel__is_hybrid(pos)) + continue; + + if (prev && strcmp(prev->pmu_name, pos->pmu_name)) + return true; + + prev = pos; + } + + return false; +} + +void evlist__warn_hybrid_group(struct evlist *evlist) +{ + struct evsel *evsel; + + evlist__for_each_entry(evlist, evsel) { + if (evsel__is_group_leader(evsel) && + evsel->core.nr_members > 1 && + group_hybrid_conflict(evsel)) { + pr_warning("WARNING: events in group from " + "different hybrid PMUs!\n"); + return; + } + } +} + +bool evlist__has_hybrid(struct evlist *evlist) +{ + struct evsel *evsel; + + evlist__for_each_entry(evlist, evsel) { + if (evsel->pmu_name && + perf_pmu__is_hybrid(evsel->pmu_name)) { + return true; + } + } + + return false; +} diff --git a/tools/perf/util/evlist-hybrid.h b/tools/perf/util/evlist-hybrid.h index e25861649d8f..19f74b4c340a 100644 --- a/tools/perf/util/evlist-hybrid.h +++ b/tools/perf/util/evlist-hybrid.h @@ -8,5 +8,7 @@ #include int evlist__add_default_hybrid(struct evlist *evlist, bool precise); +void evlist__warn_hybrid_group(struct evlist *evlist); +bool evlist__has_hybrid(struct evlist *evlist); #endif /* __PERF_EVLIST_HYBRID_H */ diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 0fc9555a2ac4..4a3cd1b5bb33 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -47,6 +47,7 @@ #include "memswap.h" #include "util.h" #include "hashmap.h" +#include "pmu-hybrid.h" #include "../perf-sys.h" #include "util/parse-branch-options.h" #include @@ -2819,3 +2820,8 @@ void evsel__zero_per_pkg(struct evsel *evsel) hashmap__clear(evsel->per_pkg_mask); } } + +bool evsel__is_hybrid(struct evsel *evsel) +{ + return evsel->pmu_name && perf_pmu__is_hybrid(evsel->pmu_name); +} diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index a9cc9563ac59..75cf5dbfe208 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -462,4 +462,5 @@ struct perf_env *evsel__env(struct evsel *evsel); int evsel__store_ids(struct evsel *evsel, struct evlist *evlist); void evsel__zero_per_pkg(struct evsel *evsel); +bool evsel__is_hybrid(struct evsel *evsel); #endif /* __PERF_EVSEL_H */ diff --git a/tools/perf/util/python-ext-sources b/tools/perf/util/python-ext-sources index 845dd46e3c61..d7c976671e3a 100644 --- a/tools/perf/util/python-ext-sources +++ b/tools/perf/util/python-ext-sources @@ -37,3 +37,5 @@ util/units.c util/affinity.c util/rwsem.c util/hashmap.c +util/pmu-hybrid.c +util/fncache.c -- cgit v1.2.3 From 91c0f5ec812f38f5e900b5557254baf563c4a2e3 Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Tue, 27 Apr 2021 15:01:30 +0800 Subject: perf record: Uniquify hybrid event name For perf-record, it would be useful to tell user the pmu which the event belongs to. For example, # perf record -a -- sleep 1 # perf report # To display the perf.data header info, please use --header/--header-only options. # # # Total Lost Samples: 0 # # Samples: 106 of event 'cpu_core/cycles/' # Event count (approx.): 22043448 # # Overhead Command Shared Object Symbol # ........ ............ ....................... ............................ # ... Signed-off-by: Jin Yao Reviewed-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20210427070139.25256-18-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'tools') diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 6af46c6a4fd8..3337b5f93336 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -1605,6 +1605,32 @@ static void hit_auxtrace_snapshot_trigger(struct record *rec) } } +static void record__uniquify_name(struct record *rec) +{ + struct evsel *pos; + struct evlist *evlist = rec->evlist; + char *new_name; + int ret; + + if (!perf_pmu__has_hybrid()) + return; + + evlist__for_each_entry(evlist, pos) { + if (!evsel__is_hybrid(pos)) + continue; + + if (strchr(pos->name, '/')) + continue; + + ret = asprintf(&new_name, "%s/%s/", + pos->pmu_name, pos->name); + if (ret) { + free(pos->name); + pos->name = new_name; + } + } +} + static int __cmd_record(struct record *rec, int argc, const char **argv) { int err; @@ -1709,6 +1735,8 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) if (data->is_pipe && rec->evlist->core.nr_entries == 1) rec->opts.sample_id = true; + record__uniquify_name(rec); + if (record__open(rec) != 0) { err = -1; goto out_child; -- cgit v1.2.3 From 2541cb63ac0c3dfbbe363dd09a16dfdd4096fc88 Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Tue, 27 Apr 2021 15:01:31 +0800 Subject: perf tests: Add hybrid cases for 'Parse event definition strings' test Add basic hybrid test cases for 'Parse event definition strings' test. # perf test 6 6: Parse event definition strings : Ok Signed-off-by: Jin Yao Reviewed-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20210427070139.25256-19-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/parse-events.c | 171 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 171 insertions(+) (limited to 'tools') diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c index 026c54743311..0f113b2b36a3 100644 --- a/tools/perf/tests/parse-events.c +++ b/tools/perf/tests/parse-events.c @@ -1512,6 +1512,124 @@ static int test__all_tracepoints(struct evlist *evlist) return test__checkevent_tracepoint_multi(evlist); } +static int test__hybrid_hw_event_with_pmu(struct evlist *evlist) +{ + struct evsel *evsel = evlist__first(evlist); + + TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", 0x3c == evsel->core.attr.config); + return 0; +} + +static int test__hybrid_hw_group_event(struct evlist *evlist) +{ + struct evsel *evsel, *leader; + + evsel = leader = evlist__first(evlist); + TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", 0x3c == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong leader", evsel->leader == leader); + + evsel = evsel__next(evsel); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", 0xc0 == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong leader", evsel->leader == leader); + return 0; +} + +static int test__hybrid_sw_hw_group_event(struct evlist *evlist) +{ + struct evsel *evsel, *leader; + + evsel = leader = evlist__first(evlist); + TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_SOFTWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong leader", evsel->leader == leader); + + evsel = evsel__next(evsel); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", 0x3c == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong leader", evsel->leader == leader); + return 0; +} + +static int test__hybrid_hw_sw_group_event(struct evlist *evlist) +{ + struct evsel *evsel, *leader; + + evsel = leader = evlist__first(evlist); + TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", 0x3c == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong leader", evsel->leader == leader); + + evsel = evsel__next(evsel); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_SOFTWARE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong leader", evsel->leader == leader); + return 0; +} + +static int test__hybrid_group_modifier1(struct evlist *evlist) +{ + struct evsel *evsel, *leader; + + evsel = leader = evlist__first(evlist); + TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", 0x3c == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong leader", evsel->leader == leader); + TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel); + + evsel = evsel__next(evsel); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", 0xc0 == evsel->core.attr.config); + TEST_ASSERT_VAL("wrong leader", evsel->leader == leader); + TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel); + return 0; +} + +static int test__hybrid_raw1(struct evlist *evlist) +{ + struct evsel *evsel = evlist__first(evlist); + + TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", 0x1a == evsel->core.attr.config); + + /* The type of second event is randome value */ + evsel = evsel__next(evsel); + TEST_ASSERT_VAL("wrong config", 0x1a == evsel->core.attr.config); + return 0; +} + +static int test__hybrid_raw2(struct evlist *evlist) +{ + struct evsel *evsel = evlist__first(evlist); + + TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", 0x1a == evsel->core.attr.config); + return 0; +} + +static int test__hybrid_cache_event(struct evlist *evlist) +{ + struct evsel *evsel = evlist__first(evlist); + + TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HW_CACHE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", 0x2 == (evsel->core.attr.config & 0xffffffff)); + + evsel = evsel__next(evsel); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_HW_CACHE == evsel->core.attr.type); + TEST_ASSERT_VAL("wrong config", 0x10002 == (evsel->core.attr.config & 0xffffffff)); + return 0; +} + struct evlist_test { const char *name; __u32 type; @@ -1868,6 +1986,54 @@ static struct terms_test test__terms[] = { }, }; +static struct evlist_test test__hybrid_events[] = { + { + .name = "cpu_core/cpu-cycles/", + .check = test__hybrid_hw_event_with_pmu, + .id = 0, + }, + { + .name = "{cpu_core/cpu-cycles/,cpu_core/instructions/}", + .check = test__hybrid_hw_group_event, + .id = 1, + }, + { + .name = "{cpu-clock,cpu_core/cpu-cycles/}", + .check = test__hybrid_sw_hw_group_event, + .id = 2, + }, + { + .name = "{cpu_core/cpu-cycles/,cpu-clock}", + .check = test__hybrid_hw_sw_group_event, + .id = 3, + }, + { + .name = "{cpu_core/cpu-cycles/k,cpu_core/instructions/u}", + .check = test__hybrid_group_modifier1, + .id = 4, + }, + { + .name = "r1a", + .check = test__hybrid_raw1, + .id = 5, + }, + { + .name = "cpu_core/r1a/", + .check = test__hybrid_raw2, + .id = 6, + }, + { + .name = "cpu_core/config=10,config1,config2=3,period=1000/u", + .check = test__checkevent_pmu, + .id = 7, + }, + { + .name = "cpu_core/LLC-loads/,cpu_atom/LLC-load-misses/", + .check = test__hybrid_cache_event, + .id = 8, + }, +}; + static int test_event(struct evlist_test *e) { struct parse_events_error err; @@ -2035,6 +2201,11 @@ do { \ ret2 = ret1; \ } while (0) + if (perf_pmu__has_hybrid()) { + TEST_EVENTS(test__hybrid_events); + return ret2; + } + TEST_EVENTS(test__events); if (test_pmu()) -- cgit v1.2.3 From afff9f312e37c64a789aad0fab1ec597404a500f Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Tue, 27 Apr 2021 15:01:32 +0800 Subject: perf tests: Add hybrid cases for 'Roundtrip evsel->name' test Since for one hw event, two hybrid events are created. For example, evsel->idx evsel__name(evsel) 0 cycles 1 cycles 2 instructions 3 instructions ... So for comparing the evsel name on hybrid, the evsel->idx needs to be divided by 2. # ./perf test 14 14: Roundtrip evsel->name : Ok Signed-off-by: Jin Yao Reviewed-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20210427070139.25256-20-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/evsel-roundtrip-name.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/evsel-roundtrip-name.c b/tools/perf/tests/evsel-roundtrip-name.c index f7f3e5b4c180..b74cf80d1f10 100644 --- a/tools/perf/tests/evsel-roundtrip-name.c +++ b/tools/perf/tests/evsel-roundtrip-name.c @@ -4,6 +4,7 @@ #include "parse-events.h" #include "tests.h" #include "debug.h" +#include "pmu.h" #include #include @@ -62,7 +63,8 @@ static int perf_evsel__roundtrip_cache_name_test(void) return ret; } -static int __perf_evsel__name_array_test(const char *names[], int nr_names) +static int __perf_evsel__name_array_test(const char *names[], int nr_names, + int distance) { int i, err; struct evsel *evsel; @@ -82,9 +84,9 @@ static int __perf_evsel__name_array_test(const char *names[], int nr_names) err = 0; evlist__for_each_entry(evlist, evsel) { - if (strcmp(evsel__name(evsel), names[evsel->idx])) { + if (strcmp(evsel__name(evsel), names[evsel->idx / distance])) { --err; - pr_debug("%s != %s\n", evsel__name(evsel), names[evsel->idx]); + pr_debug("%s != %s\n", evsel__name(evsel), names[evsel->idx / distance]); } } @@ -93,18 +95,21 @@ out_delete_evlist: return err; } -#define perf_evsel__name_array_test(names) \ - __perf_evsel__name_array_test(names, ARRAY_SIZE(names)) +#define perf_evsel__name_array_test(names, distance) \ + __perf_evsel__name_array_test(names, ARRAY_SIZE(names), distance) int test__perf_evsel__roundtrip_name_test(struct test *test __maybe_unused, int subtest __maybe_unused) { int err = 0, ret = 0; - err = perf_evsel__name_array_test(evsel__hw_names); + if (perf_pmu__has_hybrid()) + return perf_evsel__name_array_test(evsel__hw_names, 2); + + err = perf_evsel__name_array_test(evsel__hw_names, 1); if (err) ret = err; - err = __perf_evsel__name_array_test(evsel__sw_names, PERF_COUNT_SW_DUMMY + 1); + err = __perf_evsel__name_array_test(evsel__sw_names, PERF_COUNT_SW_DUMMY + 1, 1); if (err) ret = err; -- cgit v1.2.3 From f15da0b1fb7bdff4891218f648d374cfffeb24fa Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Tue, 27 Apr 2021 15:01:33 +0800 Subject: perf tests: Skip 'Setup struct perf_event_attr' test for hybrid For hybrid, the attr.type consists of pmu type id + original type. There will be much changes for this test. Now we temporarily skip this test case and TODO in future. Signed-off-by: Jin Yao Reviewed-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20210427070139.25256-21-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/attr.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'tools') diff --git a/tools/perf/tests/attr.c b/tools/perf/tests/attr.c index dd39ce9b0277..9b40a25376ae 100644 --- a/tools/perf/tests/attr.c +++ b/tools/perf/tests/attr.c @@ -34,6 +34,7 @@ #include "event.h" #include "util.h" #include "tests.h" +#include "pmu.h" #define ENV "PERF_TEST_ATTR" @@ -184,6 +185,9 @@ int test__attr(struct test *test __maybe_unused, int subtest __maybe_unused) char path_dir[PATH_MAX]; char *exec_path; + if (perf_pmu__has_hybrid()) + return TEST_SKIP; + /* First try development tree tests. */ if (!lstat("./tests", &st)) return run_dir("./tests", "./perf"); -- cgit v1.2.3 From 43eb05d066795bdfea58a6a0cea77bbaa1a09b30 Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Tue, 27 Apr 2021 15:01:34 +0800 Subject: perf tests: Support 'Track with sched_switch' test for hybrid Since for "cycles:u' on hybrid platform, it creates two "cycles". So the number of events in evlist is not expected in next test steps. Now we just use one event "cpu_core/cycles:u/" for hybrid. # ./perf test 35 35: Track with sched_switch : Ok Signed-off-by: Jin Yao Reviewed-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20210427070139.25256-22-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/switch-tracking.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/tests/switch-tracking.c b/tools/perf/tests/switch-tracking.c index 3ebaa758df77..62c0ec21aaa8 100644 --- a/tools/perf/tests/switch-tracking.c +++ b/tools/perf/tests/switch-tracking.c @@ -18,6 +18,7 @@ #include "record.h" #include "tests.h" #include "util/mmap.h" +#include "pmu.h" static int spin_sleep(void) { @@ -371,7 +372,10 @@ int test__switch_tracking(struct test *test __maybe_unused, int subtest __maybe_ cpu_clocks_evsel = evlist__last(evlist); /* Second event */ - err = parse_events(evlist, "cycles:u", NULL); + if (perf_pmu__has_hybrid()) + err = parse_events(evlist, "cpu_core/cycles/u", NULL); + else + err = parse_events(evlist, "cycles:u", NULL); if (err) { pr_debug("Failed to parse event cycles:u\n"); goto out_err; -- cgit v1.2.3 From 6081e876edd3f5d23273385730e482eca0afb2c8 Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Tue, 27 Apr 2021 15:01:35 +0800 Subject: perf tests: Support 'Parse and process metrics' test for hybrid Some events are not supported. Only pick up some cases for hybrid. # ./perf test 68 68: Parse and process metrics : Ok Signed-off-by: Jin Yao Reviewed-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20210427070139.25256-23-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/parse-metric.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/parse-metric.c b/tools/perf/tests/parse-metric.c index 4968c4106254..4f6f4904e852 100644 --- a/tools/perf/tests/parse-metric.c +++ b/tools/perf/tests/parse-metric.c @@ -11,6 +11,7 @@ #include "debug.h" #include "expr.h" #include "stat.h" +#include "pmu.h" static struct pmu_event pme_test[] = { { @@ -372,10 +373,13 @@ int test__parse_metric(struct test *test __maybe_unused, int subtest __maybe_unu { TEST_ASSERT_VAL("IPC failed", test_ipc() == 0); TEST_ASSERT_VAL("frontend failed", test_frontend() == 0); - TEST_ASSERT_VAL("cache_miss_cycles failed", test_cache_miss_cycles() == 0); TEST_ASSERT_VAL("DCache_L2 failed", test_dcache_l2() == 0); TEST_ASSERT_VAL("recursion fail failed", test_recursion_fail() == 0); - TEST_ASSERT_VAL("test metric group", test_metric_group() == 0); TEST_ASSERT_VAL("Memory bandwidth", test_memory_bandwidth() == 0); + + if (!perf_pmu__has_hybrid()) { + TEST_ASSERT_VAL("cache_miss_cycles failed", test_cache_miss_cycles() == 0); + TEST_ASSERT_VAL("test metric group", test_metric_group() == 0); + } return 0; } -- cgit v1.2.3 From c102038892f73cf70f8c50e4fafb45d6e5465129 Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Tue, 27 Apr 2021 15:01:36 +0800 Subject: perf tests: Support 'Session topology' test for hybrid Force to create one event "cpu_core/cycles/" by default, otherwise in evlist__valid_sample_type, the checking of 'if (evlist->core.nr_entries == 1)' would be failed. # ./perf test 41 41: Session topology : Ok Signed-off-by: Jin Yao Reviewed-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20210427070139.25256-24-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/topology.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c index 050489807a47..ec4e3b21b831 100644 --- a/tools/perf/tests/topology.c +++ b/tools/perf/tests/topology.c @@ -8,6 +8,7 @@ #include "session.h" #include "evlist.h" #include "debug.h" +#include "pmu.h" #include #define TEMPL "/tmp/perf-test-XXXXXX" @@ -40,8 +41,16 @@ static int session_write_header(char *path) session = perf_session__new(&data, false, NULL); TEST_ASSERT_VAL("can't get session", !IS_ERR(session)); - session->evlist = evlist__new_default(); - TEST_ASSERT_VAL("can't get evlist", session->evlist); + if (!perf_pmu__has_hybrid()) { + session->evlist = evlist__new_default(); + TEST_ASSERT_VAL("can't get evlist", session->evlist); + } else { + struct parse_events_error err; + + session->evlist = evlist__new(); + TEST_ASSERT_VAL("can't get evlist", session->evlist); + parse_events(session->evlist, "cpu_core/cycles/", &err); + } perf_header__set_feat(&session->header, HEADER_CPU_TOPOLOGY); perf_header__set_feat(&session->header, HEADER_NRCPUS); -- cgit v1.2.3 From d9da6f70eb23511007cc6ed0aba02d9f61b3d6cf Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Tue, 27 Apr 2021 15:01:37 +0800 Subject: perf tests: Support 'Convert perf time to TSC' test for hybrid Since for "cycles:u' on hybrid platform, it creates two "cycles". So the second evsel in evlist also needs initialization. With this patch, # ./perf test 71 71: Convert perf time to TSC : Ok Signed-off-by: Jin Yao Reviewed-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20210427070139.25256-25-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/perf-time-to-tsc.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'tools') diff --git a/tools/perf/tests/perf-time-to-tsc.c b/tools/perf/tests/perf-time-to-tsc.c index 680c3cffb128..85d75b9b25a1 100644 --- a/tools/perf/tests/perf-time-to-tsc.c +++ b/tools/perf/tests/perf-time-to-tsc.c @@ -20,6 +20,7 @@ #include "tsc.h" #include "mmap.h" #include "tests.h" +#include "pmu.h" #define CHECK__(x) { \ while ((x) < 0) { \ @@ -88,6 +89,17 @@ int test__perf_time_to_tsc(struct test *test __maybe_unused, int subtest __maybe evsel->core.attr.disabled = 1; evsel->core.attr.enable_on_exec = 0; + /* + * For hybrid "cycles:u", it creates two events. + * Init the second evsel here. + */ + if (perf_pmu__has_hybrid()) { + evsel = evsel__next(evsel); + evsel->core.attr.comm = 1; + evsel->core.attr.disabled = 1; + evsel->core.attr.enable_on_exec = 0; + } + CHECK__(evlist__open(evlist)); CHECK__(evlist__mmap(evlist, UINT_MAX)); -- cgit v1.2.3 From a37f3b885610f89c3f2285756eb3f386288c3d41 Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Tue, 27 Apr 2021 15:01:38 +0800 Subject: perf tests: Skip 'perf stat metrics (shadow stat) test' for hybrid Currently we don't support shadow stat for hybrid. root@ssp-pwrt-002:~# ./perf stat -e cycles,instructions -a -- sleep 1 Performance counter stats for 'system wide': 12,883,109,591 cpu_core/cycles/ 6,405,163,221 cpu_atom/cycles/ 555,553,778 cpu_core/instructions/ 841,158,734 cpu_atom/instructions/ 1.002644773 seconds time elapsed Now there is no shadow stat 'insn per cycle' reported. We will support it later and now just skip the 'perf stat metrics (shadow stat) test'. Signed-off-by: Jin Yao Reviewed-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20210427070139.25256-26-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/stat+shadow_stat.sh | 3 +++ 1 file changed, 3 insertions(+) (limited to 'tools') diff --git a/tools/perf/tests/shell/stat+shadow_stat.sh b/tools/perf/tests/shell/stat+shadow_stat.sh index ebebd3596cf9..e6e35fc6c882 100755 --- a/tools/perf/tests/shell/stat+shadow_stat.sh +++ b/tools/perf/tests/shell/stat+shadow_stat.sh @@ -7,6 +7,9 @@ set -e # skip if system-wide mode is forbidden perf stat -a true > /dev/null 2>&1 || exit 2 +# skip if on hybrid platform +perf stat -a -e cycles sleep 1 2>&1 | grep -e cpu_core && exit 2 + test_global_aggr() { perf stat -a --no-big-num -e cycles,instructions sleep 1 2>&1 | \ -- cgit v1.2.3 From 2750ce1d4df2e70630d76bc53da160ca43a80d22 Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Tue, 27 Apr 2021 15:01:39 +0800 Subject: perf Documentation: Document intel-hybrid support Add some words and examples to help understanding of Intel hybrid perf support. Signed-off-by: Jin Yao Reviewed-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20210427070139.25256-27-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/intel-hybrid.txt | 214 ++++++++++++++++++++++++++++++ tools/perf/Documentation/perf-record.txt | 1 + tools/perf/Documentation/perf-stat.txt | 2 + 3 files changed, 217 insertions(+) create mode 100644 tools/perf/Documentation/intel-hybrid.txt (limited to 'tools') diff --git a/tools/perf/Documentation/intel-hybrid.txt b/tools/perf/Documentation/intel-hybrid.txt new file mode 100644 index 000000000000..07f0aa3bf682 --- /dev/null +++ b/tools/perf/Documentation/intel-hybrid.txt @@ -0,0 +1,214 @@ +Intel hybrid support +-------------------- +Support for Intel hybrid events within perf tools. + +For some Intel platforms, such as AlderLake, which is hybrid platform and +it consists of atom cpu and core cpu. Each cpu has dedicated event list. +Part of events are available on core cpu, part of events are available +on atom cpu and even part of events are available on both. + +Kernel exports two new cpu pmus via sysfs: +/sys/devices/cpu_core +/sys/devices/cpu_atom + +The 'cpus' files are created under the directories. For example, + +cat /sys/devices/cpu_core/cpus +0-15 + +cat /sys/devices/cpu_atom/cpus +16-23 + +It indicates cpu0-cpu15 are core cpus and cpu16-cpu23 are atom cpus. + +Quickstart + +List hybrid event +----------------- + +As before, use perf-list to list the symbolic event. + +perf list + +inst_retired.any + [Fixed Counter: Counts the number of instructions retired. Unit: cpu_atom] +inst_retired.any + [Number of instructions retired. Fixed Counter - architectural event. Unit: cpu_core] + +The 'Unit: xxx' is added to brief description to indicate which pmu +the event is belong to. Same event name but with different pmu can +be supported. + +Enable hybrid event with a specific pmu +--------------------------------------- + +To enable a core only event or atom only event, following syntax is supported: + + cpu_core// +or + cpu_atom// + +For example, count the 'cycles' event on core cpus. + + perf stat -e cpu_core/cycles/ + +Create two events for one hardware event automatically +------------------------------------------------------ + +When creating one event and the event is available on both atom and core, +two events are created automatically. One is for atom, the other is for +core. Most of hardware events and cache events are available on both +cpu_core and cpu_atom. + +For hardware events, they have pre-defined configs (e.g. 0 for cycles). +But on hybrid platform, kernel needs to know where the event comes from +(from atom or from core). The original perf event type PERF_TYPE_HARDWARE +can't carry pmu information. So now this type is extended to be PMU aware +type. The PMU type ID is stored at attr.config[63:32]. + +PMU type ID is retrieved from sysfs. +/sys/devices/cpu_atom/type +/sys/devices/cpu_core/type + +The new attr.config layout for PERF_TYPE_HARDWARE: + +PERF_TYPE_HARDWARE: 0xEEEEEEEE000000AA + AA: hardware event ID + EEEEEEEE: PMU type ID + +Cache event is similar. The type PERF_TYPE_HW_CACHE is extended to be +PMU aware type. The PMU type ID is stored at attr.config[63:32]. + +The new attr.config layout for PERF_TYPE_HW_CACHE: + +PERF_TYPE_HW_CACHE: 0xEEEEEEEE00DDCCBB + BB: hardware cache ID + CC: hardware cache op ID + DD: hardware cache op result ID + EEEEEEEE: PMU type ID + +When enabling a hardware event without specified pmu, such as, +perf stat -e cycles -a (use system-wide in this example), two events +are created automatically. + + ------------------------------------------------------------ + perf_event_attr: + size 120 + config 0x400000000 + sample_type IDENTIFIER + read_format TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING + disabled 1 + inherit 1 + exclude_guest 1 + ------------------------------------------------------------ + +and + + ------------------------------------------------------------ + perf_event_attr: + size 120 + config 0x800000000 + sample_type IDENTIFIER + read_format TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING + disabled 1 + inherit 1 + exclude_guest 1 + ------------------------------------------------------------ + +type 0 is PERF_TYPE_HARDWARE. +0x4 in 0x400000000 indicates it's cpu_core pmu. +0x8 in 0x800000000 indicates it's cpu_atom pmu (atom pmu type id is random). + +The kernel creates 'cycles' (0x400000000) on cpu0-cpu15 (core cpus), +and create 'cycles' (0x800000000) on cpu16-cpu23 (atom cpus). + +For perf-stat result, it displays two events: + + Performance counter stats for 'system wide': + + 6,744,979 cpu_core/cycles/ + 1,965,552 cpu_atom/cycles/ + +The first 'cycles' is core event, the second 'cycles' is atom event. + +Thread mode example: +-------------------- + +perf-stat reports the scaled counts for hybrid event and with a percentage +displayed. The percentage is the event's running time/enabling time. + +One example, 'triad_loop' runs on cpu16 (atom core), while we can see the +scaled value for core cycles is 160,444,092 and the percentage is 0.47%. + +perf stat -e cycles -- taskset -c 16 ./triad_loop + +As previous, two events are created. + +------------------------------------------------------------ +perf_event_attr: + size 120 + config 0x400000000 + sample_type IDENTIFIER + read_format TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING + disabled 1 + inherit 1 + enable_on_exec 1 + exclude_guest 1 +------------------------------------------------------------ + +and + +------------------------------------------------------------ +perf_event_attr: + size 120 + config 0x800000000 + sample_type IDENTIFIER + read_format TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING + disabled 1 + inherit 1 + enable_on_exec 1 + exclude_guest 1 +------------------------------------------------------------ + + Performance counter stats for 'taskset -c 16 ./triad_loop': + + 233,066,666 cpu_core/cycles/ (0.43%) + 604,097,080 cpu_atom/cycles/ (99.57%) + +perf-record: +------------ + +If there is no '-e' specified in perf record, on hybrid platform, +it creates two default 'cycles' and adds them to event list. One +is for core, the other is for atom. + +perf-stat: +---------- + +If there is no '-e' specified in perf stat, on hybrid platform, +besides of software events, following events are created and +added to event list in order. + +cpu_core/cycles/, +cpu_atom/cycles/, +cpu_core/instructions/, +cpu_atom/instructions/, +cpu_core/branches/, +cpu_atom/branches/, +cpu_core/branch-misses/, +cpu_atom/branch-misses/ + +Of course, both perf-stat and perf-record support to enable +hybrid event with a specific pmu. + +e.g. +perf stat -e cpu_core/cycles/ +perf stat -e cpu_atom/cycles/ +perf stat -e cpu_core/r1a/ +perf stat -e cpu_atom/L1-icache-loads/ +perf stat -e cpu_core/cycles/,cpu_atom/instructions/ +perf stat -e '{cpu_core/cycles/,cpu_core/instructions/}' + +But '{cpu_core/cycles/,cpu_atom/instructions/}' will return +warning and disable grouping, because the pmus in group are +not matched (cpu_core vs. cpu_atom). diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index f3161c9673e9..d71bac847936 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -695,6 +695,7 @@ measurements: wait -n ${perf_pid} exit $? +include::intel-hybrid.txt[] SEE ALSO -------- diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt index f10e24da23e9..45c2467e4eb2 100644 --- a/tools/perf/Documentation/perf-stat.txt +++ b/tools/perf/Documentation/perf-stat.txt @@ -552,6 +552,8 @@ The fields are in this order: Additional metrics may be printed with all earlier fields being empty. +include::intel-hybrid.txt[] + SEE ALSO -------- linkperf:perf-top[1], linkperf:perf-list[1] -- cgit v1.2.3 From 56d32d4cac645bac05fa70d935fa5040e3ab6bb3 Mon Sep 17 00:00:00 2001 From: Michael Petlan Date: Wed, 28 Apr 2021 11:20:23 +0200 Subject: perf tools: Enable libtraceevent dynamic linking Currently we support only static linking with kernel's libtraceevent (tools/lib/traceevent). This patch adds libtraceevent package detection and support to link perf with it dynamically. The libtraceevent package status is displayed with: $ make VF=1 LIBTRACEEVENT_DYNAMIC=1 ... ... libtraceevent: [ on ] Default behavior remains the same (static linking). Committer testing: $ make LIBTRACEEVENT_DYNAMIC=1 VF=1 O=/tmp/build/perf -C tools/perf install-bin |& grep traceevent Makefile.config:1090: *** Error: No libtraceevent devel library found, please install libtraceevent-devel. Stop. $ Signed-off-by: Michael Petlan Tested-by: Arnaldo Carvalho de Melo Cc: Jiri Olsa LPU-Reference: 20210428092023.4009-1-mpetlan@redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/build/Makefile.feature | 1 + tools/build/feature/Makefile | 4 ++++ tools/build/feature/test-libtraceevent.c | 12 ++++++++++++ tools/perf/Makefile.config | 9 +++++++++ tools/perf/Makefile.perf | 8 ++++++-- 5 files changed, 32 insertions(+), 2 deletions(-) create mode 100644 tools/build/feature/test-libtraceevent.c (limited to 'tools') diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature index 74e255d58d8d..b3221a43fa65 100644 --- a/tools/build/Makefile.feature +++ b/tools/build/Makefile.feature @@ -52,6 +52,7 @@ FEATURE_TESTS_BASIC := \ libpython-version \ libslang \ libslang-include-subdir \ + libtraceevent \ libcrypto \ libunwind \ pthread-attr-setaffinity-np \ diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile index 3e55edb3ea54..ec203e28407f 100644 --- a/tools/build/feature/Makefile +++ b/tools/build/feature/Makefile @@ -36,6 +36,7 @@ FILES= \ test-libpython-version.bin \ test-libslang.bin \ test-libslang-include-subdir.bin \ + test-libtraceevent.bin \ test-libcrypto.bin \ test-libunwind.bin \ test-libunwind-debug-frame.bin \ @@ -196,6 +197,9 @@ $(OUTPUT)test-libslang.bin: $(OUTPUT)test-libslang-include-subdir.bin: $(BUILD) -lslang +$(OUTPUT)test-libtraceevent.bin: + $(BUILD) -ltraceevent + $(OUTPUT)test-libcrypto.bin: $(BUILD) -lcrypto diff --git a/tools/build/feature/test-libtraceevent.c b/tools/build/feature/test-libtraceevent.c new file mode 100644 index 000000000000..416b11ffd4b4 --- /dev/null +++ b/tools/build/feature/test-libtraceevent.c @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +int main(void) +{ + int rv = 0; + struct trace_seq s; + trace_seq_init(&s); + rv += !(s.state == TRACE_SEQ__GOOD); + trace_seq_destroy(&s); + return rv; +} diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 3514fe956ed1..99c73a7b6a26 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -1079,6 +1079,15 @@ ifdef LIBPFM4 endif endif +ifdef LIBTRACEEVENT_DYNAMIC + $(call feature_check,libtraceevent) + ifeq ($(feature-libtraceevent), 1) + EXTLIBS += -ltraceevent + else + dummy := $(error Error: No libtraceevent devel library found, please install libtraceevent-devel); + endif +endif + # Among the variables below, these: # perfexecdir # perf_include_dir diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 6240fbb1646e..e47f04e5b51e 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -128,6 +128,8 @@ include ../scripts/utilities.mak # # Define BUILD_BPF_SKEL to enable BPF skeletons # +# Define LIBTRACEEVENT_DYNAMIC to enable libtraceevent dynamic linking +# # As per kernel Makefile, avoid funny character set dependencies unexport LC_ALL @@ -310,7 +312,6 @@ endif LIBTRACEEVENT = $(TE_PATH)libtraceevent.a export LIBTRACEEVENT - LIBTRACEEVENT_DYNAMIC_LIST = $(PLUGINS_PATH)libtraceevent-dynamic-list # @@ -375,12 +376,15 @@ endif export PERL_PATH -PERFLIBS = $(LIBAPI) $(LIBTRACEEVENT) $(LIBSUBCMD) $(LIBPERF) +PERFLIBS = $(LIBAPI) $(LIBSUBCMD) $(LIBPERF) ifndef NO_LIBBPF ifndef LIBBPF_DYNAMIC PERFLIBS += $(LIBBPF) endif endif +ifndef LIBTRACEEVENT_DYNAMIC + PERFLIBS += $(LIBTRACEEVENT) +endif # We choose to avoid "if .. else if .. else .. endif endif" # because maintaining the nesting to match is a pain. If -- cgit v1.2.3 From e1d380ea8b00db4bb14d1f513000d4b62aa9d3f0 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Wed, 28 Apr 2021 20:09:12 +0800 Subject: perf tools: Change fields type in perf_record_time_conv C standard claims "An object declared as type _Bool is large enough to store the values 0 and 1", bool type size can be 1 byte or larger than 1 byte. Thus it's uncertian for bool type size with different compilers. This patch changes the bool type in structure perf_record_time_conv to __u8 type, and pads extra bytes for 8-byte alignment; this can give reliable structure size. Fixes: d110162cafc8 ("perf tsc: Support cap_user_time_short for event TIME_CONV") Suggested-by: Adrian Hunter Signed-off-by: Leo Yan Acked-by: Adrian Hunter Cc: Alexander Shishkin Cc: Gustavo A. R. Silva Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Steve MacLean Cc: Yonatan Goldschmidt Link: https://lore.kernel.org/r/20210428120915.7123-2-leo.yan@linaro.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/perf/include/perf/event.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/lib/perf/include/perf/event.h b/tools/lib/perf/include/perf/event.h index d82054225fcc..48583e441d9b 100644 --- a/tools/lib/perf/include/perf/event.h +++ b/tools/lib/perf/include/perf/event.h @@ -346,8 +346,9 @@ struct perf_record_time_conv { __u64 time_zero; __u64 time_cycles; __u64 time_mask; - bool cap_user_time_zero; - bool cap_user_time_short; + __u8 cap_user_time_zero; + __u8 cap_user_time_short; + __u8 reserved[6]; /* For alignment */ }; struct perf_record_header_feature { -- cgit v1.2.3 From aa616f5a8a2d22a179d5502ebd85045af66fa656 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Wed, 28 Apr 2021 20:09:13 +0800 Subject: perf jit: Let convert_timestamp() to be backwards-compatible Commit d110162cafc80dad ("perf tsc: Support cap_user_time_short for event TIME_CONV") supports the extended parameters for event TIME_CONV, but it broke the backwards compatibility, so any perf data file with old event format fails to convert timestamp. This patch introduces a helper event_contains() to check if an event contains a specific member or not. For the backwards-compatibility, if the event size confirms the extended parameters are supported in the event TIME_CONV, then copies these parameters. Committer notes: To make this compiler backwards compatible add this patch: - struct perf_tsc_conversion tc = { 0 }; + struct perf_tsc_conversion tc = { .time_shift = 0, }; Fixes: d110162cafc8 ("perf tsc: Support cap_user_time_short for event TIME_CONV") Signed-off-by: Leo Yan Acked-by: Adrian Hunter Cc: Alexander Shishkin Cc: Gustavo A. R. Silva Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Steve MacLean Cc: Yonatan Goldschmidt Link: https://lore.kernel.org/r/20210428120915.7123-3-leo.yan@linaro.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/perf/include/perf/event.h | 2 ++ tools/perf/util/jitdump.c | 30 ++++++++++++++++++++---------- 2 files changed, 22 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/lib/perf/include/perf/event.h b/tools/lib/perf/include/perf/event.h index 48583e441d9b..4d0c02ba3f7d 100644 --- a/tools/lib/perf/include/perf/event.h +++ b/tools/lib/perf/include/perf/event.h @@ -8,6 +8,8 @@ #include #include /* pid_t */ +#define event_contains(obj, mem) ((obj).header.size > offsetof(typeof(obj), mem)) + struct perf_record_mmap { struct perf_event_header header; __u32 pid, tid; diff --git a/tools/perf/util/jitdump.c b/tools/perf/util/jitdump.c index 9760d8e7b386..917a9c707371 100644 --- a/tools/perf/util/jitdump.c +++ b/tools/perf/util/jitdump.c @@ -396,21 +396,31 @@ static pid_t jr_entry_tid(struct jit_buf_desc *jd, union jr_entry *jr) static uint64_t convert_timestamp(struct jit_buf_desc *jd, uint64_t timestamp) { - struct perf_tsc_conversion tc; + struct perf_tsc_conversion tc = { .time_shift = 0, }; + struct perf_record_time_conv *time_conv = &jd->session->time_conv; if (!jd->use_arch_timestamp) return timestamp; - tc.time_shift = jd->session->time_conv.time_shift; - tc.time_mult = jd->session->time_conv.time_mult; - tc.time_zero = jd->session->time_conv.time_zero; - tc.time_cycles = jd->session->time_conv.time_cycles; - tc.time_mask = jd->session->time_conv.time_mask; - tc.cap_user_time_zero = jd->session->time_conv.cap_user_time_zero; - tc.cap_user_time_short = jd->session->time_conv.cap_user_time_short; + tc.time_shift = time_conv->time_shift; + tc.time_mult = time_conv->time_mult; + tc.time_zero = time_conv->time_zero; - if (!tc.cap_user_time_zero) - return 0; + /* + * The event TIME_CONV was extended for the fields from "time_cycles" + * when supported cap_user_time_short, for backward compatibility, + * checks the event size and assigns these extended fields if these + * fields are contained in the event. + */ + if (event_contains(*time_conv, time_cycles)) { + tc.time_cycles = time_conv->time_cycles; + tc.time_mask = time_conv->time_mask; + tc.cap_user_time_zero = time_conv->cap_user_time_zero; + tc.cap_user_time_short = time_conv->cap_user_time_short; + + if (!tc.cap_user_time_zero) + return 0; + } return tsc_to_perf_time(timestamp, &tc); } -- cgit v1.2.3 From 050ffc449008eeeafc187dec337d9cf1518f89bc Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Wed, 28 Apr 2021 20:09:14 +0800 Subject: perf session: Add swap operation for event TIME_CONV Since commit d110162cafc8 ("perf tsc: Support cap_user_time_short for event TIME_CONV"), the event PERF_RECORD_TIME_CONV has extended the data structure for clock parameters. To be backwards-compatible, this patch adds a dedicated swap operation for the event PERF_RECORD_TIME_CONV, based on checking if the event contains field "time_cycles", it can support both for the old and new event formats. Fixes: d110162cafc8 ("perf tsc: Support cap_user_time_short for event TIME_CONV") Signed-off-by: Leo Yan Acked-by: Adrian Hunter Cc: Alexander Shishkin Cc: Gustavo A. R. Silva Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Steve MacLean Cc: Yonatan Goldschmidt Link: https://lore.kernel.org/r/20210428120915.7123-4-leo.yan@linaro.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/session.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index a6659b616e6d..cc954b1cfa86 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -949,6 +949,19 @@ static void perf_event__stat_round_swap(union perf_event *event, event->stat_round.time = bswap_64(event->stat_round.time); } +static void perf_event__time_conv_swap(union perf_event *event, + bool sample_id_all __maybe_unused) +{ + event->time_conv.time_shift = bswap_64(event->time_conv.time_shift); + event->time_conv.time_mult = bswap_64(event->time_conv.time_mult); + event->time_conv.time_zero = bswap_64(event->time_conv.time_zero); + + if (event_contains(event->time_conv, time_cycles)) { + event->time_conv.time_cycles = bswap_64(event->time_conv.time_cycles); + event->time_conv.time_mask = bswap_64(event->time_conv.time_mask); + } +} + typedef void (*perf_event__swap_op)(union perf_event *event, bool sample_id_all); @@ -985,7 +998,7 @@ static perf_event__swap_op perf_event__swap_ops[] = { [PERF_RECORD_STAT] = perf_event__stat_swap, [PERF_RECORD_STAT_ROUND] = perf_event__stat_round_swap, [PERF_RECORD_EVENT_UPDATE] = perf_event__event_update_swap, - [PERF_RECORD_TIME_CONV] = perf_event__all64_swap, + [PERF_RECORD_TIME_CONV] = perf_event__time_conv_swap, [PERF_RECORD_HEADER_MAX] = NULL, }; -- cgit v1.2.3 From 81e70d7ee4ae13d60800958bca9d3c7675de16c9 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Wed, 28 Apr 2021 20:09:15 +0800 Subject: perf session: Dump PERF_RECORD_TIME_CONV event Now perf tool uses the common stub function process_event_op2_stub() for dumping TIME_CONV event, thus it doesn't output the clock parameters contained in the event. This patch adds the callback function for dumping the hardware clock parameters in TIME_CONV event. Before: # perf report -D 0x978 [0x38]: event: 79 . . ... raw event: size 56 bytes . 0000: 4f 00 00 00 00 00 38 00 15 00 00 00 00 00 00 00 O.....8......... . 0010: 00 00 40 01 00 00 00 00 86 89 0b bf df ff ff ff ..@........ . 0020: d1 c1 b2 39 03 00 00 00 ff ff ff ff ff ff ff 00 9..... . 0030: 01 01 00 00 00 00 00 00 ........ 0 0 0x978 [0x38]: PERF_RECORD_TIME_CONV : unhandled! [...] After: # perf report -D 0x978 [0x38]: event: 79 . . ... raw event: size 56 bytes . 0000: 4f 00 00 00 00 00 38 00 15 00 00 00 00 00 00 00 O.....8......... . 0010: 00 00 40 01 00 00 00 00 86 89 0b bf df ff ff ff ..@........ . 0020: d1 c1 b2 39 03 00 00 00 ff ff ff ff ff ff ff 00 9..... . 0030: 01 01 00 00 00 00 00 00 ........ 0 0 0x978 [0x38]: PERF_RECORD_TIME_CONV ... Time Shift 21 ... Time Muliplier 20971520 ... Time Zero 18446743935180835206 ... Time Cycles 13852918225 ... Time Mask 0xffffffffffffff ... Cap Time Zero 1 ... Cap Time Short 1 : unhandled! [...] Signed-off-by: Leo Yan Acked-by: Adrian Hunter Cc: Alexander Shishkin Cc: Gustavo A. R. Silva Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Steve MacLean Cc: Yonatan Goldschmidt Link: https://lore.kernel.org/r/20210428120915.7123-5-leo.yan@linaro.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/session.c | 13 ++++++++++++- tools/perf/util/tsc.c | 30 ++++++++++++++++++++++++++++++ tools/perf/util/tsc.h | 4 ++++ 3 files changed, 46 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index cc954b1cfa86..a12cf4f0e97a 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -29,6 +29,7 @@ #include "thread-stack.h" #include "sample-raw.h" #include "stat.h" +#include "tsc.h" #include "ui/progress.h" #include "../perf.h" #include "arch/common.h" @@ -451,6 +452,16 @@ static int process_stat_round_stub(struct perf_session *perf_session __maybe_unu return 0; } +static int process_event_time_conv_stub(struct perf_session *perf_session __maybe_unused, + union perf_event *event) +{ + if (dump_trace) + perf_event__fprintf_time_conv(event, stdout); + + dump_printf(": unhandled!\n"); + return 0; +} + static int perf_session__process_compressed_event_stub(struct perf_session *session __maybe_unused, union perf_event *event __maybe_unused, u64 file_offset __maybe_unused) @@ -532,7 +543,7 @@ void perf_tool__fill_defaults(struct perf_tool *tool) if (tool->stat_round == NULL) tool->stat_round = process_stat_round_stub; if (tool->time_conv == NULL) - tool->time_conv = process_event_op2_stub; + tool->time_conv = process_event_time_conv_stub; if (tool->feature == NULL) tool->feature = process_event_op2_stub; if (tool->compressed == NULL) diff --git a/tools/perf/util/tsc.c b/tools/perf/util/tsc.c index 62b4c75c966c..f19791d46e99 100644 --- a/tools/perf/util/tsc.c +++ b/tools/perf/util/tsc.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include +#include #include #include @@ -110,3 +112,31 @@ u64 __weak rdtsc(void) { return 0; } + +size_t perf_event__fprintf_time_conv(union perf_event *event, FILE *fp) +{ + struct perf_record_time_conv *tc = (struct perf_record_time_conv *)event; + size_t ret; + + ret = fprintf(fp, "\n... Time Shift %" PRI_lu64 "\n", tc->time_shift); + ret += fprintf(fp, "... Time Muliplier %" PRI_lu64 "\n", tc->time_mult); + ret += fprintf(fp, "... Time Zero %" PRI_lu64 "\n", tc->time_zero); + + /* + * The event TIME_CONV was extended for the fields from "time_cycles" + * when supported cap_user_time_short, for backward compatibility, + * prints the extended fields only if they are contained in the event. + */ + if (event_contains(*tc, time_cycles)) { + ret += fprintf(fp, "... Time Cycles %" PRI_lu64 "\n", + tc->time_cycles); + ret += fprintf(fp, "... Time Mask %#" PRI_lx64 "\n", + tc->time_mask); + ret += fprintf(fp, "... Cap Time Zero %" PRId32 "\n", + tc->cap_user_time_zero); + ret += fprintf(fp, "... Cap Time Short %" PRId32 "\n", + tc->cap_user_time_short); + } + + return ret; +} diff --git a/tools/perf/util/tsc.h b/tools/perf/util/tsc.h index 72a15419f3b3..7d83a31732a7 100644 --- a/tools/perf/util/tsc.h +++ b/tools/perf/util/tsc.h @@ -4,6 +4,8 @@ #include +#include "event.h" + struct perf_tsc_conversion { u16 time_shift; u32 time_mult; @@ -24,4 +26,6 @@ u64 perf_time_to_tsc(u64 ns, struct perf_tsc_conversion *tc); u64 tsc_to_perf_time(u64 cyc, struct perf_tsc_conversion *tc); u64 rdtsc(void); +size_t perf_event__fprintf_time_conv(union perf_event *event, FILE *fp); + #endif // __PERF_TSC_H -- cgit v1.2.3 From fbed59f844912f377b83cc25594c692b5f6ebae2 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 26 Apr 2021 15:27:32 -0300 Subject: perf build: Regenerate the FEATURE_DUMP file after extra feature checks Feature detection is done in tools/build/Makefile.feature, we may exit there with some features not detected and then, in tools/perf/Makefile.config try adding extra libraries to link and then do extra feature checks to see if we now find the feature. This is the case with the disassembler-four-args that checks if the diassembler() function in libopcodes (binutils) has a signature with one or with four arguments, as this is not ABI and they changed it at some point. This is not a problem when doing normal builds, for instance: $ make -C tools/perf O=/tmp/build/perf As we don't use what is in FEATURE-DUMP at that point, but is a problem if we pass FEATURE_DUMP=/previously-detected-features as we do in 'make -C tools/perf build-test' to reuse the feature detection in the many build combinations we test there. When that is done feature-disassembler-four-args will be set to 0, but opensuse 15.1 has the four arguments function signature in disassembler(). The build thus fails. Fix it by rewriting the FEATURE-DUMP file at the end of tools/perf/Makefile.config to register features we retested in that make file. Signed-off-by: Jiri Olsa Reported-by: Arnaldo Carvalho de Melo Tested-by: Arnaldo Carvalho de Melo Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.config | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'tools') diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 99c73a7b6a26..94e31fa75f42 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -1224,3 +1224,9 @@ $(call detected_var,LIBDIR) $(call detected_var,GTK_CFLAGS) $(call detected_var,PERL_EMBED_CCOPTS) $(call detected_var,PYTHON_EMBED_CCOPTS) + +# re-generate FEATURE-DUMP as we may have called feature_check, found out +# extra libraries to add to LDFLAGS of some other test and then redo those +# tests, see the block about libbfd, disassembler-four-args, for instance. +$(shell rm -f $(FEATURE_DUMP_FILENAME)) +$(foreach feat,$(FEATURE_TESTS),$(shell echo "$(call feature_assign,$(feat))" >> $(FEATURE_DUMP_FILENAME))) -- cgit v1.2.3 From 19177bc3da7e52bc7fb7e603556f98f06e074092 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 26 Apr 2021 17:01:24 -0300 Subject: tools build: Allow deferring printing the results of feature detection By setting FEATURE_DISPLAY_DEFERRED=1 a tool may ask for the printout of the detected features in tools/build/Makefile.feature to be done later adter extra feature checks are done that are tool specific. The perf tool will do it via its tools/perf/Makefile.config, as it performs such extra feature checks. Acked-by: Jiri Olsa Signed-off-by: Arnaldo Carvalho de Melo --- tools/build/Makefile.feature | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature index b3221a43fa65..04a8e3db8a54 100644 --- a/tools/build/Makefile.feature +++ b/tools/build/Makefile.feature @@ -240,17 +240,24 @@ ifeq ($(VF),1) feature_verbose := 1 endif -ifeq ($(feature_display),1) - $(info ) - $(info Auto-detecting system features:) - $(foreach feat,$(FEATURE_DISPLAY),$(call feature_print_status,$(feat),)) - ifneq ($(feature_verbose),1) +feature_display_entries = $(eval $(feature_display_entries_code)) +define feature_display_entries_code + ifeq ($(feature_display),1) $(info ) + $(info Auto-detecting system features:) + $(foreach feat,$(FEATURE_DISPLAY),$(call feature_print_status,$(feat),)) + ifneq ($(feature_verbose),1) + $(info ) + endif endif -endif -ifeq ($(feature_verbose),1) - TMP := $(filter-out $(FEATURE_DISPLAY),$(FEATURE_TESTS)) - $(foreach feat,$(TMP),$(call feature_print_status,$(feat),)) - $(info ) + ifeq ($(feature_verbose),1) + TMP := $(filter-out $(FEATURE_DISPLAY),$(FEATURE_TESTS)) + $(foreach feat,$(TMP),$(call feature_print_status,$(feat),)) + $(info ) + endif +endef + +ifeq ($(FEATURE_DISPLAY_DEFERRED),) + $(call feature_display_entries) endif -- cgit v1.2.3 From c6e3bf437184d41d885ba679eab0ddd43f95db56 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 26 Apr 2021 15:47:39 -0300 Subject: perf build: Defer printing detected features to the end of all feature checks We were doing it in tools/build/Makefile.feature, after running the feature checks, but then in tools/perf/Makefile.config we can call more feature checks when we notice that some feature check failed, like when libbfd wasn't detected and we add libraries to the LDFLAGS of its feature check to try again, etc. Acked-by: Jiri Olsa Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.config | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'tools') diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 94e31fa75f42..0d6619064a83 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -299,6 +299,9 @@ ifneq ($(TCMALLOC),) endif ifeq ($(FEATURES_DUMP),) +# We will display at the end of this Makefile.config, using $(call feature_display_entries) +# As we may retry some feature detection here, see the disassembler-four-args case, for instance + FEATURE_DISPLAY_DEFERRED := 1 include $(srctree)/tools/build/Makefile.feature else include $(FEATURES_DUMP) @@ -1230,3 +1233,7 @@ $(call detected_var,PYTHON_EMBED_CCOPTS) # tests, see the block about libbfd, disassembler-four-args, for instance. $(shell rm -f $(FEATURE_DUMP_FILENAME)) $(foreach feat,$(FEATURE_TESTS),$(shell echo "$(call feature_assign,$(feat))" >> $(FEATURE_DUMP_FILENAME))) + +ifeq ($(feature_display),1) + $(call feature_display_entries) +endif -- cgit v1.2.3 From f80f88f0e2f2ef9cd805fad1bbf676b0ecd4b55c Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Wed, 28 Apr 2021 17:25:01 +0200 Subject: selftests/bpf: Fix the snprintf test The BPF program for the snprintf selftest runs on all syscall entries. On busy multicore systems this can cause concurrency issues. For example it was observed that sometimes the userspace part of the test reads " 4 0000" instead of " 4 000" (extra '0' at the end) which seems to happen just before snprintf on another core sets end[-1] = '\0'. This patch adds a pid filter to the test to ensure that no bpf_snprintf() will write over the test's output buffers while the userspace reads the values. Fixes: c2e39c6bdc7e ("selftests/bpf: Add a series of tests for bpf_snprintf") Reported-by: Andrii Nakryiko Signed-off-by: Florent Revest Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210428152501.1024509-1-revest@chromium.org --- tools/testing/selftests/bpf/prog_tests/snprintf.c | 2 ++ tools/testing/selftests/bpf/progs/test_snprintf.c | 5 +++++ 2 files changed, 7 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/snprintf.c b/tools/testing/selftests/bpf/prog_tests/snprintf.c index a958c22aec75..dffbcaa1ec98 100644 --- a/tools/testing/selftests/bpf/prog_tests/snprintf.c +++ b/tools/testing/selftests/bpf/prog_tests/snprintf.c @@ -43,6 +43,8 @@ void test_snprintf_positive(void) if (!ASSERT_OK_PTR(skel, "skel_open")) return; + skel->bss->pid = getpid(); + if (!ASSERT_OK(test_snprintf__attach(skel), "skel_attach")) goto cleanup; diff --git a/tools/testing/selftests/bpf/progs/test_snprintf.c b/tools/testing/selftests/bpf/progs/test_snprintf.c index 951a0301c553..e35129bea0a0 100644 --- a/tools/testing/selftests/bpf/progs/test_snprintf.c +++ b/tools/testing/selftests/bpf/progs/test_snprintf.c @@ -5,6 +5,8 @@ #include #include +__u32 pid = 0; + char num_out[64] = {}; long num_ret = 0; @@ -42,6 +44,9 @@ int handler(const void *ctx) static const char str1[] = "str1"; static const char longstr[] = "longstr"; + if ((int)bpf_get_current_pid_tgid() != pid) + return 0; + /* Integer types */ num_ret = BPF_SNPRINTF(num_out, sizeof(num_out), "%d %u %x %li %llu %lX", -- cgit v1.2.3 From 4bbcc5a41c5449f6a67edb3fbc2dccae9c6724db Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 29 Apr 2021 22:56:33 -0700 Subject: kselftests: cgroup: update kmem test for new vmstat implementation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With memcg having switched to rstat, memory.stat output is precise. Update the cgroup selftest to reflect the expectations and error tolerances of the new implementation. Also add newly tracked types of memory to the memory.stat side of the equation, since they're included in memory.current and could throw false positives. Link: https://lkml.kernel.org/r/20210209163304.77088-9-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Shakeel Butt Reviewed-by: Michal Koutný Acked-by: Roman Gushchin Cc: Michal Hocko Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/cgroup/test_kmem.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/cgroup/test_kmem.c b/tools/testing/selftests/cgroup/test_kmem.c index 0941aa16157e..22b31ebb3513 100644 --- a/tools/testing/selftests/cgroup/test_kmem.c +++ b/tools/testing/selftests/cgroup/test_kmem.c @@ -19,12 +19,12 @@ /* - * Memory cgroup charging and vmstat data aggregation is performed using - * percpu batches 32 pages big (look at MEMCG_CHARGE_BATCH). So the maximum - * discrepancy between charge and vmstat entries is number of cpus multiplied - * by 32 pages multiplied by 2. + * Memory cgroup charging is performed using percpu batches 32 pages + * big (look at MEMCG_CHARGE_BATCH), whereas memory.stat is exact. So + * the maximum discrepancy between charge and vmstat entries is number + * of cpus multiplied by 32 pages. */ -#define MAX_VMSTAT_ERROR (4096 * 32 * 2 * get_nprocs()) +#define MAX_VMSTAT_ERROR (4096 * 32 * get_nprocs()) static int alloc_dcache(const char *cgroup, void *arg) @@ -162,7 +162,7 @@ static int cg_run_in_subcgroups(const char *parent, */ static int test_kmem_memcg_deletion(const char *root) { - long current, slab, anon, file, kernel_stack, sum; + long current, slab, anon, file, kernel_stack, pagetables, percpu, sock, sum; int ret = KSFT_FAIL; char *parent; @@ -184,11 +184,14 @@ static int test_kmem_memcg_deletion(const char *root) anon = cg_read_key_long(parent, "memory.stat", "anon "); file = cg_read_key_long(parent, "memory.stat", "file "); kernel_stack = cg_read_key_long(parent, "memory.stat", "kernel_stack "); + pagetables = cg_read_key_long(parent, "memory.stat", "pagetables "); + percpu = cg_read_key_long(parent, "memory.stat", "percpu "); + sock = cg_read_key_long(parent, "memory.stat", "sock "); if (current < 0 || slab < 0 || anon < 0 || file < 0 || - kernel_stack < 0) + kernel_stack < 0 || pagetables < 0 || percpu < 0 || sock < 0) goto cleanup; - sum = slab + anon + file + kernel_stack; + sum = slab + anon + file + kernel_stack + pagetables + percpu + sock; if (abs(sum - current) < MAX_VMSTAT_ERROR) { ret = KSFT_PASS; } else { @@ -198,6 +201,9 @@ static int test_kmem_memcg_deletion(const char *root) printf("anon = %ld\n", anon); printf("file = %ld\n", file); printf("kernel_stack = %ld\n", kernel_stack); + printf("pagetables = %ld\n", pagetables); + printf("percpu = %ld\n", percpu); + printf("sock = %ld\n", sock); } cleanup: -- cgit v1.2.3 From 8593100444e93861fb5c867bf8cc104543259714 Mon Sep 17 00:00:00 2001 From: Brian Geffon Date: Thu, 29 Apr 2021 22:57:52 -0700 Subject: selftests: add a MREMAP_DONTUNMAP selftest for shmem This test extends the current mremap tests to validate that the MREMAP_DONTUNMAP operation can be performed on shmem mappings. Link: https://lkml.kernel.org/r/20210323182520.2712101-3-bgeffon@google.com Signed-off-by: Brian Geffon Cc: Axel Rasmussen Cc: Lokesh Gidra Cc: Mike Rapoport Cc: Peter Xu Cc: Hugh Dickins Cc: "Michael S . Tsirkin" Cc: Brian Geffon Cc: Andy Lutomirski Cc: Vlastimil Babka Cc: Andrea Arcangeli Cc: Sonny Rao Cc: Minchan Kim Cc: "Kirill A . Shutemov" Cc: Dmitry Safonov Cc: Michael Kerrisk Cc: Alejandro Colomar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/mremap_dontunmap.c | 52 +++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/vm/mremap_dontunmap.c b/tools/testing/selftests/vm/mremap_dontunmap.c index 3a7b5ef0b0c6..f01dc4a85b0b 100644 --- a/tools/testing/selftests/vm/mremap_dontunmap.c +++ b/tools/testing/selftests/vm/mremap_dontunmap.c @@ -127,6 +127,57 @@ static void mremap_dontunmap_simple() "unable to unmap source mapping"); } +// This test validates that MREMAP_DONTUNMAP on a shared mapping works as expected. +static void mremap_dontunmap_simple_shmem() +{ + unsigned long num_pages = 5; + + int mem_fd = memfd_create("memfd", MFD_CLOEXEC); + BUG_ON(mem_fd < 0, "memfd_create"); + + BUG_ON(ftruncate(mem_fd, num_pages * page_size) < 0, + "ftruncate"); + + void *source_mapping = + mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE, + MAP_FILE | MAP_SHARED, mem_fd, 0); + BUG_ON(source_mapping == MAP_FAILED, "mmap"); + + BUG_ON(close(mem_fd) < 0, "close"); + + memset(source_mapping, 'a', num_pages * page_size); + + // Try to just move the whole mapping anywhere (not fixed). + void *dest_mapping = + mremap(source_mapping, num_pages * page_size, num_pages * page_size, + MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL); + if (dest_mapping == MAP_FAILED && errno == EINVAL) { + // Old kernel which doesn't support MREMAP_DONTUNMAP on shmem. + BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, + "unable to unmap source mapping"); + return; + } + + BUG_ON(dest_mapping == MAP_FAILED, "mremap"); + + // Validate that the pages have been moved, we know they were moved if + // the dest_mapping contains a's. + BUG_ON(check_region_contains_byte + (dest_mapping, num_pages * page_size, 'a') != 0, + "pages did not migrate"); + + // Because the region is backed by shmem, we will actually see the same + // memory at the source location still. + BUG_ON(check_region_contains_byte + (source_mapping, num_pages * page_size, 'a') != 0, + "source should have no ptes"); + + BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1, + "unable to unmap destination mapping"); + BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, + "unable to unmap source mapping"); +} + // This test validates MREMAP_DONTUNMAP will move page tables to a specific // destination using MREMAP_FIXED, also while validating that the source // remains intact. @@ -300,6 +351,7 @@ int main(void) BUG_ON(page_buffer == MAP_FAILED, "unable to mmap a page."); mremap_dontunmap_simple(); + mremap_dontunmap_simple_shmem(); mremap_dontunmap_simple_fixed(); mremap_dontunmap_partial_mapping(); mremap_dontunmap_partial_mapping_overwrite(); -- cgit v1.2.3 From 7bc4ca3ea956669b4e14ee03108c6623a136edfa Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 29 Apr 2021 22:59:16 -0700 Subject: vm/test_vmalloc.sh: adapt for updated driver interface A 'single_cpu_test' parameter is odd and it does not exist anymore. Instead there was introduced a 'nr_threads' one. If it is not set it behaves as the former parameter. That is why update a "stress mode" according to this change specifying number of workers which are equal to number of CPUs. Also update an output of help message based on a new interface. Link: https://lkml.kernel.org/r/20210402202237.20334-3-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Cc: Shuah Khan Cc: Hillf Danton Cc: Matthew Wilcox Cc: Michal Hocko Cc: Oleksiy Avramchenko Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/test_vmalloc.sh | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/vm/test_vmalloc.sh b/tools/testing/selftests/vm/test_vmalloc.sh index 06d2bb109f06..d73b846736f1 100755 --- a/tools/testing/selftests/vm/test_vmalloc.sh +++ b/tools/testing/selftests/vm/test_vmalloc.sh @@ -11,6 +11,7 @@ TEST_NAME="vmalloc" DRIVER="test_${TEST_NAME}" +NUM_CPUS=`grep -c ^processor /proc/cpuinfo` # 1 if fails exitcode=1 @@ -22,9 +23,9 @@ ksft_skip=4 # Static templates for performance, stressing and smoke tests. # Also it is possible to pass any supported parameters manualy. # -PERF_PARAM="single_cpu_test=1 sequential_test_order=1 test_repeat_count=3" -SMOKE_PARAM="single_cpu_test=1 test_loop_count=10000 test_repeat_count=10" -STRESS_PARAM="test_repeat_count=20" +PERF_PARAM="sequential_test_order=1 test_repeat_count=3" +SMOKE_PARAM="test_loop_count=10000 test_repeat_count=10" +STRESS_PARAM="nr_threads=$NUM_CPUS test_repeat_count=20" check_test_requirements() { @@ -58,8 +59,8 @@ run_perfformance_check() run_stability_check() { - echo "Run stability tests. In order to stress vmalloc subsystem we run" - echo "all available test cases on all available CPUs simultaneously." + echo "Run stability tests. In order to stress vmalloc subsystem all" + echo "available test cases are run by NUM_CPUS workers simultaneously." echo "It will take time, so be patient." modprobe $DRIVER $STRESS_PARAM > /dev/null 2>&1 @@ -92,17 +93,17 @@ usage() echo "# Shows help message" echo "./${DRIVER}.sh" echo - echo "# Runs 1 test(id_1), repeats it 5 times on all online CPUs" - echo "./${DRIVER}.sh run_test_mask=1 test_repeat_count=5" + echo "# Runs 1 test(id_1), repeats it 5 times by NUM_CPUS workers" + echo "./${DRIVER}.sh nr_threads=$NUM_CPUS run_test_mask=1 test_repeat_count=5" echo echo -n "# Runs 4 tests(id_1|id_2|id_4|id_16) on one CPU with " echo "sequential order" - echo -n "./${DRIVER}.sh single_cpu_test=1 sequential_test_order=1 " + echo -n "./${DRIVER}.sh sequential_test_order=1 " echo "run_test_mask=23" echo - echo -n "# Runs all tests on all online CPUs, shuffled order, repeats " + echo -n "# Runs all tests by NUM_CPUS workers, shuffled order, repeats " echo "20 times" - echo "./${DRIVER}.sh test_repeat_count=20" + echo "./${DRIVER}.sh nr_threads=$NUM_CPUS test_repeat_count=20" echo echo "# Performance analysis" echo "./${DRIVER}.sh performance" -- cgit v1.2.3 From 77a88274dc1a2cf3a775161d9a3242bc798ee680 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 30 Apr 2021 10:56:27 +0900 Subject: kbuild: replace LANG=C with LC_ALL=C LANG gives a weak default to each LC_* in case it is not explicitly defined. LC_ALL, if set, overrides all other LC_* variables. LANG < LC_CTYPE, LC_COLLATE, LC_MONETARY, LC_NUMERIC, ... < LC_ALL This is why documentation such as [1] suggests to set LC_ALL in build scripts to get the deterministic result. LANG=C is not strong enough to override LC_* that may be set by end users. [1]: https://reproducible-builds.org/docs/locales/ Signed-off-by: Masahiro Yamada Acked-by: Michael Ellerman (powerpc) Reviewed-by: Matthias Maennich Acked-by: Matthieu Baerts (mptcp) Reviewed-by: Greg Kroah-Hartman --- arch/powerpc/boot/wrapper | 2 +- scripts/nsdeps | 2 +- scripts/recordmcount.pl | 2 +- scripts/setlocalversion | 2 +- scripts/tags.sh | 2 +- tools/testing/selftests/net/mptcp/mptcp_connect.sh | 2 +- usr/gen_initramfs.sh | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/arch/powerpc/boot/wrapper b/arch/powerpc/boot/wrapper index 41fa0a8715e3..cdb796b76e2e 100755 --- a/arch/powerpc/boot/wrapper +++ b/arch/powerpc/boot/wrapper @@ -191,7 +191,7 @@ if [ -z "$kernel" ]; then kernel=vmlinux fi -LANG=C elfformat="`${CROSS}objdump -p "$kernel" | grep 'file format' | awk '{print $4}'`" +LC_ALL=C elfformat="`${CROSS}objdump -p "$kernel" | grep 'file format' | awk '{print $4}'`" case "$elfformat" in elf64-powerpcle) format=elf64lppc ;; elf64-powerpc) format=elf32ppc ;; diff --git a/scripts/nsdeps b/scripts/nsdeps index e8ce2a4d704a..04c4b96e95ec 100644 --- a/scripts/nsdeps +++ b/scripts/nsdeps @@ -44,7 +44,7 @@ generate_deps() { for source_file in $mod_source_files; do sed '/MODULE_IMPORT_NS/Q' $source_file > ${source_file}.tmp offset=$(wc -l ${source_file}.tmp | awk '{print $1;}') - cat $source_file | grep MODULE_IMPORT_NS | LANG=C sort -u >> ${source_file}.tmp + cat $source_file | grep MODULE_IMPORT_NS | LC_ALL=C sort -u >> ${source_file}.tmp tail -n +$((offset +1)) ${source_file} | grep -v MODULE_IMPORT_NS >> ${source_file}.tmp if ! diff -q ${source_file} ${source_file}.tmp; then mv ${source_file}.tmp ${source_file} diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 867860ea57da..0a7fc9507d6f 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -497,7 +497,7 @@ sub update_funcs # # Step 2: find the sections and mcount call sites # -open(IN, "LANG=C $objdump -hdr $inputfile|") || die "error running $objdump"; +open(IN, "LC_ALL=C $objdump -hdr $inputfile|") || die "error running $objdump"; my $text; diff --git a/scripts/setlocalversion b/scripts/setlocalversion index bb709eda96cd..db941f6d9591 100755 --- a/scripts/setlocalversion +++ b/scripts/setlocalversion @@ -126,7 +126,7 @@ scm_version() fi # Check for svn and a svn repo. - if rev=$(LANG= LC_ALL= LC_MESSAGES=C svn info 2>/dev/null | grep '^Last Changed Rev'); then + if rev=$(LC_ALL=C svn info 2>/dev/null | grep '^Last Changed Rev'); then rev=$(echo $rev | awk '{print $NF}') printf -- '-svn%s' "$rev" diff --git a/scripts/tags.sh b/scripts/tags.sh index fd96734deff1..db8ba411860a 100755 --- a/scripts/tags.sh +++ b/scripts/tags.sh @@ -326,5 +326,5 @@ esac # Remove structure forward declarations. if [ -n "$remove_structs" ]; then - LANG=C sed -i -e '/^\([a-zA-Z_][a-zA-Z0-9_]*\)\t.*\t\/\^struct \1;.*\$\/;"\tx$/d' $1 + LC_ALL=C sed -i -e '/^\([a-zA-Z_][a-zA-Z0-9_]*\)\t.*\t\/\^struct \1;.*\$\/;"\tx$/d' $1 fi diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index 9236609731b1..3c4cb72ed8a4 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -274,7 +274,7 @@ check_mptcp_disabled() ip netns exec ${disabled_ns} sysctl -q net.mptcp.enabled=0 local err=0 - LANG=C ip netns exec ${disabled_ns} ./mptcp_connect -p 10000 -s MPTCP 127.0.0.1 < "$cin" 2>&1 | \ + LC_ALL=C ip netns exec ${disabled_ns} ./mptcp_connect -p 10000 -s MPTCP 127.0.0.1 < "$cin" 2>&1 | \ grep -q "^socket: Protocol not available$" && err=1 ip netns delete ${disabled_ns} diff --git a/usr/gen_initramfs.sh b/usr/gen_initramfs.sh index 8ae831657e5d..63476bb70b41 100755 --- a/usr/gen_initramfs.sh +++ b/usr/gen_initramfs.sh @@ -147,7 +147,7 @@ dir_filelist() { header "$1" srcdir=$(echo "$1" | sed -e 's://*:/:g') - dirlist=$(find "${srcdir}" -printf "%p %m %U %G\n" | LANG=C sort) + dirlist=$(find "${srcdir}" -printf "%p %m %U %G\n" | LC_ALL=C sort) # If $dirlist is only one line, then the directory is empty if [ "$(echo "${dirlist}" | wc -l)" -gt 1 ]; then -- cgit v1.2.3 From 2a30f9440640c418bcfbea9b2b344d268b58e0a2 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 29 Apr 2021 13:05:10 +0000 Subject: libbpf: Fix signed overflow in ringbuf_process_ring One of our benchmarks running in (Google-internal) CI pushes data through the ringbuf faster htan than userspace is able to consume it. In this case it seems we're actually able to get >INT_MAX entries in a single ring_buffer__consume() call. ASAN detected that cnt overflows in this case. Fix by using 64-bit counter internally and then capping the result to INT_MAX before converting to the int return type. Do the same for the ring_buffer__poll(). Fixes: bf99c936f947 (libbpf: Add BPF ring buffer support) Signed-off-by: Brendan Jackman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210429130510.1621665-1-jackmanb@google.com --- tools/lib/bpf/ringbuf.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/lib/bpf/ringbuf.c b/tools/lib/bpf/ringbuf.c index e7a8d847161f..1d80ad4e0de8 100644 --- a/tools/lib/bpf/ringbuf.c +++ b/tools/lib/bpf/ringbuf.c @@ -202,9 +202,11 @@ static inline int roundup_len(__u32 len) return (len + 7) / 8 * 8; } -static int ringbuf_process_ring(struct ring* r) +static int64_t ringbuf_process_ring(struct ring* r) { - int *len_ptr, len, err, cnt = 0; + int *len_ptr, len, err; + /* 64-bit to avoid overflow in case of extreme application behavior */ + int64_t cnt = 0; unsigned long cons_pos, prod_pos; bool got_new_data; void *sample; @@ -244,12 +246,14 @@ done: } /* Consume available ring buffer(s) data without event polling. - * Returns number of records consumed across all registered ring buffers, or - * negative number if any of the callbacks return error. + * Returns number of records consumed across all registered ring buffers (or + * INT_MAX, whichever is less), or negative number if any of the callbacks + * return error. */ int ring_buffer__consume(struct ring_buffer *rb) { - int i, err, res = 0; + int64_t err, res = 0; + int i; for (i = 0; i < rb->ring_cnt; i++) { struct ring *ring = &rb->rings[i]; @@ -259,18 +263,24 @@ int ring_buffer__consume(struct ring_buffer *rb) return err; res += err; } + if (res > INT_MAX) + return INT_MAX; return res; } /* Poll for available data and consume records, if any are available. - * Returns number of records consumed, or negative number, if any of the - * registered callbacks returned error. + * Returns number of records consumed (or INT_MAX, whichever is less), or + * negative number, if any of the registered callbacks returned error. */ int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms) { - int i, cnt, err, res = 0; + int i, cnt; + int64_t err, res = 0; cnt = epoll_wait(rb->epoll_fd, rb->events, rb->ring_cnt, timeout_ms); + if (cnt < 0) + return -errno; + for (i = 0; i < cnt; i++) { __u32 ring_id = rb->events[i].data.fd; struct ring *ring = &rb->rings[ring_id]; @@ -280,7 +290,9 @@ int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms) return err; res += err; } - return cnt < 0 ? -errno : res; + if (res > INT_MAX) + return INT_MAX; + return res; } /* Get an fd that can be used to sleep until data is available in the ring(s) */ -- cgit v1.2.3 From da2e56634b262fddfa40b2cfedd24de841418cd3 Mon Sep 17 00:00:00 2001 From: "John 'Warthog9' Hawley (VMware)" Date: Mon, 19 Apr 2021 17:29:26 -0700 Subject: ktest: Minor cleanup with uninitialized variable $build_options Signed-off-by: John 'Warthog9' Hawley (VMware) Signed-off-by: Steven Rostedt (VMware) --- tools/testing/ktest/ktest.pl | 3 +++ 1 file changed, 3 insertions(+) (limited to 'tools') diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl index 4e2450964517..18fd4fd117dd 100755 --- a/tools/testing/ktest/ktest.pl +++ b/tools/testing/ktest/ktest.pl @@ -2594,6 +2594,9 @@ sub build { # Run old config regardless, to enforce min configurations make_oldconfig; + if (not defined($build_options)){ + $build_options = ""; + } my $build_ret = run_command "$make $build_options", $buildlog; if (defined($post_build)) { -- cgit v1.2.3 From 2676eb4bfc546dc490d2abd155877a580c74c294 Mon Sep 17 00:00:00 2001 From: "John 'Warthog9' Hawley (VMware)" Date: Mon, 19 Apr 2021 17:29:27 -0700 Subject: ktest: Add example config for using VMware VMs This duplicates the KVM/Qemu config with specific notes for how to use it with VMware VMs on Workstation, Player, or Fusion. The main thing to be aware of is how the serial port is exposed which is a unix pipe, and will need something like ncat to get into ktest's monitoring Signed-off-by: John 'Warthog9' Hawley (VMware) Signed-off-by: Steven Rostedt (VMware) --- tools/testing/ktest/examples/vmware.conf | 137 +++++++++++++++++++++++++++++++ 1 file changed, 137 insertions(+) create mode 100644 tools/testing/ktest/examples/vmware.conf (limited to 'tools') diff --git a/tools/testing/ktest/examples/vmware.conf b/tools/testing/ktest/examples/vmware.conf new file mode 100644 index 000000000000..61958163d242 --- /dev/null +++ b/tools/testing/ktest/examples/vmware.conf @@ -0,0 +1,137 @@ +# +# This config is an example usage of ktest.pl with a vmware guest +# +# VMware Setup: +# ------------- +# - Edit the Virtual Machine ("Edit virtual machine settings") +# - Add a Serial Port +# - You almost certainly want it set "Connect at power on" +# - Select "Use socket (named pipe)" +# - Select a name that you'll recognize, like 'ktestserialpipe' +# - From: Server +# - To: A Virtual Machine +# - Save +# - Make sure you note the name, it will be in the base directory of the +# virtual machine (where the "disks" are stored. The default +# is /var/lib/vmware// +# +# - Make note of the path to the VM +# +# +# The guest is called 'Guest' and this would be something that +# could be run on the host to test a virtual machine target. + +MACHINE = Guest + +# Name of the serial pipe you set in the VMware settings +VMWARE_SERIAL_NAME = + +# Define a variable of the name of the VM +# Noting this needs to be the name of the kmx file, and usually, the +# name of the directory that it's in. If the directory and name +# differ change the VMWARE_VM_DIR accordingly. +# Please ommit the .kmx extension +VMWARE_VM_NAME = + +# VM dir name. This is usually the same as the virtual machine's name, +# but not always the case. Change if they differ +VMWARE_VM_DIR = ${VMWARE_VM_NAME} + +# Base directory that the Virtual machine is contained in +# /var/lib/vmware is the default on Linux +VMWARE_VM_BASE_DIR = /var/lib/vmware/${VMWARE_VM_DIR} + +# Use ncat to read the unix pipe. Anything that can read the Unix Pipe +# and output it's contents to stdout will work +CONSOLE = /usr/bin/ncat -U ${VMWARE_VM_BASE_DIR}/${VMWARE_SERIAL_NAME} + +# Define what version of Workstation you are using +# This is used by vmrun to use the appropriate appripriate pieces to +# test this. In all likelihood you want 'ws' or 'player' +# Valid options: +# ws - Workstation (Windows or Linux host) +# fusion - Fusion (Mac host) +# player - Using VMware Player (Windows or Linux host) +# Note: vmrun has to run directly on the host machine +VMWARE_HOST_TYPE = ws + +# VMware provides `vmrun` to allow you to do certain things to the virtual machine +# This should hard reset the VM and force a boot +VMWARE_POWER_CYCLE = /usr/bin/vmrun -T ${VMWARE_HOST_TYPE} reset ${VMWARE_VM_BASE_DIR}/${VMWARE_VM_NAME}.kmx nogui + +#*************************************# +# This part is the same as test.conf # +#*************************************# + +# The include files will set up the type of test to run. Just set TEST to +# which test you want to run. +# +# TESTS = patchcheck, randconfig, boot, test, config-bisect, bisect, min-config +# +# See the include/*.conf files that define these tests +# +TEST := patchcheck + +# Some tests may have more than one test to run. Define MULTI := 1 to run +# the extra tests. +MULTI := 0 + +# In case you want to differentiate which type of system you are testing +BITS := 64 + +# REBOOT = none, error, fail, empty +# See include/defaults.conf +REBOOT := empty + + +# The defaults file will set up various settings that can be used by all +# machine configs. +INCLUDE include/defaults.conf + + +#*************************************# +# Now we are different from test.conf # +#*************************************# + + +# The example here assumes that Guest is running a Fedora release +# that uses dracut for its initfs. The POST_INSTALL will be executed +# after the install of the kernel and modules are complete. +# +POST_INSTALL = ${SSH} /sbin/dracut -f /boot/initramfs-test.img $KERNEL_VERSION + +# Guests sometimes get stuck on reboot. We wait 3 seconds after running +# the reboot command and then do a full power-cycle of the guest. +# This forces the guest to restart. +# +POWERCYCLE_AFTER_REBOOT = 3 + +# We do the same after the halt command, but this time we wait 20 seconds. +POWEROFF_AFTER_HALT = 20 + + +# As the defaults.conf file has a POWER_CYCLE option already defined, +# and options can not be defined in the same section more than once +# (all DEFAULTS sections are considered the same). We use the +# DEFAULTS OVERRIDE to tell ktest.pl to ignore the previous defined +# options, for the options set in the OVERRIDE section. +# +DEFAULTS OVERRIDE + +# Instead of using the default POWER_CYCLE option defined in +# defaults.conf, we use virsh to cycle it. To do so, we destroy +# the guest, wait 5 seconds, and then start it up again. +# Crude, but effective. +# +POWER_CYCLE = ${VMWARE_POWER_CYCLE} + + +DEFAULTS + +# The following files each handle a different test case. +# Having them included allows you to set up more than one machine and share +# the same tests. +INCLUDE include/patchcheck.conf +INCLUDE include/tests.conf +INCLUDE include/bisect.conf +INCLUDE include/min-config.conf -- cgit v1.2.3 From becdd17b5acc79267cf4dba65e07e96e11cc9b57 Mon Sep 17 00:00:00 2001 From: "John 'Warthog9' Hawley (VMware)" Date: Mon, 19 Apr 2021 17:29:28 -0700 Subject: ktest: Adding editor hints to improve consistency Emacs and Vi(m) have different styles of dealing with perl syntax which can lead to slightly inconsistent indentation, and makes the code slightly harder to read. Emacs assumes a more perl recommended standard of 4 spaces (1 column) or tab (two column) indentation. Vi(m) tends to favor just normal spaces or tabs depending on what was being used. This gives the basic hinting to Emacs and Vim to do what is expected to be basically consistent. Emacs: - Explicitly flip into perl mode, cperl would require more adjustments Vi(m): - Set softtabs=4 which will flip it over to doing indentation the way you would expect from Emacs Signed-off-by: John 'Warthog9' Hawley (VMware) Signed-off-by: Steven Rostedt (VMware) --- tools/testing/ktest/ktest.pl | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'tools') diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl index 18fd4fd117dd..14a753b86445 100755 --- a/tools/testing/ktest/ktest.pl +++ b/tools/testing/ktest/ktest.pl @@ -4520,3 +4520,12 @@ if (defined($opt{"LOG_FILE"})) { } exit 0; + +## +# The following are here to standardize tabs/spaces/etc across the most likely editors +### + +# Local Variables: +# mode: perl +# End: +# vim: softtabstop=4 -- cgit v1.2.3 From 12d4cddda2043466a5af8fc0c49e49f24f1d4c59 Mon Sep 17 00:00:00 2001 From: "John 'Warthog9' Hawley (VMware)" Date: Mon, 19 Apr 2021 17:29:29 -0700 Subject: ktest: Fixing indentation to match expected pattern This is a followup to "ktest: Adding editor hints to improve consistency" to actually adjust the existing indentation to match the, now, expected pattern (first column 4 spaces, 2nd tab, 3rd tab + 4 spaces, etc). This should, at least help, keep things consistent going forward now. Signed-off-by: John 'Warthog9' Hawley (VMware) Signed-off-by: Steven Rostedt (VMware) --- tools/testing/ktest/ktest.pl | 186 +++++++++++++++++++++---------------------- 1 file changed, 92 insertions(+), 94 deletions(-) (limited to 'tools') diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl index 14a753b86445..633c715173ff 100755 --- a/tools/testing/ktest/ktest.pl +++ b/tools/testing/ktest/ktest.pl @@ -760,7 +760,7 @@ sub process_variables { # remove the space added in the beginning $retval =~ s/ //; - return "$retval" + return "$retval"; } sub set_value { @@ -1095,7 +1095,7 @@ sub __read_config { } } } - + if ( ! -r $file ) { die "$name: $.: Can't read file $file\n$_"; } @@ -1186,13 +1186,13 @@ sub __read_config { } sub get_test_case { - print "What test case would you like to run?\n"; - print " (build, install or boot)\n"; - print " Other tests are available but require editing ktest.conf\n"; - print " (see tools/testing/ktest/sample.conf)\n"; - my $ans = ; - chomp $ans; - $default{"TEST_TYPE"} = $ans; + print "What test case would you like to run?\n"; + print " (build, install or boot)\n"; + print " Other tests are available but require editing ktest.conf\n"; + print " (see tools/testing/ktest/sample.conf)\n"; + my $ans = ; + chomp $ans; + $default{"TEST_TYPE"} = $ans; } sub read_config { @@ -1519,13 +1519,13 @@ sub dodie { close O; close L; } - send_email("KTEST: critical failure for test $i [$name]", - "Your test started at $script_start_time has failed with:\n@_\n", $log_file); + send_email("KTEST: critical failure for test $i [$name]", + "Your test started at $script_start_time has failed with:\n@_\n", $log_file); } if ($monitor_cnt) { - # restore terminal settings - system("stty $stty_orig"); + # restore terminal settings + system("stty $stty_orig"); } if (defined($post_test)) { @@ -1709,81 +1709,81 @@ sub wait_for_monitor { } sub save_logs { - my ($result, $basedir) = @_; - my @t = localtime; - my $date = sprintf "%04d%02d%02d%02d%02d%02d", - 1900+$t[5],$t[4],$t[3],$t[2],$t[1],$t[0]; + my ($result, $basedir) = @_; + my @t = localtime; + my $date = sprintf "%04d%02d%02d%02d%02d%02d", + 1900+$t[5],$t[4],$t[3],$t[2],$t[1],$t[0]; - my $type = $build_type; - if ($type =~ /useconfig/) { - $type = "useconfig"; - } + my $type = $build_type; + if ($type =~ /useconfig/) { + $type = "useconfig"; + } - my $dir = "$machine-$test_type-$type-$result-$date"; + my $dir = "$machine-$test_type-$type-$result-$date"; - $dir = "$basedir/$dir"; + $dir = "$basedir/$dir"; - if (!-d $dir) { - mkpath($dir) or - dodie "can't create $dir"; - } + if (!-d $dir) { + mkpath($dir) or + dodie "can't create $dir"; + } - my %files = ( - "config" => $output_config, - "buildlog" => $buildlog, - "dmesg" => $dmesg, - "testlog" => $testlog, - ); + my %files = ( + "config" => $output_config, + "buildlog" => $buildlog, + "dmesg" => $dmesg, + "testlog" => $testlog, + ); - while (my ($name, $source) = each(%files)) { - if (-f "$source") { - cp "$source", "$dir/$name" or - dodie "failed to copy $source"; - } + while (my ($name, $source) = each(%files)) { + if (-f "$source") { + cp "$source", "$dir/$name" or + dodie "failed to copy $source"; } + } - doprint "*** Saved info to $dir ***\n"; + doprint "*** Saved info to $dir ***\n"; } sub fail { - if ($die_on_failure) { - dodie @_; - } + if ($die_on_failure) { + dodie @_; + } - doprint "FAILED\n"; + doprint "FAILED\n"; - my $i = $iteration; + my $i = $iteration; - # no need to reboot for just building. - if (!do_not_reboot) { - doprint "REBOOTING\n"; - reboot_to_good $sleep_time; - } + # no need to reboot for just building. + if (!do_not_reboot) { + doprint "REBOOTING\n"; + reboot_to_good $sleep_time; + } - my $name = ""; + my $name = ""; - if (defined($test_name)) { - $name = " ($test_name)"; - } + if (defined($test_name)) { + $name = " ($test_name)"; + } - print_times; + print_times; - doprint "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n"; - doprint "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n"; - doprint "KTEST RESULT: TEST $i$name Failed: ", @_, "\n"; - doprint "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n"; - doprint "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n"; + doprint "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n"; + doprint "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n"; + doprint "KTEST RESULT: TEST $i$name Failed: ", @_, "\n"; + doprint "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n"; + doprint "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n"; - if (defined($store_failures)) { - save_logs "fail", $store_failures; - } + if (defined($store_failures)) { + save_logs "fail", $store_failures; + } - if (defined($post_test)) { - run_command $post_test; - } + if (defined($post_test)) { + run_command $post_test; + } - return 1; + return 1; } sub run_command { @@ -1969,9 +1969,9 @@ sub get_grub_index { $target = '^menuentry.*' . $grub_menu_qt; $skip = '^menuentry\s|^submenu\s'; } elsif ($reboot_type eq "grub2bls") { - $command = $grub_bls_get; - $target = '^title=.*' . $grub_menu_qt; - $skip = '^title='; + $command = $grub_bls_get; + $target = '^title=.*' . $grub_menu_qt; + $skip = '^title='; } else { return; } @@ -2394,7 +2394,7 @@ sub check_buildlog { while () { if (/$check_build_re/) { my $warning = process_warning_line $_; - + $warnings_list{$warning} = 1; } } @@ -2659,7 +2659,7 @@ sub success { doprint "*******************************************\n"; if (defined($store_successes)) { - save_logs "success", $store_successes; + save_logs "success", $store_successes; } if ($i != $opt{"NUM_TESTS"} && !do_not_reboot) { @@ -3246,13 +3246,13 @@ sub run_config_bisect { $ret = run_config_bisect_test $config_bisect_type; if ($ret) { - doprint "NEW GOOD CONFIG ($pass)\n"; + doprint "NEW GOOD CONFIG ($pass)\n"; system("cp $output_config $tmpdir/good_config.tmp.$pass"); $pass++; # Return 3 for good config return 3; } else { - doprint "NEW BAD CONFIG ($pass)\n"; + doprint "NEW BAD CONFIG ($pass)\n"; system("cp $output_config $tmpdir/bad_config.tmp.$pass"); $pass++; # Return 4 for bad config @@ -3371,7 +3371,7 @@ sub config_bisect { } while ($ret == 3 || $ret == 4); if ($ret == 2) { - config_bisect_end "$good_config.tmp", "$bad_config.tmp"; + config_bisect_end "$good_config.tmp", "$bad_config.tmp"; } return $ret if ($ret < 0); @@ -3551,7 +3551,6 @@ sub read_kconfig { my $cont = 0; my $line; - if (! -f $kconfig) { doprint "file $kconfig does not exist, skipping\n"; return; @@ -3660,7 +3659,7 @@ sub read_depends { if (! -f $kconfig && $arch =~ /\d$/) { my $orig = $arch; - # some subarchs have numbers, truncate them + # some subarchs have numbers, truncate them $arch =~ s/\d*$//; $kconfig = "$builddir/arch/$arch/Kconfig"; if (! -f $kconfig) { @@ -3855,7 +3854,7 @@ sub make_min_config { foreach my $config (@config_keys) { my $kconfig = chomp_config $config; if (!defined $depcount{$kconfig}) { - $depcount{$kconfig} = 0; + $depcount{$kconfig} = 0; } } @@ -3957,13 +3956,13 @@ sub make_min_config { my $failed = 0; build "oldconfig" or $failed = 1; if (!$failed) { - start_monitor_and_install or $failed = 1; + start_monitor_and_install or $failed = 1; - if ($type eq "test" && !$failed) { - do_run_test or $failed = 1; - } + if ($type eq "test" && !$failed) { + do_run_test or $failed = 1; + } - end_monitor; + end_monitor; } $in_bisect = 0; @@ -4277,8 +4276,8 @@ sub send_email { sub cancel_test { if ($email_when_canceled) { my $name = get_test_name; - send_email("KTEST: Your [$name] test was cancelled", - "Your test started at $script_start_time was cancelled: sig int"); + send_email("KTEST: Your [$name] test was cancelled", + "Your test started at $script_start_time was cancelled: sig int"); } die "\nCaught Sig Int, test interrupted: $!\n" } @@ -4326,15 +4325,15 @@ for (my $i = 1; $i <= $opt{"NUM_TESTS"}; $i++) { # The first test may override the PRE_KTEST option if ($i == 1) { - if (defined($pre_ktest)) { - doprint "\n"; - run_command $pre_ktest; - } - if ($email_when_started) { + if (defined($pre_ktest)) { + doprint "\n"; + run_command $pre_ktest; + } + if ($email_when_started) { my $name = get_test_name; - send_email("KTEST: Your [$name] test was started", - "Your test was started on $script_start_time"); - } + send_email("KTEST: Your [$name] test was started", + "Your test was started on $script_start_time"); + } } # Any test can override the POST_KTEST option @@ -4506,12 +4505,11 @@ if ($opt{"POWEROFF_ON_SUCCESS"}) { run_command $switch_to_good; } - doprint "\n $successes of $opt{NUM_TESTS} tests were successful\n\n"; if ($email_when_finished) { send_email("KTEST: Your test has finished!", - "$successes of $opt{NUM_TESTS} tests started at $script_start_time were successful!"); + "$successes of $opt{NUM_TESTS} tests started at $script_start_time were successful!"); } if (defined($opt{"LOG_FILE"})) { -- cgit v1.2.3 From c043ccbfc6d83fa21512f842c5d2ba4060cee5fe Mon Sep 17 00:00:00 2001 From: "John 'Warthog9' Hawley (VMware)" Date: Mon, 19 Apr 2021 17:29:30 -0700 Subject: ktest: Further consistency cleanups This cleans up some additional whitespace pieces that to be more consistent, as well as moving a curly brace around, and some 'or' statements to match the rest of the file (usually or goes at the end of the line vs. at the beginning) Signed-off-by: John 'Warthog9' Hawley (VMware) Signed-off-by: Steven Rostedt (VMware) --- tools/testing/ktest/ktest.pl | 85 +++++++++++++++++++------------------------- 1 file changed, 36 insertions(+), 49 deletions(-) (limited to 'tools') diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl index 633c715173ff..a3c6ad64c479 100755 --- a/tools/testing/ktest/ktest.pl +++ b/tools/testing/ktest/ktest.pl @@ -24,7 +24,7 @@ my %evals; #default opts my %default = ( - "MAILER" => "sendmail", # default mailer + "MAILER" => "sendmail", # default mailer "EMAIL_ON_ERROR" => 1, "EMAIL_WHEN_FINISHED" => 1, "EMAIL_WHEN_CANCELED" => 0, @@ -36,15 +36,15 @@ my %default = ( "CLOSE_CONSOLE_SIGNAL" => "INT", "TIMEOUT" => 120, "TMP_DIR" => "/tmp/ktest/\${MACHINE}", - "SLEEP_TIME" => 60, # sleep time between tests + "SLEEP_TIME" => 60, # sleep time between tests "BUILD_NOCLEAN" => 0, "REBOOT_ON_ERROR" => 0, "POWEROFF_ON_ERROR" => 0, "REBOOT_ON_SUCCESS" => 1, "POWEROFF_ON_SUCCESS" => 0, "BUILD_OPTIONS" => "", - "BISECT_SLEEP_TIME" => 60, # sleep time between bisects - "PATCHCHECK_SLEEP_TIME" => 60, # sleep time between patch checks + "BISECT_SLEEP_TIME" => 60, # sleep time between bisects + "PATCHCHECK_SLEEP_TIME" => 60, # sleep time between patch checks "CLEAR_LOG" => 0, "BISECT_MANUAL" => 0, "BISECT_SKIP" => 1, @@ -537,7 +537,7 @@ sub read_prompt { my $ans; for (;;) { - if ($cancel) { + if ($cancel) { print "$prompt [y/n/C] "; } else { print "$prompt [Y/n] "; @@ -863,7 +863,6 @@ sub value_defined { defined($opt{$2}); } -my $d = 0; sub process_expression { my ($name, $val) = @_; @@ -978,7 +977,6 @@ sub __read_config { $override = 0; if ($type eq "TEST_START") { - if ($num_tests_set) { die "$name: $.: Can not specify both NUM_TESTS and TEST_START\n"; } @@ -1048,7 +1046,6 @@ sub __read_config { $test_num = $old_test_num; $repeat = $old_repeat; } - } elsif (/^\s*ELSE\b(.*)$/) { if (!$if) { die "$name: $.: ELSE found with out matching IF section\n$_"; @@ -1471,7 +1468,6 @@ sub get_test_name() { } sub dodie { - # avoid recursion return if ($in_die); $in_die = 1; @@ -1481,10 +1477,8 @@ sub dodie { doprint "CRITICAL FAILURE... [TEST $i] ", @_, "\n"; if ($reboot_on_error && !do_not_reboot) { - doprint "REBOOTING\n"; reboot_to_good; - } elsif ($poweroff_on_error && defined($power_off)) { doprint "POWERING OFF\n"; `$power_off`; @@ -1519,8 +1513,9 @@ sub dodie { close O; close L; } + send_email("KTEST: critical failure for test $i [$name]", - "Your test started at $script_start_time has failed with:\n@_\n", $log_file); + "Your test started at $script_start_time has failed with:\n@_\n", $log_file); } if ($monitor_cnt) { @@ -1915,8 +1910,8 @@ sub _get_grub_index { my ($command, $target, $skip) = @_; return if (defined($grub_number) && defined($last_grub_menu) && - $last_grub_menu eq $grub_menu && defined($last_machine) && - $last_machine eq $machine); + $last_grub_menu eq $grub_menu && defined($last_machine) && + $last_machine eq $machine); doprint "Find $reboot_type menu ... "; $grub_number = -1; @@ -1924,8 +1919,8 @@ sub _get_grub_index { my $ssh_grub = $ssh_exec; $ssh_grub =~ s,\$SSH_COMMAND,$command,g; - open(IN, "$ssh_grub |") - or dodie "unable to execute $command"; + open(IN, "$ssh_grub |") or + dodie "unable to execute $command"; my $found = 0; @@ -1979,8 +1974,7 @@ sub get_grub_index { _get_grub_index($command, $target, $skip); } -sub wait_for_input -{ +sub wait_for_input { my ($fp, $time) = @_; my $start_time; my $rin; @@ -2096,7 +2090,6 @@ sub monitor { my $version_found = 0; while (!$done) { - if ($bug && defined($stop_after_failure) && $stop_after_failure >= 0) { my $time = $stop_after_failure - (time - $failure_start); @@ -2571,7 +2564,6 @@ sub build { run_command "mv $outputdir/config_temp $output_config" or dodie "moving config_temp"; } - } elsif (!$noclean) { unlink "$output_config"; run_command "$make mrproper" or @@ -2652,11 +2644,12 @@ sub success { print_times; - doprint "\n\n*******************************************\n"; - doprint "*******************************************\n"; - doprint "KTEST RESULT: TEST $i$name SUCCESS!!!! **\n"; - doprint "*******************************************\n"; - doprint "*******************************************\n"; + doprint "\n\n"; + doprint "*******************************************\n"; + doprint "*******************************************\n"; + doprint "KTEST RESULT: TEST $i$name SUCCESS!!!! **\n"; + doprint "*******************************************\n"; + doprint "*******************************************\n"; if (defined($store_successes)) { save_logs "success", $store_successes; @@ -3034,7 +3027,6 @@ sub bisect { } if ($do_check) { - # get current HEAD my $head = get_sha1("HEAD"); @@ -3074,13 +3066,11 @@ sub bisect { run_command "git bisect replay $replay" or dodie "failed to run replay"; } else { - run_command "git bisect good $good" or dodie "could not set bisect good to $good"; run_git_bisect "git bisect bad $bad" or dodie "could not set bisect bad to $bad"; - } if (defined($start)) { @@ -3133,8 +3123,8 @@ sub assign_configs { doprint "Reading configs from $config\n"; - open (IN, $config) - or dodie "Failed to read $config"; + open (IN, $config) or + dodie "Failed to read $config"; while () { chomp; @@ -3287,10 +3277,11 @@ sub config_bisect { if (!defined($config_bisect_exec)) { # First check the location that ktest.pl ran - my @locations = ( "$pwd/config-bisect.pl", - "$dirname/config-bisect.pl", - "$builddir/tools/testing/ktest/config-bisect.pl", - undef ); + my @locations = ( + "$pwd/config-bisect.pl", + "$dirname/config-bisect.pl", + "$builddir/tools/testing/ktest/config-bisect.pl", + undef ); foreach my $loc (@locations) { doprint "loc = $loc\n"; $config_bisect_exec = $loc; @@ -3632,8 +3623,8 @@ sub read_kconfig { sub read_depends { # find out which arch this is by the kconfig file - open (IN, $output_config) - or dodie "Failed to read $output_config"; + open (IN, $output_config) or + dodie "Failed to read $output_config"; my $arch; while () { if (m,Linux/(\S+)\s+\S+\s+Kernel Configuration,) { @@ -3708,7 +3699,6 @@ sub get_depends { my @configs; while ($dep =~ /[$valid]/) { - if ($dep =~ /^[^$valid]*([$valid]+)/) { my $conf = "CONFIG_" . $1; @@ -3889,7 +3879,6 @@ sub make_min_config { my $take_two = 0; while (!$done) { - my $config; my $found; @@ -3900,7 +3889,7 @@ sub make_min_config { # Sort keys by who is most dependent on @test_configs = sort { $depcount{chomp_config($b)} <=> $depcount{chomp_config($a)} } - @test_configs ; + @test_configs ; # Put configs that did not modify the config at the end. my $reset = 1; @@ -3976,8 +3965,8 @@ sub make_min_config { # update new ignore configs if (defined($ignore_config)) { - open (OUT, ">$temp_config") - or dodie "Can't write to $temp_config"; + open (OUT, ">$temp_config") or + dodie "Can't write to $temp_config"; foreach my $config (keys %save_configs) { print OUT "$save_configs{$config}\n"; } @@ -4004,8 +3993,8 @@ sub make_min_config { } # Save off all the current mandatory configs - open (OUT, ">$temp_config") - or dodie "Can't write to $temp_config"; + open (OUT, ">$temp_config") or + dodie "Can't write to $temp_config"; foreach my $config (keys %keep_configs) { print OUT "$keep_configs{$config}\n"; } @@ -4043,7 +4032,6 @@ sub make_warnings_file { open(IN, $buildlog) or dodie "Can't open $buildlog"; while () { - # Some compilers use UTF-8 extended for quotes # for distcc heterogeneous systems, this causes issues s/$utf8_quote/'/g; @@ -4263,7 +4251,6 @@ sub do_send_mail { } sub send_email { - if (defined($mailto)) { if (!defined($mailer)) { doprint "No email sent: email or mailer not specified in config.\n"; @@ -4277,7 +4264,7 @@ sub cancel_test { if ($email_when_canceled) { my $name = get_test_name; send_email("KTEST: Your [$name] test was cancelled", - "Your test started at $script_start_time was cancelled: sig int"); + "Your test started at $script_start_time was cancelled: sig int"); } die "\nCaught Sig Int, test interrupted: $!\n" } @@ -4332,7 +4319,7 @@ for (my $i = 1; $i <= $opt{"NUM_TESTS"}; $i++) { if ($email_when_started) { my $name = get_test_name; send_email("KTEST: Your [$name] test was started", - "Your test was started on $script_start_time"); + "Your test was started on $script_start_time"); } } @@ -4411,7 +4398,7 @@ for (my $i = 1; $i <= $opt{"NUM_TESTS"}; $i++) { my $ret = run_command $pre_test; if (!$ret && defined($pre_test_die) && $pre_test_die) { - dodie "failed to pre_test\n"; + dodie "failed to pre_test\n"; } } @@ -4509,7 +4496,7 @@ doprint "\n $successes of $opt{NUM_TESTS} tests were successful\n\n"; if ($email_when_finished) { send_email("KTEST: Your test has finished!", - "$successes of $opt{NUM_TESTS} tests started at $script_start_time were successful!"); + "$successes of $opt{NUM_TESTS} tests started at $script_start_time were successful!"); } if (defined($opt{"LOG_FILE"})) { -- cgit v1.2.3 From 6a0f3652952c7bba83af66c115a311d4a2164ebb Mon Sep 17 00:00:00 2001 From: "John 'Warthog9' Hawley (VMware)" Date: Mon, 19 Apr 2021 17:29:31 -0700 Subject: ktest: Re-arrange the code blocks for better discoverability Perl, as with most scripting languages, is fairly flexible in how / where you can define things, and it will (for the most part) do what you would expect it to do. This however can lead to situations, like with ktest, where things get muddled over time. This pushes the variable definitions back up to the top, followed by functions, with the main script executables down at the bottom, INSTEAD of being somewhat mish-mashed together in certain places. This mostly has the advantage of making it more obvious where things are initially defined, what functions are there, and ACTUALLY where the main script starts executing, and should make this a little more approachable. Signed-off-by: John 'Warthog9' Hawley (VMware) Signed-off-by: Steven Rostedt (VMware) --- tools/testing/ktest/ktest.pl | 296 ++++++++++++++++++++++--------------------- 1 file changed, 154 insertions(+), 142 deletions(-) (limited to 'tools') diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl index a3c6ad64c479..09d1578f9d66 100755 --- a/tools/testing/ktest/ktest.pl +++ b/tools/testing/ktest/ktest.pl @@ -512,6 +512,69 @@ $config_help{"REBOOT_SCRIPT"} = << "EOF" EOF ; +# used with process_expression() +my $d = 0; + +# defined before get_test_name() +my $in_die = 0; + +# defined before process_warning_line() +my $check_build_re = ".*:.*(warning|error|Error):.*"; +my $utf8_quote = "\\x{e2}\\x{80}(\\x{98}|\\x{99})"; + +# defined before child_finished() +my $child_done; + +# config_ignore holds the configs that were set (or unset) for +# a good config and we will ignore these configs for the rest +# of a config bisect. These configs stay as they were. +my %config_ignore; + +# config_set holds what all configs were set as. +my %config_set; + +# config_off holds the set of configs that the bad config had disabled. +# We need to record them and set them in the .config when running +# olddefconfig, because olddefconfig keeps the defaults. +my %config_off; + +# config_off_tmp holds a set of configs to turn off for now +my @config_off_tmp; + +# config_list is the set of configs that are being tested +my %config_list; +my %null_config; + +my %dependency; + +# found above run_config_bisect() +my $pass = 1; + +# found above add_dep() + +my %depends; +my %depcount; +my $iflevel = 0; +my @ifdeps; + +# prevent recursion +my %read_kconfigs; + +# found above test_this_config() +my %min_configs; +my %keep_configs; +my %save_configs; +my %processed_configs; +my %nochange_config; + +# +# These are first defined here, main function later on +# +sub run_command; +sub start_monitor; +sub end_monitor; +sub wait_for_monitor; + sub _logit { if (defined($opt{"LOG_FILE"})) { print LOG @_; @@ -1365,11 +1428,6 @@ sub eval_option { return $option; } -sub run_command; -sub start_monitor; -sub end_monitor; -sub wait_for_monitor; - sub reboot { my ($time) = @_; my $powercycle = 0; @@ -1454,8 +1512,6 @@ sub do_not_reboot { ($test_type eq "config_bisect" && $opt{"CONFIG_BISECT_TYPE[$i]"} eq "build"); } -my $in_die = 0; - sub get_test_name() { my $name; @@ -2342,9 +2398,6 @@ sub start_monitor_and_install { return monitor; } -my $check_build_re = ".*:.*(warning|error|Error):.*"; -my $utf8_quote = "\\x{e2}\\x{80}(\\x{98}|\\x{99})"; - sub process_warning_line { my ($line) = @_; @@ -2694,8 +2747,6 @@ sub child_run_test { exit $run_command_status; } -my $child_done; - sub child_finished { $child_done = 1; } @@ -3096,28 +3147,6 @@ sub bisect { success $i; } -# config_ignore holds the configs that were set (or unset) for -# a good config and we will ignore these configs for the rest -# of a config bisect. These configs stay as they were. -my %config_ignore; - -# config_set holds what all configs were set as. -my %config_set; - -# config_off holds the set of configs that the bad config had disabled. -# We need to record them and set them in the .config when running -# olddefconfig, because olddefconfig keeps the defaults. -my %config_off; - -# config_off_tmp holds a set of configs to turn off for now -my @config_off_tmp; - -# config_list is the set of configs that are being tested -my %config_list; -my %null_config; - -my %dependency; - sub assign_configs { my ($hash, $config) = @_; @@ -3212,8 +3241,6 @@ sub config_bisect_end { doprint "***************************************\n\n"; } -my $pass = 1; - sub run_config_bisect { my ($good, $bad, $last_result) = @_; my $reset = ""; @@ -3505,14 +3532,6 @@ sub patchcheck { return 1; } -my %depends; -my %depcount; -my $iflevel = 0; -my @ifdeps; - -# prevent recursion -my %read_kconfigs; - sub add_dep { # $config depends on $dep my ($config, $dep) = @_; @@ -3713,12 +3732,6 @@ sub get_depends { return @configs; } -my %min_configs; -my %keep_configs; -my %save_configs; -my %processed_configs; -my %nochange_config; - sub test_this_config { my ($config) = @_; @@ -4047,98 +4060,6 @@ sub make_warnings_file { success $i; } -$#ARGV < 1 or die "ktest.pl version: $VERSION\n usage: ktest.pl [config-file]\n"; - -if ($#ARGV == 0) { - $ktest_config = $ARGV[0]; - if (! -f $ktest_config) { - print "$ktest_config does not exist.\n"; - if (!read_yn "Create it?") { - exit 0; - } - } -} - -if (! -f $ktest_config) { - $newconfig = 1; - get_test_case; - open(OUT, ">$ktest_config") or die "Can not create $ktest_config"; - print OUT << "EOF" -# Generated by ktest.pl -# - -# PWD is a ktest.pl variable that will result in the process working -# directory that ktest.pl is executed in. - -# THIS_DIR is automatically assigned the PWD of the path that generated -# the config file. It is best to use this variable when assigning other -# directory paths within this directory. This allows you to easily -# move the test cases to other locations or to other machines. -# -THIS_DIR := $variable{"PWD"} - -# Define each test with TEST_START -# The config options below it will override the defaults -TEST_START -TEST_TYPE = $default{"TEST_TYPE"} - -DEFAULTS -EOF -; - close(OUT); -} -read_config $ktest_config; - -if (defined($opt{"LOG_FILE"})) { - $opt{"LOG_FILE"} = eval_option("LOG_FILE", $opt{"LOG_FILE"}, -1); -} - -# Append any configs entered in manually to the config file. -my @new_configs = keys %entered_configs; -if ($#new_configs >= 0) { - print "\nAppending entered in configs to $ktest_config\n"; - open(OUT, ">>$ktest_config") or die "Can not append to $ktest_config"; - foreach my $config (@new_configs) { - print OUT "$config = $entered_configs{$config}\n"; - $opt{$config} = process_variables($entered_configs{$config}); - } -} - -if (defined($opt{"LOG_FILE"})) { - if ($opt{"CLEAR_LOG"}) { - unlink $opt{"LOG_FILE"}; - } - open(LOG, ">> $opt{LOG_FILE}") or die "Can't write to $opt{LOG_FILE}"; - LOG->autoflush(1); -} - -doprint "\n\nSTARTING AUTOMATED TESTS\n\n"; - -for (my $i = 0, my $repeat = 1; $i <= $opt{"NUM_TESTS"}; $i += $repeat) { - - if (!$i) { - doprint "DEFAULT OPTIONS:\n"; - } else { - doprint "\nTEST $i OPTIONS"; - if (defined($repeat_tests{$i})) { - $repeat = $repeat_tests{$i}; - doprint " ITERATE $repeat"; - } - doprint "\n"; - } - - foreach my $option (sort keys %opt) { - - if ($option =~ /\[(\d+)\]$/) { - next if ($i != $1); - } else { - next if ($i); - } - - doprint "$option = $opt{$option}\n"; - } -} - sub option_defined { my ($option) = @_; @@ -4269,6 +4190,97 @@ sub cancel_test { die "\nCaught Sig Int, test interrupted: $!\n" } +$#ARGV < 1 or die "ktest.pl version: $VERSION\n usage: ktest.pl [config-file]\n"; + +if ($#ARGV == 0) { + $ktest_config = $ARGV[0]; + if (! -f $ktest_config) { + print "$ktest_config does not exist.\n"; + if (!read_yn "Create it?") { + exit 0; + } + } +} + +if (! -f $ktest_config) { + $newconfig = 1; + get_test_case; + open(OUT, ">$ktest_config") or die "Can not create $ktest_config"; + print OUT << "EOF" +# Generated by ktest.pl +# + +# PWD is a ktest.pl variable that will result in the process working +# directory that ktest.pl is executed in. + +# THIS_DIR is automatically assigned the PWD of the path that generated +# the config file. It is best to use this variable when assigning other +# directory paths within this directory. This allows you to easily +# move the test cases to other locations or to other machines. +# +THIS_DIR := $variable{"PWD"} + +# Define each test with TEST_START +# The config options below it will override the defaults +TEST_START +TEST_TYPE = $default{"TEST_TYPE"} + +DEFAULTS +EOF +; + close(OUT); +} +read_config $ktest_config; + +if (defined($opt{"LOG_FILE"})) { + $opt{"LOG_FILE"} = eval_option("LOG_FILE", $opt{"LOG_FILE"}, -1); +} + +# Append any configs entered in manually to the config file. +my @new_configs = keys %entered_configs; +if ($#new_configs >= 0) { + print "\nAppending entered in configs to $ktest_config\n"; + open(OUT, ">>$ktest_config") or die "Can not append to $ktest_config"; + foreach my $config (@new_configs) { + print OUT "$config = $entered_configs{$config}\n"; + $opt{$config} = process_variables($entered_configs{$config}); + } +} + +if (defined($opt{"LOG_FILE"})) { + if ($opt{"CLEAR_LOG"}) { + unlink $opt{"LOG_FILE"}; + } + open(LOG, ">> $opt{LOG_FILE}") or die "Can't write to $opt{LOG_FILE}"; + LOG->autoflush(1); +} + +doprint "\n\nSTARTING AUTOMATED TESTS\n\n"; + +for (my $i = 0, my $repeat = 1; $i <= $opt{"NUM_TESTS"}; $i += $repeat) { + + if (!$i) { + doprint "DEFAULT OPTIONS:\n"; + } else { + doprint "\nTEST $i OPTIONS"; + if (defined($repeat_tests{$i})) { + $repeat = $repeat_tests{$i}; + doprint " ITERATE $repeat"; + } + doprint "\n"; + } + + foreach my $option (sort keys %opt) { + if ($option =~ /\[(\d+)\]$/) { + next if ($i != $1); + } else { + next if ($i); + } + + doprint "$option = $opt{$option}\n"; + } +} + $SIG{INT} = qw(cancel_test); # First we need to do is the builds -- cgit v1.2.3 From 2af4f9b8596afbbd7667a18fa71d117bac227dea Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sat, 30 Jan 2021 23:43:10 -0500 Subject: tools/power turbostat: add built-in-counter for IPC -- Instructions per Cycle Use linux-perf to access the hardware instructions-retired counter. This is necessary because the counter is not enabled by default, and also the counter is prone to roll-over -- both of which perf manages. It is not necessary to use perf for the cycle counter, because turbostat already needs to collect delta-aperf to calcuate frequency. Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 84 +++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index a7c4f0772e53..b82295eaa744 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -33,10 +33,13 @@ #include #include #include +#include +#include char *proc_stat = "/proc/stat"; FILE *outf; int *fd_percpu; +int *fd_instr_count_percpu; struct timeval interval_tv = {5, 0}; struct timespec interval_ts = {5, 0}; unsigned int num_iterations; @@ -75,6 +78,7 @@ char *output_buffer, *outp; unsigned int do_rapl; unsigned int do_dts; unsigned int do_ptm; +unsigned int do_ipc; unsigned long long gfx_cur_rc6_ms; unsigned long long cpuidle_cur_cpu_lpi_us; unsigned long long cpuidle_cur_sys_lpi_us; @@ -173,6 +177,7 @@ struct thread_data { unsigned long long aperf; unsigned long long mperf; unsigned long long c1; + unsigned long long instr_count; unsigned long long irq_count; unsigned int smi_count; unsigned int cpu_id; @@ -490,6 +495,39 @@ int get_msr_fd(int cpu) return fd; } +static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) +{ + return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags); +} + +static int perf_instr_count_open(int cpu_num) +{ + struct perf_event_attr pea; + int fd; + + memset(&pea, 0, sizeof(struct perf_event_attr)); + pea.type = PERF_TYPE_HARDWARE; + pea.size = sizeof(struct perf_event_attr); + pea.config = PERF_COUNT_HW_INSTRUCTIONS; + + /* counter for cpu_num, including user + kernel and all processes */ + fd = perf_event_open(&pea, -1, cpu_num, -1, 0); + if (fd == -1) + err(-1, "cpu%d: perf instruction counter\n", cpu_num); + + return fd; +} + +int get_instr_count_fd(int cpu) +{ + if (fd_instr_count_percpu[cpu]) + return fd_instr_count_percpu[cpu]; + + fd_instr_count_percpu[cpu] = perf_instr_count_open(cpu); + + return fd_instr_count_percpu[cpu]; +} + int get_msr(int cpu, off_t offset, unsigned long long *msr) { ssize_t retval; @@ -561,6 +599,7 @@ struct msr_counter bic[] = { { 0x0, "X2APIC" }, { 0x0, "Die" }, { 0x0, "GFXAMHz" }, + { 0x0, "IPC" }, }; #define MAX_BIC (sizeof(bic) / sizeof(struct msr_counter)) @@ -616,6 +655,7 @@ struct msr_counter bic[] = { #define BIC_X2APIC (1ULL << 49) #define BIC_Die (1ULL << 50) #define BIC_GFXACTMHz (1ULL << 51) +#define BIC_IPC (1ULL << 52) #define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC) @@ -627,6 +667,7 @@ unsigned long long bic_present = BIC_USEC | BIC_TOD | BIC_sysfs | BIC_APIC | BIC #define ENABLE_BIC(COUNTER_NAME) (bic_enabled |= COUNTER_NAME) #define BIC_PRESENT(COUNTER_BIT) (bic_present |= COUNTER_BIT) #define BIC_NOT_PRESENT(COUNTER_BIT) (bic_present &= ~COUNTER_BIT) +#define BIC_IS_ENABLED(COUNTER_BIT) (bic_enabled & COUNTER_BIT) #define MAX_DEFERRED 16 @@ -764,6 +805,9 @@ void print_header(char *delim) if (DO_BIC(BIC_TSC_MHz)) outp += sprintf(outp, "%sTSC_MHz", (printed++ ? delim : "")); + if (DO_BIC(BIC_IPC)) + outp += sprintf(outp, "%sIPC", (printed++ ? delim : "")); + if (DO_BIC(BIC_IRQ)) { if (sums_need_wide_columns) outp += sprintf(outp, "%s IRQ", (printed++ ? delim : "")); @@ -926,6 +970,9 @@ int dump_counters(struct thread_data *t, struct core_data *c, outp += sprintf(outp, "mperf: %016llX\n", t->mperf); outp += sprintf(outp, "c1: %016llX\n", t->c1); + if (DO_BIC(BIC_IPC)) + outp += sprintf(outp, "IPC: %lld\n", t->instr_count); + if (DO_BIC(BIC_IRQ)) outp += sprintf(outp, "IRQ: %lld\n", t->irq_count); if (DO_BIC(BIC_SMI)) @@ -1105,6 +1152,9 @@ int format_counters(struct thread_data *t, struct core_data *c, if (DO_BIC(BIC_TSC_MHz)) outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), 1.0 * t->tsc/units/interval_float); + if (DO_BIC(BIC_IPC)) + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 1.0 * t->instr_count / t->aperf); + /* IRQ */ if (DO_BIC(BIC_IRQ)) { if (sums_need_wide_columns) @@ -1482,6 +1532,9 @@ delta_thread(struct thread_data *new, struct thread_data *old, old->mperf = 1; /* divide by 0 protection */ } + if (DO_BIC(BIC_IPC)) + old->instr_count = new->instr_count - old->instr_count; + if (DO_BIC(BIC_IRQ)) old->irq_count = new->irq_count - old->irq_count; @@ -1536,6 +1589,8 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data t->mperf = 0; t->c1 = 0; + t->instr_count = 0; + t->irq_count = 0; t->smi_count = 0; @@ -1611,6 +1666,8 @@ int sum_counters(struct thread_data *t, struct core_data *c, average.threads.mperf += t->mperf; average.threads.c1 += t->c1; + average.threads.instr_count += t->instr_count; + average.threads.irq_count += t->irq_count; average.threads.smi_count += t->smi_count; @@ -1707,6 +1764,7 @@ void compute_average(struct thread_data *t, struct core_data *c, average.threads.tsc /= topo.num_cpus; average.threads.aperf /= topo.num_cpus; average.threads.mperf /= topo.num_cpus; + average.threads.instr_count /= topo.num_cpus; average.threads.c1 /= topo.num_cpus; if (average.threads.irq_count > 9999999) @@ -1989,6 +2047,10 @@ retry: t->mperf = t->mperf * aperf_mperf_multiplier; } + if (DO_BIC(BIC_IPC)) + if (read(get_instr_count_fd(cpu), &t->instr_count, sizeof(long long)) != sizeof(long long)) + return -4; + if (DO_BIC(BIC_IRQ)) t->irq_count = irqs_per_cpu[cpu]; if (DO_BIC(BIC_SMI)) { @@ -5031,6 +5093,26 @@ void print_dev_latency(void) close(fd); } + +/* + * Linux-perf manages the the HW instructions-retired counter + * by enabling when requested, and hiding rollover + */ +void linux_perf_init(void) +{ + if (!BIC_IS_ENABLED(BIC_IPC)) + return; + + if (access("/proc/sys/kernel/perf_event_paranoid", F_OK)) + return; + + fd_instr_count_percpu = calloc(topo.max_cpu_num + 1, sizeof(int)); + if (fd_instr_count_percpu == NULL) + err(-1, "calloc fd_instr_count_percpu"); + + BIC_PRESENT(BIC_IPC); +} + void process_cpuid() { unsigned int eax, ebx, ecx, edx; @@ -5642,6 +5724,7 @@ void turbostat_init() check_dev_msr(); check_permissions(); process_cpuid(); + linux_perf_init(); if (!quiet) @@ -6087,6 +6170,7 @@ void cmdline(int argc, char **argv) {"debug", no_argument, 0, 'd'}, /* internal, not documented */ {"enable", required_argument, 0, 'e'}, {"interval", required_argument, 0, 'i'}, + {"IPC", no_argument, 0, 'I'}, {"num_iterations", required_argument, 0, 'n'}, {"help", no_argument, 0, 'h'}, {"hide", required_argument, 0, 'H'}, // meh, -h taken by --help -- cgit v1.2.3 From ed0757b83a00d1799c249073d688b018b82d0093 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Thu, 4 Feb 2021 14:44:12 -0500 Subject: tools/power turbostat: print microcode patch level (also available via "grep microcode /proc/cpuinfo") Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index b82295eaa744..e1bc7937b1ec 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5118,6 +5118,7 @@ void process_cpuid() unsigned int eax, ebx, ecx, edx; unsigned int fms, family, model, stepping, ecx_flags, edx_flags; unsigned int has_turbo; + unsigned long long ucode_patch = 0; eax = ebx = ecx = edx = 0; @@ -5131,8 +5132,8 @@ void process_cpuid() hygon_genuine = 1; if (!quiet) - fprintf(outf, "CPUID(0): %.4s%.4s%.4s ", - (char *)&ebx, (char *)&edx, (char *)&ecx); + fprintf(outf, "CPUID(0): %.4s%.4s%.4s 0x%x CPUID levels\n", + (char *)&ebx, (char *)&edx, (char *)&ecx, max_level); __cpuid(1, fms, ebx, ecx, edx); family = (fms >> 8) & 0xf; @@ -5145,6 +5146,9 @@ void process_cpuid() ecx_flags = ecx; edx_flags = edx; + if (get_msr(sched_getcpu(), MSR_IA32_UCODE_REV, &ucode_patch)) + warnx("get_msr(UCODE)\n"); + /* * check max extended function levels of CPUID. * This is needed to check for invariant TSC. @@ -5154,8 +5158,9 @@ void process_cpuid() __cpuid(0x80000000, max_extended_level, ebx, ecx, edx); if (!quiet) { - fprintf(outf, "0x%x CPUID levels; 0x%x xlevels; family:model:stepping 0x%x:%x:%x (%d:%d:%d)\n", - max_level, max_extended_level, family, model, stepping, family, model, stepping); + fprintf(outf, "CPUID(1): family:model:stepping 0x%x:%x:%x (%d:%d:%d) microcode 0x%x\n", + family, model, stepping, family, model, stepping, (unsigned int)((ucode_patch >> 32) & 0xFFFFFFFF)); + fprintf(outf, "CPUID(0x80000000): max_extended_levels: 0x%x\n", max_extended_level); fprintf(outf, "CPUID(1): %s %s %s %s %s %s %s %s %s %s\n", ecx_flags & (1 << 0) ? "SSE3" : "-", ecx_flags & (1 << 3) ? "MONITOR" : "-", -- cgit v1.2.3 From 5683460b85a8a14c5eec10e363635ad4660eb961 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 3 Feb 2021 16:19:59 +0800 Subject: tools/power turbostat: Support Alder Lake Mobile Share the code between Alder Lake Mobile and Alder Lake Desktop. Signed-off-by: Chen Yu Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index e1bc7937b1ec..a4745825047f 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5056,6 +5056,7 @@ unsigned int intel_model_duplicates(unsigned int model) case INTEL_FAM6_ROCKETLAKE: case INTEL_FAM6_LAKEFIELD: case INTEL_FAM6_ALDERLAKE: + case INTEL_FAM6_ALDERLAKE_L: return INTEL_FAM6_CANNONLAKE_L; case INTEL_FAM6_ATOM_TREMONT_L: -- cgit v1.2.3 From 6c5c656006cf314196faea7bd76eebbfa0941cd1 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 3 Feb 2021 16:26:32 +0800 Subject: tools/power turbostat: Support Ice Lake D Ice Lake D is low-end server version of Ice Lake X, reuse the code accordingly. Tested-by: Wendy Wang Signed-off-by: Chen Yu Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index a4745825047f..d27e899328f9 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5063,6 +5063,7 @@ unsigned int intel_model_duplicates(unsigned int model) return INTEL_FAM6_ATOM_TREMONT; case INTEL_FAM6_ICELAKE_X: + case INTEL_FAM6_ICELAKE_D: case INTEL_FAM6_SAPPHIRERAPIDS_X: return INTEL_FAM6_SKYLAKE_X; } -- cgit v1.2.3 From b2b94be787bf47eedd5890a249f3318bf9f1f1d5 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Thu, 11 Mar 2021 18:36:35 -0500 Subject: Revert "tools/power turbostat: adjust for temperature offset" This reverts commit 6ff7cb371c4bea3dba03a56d774da925e78a5087. Apparently the TCC offset should not be used to adjust what temperature we show the user after all. (on most systems, TCC offset is 0, FWIW) Fixes: 6ff7cb371c4b Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 62 +++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 29 deletions(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index d27e899328f9..98a0a731da8a 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -4879,33 +4879,12 @@ double discover_bclk(unsigned int family, unsigned int model) * below this value, including the Digital Thermal Sensor (DTS), * Package Thermal Management Sensor (PTM), and thermal event thresholds. */ -int read_tcc_activation_temp() +int set_temperature_target(struct thread_data *t, struct core_data *c, struct pkg_data *p) { unsigned long long msr; - unsigned int tcc, target_c, offset_c; - - /* Temperature Target MSR is Nehalem and newer only */ - if (!do_nhm_platform_info) - return 0; - - if (get_msr(base_cpu, MSR_IA32_TEMPERATURE_TARGET, &msr)) - return 0; - - target_c = (msr >> 16) & 0xFF; - - offset_c = (msr >> 24) & 0xF; - - tcc = target_c - offset_c; - - if (!quiet) - fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C) (%d default - %d offset)\n", - base_cpu, msr, tcc, target_c, offset_c); - - return tcc; -} + unsigned int target_c_local; + int cpu; -int set_temperature_target(struct thread_data *t, struct core_data *c, struct pkg_data *p) -{ /* tcc_activation_temp is used only for dts or ptm */ if (!(do_dts || do_ptm)) return 0; @@ -4914,18 +4893,43 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE) || !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) return 0; + cpu = t->cpu_id; + if (cpu_migrate(cpu)) { + fprintf(outf, "Could not migrate to CPU %d\n", cpu); + return -1; + } + if (tcc_activation_temp_override != 0) { tcc_activation_temp = tcc_activation_temp_override; - fprintf(outf, "Using cmdline TCC Target (%d C)\n", tcc_activation_temp); + fprintf(outf, "cpu%d: Using cmdline TCC Target (%d C)\n", + cpu, tcc_activation_temp); return 0; } - tcc_activation_temp = read_tcc_activation_temp(); - if (tcc_activation_temp) - return 0; + /* Temperature Target MSR is Nehalem and newer only */ + if (!do_nhm_platform_info) + goto guess; + + if (get_msr(base_cpu, MSR_IA32_TEMPERATURE_TARGET, &msr)) + goto guess; + + target_c_local = (msr >> 16) & 0xFF; + + if (!quiet) + fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C)\n", + cpu, msr, target_c_local); + + if (!target_c_local) + goto guess; + + tcc_activation_temp = target_c_local; + + return 0; +guess: tcc_activation_temp = TJMAX_DEFAULT; - fprintf(outf, "Guessing tjMax %d C, Please use -T to specify\n", tcc_activation_temp); + fprintf(outf, "cpu%d: Guessing tjMax %d C, Please use -T to specify\n", + cpu, tcc_activation_temp); return 0; } -- cgit v1.2.3 From abdc75ab53b7fd2ef42c79e88cf0caf2d007c4f2 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Thu, 11 Mar 2021 10:05:13 +0800 Subject: tools/power turbostat: Fix DRAM Energy Unit on SKX SKX uses fixed DRAM Energy Unit, just like HSX and BDX. Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 98a0a731da8a..5afe85efca5c 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -4272,6 +4272,7 @@ rapl_dram_energy_units_probe(int model, double rapl_energy_units) switch (model) { case INTEL_FAM6_HASWELL_X: /* HSX */ case INTEL_FAM6_BROADWELL_X: /* BDX */ + case INTEL_FAM6_SKYLAKE_X: /* SKX */ case INTEL_FAM6_XEON_PHI_KNL: /* KNL */ return (rapl_dram_energy_units = 15.3 / 1000000); default: -- cgit v1.2.3 From ba58ecde5eec898f647bba7cb07e6ec6ea1b875c Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 12 Mar 2021 17:30:30 -0500 Subject: tools/power turbostat: update version number --- tools/power/x86/turbostat/turbostat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 5afe85efca5c..ace100dd5a83 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5834,7 +5834,7 @@ int get_and_dump_counters(void) } void print_version() { - fprintf(outf, "turbostat version 20.09.30" + fprintf(outf, "turbostat version 21.03.12" " - Len Brown \n"); } -- cgit v1.2.3 From 301b1d3a9104f4f3a8ab4171cf88d0f55d632b41 Mon Sep 17 00:00:00 2001 From: Bas Nieuwenhuizen Date: Wed, 28 Apr 2021 17:09:03 +0800 Subject: tools/power/turbostat: Fix turbostat for AMD Zen CPUs It was reported that on Zen+ system turbostat started exiting, which was tracked down to the MSR_PKG_ENERGY_STAT read failing because offset_to_idx wasn't returning a non-negative index. This patch combined the modification from Bingsong Si and Bas Nieuwenhuizen and addd the MSR to the index system as alternative for MSR_PKG_ENERGY_STATUS. Fixes: 9972d5d84d76 ("tools/power turbostat: Enable accumulate RAPL display") Reported-by: youling257 Tested-by: youling257 Tested-by: Kurt Garloff Tested-by: Bingsong Si Tested-by: Artem S. Tashkinov Co-developed-by: Bingsong Si Co-developed-by: Terry Bowman Signed-off-by: Bas Nieuwenhuizen Reviewed-by: Chen Yu Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index ace100dd5a83..b5f4ec24fea9 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -302,7 +302,10 @@ int idx_to_offset(int idx) switch (idx) { case IDX_PKG_ENERGY: - offset = MSR_PKG_ENERGY_STATUS; + if (do_rapl & RAPL_AMD_F17H) + offset = MSR_PKG_ENERGY_STAT; + else + offset = MSR_PKG_ENERGY_STATUS; break; case IDX_DRAM_ENERGY: offset = MSR_DRAM_ENERGY_STATUS; @@ -331,6 +334,7 @@ int offset_to_idx(int offset) switch (offset) { case MSR_PKG_ENERGY_STATUS: + case MSR_PKG_ENERGY_STAT: idx = IDX_PKG_ENERGY; break; case MSR_DRAM_ENERGY_STATUS: @@ -358,7 +362,7 @@ int idx_valid(int idx) { switch (idx) { case IDX_PKG_ENERGY: - return do_rapl & RAPL_PKG; + return do_rapl & (RAPL_PKG | RAPL_AMD_F17H); case IDX_DRAM_ENERGY: return do_rapl & RAPL_DRAM; case IDX_PP0_ENERGY: -- cgit v1.2.3 From 13a779de4175df602366d129e41782ad7168cef0 Mon Sep 17 00:00:00 2001 From: Calvin Walton Date: Wed, 28 Apr 2021 17:09:16 +0800 Subject: tools/power turbostat: Fix offset overflow issue in index converting The idx_to_offset() function returns type int (32-bit signed), but MSR_PKG_ENERGY_STAT is u32 and would be interpreted as a negative number. The end result is that it hits the if (offset < 0) check in update_msr_sum() which prevents the timer callback from updating the stat in the background when long durations are used. The similar issue exists in offset_to_idx() and update_msr_sum(). Fix this issue by converting the 'int' to 'off_t' accordingly. Fixes: 9972d5d84d76 ("tools/power turbostat: Enable accumulate RAPL display") Signed-off-by: Calvin Walton Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index b5f4ec24fea9..f3cb06a115b5 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -296,9 +296,9 @@ struct msr_sum_array { /* The percpu MSR sum array.*/ struct msr_sum_array *per_cpu_msr_sum; -int idx_to_offset(int idx) +off_t idx_to_offset(int idx) { - int offset; + off_t offset; switch (idx) { case IDX_PKG_ENERGY: @@ -328,7 +328,7 @@ int idx_to_offset(int idx) return offset; } -int offset_to_idx(int offset) +int offset_to_idx(off_t offset) { int idx; @@ -3338,7 +3338,7 @@ static int update_msr_sum(struct thread_data *t, struct core_data *c, struct pkg for (i = IDX_PKG_ENERGY; i < IDX_COUNT; i++) { unsigned long long msr_cur, msr_last; - int offset; + off_t offset; if (!idx_valid(i)) continue; @@ -3347,7 +3347,8 @@ static int update_msr_sum(struct thread_data *t, struct core_data *c, struct pkg continue; ret = get_msr(cpu, offset, &msr_cur); if (ret) { - fprintf(outf, "Can not update msr(0x%x)\n", offset); + fprintf(outf, "Can not update msr(0x%llx)\n", + (unsigned long long)offset); continue; } -- cgit v1.2.3 From 25368d7cefcd87a94ccabcc6f9f31796607bbe4e Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Tue, 4 May 2021 17:52:34 +0300 Subject: tools/power/turbostat: Remove Package C6 Retention on Ice Lake Server Currently the turbostat treats ICX the same way as SKX and shares the code among them. But one difference is that ICX does not support Package C6 Retention, unlike SKX and CLX. So this patch: 1. Splitting SKX and ICX in turbostat. 2. Removing Package C6 Rentention for ICX. And after this split, it would be easier to cutomize Ice Lake Server in turbostat in the future. Suggested-by: Artem Bityutskiy Signed-off-by: Chen Yu Reviewed-by: Artem Bityutskiy Tested-by: Artem Bityutskiy Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 36 +++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index f3cb06a115b5..b43816e4c1ff 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -2263,7 +2263,7 @@ int amt_pkg_cstate_limits[16] = {PCLUNL, PCL__1, PCL__2, PCLRSV, PCLRSV, PCLRSV, int phi_pkg_cstate_limits[16] = {PCL__0, PCL__2, PCL_6N, PCL_6R, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV}; int glm_pkg_cstate_limits[16] = {PCLUNL, PCL__1, PCL__3, PCL__6, PCL__7, PCL_7S, PCL__8, PCL__9, PCL_10, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV}; int skx_pkg_cstate_limits[16] = {PCL__0, PCL__2, PCL_6N, PCL_6R, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV}; - +int icx_pkg_cstate_limits[16] = {PCL__0, PCL__2, PCL__6, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV}; static void calculate_tsc_tweak() @@ -2378,6 +2378,7 @@ int has_turbo_ratio_group_limits(int family, int model) switch (model) { case INTEL_FAM6_ATOM_GOLDMONT: case INTEL_FAM6_SKYLAKE_X: + case INTEL_FAM6_ICELAKE_X: case INTEL_FAM6_ATOM_GOLDMONT_D: case INTEL_FAM6_ATOM_TREMONT_D: return 1; @@ -3618,6 +3619,10 @@ int probe_nhm_msrs(unsigned int family, unsigned int model) pkg_cstate_limits = skx_pkg_cstate_limits; has_misc_feature_control = 1; break; + case INTEL_FAM6_ICELAKE_X: /* ICX */ + pkg_cstate_limits = icx_pkg_cstate_limits; + has_misc_feature_control = 1; + break; case INTEL_FAM6_ATOM_SILVERMONT: /* BYT */ no_MSR_MISC_PWR_MGMT = 1; case INTEL_FAM6_ATOM_SILVERMONT_D: /* AVN */ @@ -3706,6 +3711,20 @@ int is_skx(unsigned int family, unsigned int model) } return 0; } + +int is_icx(unsigned int family, unsigned int model) +{ + + if (!genuine_intel) + return 0; + + switch (model) { + case INTEL_FAM6_ICELAKE_X: + return 1; + } + return 0; +} + int is_ehl(unsigned int family, unsigned int model) { if (!genuine_intel) @@ -3808,6 +3827,7 @@ int has_glm_turbo_ratio_limit(unsigned int family, unsigned int model) switch (model) { case INTEL_FAM6_ATOM_GOLDMONT: case INTEL_FAM6_SKYLAKE_X: + case INTEL_FAM6_ICELAKE_X: return 1; default: return 0; @@ -3833,6 +3853,7 @@ int has_config_tdp(unsigned int family, unsigned int model) case INTEL_FAM6_SKYLAKE_L: /* SKL */ case INTEL_FAM6_CANNONLAKE_L: /* CNL */ case INTEL_FAM6_SKYLAKE_X: /* SKX */ + case INTEL_FAM6_ICELAKE_X: /* ICX */ case INTEL_FAM6_XEON_PHI_KNL: /* Knights Landing */ return 1; @@ -4363,6 +4384,7 @@ void rapl_probe_intel(unsigned int family, unsigned int model) case INTEL_FAM6_HASWELL_X: /* HSX */ case INTEL_FAM6_BROADWELL_X: /* BDX */ case INTEL_FAM6_SKYLAKE_X: /* SKX */ + case INTEL_FAM6_ICELAKE_X: /* ICX */ case INTEL_FAM6_XEON_PHI_KNL: /* KNL */ do_rapl = RAPL_PKG | RAPL_DRAM | RAPL_DRAM_POWER_INFO | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS | RAPL_PKG_POWER_INFO; BIC_PRESENT(BIC_PKG__); @@ -4519,7 +4541,8 @@ void perf_limit_reasons_probe(unsigned int family, unsigned int model) void automatic_cstate_conversion_probe(unsigned int family, unsigned int model) { - if (is_skx(family, model) || is_bdx(family, model)) + if (is_skx(family, model) || is_bdx(family, model) || + is_icx(family, model)) has_automatic_cstate_conversion = 1; } @@ -4734,6 +4757,7 @@ int has_snb_msrs(unsigned int family, unsigned int model) case INTEL_FAM6_SKYLAKE_L: /* SKL */ case INTEL_FAM6_CANNONLAKE_L: /* CNL */ case INTEL_FAM6_SKYLAKE_X: /* SKX */ + case INTEL_FAM6_ICELAKE_X: /* ICX */ case INTEL_FAM6_ATOM_GOLDMONT: /* BXT */ case INTEL_FAM6_ATOM_GOLDMONT_PLUS: case INTEL_FAM6_ATOM_GOLDMONT_D: /* DNV */ @@ -5072,10 +5096,9 @@ unsigned int intel_model_duplicates(unsigned int model) case INTEL_FAM6_ATOM_TREMONT_L: return INTEL_FAM6_ATOM_TREMONT; - case INTEL_FAM6_ICELAKE_X: case INTEL_FAM6_ICELAKE_D: case INTEL_FAM6_SAPPHIRERAPIDS_X: - return INTEL_FAM6_SKYLAKE_X; + return INTEL_FAM6_ICELAKE_X; } return model; } @@ -5185,8 +5208,9 @@ void process_cpuid() edx_flags & (1 << 28) ? "HT" : "-", edx_flags & (1 << 29) ? "TM" : "-"); } - if (genuine_intel) + if (genuine_intel) { model = intel_model_duplicates(model); + } if (!(edx_flags & (1 << 5))) errx(1, "CPUID: no MSR"); @@ -5364,7 +5388,7 @@ void process_cpuid() BIC_NOT_PRESENT(BIC_Pkgpc7); use_c1_residency_msr = 1; } - if (is_skx(family, model)) { + if (is_skx(family, model) || is_icx(family, model)) { BIC_NOT_PRESENT(BIC_CPU_c3); BIC_NOT_PRESENT(BIC_Pkgpc3); BIC_NOT_PRESENT(BIC_CPU_c7); -- cgit v1.2.3 From 1e3ec5cdfb63bc2a1ff06145faa2be08d6ec9594 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 25 Mar 2021 13:13:33 -0700 Subject: tools/power turbostat: unmark non-kernel-doc comment Do not mark a comment as kernel-doc notation when it is not meant to be in kernel-doc notation. Signed-off-by: Randy Dunlap Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index b43816e4c1ff..407e80e72546 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -2516,7 +2516,7 @@ dump_knl_turbo_ratio_limits(void) fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT: 0x%08llx\n", base_cpu, msr); - /** + /* * Turbo encoding in KNL is as follows: * [0] -- Reserved * [7:1] -- Base value of number of active cores of bucket 1. -- cgit v1.2.3 From 8c69da293041352d15a2b6e8010c141822a416c5 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 28 Apr 2021 10:51:57 +0800 Subject: tools/power turbostat: Enable tsc_tweak for Elkhart Lake and Jasper Lake It was found that on Elkhart Lake the TSC frequency is driven by a separate crystal-clock domain, which is different from the BCLK domain which includes mperf. This has result in small different speed thus inconsistence between TSC and the mperf, which caused the Busy% to be higher than 100%. On this platform it seems that the mperf runs faster than tsc when the CPU is 100% utilized: delta tsc(18815473183) < delta mperf(18958403680) for 10 seconds. To align TSC with mperf, leverage the tsc_tweak mechanism introduced for cores newer than Skylake, so that TSC and mperf would be calculated in the same domain. Reported-by: Zhang Rui Signed-off-by: Chen Yu Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 407e80e72546..9ec13f06c0f3 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5440,7 +5440,7 @@ void process_cpuid() if (!quiet) dump_sysfs_pstate_config(); - if (has_skl_msrs(family, model)) + if (has_skl_msrs(family, model) || is_ehl(family, model)) calculate_tsc_tweak(); if (!access("/sys/class/drm/card0/power/rc6_residency_ms", R_OK)) -- cgit v1.2.3 From aeb01e6d71ffaf3011ac755c3083cc200ed57cb4 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 28 Apr 2021 12:18:12 +0800 Subject: tools/power turbostat: Print the C-state Pre-wake settings C-state pre-wake setting[1] is an optimization for some Intel CPUs to be woken up from deep C-states in order to reduce latency. According to the spec, the BIT30 is the C-state Pre-wake Disable. Expose this setting accordingly. Sample output from turbostat: ... cpu51: MSR_IA32_POWER_CTL: 0x1a00a40059 (C1E auto-promotion: DISabled) C-state Pre-wake: ENabled cpu51: MSR_TURBO_RATIO_LIMIT: 0x2021212121212224 ... [1] https://intel.github.io/wult/#c-state-pre-wake Signed-off-by: Chen Yu Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 9ec13f06c0f3..e1ed14c666db 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -91,6 +91,7 @@ double rapl_dram_energy_units, rapl_energy_units; double rapl_joule_counter_range; unsigned int do_core_perf_limit_reasons; unsigned int has_automatic_cstate_conversion; +unsigned int dis_cstate_prewake; unsigned int do_gfx_perf_limit_reasons; unsigned int do_ring_perf_limit_reasons; unsigned int crystal_hz; @@ -2271,6 +2272,8 @@ calculate_tsc_tweak() tsc_tweak = base_hz / tsc_hz; } +void prewake_cstate_probe(unsigned int family, unsigned int model); + static void dump_nhm_platform_info(void) { @@ -2293,6 +2296,11 @@ dump_nhm_platform_info(void) fprintf(outf, "cpu%d: MSR_IA32_POWER_CTL: 0x%08llx (C1E auto-promotion: %sabled)\n", base_cpu, msr, msr & 0x2 ? "EN" : "DIS"); + /* C-state Pre-wake Disable (CSTATE_PREWAKE_DISABLE) */ + if (dis_cstate_prewake) + fprintf(outf, "C-state Pre-wake: %sabled\n", + msr & 0x40000000 ? "DIS" : "EN"); + return; } @@ -4546,6 +4554,12 @@ void automatic_cstate_conversion_probe(unsigned int family, unsigned int model) has_automatic_cstate_conversion = 1; } +void prewake_cstate_probe(unsigned int family, unsigned int model) +{ + if (is_icx(family, model)) + dis_cstate_prewake = 1; +} + int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p) { unsigned long long msr; -- cgit v1.2.3 From 7ab5ff4937a338783d147ec2d8c8714f48a5de79 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 21 Apr 2021 22:22:47 +0800 Subject: tools/power turbostat: Fix Core C6 residency on Atom CPUs For Atom CPUs that have core cstate deeper than C6, MSR_CORE_C6_RESIDENCY actually returns the residency for both CC6 and deeper Core cstates. Thus, the real Core C6 residency should be the subtraction of MSR_CORE_C6_RESIDENCY return value and MSR_CORE_C6_RESIDENCY return value. Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 39 ++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index e1ed14c666db..ee18966f65a4 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -35,6 +35,7 @@ #include #include #include +#include char *proc_stat = "/proc/stat"; FILE *outf; @@ -185,6 +186,7 @@ struct thread_data { unsigned int apic_id; unsigned int x2apic_id; unsigned int flags; + bool is_atom; #define CPU_IS_FIRST_THREAD_IN_CORE 0x2 #define CPU_IS_FIRST_CORE_IN_PACKAGE 0x4 unsigned long long counter[MAX_ADDED_THREAD_COUNTERS]; @@ -2090,9 +2092,19 @@ retry: return -7; } - if (DO_BIC(BIC_CPU_c7) || soft_c1_residency_display(BIC_CPU_c7)) + if (DO_BIC(BIC_CPU_c7) || soft_c1_residency_display(BIC_CPU_c7)) { if (get_msr(cpu, MSR_CORE_C7_RESIDENCY, &c->c7)) return -8; + else if (t->is_atom) { + /* + * For Atom CPUs that has core cstate deeper than c6, + * MSR_CORE_C6_RESIDENCY returns residency of cc6 and deeper. + * Minus CC7 (and deeper cstates) residency to get + * accturate cc6 residency. + */ + c->c6 -= c->c7; + } + } if (DO_BIC(BIC_Mod_c6)) if (get_msr(cpu, MSR_MODULE_C6_RES_MS, &c->mc6_us)) @@ -4911,6 +4923,28 @@ double discover_bclk(unsigned int family, unsigned int model) return 133.33; } +int get_cpu_type(struct thread_data *t, struct core_data *c, struct pkg_data *p) +{ + unsigned int eax, ebx, ecx, edx; + + if (!genuine_intel) + return 0; + + if (cpu_migrate(t->cpu_id)) { + fprintf(outf, "Could not migrate to CPU %d\n", t->cpu_id); + return -1; + } + + if (max_level < 0x1a) + return 0; + + __cpuid(0x1a, eax, ebx, ecx, edx); + eax = (eax >> 24) & 0xFF; + if (eax == 0x20 ) + t->is_atom = true; + return 0; +} + /* * MSR_IA32_TEMPERATURE_TARGET indicates the temperature where * the Thermal Control Circuit (TCC) activates. @@ -5796,6 +5830,9 @@ void turbostat_init() for_all_cpus(set_temperature_target, ODD_COUNTERS); + for_all_cpus(get_cpu_type, ODD_COUNTERS); + for_all_cpus(get_cpu_type, EVEN_COUNTERS); + if (!quiet) for_all_cpus(print_thermal, ODD_COUNTERS); -- cgit v1.2.3 From e9d3092f6d7c21031c8ac10ba2016ae0482a39fe Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Mon, 26 Apr 2021 10:05:27 +0800 Subject: tools/power turbostat: save original CPU model CPU model may get changed in intel_model_duplicates() for code reuse. But there are still some cases we need the original CPU model to handle minor differences between generations. Thus save the original CPU model. Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index ee18966f65a4..d1ae6b248377 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -43,6 +43,10 @@ int *fd_percpu; int *fd_instr_count_percpu; struct timeval interval_tv = {5, 0}; struct timespec interval_ts = {5, 0}; + +/* Save original CPU model */ +unsigned int model_orig; + unsigned int num_iterations; unsigned int debug; unsigned int quiet; @@ -5257,6 +5261,7 @@ void process_cpuid() edx_flags & (1 << 29) ? "TM" : "-"); } if (genuine_intel) { + model_orig = model; model = intel_model_duplicates(model); } -- cgit v1.2.3 From 0b9a0b9be991656f125b58a240065cdf72077244 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 21 Apr 2021 23:22:14 +0800 Subject: tools/power turbostat: add TCC Offset support The length of TCC Offset bits varies on different platforms. Decode TCC Offset bits only for the platforms that we have verified. For the others, only show default TCC activation temperature. Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 58 +++++++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index d1ae6b248377..6326bee97c0b 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -91,6 +91,7 @@ unsigned int gfx_cur_mhz; unsigned int gfx_act_mhz; unsigned int tcc_activation_temp; unsigned int tcc_activation_temp_override; +int tcc_offset_bits; double rapl_power_units, rapl_time_units; double rapl_dram_energy_units, rapl_energy_units; double rapl_joule_counter_range; @@ -3886,6 +3887,40 @@ int has_config_tdp(unsigned int family, unsigned int model) } } +/* + * tcc_offset_bits: + * 0: Tcc Offset not supported (Default) + * 6: Bit 29:24 of MSR_PLATFORM_INFO + * 4: Bit 27:24 of MSR_PLATFORM_INFO + */ +void check_tcc_offset(int model) +{ + unsigned long long msr; + + if (!genuine_intel) + return; + + switch (model) { + case INTEL_FAM6_SKYLAKE_L: + case INTEL_FAM6_SKYLAKE: + case INTEL_FAM6_KABYLAKE_L: + case INTEL_FAM6_KABYLAKE: + case INTEL_FAM6_ICELAKE_L: + case INTEL_FAM6_ICELAKE: + case INTEL_FAM6_TIGERLAKE_L: + case INTEL_FAM6_TIGERLAKE: + case INTEL_FAM6_COMETLAKE: + if (!get_msr(base_cpu, MSR_PLATFORM_INFO, &msr)) { + msr = (msr >> 30) & 1; + if (msr) + tcc_offset_bits = 6; + } + return; + default: + return; + } +} + static void remove_underbar(char *s) { @@ -4964,7 +4999,7 @@ int get_cpu_type(struct thread_data *t, struct core_data *c, struct pkg_data *p) int set_temperature_target(struct thread_data *t, struct core_data *c, struct pkg_data *p) { unsigned long long msr; - unsigned int target_c_local; + unsigned int target_c_local, tcc_offset; int cpu; /* tcc_activation_temp is used only for dts or ptm */ @@ -4997,9 +5032,24 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk target_c_local = (msr >> 16) & 0xFF; - if (!quiet) - fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C)\n", + if (!quiet) { + switch (tcc_offset_bits) { + case 4: + tcc_offset = (msr >> 24) & 0xF; + fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C) (%d default - %d offset)\n", + cpu, msr, target_c_local - tcc_offset, target_c_local, tcc_offset); + break; + case 6: + tcc_offset = (msr >> 24) & 0x3F; + fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C) (%d default - %d offset)\n", + cpu, msr, target_c_local - tcc_offset, target_c_local, tcc_offset); + break; + default: + fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C)\n", cpu, msr, target_c_local); + break; + } + } if (!target_c_local) goto guess; @@ -5483,6 +5533,8 @@ void process_cpuid() perf_limit_reasons_probe(family, model); automatic_cstate_conversion_probe(family, model); + check_tcc_offset(model_orig); + if (!quiet) dump_cstate_pstate_config_info(family, model); -- cgit v1.2.3 From 55279aef754c5eab170077ae4ba4ebd304dea64f Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Mon, 26 Apr 2021 18:49:26 +0800 Subject: tools/power turbostat: rename tcc variables There are two TCC activation temeprature. One is the default TCC activation temperature, also known as TJ_MAX. Another one is the effective TCC activation temperature, which is the subtraction of default TCC activation temperature and TCC offset. The name of variable tcc_activation_temp might be misleading here. Thus rename tcc_activation_temp to tj_max, and use tcc_default and tcc_offset to calculate the effective TCC activation temperature. No functional change in this patch. Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 44 +++++++++++++++++------------------ 1 file changed, 22 insertions(+), 22 deletions(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 6326bee97c0b..8f0a1d8a0366 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -89,8 +89,8 @@ unsigned long long cpuidle_cur_cpu_lpi_us; unsigned long long cpuidle_cur_sys_lpi_us; unsigned int gfx_cur_mhz; unsigned int gfx_act_mhz; -unsigned int tcc_activation_temp; -unsigned int tcc_activation_temp_override; +unsigned int tj_max; +unsigned int tj_max_override; int tcc_offset_bits; double rapl_power_units, rapl_time_units; double rapl_dram_energy_units, rapl_energy_units; @@ -2118,7 +2118,7 @@ retry: if (DO_BIC(BIC_CoreTmp)) { if (get_msr(cpu, MSR_IA32_THERM_STATUS, &msr)) return -9; - c->core_temp_c = tcc_activation_temp - ((msr >> 16) & 0x7F); + c->core_temp_c = tj_max - ((msr >> 16) & 0x7F); } if (do_rapl & RAPL_AMD_F17H) { @@ -2224,7 +2224,7 @@ retry: if (DO_BIC(BIC_PkgTmp)) { if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_STATUS, &msr)) return -17; - p->pkg_temp_c = tcc_activation_temp - ((msr >> 16) & 0x7F); + p->pkg_temp_c = tj_max - ((msr >> 16) & 0x7F); } if (DO_BIC(BIC_GFX_rc6)) @@ -4637,7 +4637,7 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p dts = (msr >> 16) & 0x7F; fprintf(outf, "cpu%d: MSR_IA32_PACKAGE_THERM_STATUS: 0x%08llx (%d C)\n", - cpu, msr, tcc_activation_temp - dts); + cpu, msr, tj_max - dts); if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, &msr)) return 0; @@ -4645,7 +4645,7 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p dts = (msr >> 16) & 0x7F; dts2 = (msr >> 8) & 0x7F; fprintf(outf, "cpu%d: MSR_IA32_PACKAGE_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n", - cpu, msr, tcc_activation_temp - dts, tcc_activation_temp - dts2); + cpu, msr, tj_max - dts, tj_max - dts2); } @@ -4658,7 +4658,7 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p dts = (msr >> 16) & 0x7F; resolution = (msr >> 27) & 0xF; fprintf(outf, "cpu%d: MSR_IA32_THERM_STATUS: 0x%08llx (%d C +/- %d)\n", - cpu, msr, tcc_activation_temp - dts, resolution); + cpu, msr, tj_max - dts, resolution); if (get_msr(cpu, MSR_IA32_THERM_INTERRUPT, &msr)) return 0; @@ -4666,7 +4666,7 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p dts = (msr >> 16) & 0x7F; dts2 = (msr >> 8) & 0x7F; fprintf(outf, "cpu%d: MSR_IA32_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n", - cpu, msr, tcc_activation_temp - dts, tcc_activation_temp - dts2); + cpu, msr, tj_max - dts, tj_max - dts2); } return 0; @@ -4999,10 +4999,10 @@ int get_cpu_type(struct thread_data *t, struct core_data *c, struct pkg_data *p) int set_temperature_target(struct thread_data *t, struct core_data *c, struct pkg_data *p) { unsigned long long msr; - unsigned int target_c_local, tcc_offset; + unsigned int tcc_default, tcc_offset; int cpu; - /* tcc_activation_temp is used only for dts or ptm */ + /* tj_max is used only for dts or ptm */ if (!(do_dts || do_ptm)) return 0; @@ -5016,10 +5016,10 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk return -1; } - if (tcc_activation_temp_override != 0) { - tcc_activation_temp = tcc_activation_temp_override; + if (tj_max_override != 0) { + tj_max = tj_max_override; fprintf(outf, "cpu%d: Using cmdline TCC Target (%d C)\n", - cpu, tcc_activation_temp); + cpu, tj_max); return 0; } @@ -5030,38 +5030,38 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk if (get_msr(base_cpu, MSR_IA32_TEMPERATURE_TARGET, &msr)) goto guess; - target_c_local = (msr >> 16) & 0xFF; + tcc_default = (msr >> 16) & 0xFF; if (!quiet) { switch (tcc_offset_bits) { case 4: tcc_offset = (msr >> 24) & 0xF; fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C) (%d default - %d offset)\n", - cpu, msr, target_c_local - tcc_offset, target_c_local, tcc_offset); + cpu, msr, tcc_default - tcc_offset, tcc_default, tcc_offset); break; case 6: tcc_offset = (msr >> 24) & 0x3F; fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C) (%d default - %d offset)\n", - cpu, msr, target_c_local - tcc_offset, target_c_local, tcc_offset); + cpu, msr, tcc_default - tcc_offset, tcc_default, tcc_offset); break; default: fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C)\n", - cpu, msr, target_c_local); + cpu, msr, tcc_default); break; } } - if (!target_c_local) + if (!tcc_default) goto guess; - tcc_activation_temp = target_c_local; + tj_max = tcc_default; return 0; guess: - tcc_activation_temp = TJMAX_DEFAULT; + tj_max = TJMAX_DEFAULT; fprintf(outf, "cpu%d: Guessing tjMax %d C, Please use -T to specify\n", - cpu, tcc_activation_temp); + cpu, tj_max); return 0; } @@ -6421,7 +6421,7 @@ void cmdline(int argc, char **argv) summary_only++; break; case 'T': - tcc_activation_temp_override = atoi(optarg); + tj_max_override = atoi(optarg); break; case 'v': print_version(); -- cgit v1.2.3 From 1b439f01b67c77a374adbbd97ad0c745b7abb09b Mon Sep 17 00:00:00 2001 From: Len Brown Date: Tue, 4 May 2021 19:21:34 -0400 Subject: tools/power turbostat: formatting Spring is here... run a long overdue Lendent on turbostat.c no functional change Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 943 ++++++++++++++++------------------ 1 file changed, 433 insertions(+), 510 deletions(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 8f0a1d8a0366..13805e460a4d 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -41,8 +41,8 @@ char *proc_stat = "/proc/stat"; FILE *outf; int *fd_percpu; int *fd_instr_count_percpu; -struct timeval interval_tv = {5, 0}; -struct timespec interval_ts = {5, 0}; +struct timeval interval_tv = { 5, 0 }; +struct timespec interval_ts = { 5, 0 }; /* Save original CPU model */ unsigned int model_orig; @@ -84,7 +84,7 @@ unsigned int do_rapl; unsigned int do_dts; unsigned int do_ptm; unsigned int do_ipc; -unsigned long long gfx_cur_rc6_ms; +unsigned long long gfx_cur_rc6_ms; unsigned long long cpuidle_cur_cpu_lpi_us; unsigned long long cpuidle_cur_sys_lpi_us; unsigned int gfx_cur_mhz; @@ -104,12 +104,12 @@ unsigned int crystal_hz; unsigned long long tsc_hz; int base_cpu; double discover_bclk(unsigned int family, unsigned int model); -unsigned int has_hwp; /* IA32_PM_ENABLE, IA32_HWP_CAPABILITIES */ +unsigned int has_hwp; /* IA32_PM_ENABLE, IA32_HWP_CAPABILITIES */ /* IA32_HWP_REQUEST, IA32_HWP_STATUS */ -unsigned int has_hwp_notify; /* IA32_HWP_INTERRUPT */ +unsigned int has_hwp_notify; /* IA32_HWP_INTERRUPT */ unsigned int has_hwp_activity_window; /* IA32_HWP_REQUEST[bits 41:32] */ -unsigned int has_hwp_epp; /* IA32_HWP_REQUEST[bits 31:24] */ -unsigned int has_hwp_pkg; /* IA32_HWP_REQUEST_PKG */ +unsigned int has_hwp_epp; /* IA32_HWP_REQUEST[bits 31:24] */ +unsigned int has_hwp_pkg; /* IA32_HWP_REQUEST_PKG */ unsigned int has_misc_feature_control; unsigned int first_counter_read = 1; int ignore_stdin; @@ -185,7 +185,7 @@ struct thread_data { unsigned long long mperf; unsigned long long c1; unsigned long long instr_count; - unsigned long long irq_count; + unsigned long long irq_count; unsigned int smi_count; unsigned int cpu_id; unsigned int apic_id; @@ -253,12 +253,11 @@ struct pkg_data { ((node_no) * topo.cores_per_node) + \ (core_no)) - #define GET_PKG(pkg_base, pkg_no) (pkg_base + pkg_no) -enum counter_scope {SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE}; -enum counter_type {COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC}; -enum counter_format {FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT}; +enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE }; +enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC }; +enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT }; struct msr_counter { unsigned int msr_num; @@ -294,9 +293,9 @@ int get_msr_sum(int cpu, off_t offset, unsigned long long *msr); struct msr_sum_array { /* get_msr_sum() = sum + (get_msr() - last) */ struct { - /*The accumulated MSR value is updated by the timer*/ + /*The accumulated MSR value is updated by the timer */ unsigned long long sum; - /*The MSR footprint recorded in last timer*/ + /*The MSR footprint recorded in last timer */ unsigned long long last; } entries[IDX_COUNT]; }; @@ -385,6 +384,7 @@ int idx_valid(int idx) return 0; } } + struct sys_counters { unsigned int added_thread_counters; unsigned int added_core_counters; @@ -408,7 +408,7 @@ struct cpu_topology { int logical_node_id; /* 0-based count within the package */ int physical_core_id; int thread_id; - cpu_set_t *put_ids; /* Processing Unit/Thread IDs */ + cpu_set_t *put_ids; /* Processing Unit/Thread IDs */ } *cpus; struct topo_params { @@ -425,7 +425,7 @@ struct topo_params { struct timeval tv_even, tv_odd, tv_delta; -int *irq_column_2_cpu; /* /proc/interrupts column numbers */ +int *irq_column_2_cpu; /* /proc/interrupts column numbers */ int *irqs_per_cpu; /* indexed by cpu_num */ void setup_all_buffers(void); @@ -438,34 +438,31 @@ int cpu_is_not_present(int cpu) { return !CPU_ISSET_S(cpu, cpu_present_setsize, cpu_present_set); } + /* * run func(thread, core, package) in topology order * skip non-present cpus */ -int for_all_cpus(int (func)(struct thread_data *, struct core_data *, struct pkg_data *), - struct thread_data *thread_base, struct core_data *core_base, struct pkg_data *pkg_base) +int for_all_cpus(int (func) (struct thread_data *, struct core_data *, struct pkg_data *), + struct thread_data *thread_base, struct core_data *core_base, struct pkg_data *pkg_base) { int retval, pkg_no, core_no, thread_no, node_no; for (pkg_no = 0; pkg_no < topo.num_packages; ++pkg_no) { for (node_no = 0; node_no < topo.nodes_per_pkg; node_no++) { for (core_no = 0; core_no < topo.cores_per_node; ++core_no) { - for (thread_no = 0; thread_no < - topo.threads_per_core; ++thread_no) { + for (thread_no = 0; thread_no < topo.threads_per_core; ++thread_no) { struct thread_data *t; struct core_data *c; struct pkg_data *p; - t = GET_THREAD(thread_base, thread_no, - core_no, node_no, - pkg_no); + t = GET_THREAD(thread_base, thread_no, core_no, node_no, pkg_no); if (cpu_is_not_present(t->cpu_id)) continue; - c = GET_CORE(core_base, core_no, - node_no, pkg_no); + c = GET_CORE(core_base, core_no, node_no, pkg_no); p = GET_PKG(pkg_base, pkg_no); retval = func(t, c, p); @@ -487,6 +484,7 @@ int cpu_migrate(int cpu) else return 0; } + int get_msr_fd(int cpu) { char pathname[32]; @@ -524,7 +522,7 @@ static int perf_instr_count_open(int cpu_num) /* counter for cpu_num, including user + kernel and all processes */ fd = perf_event_open(&pea, -1, cpu_num, -1, 0); - if (fd == -1) + if (fd == -1) err(-1, "cpu%d: perf instruction counter\n", cpu_num); return fd; @@ -568,7 +566,7 @@ struct msr_counter bic[] = { { 0x0, "Bzy_MHz" }, { 0x0, "TSC_MHz" }, { 0x0, "IRQ" }, - { 0x0, "SMI", "", 32, 0, FORMAT_DELTA, NULL}, + { 0x0, "SMI", "", 32, 0, FORMAT_DELTA, NULL }, { 0x0, "sysfs" }, { 0x0, "CPU%c1" }, { 0x0, "CPU%c3" }, @@ -681,7 +679,6 @@ unsigned long long bic_present = BIC_USEC | BIC_TOD | BIC_sysfs | BIC_APIC | BIC #define BIC_NOT_PRESENT(COUNTER_BIT) (bic_present &= ~COUNTER_BIT) #define BIC_IS_ENABLED(COUNTER_BIT) (bic_enabled & COUNTER_BIT) - #define MAX_DEFERRED 16 char *deferred_skip_names[MAX_DEFERRED]; int deferred_skip_index; @@ -695,42 +692,40 @@ enum show_hide_mode { SHOW_LIST, HIDE_LIST } global_show_hide_mode = HIDE_LIST; void help(void) { fprintf(outf, - "Usage: turbostat [OPTIONS][(--interval seconds) | COMMAND ...]\n" - "\n" - "Turbostat forks the specified COMMAND and prints statistics\n" - "when COMMAND completes.\n" - "If no COMMAND is specified, turbostat wakes every 5-seconds\n" - "to print statistics, until interrupted.\n" - " -a, --add add a counter\n" - " eg. --add msr0x10,u64,cpu,delta,MY_TSC\n" - " -c, --cpu cpu-set limit output to summary plus cpu-set:\n" - " {core | package | j,k,l..m,n-p }\n" - " -d, --debug displays usec, Time_Of_Day_Seconds and more debugging\n" - " -D, --Dump displays the raw counter values\n" - " -e, --enable [all | column]\n" - " shows all or the specified disabled column\n" - " -H, --hide [column|column,column,...]\n" - " hide the specified column(s)\n" - " -i, --interval sec.subsec\n" - " Override default 5-second measurement interval\n" - " -J, --Joules displays energy in Joules instead of Watts\n" - " -l, --list list column headers only\n" - " -n, --num_iterations num\n" - " number of the measurement iterations\n" - " -o, --out file\n" - " create or truncate \"file\" for all output\n" - " -q, --quiet skip decoding system configuration header\n" - " -s, --show [column|column,column,...]\n" - " show only the specified column(s)\n" - " -S, --Summary\n" - " limits output to 1-line system summary per interval\n" - " -T, --TCC temperature\n" - " sets the Thermal Control Circuit temperature in\n" - " degrees Celsius\n" - " -h, --help print this help message\n" - " -v, --version print version information\n" - "\n" - "For more help, run \"man turbostat\"\n"); + "Usage: turbostat [OPTIONS][(--interval seconds) | COMMAND ...]\n" + "\n" + "Turbostat forks the specified COMMAND and prints statistics\n" + "when COMMAND completes.\n" + "If no COMMAND is specified, turbostat wakes every 5-seconds\n" + "to print statistics, until interrupted.\n" + " -a, --add add a counter\n" + " eg. --add msr0x10,u64,cpu,delta,MY_TSC\n" + " -c, --cpu cpu-set limit output to summary plus cpu-set:\n" + " {core | package | j,k,l..m,n-p }\n" + " -d, --debug displays usec, Time_Of_Day_Seconds and more debugging\n" + " -D, --Dump displays the raw counter values\n" + " -e, --enable [all | column]\n" + " shows all or the specified disabled column\n" + " -H, --hide [column|column,column,...]\n" + " hide the specified column(s)\n" + " -i, --interval sec.subsec\n" + " Override default 5-second measurement interval\n" + " -J, --Joules displays energy in Joules instead of Watts\n" + " -l, --list list column headers only\n" + " -n, --num_iterations num\n" + " number of the measurement iterations\n" + " -o, --out file\n" + " create or truncate \"file\" for all output\n" + " -q, --quiet skip decoding system configuration header\n" + " -s, --show [column|column,column,...]\n" + " show only the specified column(s)\n" + " -S, --Summary\n" + " limits output to 1-line system summary per interval\n" + " -T, --TCC temperature\n" + " sets the Thermal Control Circuit temperature in\n" + " degrees Celsius\n" + " -h, --help print this help message\n" + " -v, --version print version information\n" "\n" "For more help, run \"man turbostat\"\n"); } /* @@ -784,7 +779,6 @@ unsigned long long bic_lookup(char *name_list, enum show_hide_mode mode) return retval; } - void print_header(char *delim) { struct msr_counter *mp; @@ -966,8 +960,7 @@ void print_header(char *delim) outp += sprintf(outp, "\n"); } -int dump_counters(struct thread_data *t, struct core_data *c, - struct pkg_data *p) +int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) { int i; struct msr_counter *mp; @@ -975,8 +968,7 @@ int dump_counters(struct thread_data *t, struct core_data *c, outp += sprintf(outp, "t %p, c %p, p %p\n", t, c, p); if (t) { - outp += sprintf(outp, "CPU: %d flags 0x%x\n", - t->cpu_id, t->flags); + outp += sprintf(outp, "CPU: %d flags 0x%x\n", t->cpu_id, t->flags); outp += sprintf(outp, "TSC: %016llX\n", t->tsc); outp += sprintf(outp, "aperf: %016llX\n", t->aperf); outp += sprintf(outp, "mperf: %016llX\n", t->mperf); @@ -991,8 +983,7 @@ int dump_counters(struct thread_data *t, struct core_data *c, outp += sprintf(outp, "SMI: %d\n", t->smi_count); for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) { - outp += sprintf(outp, "tADDED [%d] msr0x%x: %08llX\n", - i, mp->msr_num, t->counter[i]); + outp += sprintf(outp, "tADDED [%d] msr0x%x: %08llX\n", i, mp->msr_num, t->counter[i]); } } @@ -1005,8 +996,7 @@ int dump_counters(struct thread_data *t, struct core_data *c, outp += sprintf(outp, "Joules: %0X\n", c->core_energy); for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { - outp += sprintf(outp, "cADDED [%d] msr0x%x: %08llX\n", - i, mp->msr_num, c->counter[i]); + outp += sprintf(outp, "cADDED [%d] msr0x%x: %08llX\n", i, mp->msr_num, c->counter[i]); } outp += sprintf(outp, "mc6_us: %016llX\n", c->mc6_us); } @@ -1035,15 +1025,12 @@ int dump_counters(struct thread_data *t, struct core_data *c, outp += sprintf(outp, "Joules COR: %0llX\n", p->energy_cores); outp += sprintf(outp, "Joules GFX: %0llX\n", p->energy_gfx); outp += sprintf(outp, "Joules RAM: %0llX\n", p->energy_dram); - outp += sprintf(outp, "Throttle PKG: %0llX\n", - p->rapl_pkg_perf_status); - outp += sprintf(outp, "Throttle RAM: %0llX\n", - p->rapl_dram_perf_status); + outp += sprintf(outp, "Throttle PKG: %0llX\n", p->rapl_pkg_perf_status); + outp += sprintf(outp, "Throttle RAM: %0llX\n", p->rapl_dram_perf_status); outp += sprintf(outp, "PTM: %dC\n", p->pkg_temp_c); for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { - outp += sprintf(outp, "pADDED [%d] msr0x%x: %08llX\n", - i, mp->msr_num, p->counter[i]); + outp += sprintf(outp, "pADDED [%d] msr0x%x: %08llX\n", i, mp->msr_num, p->counter[i]); } } @@ -1055,8 +1042,7 @@ int dump_counters(struct thread_data *t, struct core_data *c, /* * column formatting convention & formats */ -int format_counters(struct thread_data *t, struct core_data *c, - struct pkg_data *p) +int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) { double interval_float, tsc; char *fmt8; @@ -1065,17 +1051,16 @@ int format_counters(struct thread_data *t, struct core_data *c, char *delim = "\t"; int printed = 0; - /* if showing only 1st thread in core and this isn't one, bail out */ + /* if showing only 1st thread in core and this isn't one, bail out */ if (show_core_only && !(t->flags & CPU_IS_FIRST_THREAD_IN_CORE)) return 0; - /* if showing only 1st thread in pkg and this isn't one, bail out */ + /* if showing only 1st thread in pkg and this isn't one, bail out */ if (show_pkg_only && !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) return 0; /*if not summary line and --cpu is used */ - if ((t != &average.threads) && - (cpu_subset && !CPU_ISSET_S(t->cpu_id, cpu_subset_size, cpu_subset))) + if ((t != &average.threads) && (cpu_subset && !CPU_ISSET_S(t->cpu_id, cpu_subset_size, cpu_subset))) return 0; if (DO_BIC(BIC_USEC)) { @@ -1090,7 +1075,7 @@ int format_counters(struct thread_data *t, struct core_data *c, if (DO_BIC(BIC_TOD)) outp += sprintf(outp, "%10ld.%06ld\t", t->tv_end.tv_sec, t->tv_end.tv_usec); - interval_float = t->tv_delta.tv_sec + t->tv_delta.tv_usec/1000000.0; + interval_float = t->tv_delta.tv_sec + t->tv_delta.tv_usec / 1000000.0; tsc = t->tsc * tsc_tweak; @@ -1126,11 +1111,9 @@ int format_counters(struct thread_data *t, struct core_data *c, if (DO_BIC(BIC_Node)) { if (t) outp += sprintf(outp, "%s%d", - (printed++ ? delim : ""), - cpus[t->cpu_id].physical_node_id); + (printed++ ? delim : ""), cpus[t->cpu_id].physical_node_id); else - outp += sprintf(outp, "%s-", - (printed++ ? delim : "")); + outp += sprintf(outp, "%s-", (printed++ ? delim : "")); } if (DO_BIC(BIC_Core)) { if (c) @@ -1147,22 +1130,22 @@ int format_counters(struct thread_data *t, struct core_data *c, } if (DO_BIC(BIC_Avg_MHz)) - outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), - 1.0 / units * t->aperf / interval_float); + outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), 1.0 / units * t->aperf / interval_float); if (DO_BIC(BIC_Busy)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->mperf/tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->mperf / tsc); if (DO_BIC(BIC_Bzy_MHz)) { if (has_base_hz) - outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), base_hz / units * t->aperf / t->mperf); + outp += + sprintf(outp, "%s%.0f", (printed++ ? delim : ""), base_hz / units * t->aperf / t->mperf); else outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), - tsc / units * t->aperf / t->mperf / interval_float); + tsc / units * t->aperf / t->mperf / interval_float); } if (DO_BIC(BIC_TSC_MHz)) - outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), 1.0 * t->tsc/units/interval_float); + outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), 1.0 * t->tsc / units / interval_float); if (DO_BIC(BIC_IPC)) outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 1.0 * t->instr_count / t->aperf); @@ -1183,7 +1166,8 @@ int format_counters(struct thread_data *t, struct core_data *c, for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) { if (mp->width == 32) - outp += sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), (unsigned int) t->counter[i]); + outp += + sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), (unsigned int)t->counter[i]); else outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), t->counter[i]); } else if (mp->format == FORMAT_DELTA) { @@ -1193,27 +1177,28 @@ int format_counters(struct thread_data *t, struct core_data *c, outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), t->counter[i]); } else if (mp->format == FORMAT_PERCENT) { if (mp->type == COUNTER_USEC) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), t->counter[i]/interval_float/10000); + outp += + sprintf(outp, "%s%.2f", (printed++ ? delim : ""), + t->counter[i] / interval_float / 10000); else - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->counter[i]/tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->counter[i] / tsc); } } /* C1 */ if (DO_BIC(BIC_CPU_c1)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->c1/tsc); - + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->c1 / tsc); /* print per-core data only for 1st thread in core */ if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE)) goto done; if (DO_BIC(BIC_CPU_c3)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->c3/tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->c3 / tsc); if (DO_BIC(BIC_CPU_c6)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->c6/tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->c6 / tsc); if (DO_BIC(BIC_CPU_c7)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->c7/tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->c7 / tsc); /* Mod%c6 */ if (DO_BIC(BIC_Mod_c6)) @@ -1225,7 +1210,8 @@ int format_counters(struct thread_data *t, struct core_data *c, for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) { if (mp->width == 32) - outp += sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), (unsigned int) c->counter[i]); + outp += + sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), (unsigned int)c->counter[i]); else outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), c->counter[i]); } else if (mp->format == FORMAT_DELTA) { @@ -1234,14 +1220,15 @@ int format_counters(struct thread_data *t, struct core_data *c, else outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), c->counter[i]); } else if (mp->format == FORMAT_PERCENT) { - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->counter[i]/tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->counter[i] / tsc); } } fmt8 = "%s%.2f"; if (DO_BIC(BIC_CorWatt) && (do_rapl & RAPL_PER_CORE_ENERGY)) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), c->core_energy * rapl_energy_units / interval_float); + outp += + sprintf(outp, fmt8, (printed++ ? delim : ""), c->core_energy * rapl_energy_units / interval_float); if (DO_BIC(BIC_Cor_J) && (do_rapl & RAPL_PER_CORE_ENERGY)) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), c->core_energy * rapl_energy_units); @@ -1259,7 +1246,7 @@ int format_counters(struct thread_data *t, struct core_data *c, outp += sprintf(outp, "%s**.**", (printed++ ? delim : "")); } else { outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), - p->gfx_rc6_ms / 10.0 / interval_float); + p->gfx_rc6_ms / 10.0 / interval_float); } } @@ -1273,42 +1260,49 @@ int format_counters(struct thread_data *t, struct core_data *c, /* Totl%C0, Any%C0 GFX%C0 CPUGFX% */ if (DO_BIC(BIC_Totl_c0)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_wtd_core_c0/tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_wtd_core_c0 / tsc); if (DO_BIC(BIC_Any_c0)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_any_core_c0/tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_any_core_c0 / tsc); if (DO_BIC(BIC_GFX_c0)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_any_gfxe_c0/tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_any_gfxe_c0 / tsc); if (DO_BIC(BIC_CPUGFX)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_both_core_gfxe_c0/tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_both_core_gfxe_c0 / tsc); if (DO_BIC(BIC_Pkgpc2)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc2/tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc2 / tsc); if (DO_BIC(BIC_Pkgpc3)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc3/tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc3 / tsc); if (DO_BIC(BIC_Pkgpc6)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc6/tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc6 / tsc); if (DO_BIC(BIC_Pkgpc7)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc7/tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc7 / tsc); if (DO_BIC(BIC_Pkgpc8)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc8/tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc8 / tsc); if (DO_BIC(BIC_Pkgpc9)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc9/tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc9 / tsc); if (DO_BIC(BIC_Pkgpc10)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc10/tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc10 / tsc); if (DO_BIC(BIC_CPU_LPI)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->cpu_lpi / 1000000.0 / interval_float); + outp += + sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->cpu_lpi / 1000000.0 / interval_float); if (DO_BIC(BIC_SYS_LPI)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->sys_lpi / 1000000.0 / interval_float); + outp += + sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->sys_lpi / 1000000.0 / interval_float); if (DO_BIC(BIC_PkgWatt)) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_pkg * rapl_energy_units / interval_float); + outp += + sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_pkg * rapl_energy_units / interval_float); if (DO_BIC(BIC_CorWatt) && !(do_rapl & RAPL_PER_CORE_ENERGY)) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_cores * rapl_energy_units / interval_float); + outp += + sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_cores * rapl_energy_units / interval_float); if (DO_BIC(BIC_GFXWatt)) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_gfx * rapl_energy_units / interval_float); + outp += + sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_gfx * rapl_energy_units / interval_float); if (DO_BIC(BIC_RAMWatt)) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_dram * rapl_dram_energy_units / interval_float); + outp += + sprintf(outp, fmt8, (printed++ ? delim : ""), + p->energy_dram * rapl_dram_energy_units / interval_float); if (DO_BIC(BIC_Pkg_J)) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_pkg * rapl_energy_units); if (DO_BIC(BIC_Cor_J) && !(do_rapl & RAPL_PER_CORE_ENERGY)) @@ -1318,14 +1312,19 @@ int format_counters(struct thread_data *t, struct core_data *c, if (DO_BIC(BIC_RAM_J)) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_dram * rapl_dram_energy_units); if (DO_BIC(BIC_PKG__)) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), 100.0 * p->rapl_pkg_perf_status * rapl_time_units / interval_float); + outp += + sprintf(outp, fmt8, (printed++ ? delim : ""), + 100.0 * p->rapl_pkg_perf_status * rapl_time_units / interval_float); if (DO_BIC(BIC_RAM__)) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), 100.0 * p->rapl_dram_perf_status * rapl_time_units / interval_float); + outp += + sprintf(outp, fmt8, (printed++ ? delim : ""), + 100.0 * p->rapl_dram_perf_status * rapl_time_units / interval_float); for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) { if (mp->width == 32) - outp += sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), (unsigned int) p->counter[i]); + outp += + sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), (unsigned int)p->counter[i]); else outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), p->counter[i]); } else if (mp->format == FORMAT_DELTA) { @@ -1334,7 +1333,7 @@ int format_counters(struct thread_data *t, struct core_data *c, else outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), p->counter[i]); } else if (mp->format == FORMAT_PERCENT) { - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->counter[i]/tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->counter[i] / tsc); } } @@ -1359,12 +1358,14 @@ void flush_output_stdout(void) outp = output_buffer; } + void flush_output_stderr(void) { fputs(output_buffer, outf); fflush(outf); outp = output_buffer; } + void format_all_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) { static int printed; @@ -1385,13 +1386,11 @@ void format_all_counters(struct thread_data *t, struct core_data *c, struct pkg_ #define DELTA_WRAP32(new, old) \ old = ((((unsigned long long)new << 32) - ((unsigned long long)old << 32)) >> 32); -int -delta_package(struct pkg_data *new, struct pkg_data *old) +int delta_package(struct pkg_data *new, struct pkg_data *old) { int i; struct msr_counter *mp; - if (DO_BIC(BIC_Totl_c0)) old->pkg_wtd_core_c0 = new->pkg_wtd_core_c0 - old->pkg_wtd_core_c0; if (DO_BIC(BIC_Any_c0)) @@ -1416,7 +1415,7 @@ delta_package(struct pkg_data *new, struct pkg_data *old) old->pkg_temp_c = new->pkg_temp_c; /* flag an error when rc6 counter resets/wraps */ - if (old->gfx_rc6_ms > new->gfx_rc6_ms) + if (old->gfx_rc6_ms > new->gfx_rc6_ms) old->gfx_rc6_ms = -1; else old->gfx_rc6_ms = new->gfx_rc6_ms - old->gfx_rc6_ms; @@ -1441,8 +1440,7 @@ delta_package(struct pkg_data *new, struct pkg_data *old) return 0; } -void -delta_core(struct core_data *new, struct core_data *old) +void delta_core(struct core_data *new, struct core_data *old) { int i; struct msr_counter *mp; @@ -1474,9 +1472,7 @@ int soft_c1_residency_display(int bic) /* * old = new - old */ -int -delta_thread(struct thread_data *new, struct thread_data *old, - struct core_data *core_delta) +int delta_thread(struct thread_data *new, struct thread_data *old, struct core_data *core_delta) { int i; struct msr_counter *mp; @@ -1507,8 +1503,7 @@ delta_thread(struct thread_data *new, struct thread_data *old, old->c1 = new->c1 - old->c1; - if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || - soft_c1_residency_display(BIC_Avg_MHz)) { + if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || soft_c1_residency_display(BIC_Avg_MHz)) { if ((new->aperf > old->aperf) && (new->mperf > old->mperf)) { old->aperf = new->aperf - old->aperf; old->mperf = new->mperf - old->mperf; @@ -1517,7 +1512,6 @@ delta_thread(struct thread_data *new, struct thread_data *old, } } - if (use_c1_residency_msr) { /* * Some models have a dedicated C1 residency MSR, @@ -1534,7 +1528,7 @@ delta_thread(struct thread_data *new, struct thread_data *old, else { /* normal case, derive c1 */ old->c1 = (old->tsc * tsc_tweak) - old->mperf - core_delta->c3 - - core_delta->c6 - core_delta->c7; + - core_delta->c6 - core_delta->c7; } } @@ -1563,8 +1557,7 @@ delta_thread(struct thread_data *new, struct thread_data *old, } int delta_cpu(struct thread_data *t, struct core_data *c, - struct pkg_data *p, struct thread_data *t2, - struct core_data *c2, struct pkg_data *p2) + struct pkg_data *p, struct thread_data *t2, struct core_data *c2, struct pkg_data *p2) { int retval = 0; @@ -1587,7 +1580,7 @@ int delta_cpu(struct thread_data *t, struct core_data *c, void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) { int i; - struct msr_counter *mp; + struct msr_counter *mp; t->tv_begin.tv_sec = 0; t->tv_begin.tv_usec = 0; @@ -1654,8 +1647,8 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) p->counter[i] = 0; } -int sum_counters(struct thread_data *t, struct core_data *c, - struct pkg_data *p) + +int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) { int i; struct msr_counter *mp; @@ -1756,12 +1749,12 @@ int sum_counters(struct thread_data *t, struct core_data *c, } return 0; } + /* * sum the counters for all cpus in the system * compute the weighted average */ -void compute_average(struct thread_data *t, struct core_data *c, - struct pkg_data *p) +void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data *p) { int i; struct msr_counter *mp; @@ -1842,7 +1835,7 @@ static unsigned long long rdtsc(void) { unsigned int low, high; - asm volatile("rdtsc" : "=a" (low), "=d" (high)); + asm volatile ("rdtsc":"=a" (low), "=d"(high)); return low | ((unsigned long long)high) << 32; } @@ -1858,6 +1851,7 @@ FILE *fopen_or_die(const char *path, const char *mode) err(1, "%s: open failed", path); return filep; } + /* * snapshot_sysfs_counter() * @@ -1889,8 +1883,7 @@ int get_mp(int cpu, struct msr_counter *mp, unsigned long long *counterp) char path[128 + PATH_BYTES]; if (mp->flags & SYSFS_PERCPU) { - sprintf(path, "/sys/devices/system/cpu/cpu%d/%s", - cpu, mp->path); + sprintf(path, "/sys/devices/system/cpu/cpu%d/%s", cpu, mp->path); *counterp = snapshot_sysfs_counter(path); } else { @@ -1950,7 +1943,7 @@ void get_apic_id(struct thread_data *t) eax = ebx = ecx = edx = 0; __cpuid(0x80000001, eax, ebx, ecx, edx); - topology_extensions = ecx & (1 << 22); + topology_extensions = ecx & (1 << 22); if (topology_extensions == 0) return; @@ -1973,8 +1966,7 @@ void get_apic_id(struct thread_data *t) t->x2apic_id = edx; if (debug && (t->apic_id != (t->x2apic_id & 0xff))) - fprintf(outf, "cpu%d: BIOS BUG: apic 0x%x x2apic 0x%x\n", - t->cpu_id, t->apic_id, t->x2apic_id); + fprintf(outf, "cpu%d: BIOS BUG: apic 0x%x x2apic 0x%x\n", t->cpu_id, t->apic_id, t->x2apic_id); } /* @@ -2002,8 +1994,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) retry: t->tsc = rdtsc(); /* we are running on local CPU of interest */ - if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || - soft_c1_residency_display(BIC_Avg_MHz)) { + if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || soft_c1_residency_display(BIC_Avg_MHz)) { unsigned long long tsc_before, tsc_between, tsc_after, aperf_time, mperf_time; /* @@ -2050,8 +2041,7 @@ retry: if (aperf_mperf_retry_count < 5) goto retry; else - warnx("cpu%d jitter %lld %lld", - cpu, aperf_time, mperf_time); + warnx("cpu%d jitter %lld %lld", cpu, aperf_time, mperf_time); } aperf_mperf_retry_count = 0; @@ -2252,47 +2242,64 @@ done: * (>= PCL__7) and to index pkg_cstate_limit_strings[]. */ -#define PCLUKN 0 /* Unknown */ -#define PCLRSV 1 /* Reserved */ -#define PCL__0 2 /* PC0 */ -#define PCL__1 3 /* PC1 */ -#define PCL__2 4 /* PC2 */ -#define PCL__3 5 /* PC3 */ -#define PCL__4 6 /* PC4 */ -#define PCL__6 7 /* PC6 */ -#define PCL_6N 8 /* PC6 No Retention */ -#define PCL_6R 9 /* PC6 Retention */ -#define PCL__7 10 /* PC7 */ -#define PCL_7S 11 /* PC7 Shrink */ -#define PCL__8 12 /* PC8 */ -#define PCL__9 13 /* PC9 */ -#define PCL_10 14 /* PC10 */ -#define PCLUNL 15 /* Unlimited */ +#define PCLUKN 0 /* Unknown */ +#define PCLRSV 1 /* Reserved */ +#define PCL__0 2 /* PC0 */ +#define PCL__1 3 /* PC1 */ +#define PCL__2 4 /* PC2 */ +#define PCL__3 5 /* PC3 */ +#define PCL__4 6 /* PC4 */ +#define PCL__6 7 /* PC6 */ +#define PCL_6N 8 /* PC6 No Retention */ +#define PCL_6R 9 /* PC6 Retention */ +#define PCL__7 10 /* PC7 */ +#define PCL_7S 11 /* PC7 Shrink */ +#define PCL__8 12 /* PC8 */ +#define PCL__9 13 /* PC9 */ +#define PCL_10 14 /* PC10 */ +#define PCLUNL 15 /* Unlimited */ int pkg_cstate_limit = PCLUKN; char *pkg_cstate_limit_strings[] = { "reserved", "unknown", "pc0", "pc1", "pc2", - "pc3", "pc4", "pc6", "pc6n", "pc6r", "pc7", "pc7s", "pc8", "pc9", "pc10", "unlimited"}; - -int nhm_pkg_cstate_limits[16] = {PCL__0, PCL__1, PCL__3, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV}; -int snb_pkg_cstate_limits[16] = {PCL__0, PCL__2, PCL_6N, PCL_6R, PCL__7, PCL_7S, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV}; -int hsw_pkg_cstate_limits[16] = {PCL__0, PCL__2, PCL__3, PCL__6, PCL__7, PCL_7S, PCL__8, PCL__9, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV}; -int slv_pkg_cstate_limits[16] = {PCL__0, PCL__1, PCLRSV, PCLRSV, PCL__4, PCLRSV, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCL__6, PCL__7}; -int amt_pkg_cstate_limits[16] = {PCLUNL, PCL__1, PCL__2, PCLRSV, PCLRSV, PCLRSV, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV}; -int phi_pkg_cstate_limits[16] = {PCL__0, PCL__2, PCL_6N, PCL_6R, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV}; -int glm_pkg_cstate_limits[16] = {PCLUNL, PCL__1, PCL__3, PCL__6, PCL__7, PCL_7S, PCL__8, PCL__9, PCL_10, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV}; -int skx_pkg_cstate_limits[16] = {PCL__0, PCL__2, PCL_6N, PCL_6R, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV}; -int icx_pkg_cstate_limits[16] = {PCL__0, PCL__2, PCL__6, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV}; + "pc3", "pc4", "pc6", "pc6n", "pc6r", "pc7", "pc7s", "pc8", "pc9", "pc10", "unlimited" +}; -static void -calculate_tsc_tweak() +int nhm_pkg_cstate_limits[16] = + { PCL__0, PCL__1, PCL__3, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, +PCLRSV, PCLRSV }; +int snb_pkg_cstate_limits[16] = + { PCL__0, PCL__2, PCL_6N, PCL_6R, PCL__7, PCL_7S, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, +PCLRSV, PCLRSV }; +int hsw_pkg_cstate_limits[16] = + { PCL__0, PCL__2, PCL__3, PCL__6, PCL__7, PCL_7S, PCL__8, PCL__9, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, +PCLRSV, PCLRSV }; +int slv_pkg_cstate_limits[16] = + { PCL__0, PCL__1, PCLRSV, PCLRSV, PCL__4, PCLRSV, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, +PCL__6, PCL__7 }; +int amt_pkg_cstate_limits[16] = + { PCLUNL, PCL__1, PCL__2, PCLRSV, PCLRSV, PCLRSV, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, +PCLRSV, PCLRSV }; +int phi_pkg_cstate_limits[16] = + { PCL__0, PCL__2, PCL_6N, PCL_6R, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, +PCLRSV, PCLRSV }; +int glm_pkg_cstate_limits[16] = + { PCLUNL, PCL__1, PCL__3, PCL__6, PCL__7, PCL_7S, PCL__8, PCL__9, PCL_10, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, +PCLRSV, PCLRSV }; +int skx_pkg_cstate_limits[16] = + { PCL__0, PCL__2, PCL_6N, PCL_6R, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, +PCLRSV, PCLRSV }; +int icx_pkg_cstate_limits[16] = + { PCL__0, PCL__2, PCL__6, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, +PCLRSV, PCLRSV }; + +static void calculate_tsc_tweak() { tsc_tweak = base_hz / tsc_hz; } void prewake_cstate_probe(unsigned int family, unsigned int model); -static void -dump_nhm_platform_info(void) +static void dump_nhm_platform_info(void) { unsigned long long msr; unsigned int ratio; @@ -2302,12 +2309,10 @@ dump_nhm_platform_info(void) fprintf(outf, "cpu%d: MSR_PLATFORM_INFO: 0x%08llx\n", base_cpu, msr); ratio = (msr >> 40) & 0xFF; - fprintf(outf, "%d * %.1f = %.1f MHz max efficiency frequency\n", - ratio, bclk, ratio * bclk); + fprintf(outf, "%d * %.1f = %.1f MHz max efficiency frequency\n", ratio, bclk, ratio * bclk); ratio = (msr >> 8) & 0xFF; - fprintf(outf, "%d * %.1f = %.1f MHz base frequency\n", - ratio, bclk, ratio * bclk); + fprintf(outf, "%d * %.1f = %.1f MHz base frequency\n", ratio, bclk, ratio * bclk); get_msr(base_cpu, MSR_IA32_POWER_CTL, &msr); fprintf(outf, "cpu%d: MSR_IA32_POWER_CTL: 0x%08llx (C1E auto-promotion: %sabled)\n", @@ -2315,14 +2320,12 @@ dump_nhm_platform_info(void) /* C-state Pre-wake Disable (CSTATE_PREWAKE_DISABLE) */ if (dis_cstate_prewake) - fprintf(outf, "C-state Pre-wake: %sabled\n", - msr & 0x40000000 ? "DIS" : "EN"); + fprintf(outf, "C-state Pre-wake: %sabled\n", msr & 0x40000000 ? "DIS" : "EN"); return; } -static void -dump_hsw_turbo_ratio_limits(void) +static void dump_hsw_turbo_ratio_limits(void) { unsigned long long msr; unsigned int ratio; @@ -2333,18 +2336,15 @@ dump_hsw_turbo_ratio_limits(void) ratio = (msr >> 8) & 0xFF; if (ratio) - fprintf(outf, "%d * %.1f = %.1f MHz max turbo 18 active cores\n", - ratio, bclk, ratio * bclk); + fprintf(outf, "%d * %.1f = %.1f MHz max turbo 18 active cores\n", ratio, bclk, ratio * bclk); ratio = (msr >> 0) & 0xFF; if (ratio) - fprintf(outf, "%d * %.1f = %.1f MHz max turbo 17 active cores\n", - ratio, bclk, ratio * bclk); + fprintf(outf, "%d * %.1f = %.1f MHz max turbo 17 active cores\n", ratio, bclk, ratio * bclk); return; } -static void -dump_ivt_turbo_ratio_limits(void) +static void dump_ivt_turbo_ratio_limits(void) { unsigned long long msr; unsigned int ratio; @@ -2355,45 +2355,38 @@ dump_ivt_turbo_ratio_limits(void) ratio = (msr >> 56) & 0xFF; if (ratio) - fprintf(outf, "%d * %.1f = %.1f MHz max turbo 16 active cores\n", - ratio, bclk, ratio * bclk); + fprintf(outf, "%d * %.1f = %.1f MHz max turbo 16 active cores\n", ratio, bclk, ratio * bclk); ratio = (msr >> 48) & 0xFF; if (ratio) - fprintf(outf, "%d * %.1f = %.1f MHz max turbo 15 active cores\n", - ratio, bclk, ratio * bclk); + fprintf(outf, "%d * %.1f = %.1f MHz max turbo 15 active cores\n", ratio, bclk, ratio * bclk); ratio = (msr >> 40) & 0xFF; if (ratio) - fprintf(outf, "%d * %.1f = %.1f MHz max turbo 14 active cores\n", - ratio, bclk, ratio * bclk); + fprintf(outf, "%d * %.1f = %.1f MHz max turbo 14 active cores\n", ratio, bclk, ratio * bclk); ratio = (msr >> 32) & 0xFF; if (ratio) - fprintf(outf, "%d * %.1f = %.1f MHz max turbo 13 active cores\n", - ratio, bclk, ratio * bclk); + fprintf(outf, "%d * %.1f = %.1f MHz max turbo 13 active cores\n", ratio, bclk, ratio * bclk); ratio = (msr >> 24) & 0xFF; if (ratio) - fprintf(outf, "%d * %.1f = %.1f MHz max turbo 12 active cores\n", - ratio, bclk, ratio * bclk); + fprintf(outf, "%d * %.1f = %.1f MHz max turbo 12 active cores\n", ratio, bclk, ratio * bclk); ratio = (msr >> 16) & 0xFF; if (ratio) - fprintf(outf, "%d * %.1f = %.1f MHz max turbo 11 active cores\n", - ratio, bclk, ratio * bclk); + fprintf(outf, "%d * %.1f = %.1f MHz max turbo 11 active cores\n", ratio, bclk, ratio * bclk); ratio = (msr >> 8) & 0xFF; if (ratio) - fprintf(outf, "%d * %.1f = %.1f MHz max turbo 10 active cores\n", - ratio, bclk, ratio * bclk); + fprintf(outf, "%d * %.1f = %.1f MHz max turbo 10 active cores\n", ratio, bclk, ratio * bclk); ratio = (msr >> 0) & 0xFF; if (ratio) - fprintf(outf, "%d * %.1f = %.1f MHz max turbo 9 active cores\n", - ratio, bclk, ratio * bclk); + fprintf(outf, "%d * %.1f = %.1f MHz max turbo 9 active cores\n", ratio, bclk, ratio * bclk); return; } + int has_turbo_ratio_group_limits(int family, int model) { @@ -2411,8 +2404,7 @@ int has_turbo_ratio_group_limits(int family, int model) return 0; } -static void -dump_turbo_ratio_limits(int family, int model) +static void dump_turbo_ratio_limits(int family, int model) { unsigned long long msr, core_counts; unsigned int ratio, group_size; @@ -2477,8 +2469,7 @@ dump_turbo_ratio_limits(int family, int model) return; } -static void -dump_atom_turbo_ratio_limits(void) +static void dump_atom_turbo_ratio_limits(void) { unsigned long long msr; unsigned int ratio; @@ -2488,45 +2479,37 @@ dump_atom_turbo_ratio_limits(void) ratio = (msr >> 0) & 0x3F; if (ratio) - fprintf(outf, "%d * %.1f = %.1f MHz minimum operating frequency\n", - ratio, bclk, ratio * bclk); + fprintf(outf, "%d * %.1f = %.1f MHz minimum operating frequency\n", ratio, bclk, ratio * bclk); ratio = (msr >> 8) & 0x3F; if (ratio) - fprintf(outf, "%d * %.1f = %.1f MHz low frequency mode (LFM)\n", - ratio, bclk, ratio * bclk); + fprintf(outf, "%d * %.1f = %.1f MHz low frequency mode (LFM)\n", ratio, bclk, ratio * bclk); ratio = (msr >> 16) & 0x3F; if (ratio) - fprintf(outf, "%d * %.1f = %.1f MHz base frequency\n", - ratio, bclk, ratio * bclk); + fprintf(outf, "%d * %.1f = %.1f MHz base frequency\n", ratio, bclk, ratio * bclk); get_msr(base_cpu, MSR_ATOM_CORE_TURBO_RATIOS, &msr); fprintf(outf, "cpu%d: MSR_ATOM_CORE_TURBO_RATIOS: 0x%08llx\n", base_cpu, msr & 0xFFFFFFFF); ratio = (msr >> 24) & 0x3F; if (ratio) - fprintf(outf, "%d * %.1f = %.1f MHz max turbo 4 active cores\n", - ratio, bclk, ratio * bclk); + fprintf(outf, "%d * %.1f = %.1f MHz max turbo 4 active cores\n", ratio, bclk, ratio * bclk); ratio = (msr >> 16) & 0x3F; if (ratio) - fprintf(outf, "%d * %.1f = %.1f MHz max turbo 3 active cores\n", - ratio, bclk, ratio * bclk); + fprintf(outf, "%d * %.1f = %.1f MHz max turbo 3 active cores\n", ratio, bclk, ratio * bclk); ratio = (msr >> 8) & 0x3F; if (ratio) - fprintf(outf, "%d * %.1f = %.1f MHz max turbo 2 active cores\n", - ratio, bclk, ratio * bclk); + fprintf(outf, "%d * %.1f = %.1f MHz max turbo 2 active cores\n", ratio, bclk, ratio * bclk); ratio = (msr >> 0) & 0x3F; if (ratio) - fprintf(outf, "%d * %.1f = %.1f MHz max turbo 1 active core\n", - ratio, bclk, ratio * bclk); + fprintf(outf, "%d * %.1f = %.1f MHz max turbo 1 active core\n", ratio, bclk, ratio * bclk); } -static void -dump_knl_turbo_ratio_limits(void) +static void dump_knl_turbo_ratio_limits(void) { const unsigned int buckets_no = 7; @@ -2538,8 +2521,7 @@ dump_knl_turbo_ratio_limits(void) get_msr(base_cpu, MSR_TURBO_RATIO_LIMIT, &msr); - fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT: 0x%08llx\n", - base_cpu, msr); + fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT: 0x%08llx\n", base_cpu, msr); /* * Turbo encoding in KNL is as follows: @@ -2584,8 +2566,7 @@ dump_knl_turbo_ratio_limits(void) ratio[i], bclk, ratio[i] * bclk, cores[i]); } -static void -dump_nhm_cst_cfg(void) +static void dump_nhm_cst_cfg(void) { unsigned long long msr; @@ -2598,14 +2579,11 @@ dump_nhm_cst_cfg(void) (msr & SNB_C1_AUTO_UNDEMOTE) ? "UNdemote-C1, " : "", (msr & NHM_C3_AUTO_DEMOTE) ? "demote-C3, " : "", (msr & NHM_C1_AUTO_DEMOTE) ? "demote-C1, " : "", - (msr & (1 << 15)) ? "" : "UN", - (unsigned int)msr & 0xF, - pkg_cstate_limit_strings[pkg_cstate_limit]); + (msr & (1 << 15)) ? "" : "UN", (unsigned int)msr & 0xF, pkg_cstate_limit_strings[pkg_cstate_limit]); #define AUTOMATIC_CSTATE_CONVERSION (1UL << 16) if (has_automatic_cstate_conversion) { - fprintf(outf, ", automatic c-state conversion=%s", - (msr & AUTOMATIC_CSTATE_CONVERSION) ? "on" : "off"); + fprintf(outf, ", automatic c-state conversion=%s", (msr & AUTOMATIC_CSTATE_CONVERSION) ? "on" : "off"); } fprintf(outf, ")\n"); @@ -2613,8 +2591,7 @@ dump_nhm_cst_cfg(void) return; } -static void -dump_config_tdp(void) +static void dump_config_tdp(void) { unsigned long long msr; @@ -2656,7 +2633,7 @@ dump_config_tdp(void) fprintf(outf, ")\n"); } -unsigned int irtl_time_units[] = {1, 32, 1024, 32768, 1048576, 33554432, 0, 0 }; +unsigned int irtl_time_units[] = { 1, 32, 1024, 32768, 1048576, 33554432, 0, 0 }; void print_irtl(void) { @@ -2696,6 +2673,7 @@ void print_irtl(void) (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); } + void free_fd_percpu(void) { int i; @@ -2752,7 +2730,6 @@ void free_all_buffers(void) free(cpus); } - /* * Parse a file containing a single int. * Return 0 if file can not be opened @@ -2827,8 +2804,7 @@ void set_node_data(void) * the logical_node_id */ for (cpux = cpu; cpux <= topo.max_cpu_num; cpux++) { - if ((cpus[cpux].physical_package_id == pkg) && - (cpus[cpux].physical_node_id == node)) { + if ((cpus[cpux].physical_package_id == pkg) && (cpus[cpux].physical_node_id == node)) { cpus[cpux].logical_node_id = lnode; cpu_count++; } @@ -2850,8 +2826,7 @@ int get_physical_node_id(struct cpu_topology *thiscpu) int cpu = thiscpu->logical_cpu_id; for (i = 0; i <= topo.max_cpu_num; i++) { - sprintf(path, "/sys/devices/system/cpu/cpu%d/node%i/cpulist", - cpu, i); + sprintf(path, "/sys/devices/system/cpu/cpu%d/node%i/cpulist", cpu, i); filep = fopen(path, "r"); if (!filep) continue; @@ -2881,8 +2856,7 @@ int get_thread_siblings(struct cpu_topology *thiscpu) size = CPU_ALLOC_SIZE((topo.max_cpu_num + 1)); CPU_ZERO_S(size, thiscpu->put_ids); - sprintf(path, - "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", cpu); + sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", cpu); filep = fopen(path, "r"); if (!filep) { @@ -2899,10 +2873,8 @@ int get_thread_siblings(struct cpu_topology *thiscpu) sib_core = get_core_id(so); if (sib_core == thiscpu->physical_core_id) { CPU_SET_S(so, size, thiscpu->put_ids); - if ((so != cpu) && - (cpus[so].thread_id < 0)) - cpus[so].thread_id = - thread_id++; + if ((so != cpu) && (cpus[so].thread_id < 0)) + cpus[so].thread_id = thread_id++; } } } @@ -2917,41 +2889,31 @@ int get_thread_siblings(struct cpu_topology *thiscpu) * skip non-present cpus */ -int for_all_cpus_2(int (func)(struct thread_data *, struct core_data *, - struct pkg_data *, struct thread_data *, struct core_data *, - struct pkg_data *), struct thread_data *thread_base, - struct core_data *core_base, struct pkg_data *pkg_base, - struct thread_data *thread_base2, struct core_data *core_base2, - struct pkg_data *pkg_base2) +int for_all_cpus_2(int (func) (struct thread_data *, struct core_data *, + struct pkg_data *, struct thread_data *, struct core_data *, + struct pkg_data *), struct thread_data *thread_base, + struct core_data *core_base, struct pkg_data *pkg_base, + struct thread_data *thread_base2, struct core_data *core_base2, struct pkg_data *pkg_base2) { int retval, pkg_no, node_no, core_no, thread_no; for (pkg_no = 0; pkg_no < topo.num_packages; ++pkg_no) { for (node_no = 0; node_no < topo.nodes_per_pkg; ++node_no) { - for (core_no = 0; core_no < topo.cores_per_node; - ++core_no) { - for (thread_no = 0; thread_no < - topo.threads_per_core; ++thread_no) { + for (core_no = 0; core_no < topo.cores_per_node; ++core_no) { + for (thread_no = 0; thread_no < topo.threads_per_core; ++thread_no) { struct thread_data *t, *t2; struct core_data *c, *c2; struct pkg_data *p, *p2; - t = GET_THREAD(thread_base, thread_no, - core_no, node_no, - pkg_no); + t = GET_THREAD(thread_base, thread_no, core_no, node_no, pkg_no); if (cpu_is_not_present(t->cpu_id)) continue; - t2 = GET_THREAD(thread_base2, thread_no, - core_no, node_no, - pkg_no); + t2 = GET_THREAD(thread_base2, thread_no, core_no, node_no, pkg_no); - c = GET_CORE(core_base, core_no, - node_no, pkg_no); - c2 = GET_CORE(core_base2, core_no, - node_no, - pkg_no); + c = GET_CORE(core_base, core_no, node_no, pkg_no); + c2 = GET_CORE(core_base2, core_no, node_no, pkg_no); p = GET_PKG(pkg_base, pkg_no); p2 = GET_PKG(pkg_base2, pkg_no); @@ -2970,7 +2932,7 @@ int for_all_cpus_2(int (func)(struct thread_data *, struct core_data *, * run func(cpu) on every cpu in /proc/stat * return max_cpu number */ -int for_all_proc_cpus(int (func)(int)) +int for_all_proc_cpus(int (func) (int)) { FILE *fp; int cpu_num; @@ -2990,7 +2952,7 @@ int for_all_proc_cpus(int (func)(int)) retval = func(cpu_num); if (retval) { fclose(fp); - return(retval); + return (retval); } } fclose(fp); @@ -3014,16 +2976,14 @@ void set_max_cpu_num(void) base_cpu = sched_getcpu(); if (base_cpu < 0) err(1, "cannot find calling cpu ID"); - sprintf(pathname, - "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", - base_cpu); + sprintf(pathname, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", base_cpu); filep = fopen_or_die(pathname, "r"); topo.max_cpu_num = 0; while (fscanf(filep, "%lx,", &dummy) == 1) topo.max_cpu_num += BITMASK_SIZE; fclose(filep); - topo.max_cpu_num--; /* 0 based */ + topo.max_cpu_num--; /* 0 based */ } /* @@ -3035,6 +2995,7 @@ int count_cpus(int cpu) topo.num_cpus++; return 0; } + int mark_cpu_present(int cpu) { CPU_SET_S(cpu, cpu_present_setsize, cpu_present_set); @@ -3104,12 +3065,12 @@ int snapshot_proc_interrupts(void) } - while (getc(fp) != '\n') - ; /* flush interrupt description */ + while (getc(fp) != '\n') ; /* flush interrupt description */ } return 0; } + /* * snapshot_gfx_rc6_ms() * @@ -3133,6 +3094,7 @@ int snapshot_gfx_rc6_ms(void) return 0; } + /* * snapshot_gfx_mhz() * @@ -3212,6 +3174,7 @@ int snapshot_cpu_lpi_us(void) return 0; } + /* * snapshot_sys_lpi() * @@ -3235,6 +3198,7 @@ int snapshot_sys_lpi_us(void) return 0; } + /* * snapshot /proc and /sys files * @@ -3266,7 +3230,7 @@ int snapshot_proc_sysfs_files(void) int exit_requested; -static void signal_handler (int signal) +static void signal_handler(int signal) { switch (signal) { case SIGINT: @@ -3373,8 +3337,7 @@ static int update_msr_sum(struct thread_data *t, struct core_data *c, struct pkg continue; ret = get_msr(cpu, offset, &msr_cur); if (ret) { - fprintf(outf, "Can not update msr(0x%llx)\n", - (unsigned long long)offset); + fprintf(outf, "Can not update msr(0x%llx)\n", (unsigned long long)offset); continue; } @@ -3387,8 +3350,7 @@ static int update_msr_sum(struct thread_data *t, struct core_data *c, struct pkg return 0; } -static void -msr_record_handler(union sigval v) +static void msr_record_handler(union sigval v) { for_all_cpus(update_msr_sum, EVEN_COUNTERS); } @@ -3433,9 +3395,9 @@ void msr_sum_record(void) } return; - release_timer: +release_timer: timer_delete(timerid); - release_msr: +release_msr: free(per_cpu_msr_sum); } @@ -3527,7 +3489,7 @@ void check_dev_msr() sprintf(pathname, "/dev/cpu/%d/msr", base_cpu); if (stat(pathname, &sb)) - if (system("/sbin/modprobe msr > /dev/null 2>&1")) + if (system("/sbin/modprobe msr > /dev/null 2>&1")) err(-5, "no /dev/cpu/0/msr, Try \"# modprobe msr\" "); } @@ -3549,8 +3511,7 @@ int check_for_cap_sys_rawio(void) err(-6, "cap_get\n"); if (cap_flag_value != CAP_SET) { - warnx("capget(CAP_SYS_RAWIO) failed," - " try \"# setcap cap_sys_rawio=ep %s\"", progname); + warnx("capget(CAP_SYS_RAWIO) failed," " try \"# setcap cap_sys_rawio=ep %s\"", progname); return 1; } @@ -3559,6 +3520,7 @@ int check_for_cap_sys_rawio(void) return 0; } + void check_permissions(void) { int do_exit = 0; @@ -3664,7 +3626,7 @@ int probe_nhm_msrs(unsigned int family, unsigned int model) case INTEL_FAM6_ATOM_GOLDMONT_PLUS: case INTEL_FAM6_ATOM_GOLDMONT_D: /* DNV */ case INTEL_FAM6_ATOM_TREMONT: /* EHL */ - case INTEL_FAM6_ATOM_TREMONT_D: /* JVL */ + case INTEL_FAM6_ATOM_TREMONT_D: /* JVL */ pkg_cstate_limits = glm_pkg_cstate_limits; break; default: @@ -3680,6 +3642,7 @@ int probe_nhm_msrs(unsigned int family, unsigned int model) has_base_hz = 1; return 1; } + /* * SLV client has support for unique MSRs: * @@ -3700,6 +3663,7 @@ int has_slv_msrs(unsigned int family, unsigned int model) } return 0; } + int is_dnv(unsigned int family, unsigned int model) { @@ -3712,6 +3676,7 @@ int is_dnv(unsigned int family, unsigned int model) } return 0; } + int is_bdx(unsigned int family, unsigned int model) { @@ -3724,6 +3689,7 @@ int is_bdx(unsigned int family, unsigned int model) } return 0; } + int is_skx(unsigned int family, unsigned int model) { @@ -3761,6 +3727,7 @@ int is_ehl(unsigned int family, unsigned int model) } return 0; } + int is_jvl(unsigned int family, unsigned int model) { if (!genuine_intel) @@ -3779,7 +3746,7 @@ int has_turbo_ratio_limit(unsigned int family, unsigned int model) return 0; switch (model) { - /* Nehalem compatible, but do not include turbo-ratio limit support */ + /* Nehalem compatible, but do not include turbo-ratio limit support */ case INTEL_FAM6_NEHALEM_EX: /* Nehalem-EX Xeon - Beckton */ case INTEL_FAM6_XEON_PHI_KNL: /* PHI - Knights Landing (different MSR definition) */ return 0; @@ -3787,6 +3754,7 @@ int has_turbo_ratio_limit(unsigned int family, unsigned int model) return 1; } } + int has_atom_turbo_ratio_limit(unsigned int family, unsigned int model) { if (has_slv_msrs(family, model)) @@ -3794,6 +3762,7 @@ int has_atom_turbo_ratio_limit(unsigned int family, unsigned int model) return 0; } + int has_ivt_turbo_ratio_limit(unsigned int family, unsigned int model) { if (!genuine_intel) @@ -3810,6 +3779,7 @@ int has_ivt_turbo_ratio_limit(unsigned int family, unsigned int model) return 0; } } + int has_hsw_turbo_ratio_limit(unsigned int family, unsigned int model) { if (!genuine_intel) @@ -3841,6 +3811,7 @@ int has_knl_turbo_ratio_limit(unsigned int family, unsigned int model) return 0; } } + int has_glm_turbo_ratio_limit(unsigned int family, unsigned int model) { if (!genuine_intel) @@ -3858,6 +3829,7 @@ int has_glm_turbo_ratio_limit(unsigned int family, unsigned int model) return 0; } } + int has_config_tdp(unsigned int family, unsigned int model) { if (!genuine_intel) @@ -3921,8 +3893,7 @@ void check_tcc_offset(int model) } } -static void -remove_underbar(char *s) +static void remove_underbar(char *s) { char *to = s; @@ -3935,8 +3906,7 @@ remove_underbar(char *s) *to = 0; } -static void -dump_cstate_pstate_config_info(unsigned int family, unsigned int model) +static void dump_cstate_pstate_config_info(unsigned int family, unsigned int model) { if (!do_nhm_platform_info) return; @@ -3981,8 +3951,8 @@ static void dump_sysfs_file(char *path) fprintf(outf, "%s: %s", strrchr(path, '/') + 1, cpuidle_buf); } -static void -dump_sysfs_cstate_config(void) + +static void dump_sysfs_cstate_config(void) { char path[64]; char name_buf[16]; @@ -4002,15 +3972,14 @@ dump_sysfs_cstate_config(void) for (state = 0; state < 10; ++state) { - sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", - base_cpu, state); + sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", base_cpu, state); input = fopen(path, "r"); if (input == NULL) continue; if (!fgets(name_buf, sizeof(name_buf), input)) err(1, "%s: failed to read file", path); - /* truncate "C1-HSW\n" to "C1", or truncate "C1\n" to "C1" */ + /* truncate "C1-HSW\n" to "C1", or truncate "C1\n" to "C1" */ sp = strchr(name_buf, '-'); if (!sp) sp = strchrnul(name_buf, '\n'); @@ -4019,8 +3988,7 @@ dump_sysfs_cstate_config(void) remove_underbar(name_buf); - sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/desc", - base_cpu, state); + sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/desc", base_cpu, state); input = fopen(path, "r"); if (input == NULL) continue; @@ -4031,8 +3999,8 @@ dump_sysfs_cstate_config(void) fclose(input); } } -static void -dump_sysfs_pstate_config(void) + +static void dump_sysfs_pstate_config(void) { char path[64]; char driver_buf[64]; @@ -4040,8 +4008,7 @@ dump_sysfs_pstate_config(void) FILE *input; int turbo; - sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_driver", - base_cpu); + sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_driver", base_cpu); input = fopen(path, "r"); if (input == NULL) { fprintf(outf, "NSFOD %s\n", path); @@ -4051,8 +4018,7 @@ dump_sysfs_pstate_config(void) err(1, "%s: failed to read file", path); fclose(input); - sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_governor", - base_cpu); + sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_governor", base_cpu); input = fopen(path, "r"); if (input == NULL) { fprintf(outf, "NSFOD %s\n", path); @@ -4084,7 +4050,6 @@ dump_sysfs_pstate_config(void) } } - /* * print_epb() * Decode the ENERGY_PERF_BIAS MSR @@ -4130,6 +4095,7 @@ int print_epb(struct thread_data *t, struct core_data *c, struct pkg_data *p) return 0; } + /* * print_hwp() * Decode the MSR_HWP_CAPABILITIES @@ -4156,8 +4122,7 @@ int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (get_msr(cpu, MSR_PM_ENABLE, &msr)) return 0; - fprintf(outf, "cpu%d: MSR_PM_ENABLE: 0x%08llx (%sHWP)\n", - cpu, msr, (msr & (1 << 0)) ? "" : "No-"); + fprintf(outf, "cpu%d: MSR_PM_ENABLE: 0x%08llx (%sHWP)\n", cpu, msr, (msr & (1 << 0)) ? "" : "No-"); /* MSR_PM_ENABLE[1] == 1 if HWP is enabled and MSRs visible */ if ((msr & (1 << 0)) == 0) @@ -4167,25 +4132,23 @@ int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p) return 0; fprintf(outf, "cpu%d: MSR_HWP_CAPABILITIES: 0x%08llx " - "(high %d guar %d eff %d low %d)\n", - cpu, msr, - (unsigned int)HWP_HIGHEST_PERF(msr), - (unsigned int)HWP_GUARANTEED_PERF(msr), - (unsigned int)HWP_MOSTEFFICIENT_PERF(msr), - (unsigned int)HWP_LOWEST_PERF(msr)); + "(high %d guar %d eff %d low %d)\n", + cpu, msr, + (unsigned int)HWP_HIGHEST_PERF(msr), + (unsigned int)HWP_GUARANTEED_PERF(msr), + (unsigned int)HWP_MOSTEFFICIENT_PERF(msr), (unsigned int)HWP_LOWEST_PERF(msr)); if (get_msr(cpu, MSR_HWP_REQUEST, &msr)) return 0; fprintf(outf, "cpu%d: MSR_HWP_REQUEST: 0x%08llx " - "(min %d max %d des %d epp 0x%x window 0x%x pkg 0x%x)\n", - cpu, msr, - (unsigned int)(((msr) >> 0) & 0xff), - (unsigned int)(((msr) >> 8) & 0xff), - (unsigned int)(((msr) >> 16) & 0xff), - (unsigned int)(((msr) >> 24) & 0xff), - (unsigned int)(((msr) >> 32) & 0xff3), - (unsigned int)(((msr) >> 42) & 0x1)); + "(min %d max %d des %d epp 0x%x window 0x%x pkg 0x%x)\n", + cpu, msr, + (unsigned int)(((msr) >> 0) & 0xff), + (unsigned int)(((msr) >> 8) & 0xff), + (unsigned int)(((msr) >> 16) & 0xff), + (unsigned int)(((msr) >> 24) & 0xff), + (unsigned int)(((msr) >> 32) & 0xff3), (unsigned int)(((msr) >> 42) & 0x1)); if (has_hwp_pkg) { if (get_msr(cpu, MSR_HWP_REQUEST_PKG, &msr)) @@ -4197,8 +4160,7 @@ int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p) (unsigned int)(((msr) >> 0) & 0xff), (unsigned int)(((msr) >> 8) & 0xff), (unsigned int)(((msr) >> 16) & 0xff), - (unsigned int)(((msr) >> 24) & 0xff), - (unsigned int)(((msr) >> 32) & 0xff3)); + (unsigned int)(((msr) >> 24) & 0xff), (unsigned int)(((msr) >> 32) & 0xff3)); } if (has_hwp_notify) { if (get_msr(cpu, MSR_HWP_INTERRUPT, &msr)) @@ -4206,18 +4168,14 @@ int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p) fprintf(outf, "cpu%d: MSR_HWP_INTERRUPT: 0x%08llx " "(%s_Guaranteed_Perf_Change, %s_Excursion_Min)\n", - cpu, msr, - ((msr) & 0x1) ? "EN" : "Dis", - ((msr) & 0x2) ? "EN" : "Dis"); + cpu, msr, ((msr) & 0x1) ? "EN" : "Dis", ((msr) & 0x2) ? "EN" : "Dis"); } if (get_msr(cpu, MSR_HWP_STATUS, &msr)) return 0; fprintf(outf, "cpu%d: MSR_HWP_STATUS: 0x%08llx " - "(%sGuaranteed_Perf_Change, %sExcursion_Min)\n", - cpu, msr, - ((msr) & 0x1) ? "" : "No-", - ((msr) & 0x2) ? "" : "No-"); + "(%sGuaranteed_Perf_Change, %sExcursion_Min)\n", + cpu, msr, ((msr) & 0x1) ? "" : "No-", ((msr) & 0x2) ? "" : "No-"); return 0; } @@ -4257,8 +4215,7 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data (msr & 1 << 5) ? "Auto-HWP, " : "", (msr & 1 << 4) ? "Graphics, " : "", (msr & 1 << 2) ? "bit2, " : "", - (msr & 1 << 1) ? "ThermStatus, " : "", - (msr & 1 << 0) ? "PROCHOT, " : ""); + (msr & 1 << 1) ? "ThermStatus, " : "", (msr & 1 << 0) ? "PROCHOT, " : ""); fprintf(outf, " (Logged: %s%s%s%s%s%s%s%s%s%s%s%s%s%s)\n", (msr & 1 << 31) ? "bit31, " : "", (msr & 1 << 30) ? "bit30, " : "", @@ -4272,8 +4229,7 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data (msr & 1 << 21) ? "Auto-HWP, " : "", (msr & 1 << 20) ? "Graphics, " : "", (msr & 1 << 18) ? "bit18, " : "", - (msr & 1 << 17) ? "ThermStatus, " : "", - (msr & 1 << 16) ? "PROCHOT, " : ""); + (msr & 1 << 17) ? "ThermStatus, " : "", (msr & 1 << 16) ? "PROCHOT, " : ""); } if (do_gfx_perf_limit_reasons) { @@ -4286,8 +4242,7 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data (msr & 1 << 6) ? "VR-Therm, " : "", (msr & 1 << 8) ? "Amps, " : "", (msr & 1 << 9) ? "GFXPwr, " : "", - (msr & 1 << 10) ? "PkgPwrL1, " : "", - (msr & 1 << 11) ? "PkgPwrL2, " : ""); + (msr & 1 << 10) ? "PkgPwrL1, " : "", (msr & 1 << 11) ? "PkgPwrL2, " : ""); fprintf(outf, " (Logged: %s%s%s%s%s%s%s%s)\n", (msr & 1 << 16) ? "PROCHOT, " : "", (msr & 1 << 17) ? "ThermStatus, " : "", @@ -4295,8 +4250,7 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data (msr & 1 << 22) ? "VR-Therm, " : "", (msr & 1 << 24) ? "Amps, " : "", (msr & 1 << 25) ? "GFXPwr, " : "", - (msr & 1 << 26) ? "PkgPwrL1, " : "", - (msr & 1 << 27) ? "PkgPwrL2, " : ""); + (msr & 1 << 26) ? "PkgPwrL1, " : "", (msr & 1 << 27) ? "PkgPwrL2, " : ""); } if (do_ring_perf_limit_reasons) { get_msr(cpu, MSR_RING_PERF_LIMIT_REASONS, &msr); @@ -4306,21 +4260,19 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data (msr & 1 << 1) ? "ThermStatus, " : "", (msr & 1 << 6) ? "VR-Therm, " : "", (msr & 1 << 8) ? "Amps, " : "", - (msr & 1 << 10) ? "PkgPwrL1, " : "", - (msr & 1 << 11) ? "PkgPwrL2, " : ""); + (msr & 1 << 10) ? "PkgPwrL1, " : "", (msr & 1 << 11) ? "PkgPwrL2, " : ""); fprintf(outf, " (Logged: %s%s%s%s%s%s)\n", (msr & 1 << 16) ? "PROCHOT, " : "", (msr & 1 << 17) ? "ThermStatus, " : "", (msr & 1 << 22) ? "VR-Therm, " : "", (msr & 1 << 24) ? "Amps, " : "", - (msr & 1 << 26) ? "PkgPwrL1, " : "", - (msr & 1 << 27) ? "PkgPwrL2, " : ""); + (msr & 1 << 26) ? "PkgPwrL1, " : "", (msr & 1 << 27) ? "PkgPwrL2, " : ""); } return 0; } #define RAPL_POWER_GRANULARITY 0x7FFF /* 15 bit power granularity */ -#define RAPL_TIME_GRANULARITY 0x3F /* 6 bit time granularity */ +#define RAPL_TIME_GRANULARITY 0x3F /* 6 bit time granularity */ double get_tdp_intel(unsigned int model) { @@ -4349,8 +4301,7 @@ double get_tdp_amd(unsigned int family) * rapl_dram_energy_units_probe() * Energy units are either hard-coded, or come from RAPL Energy Unit MSR. */ -static double -rapl_dram_energy_units_probe(int model, double rapl_energy_units) +static double rapl_dram_energy_units_probe(int model, double rapl_energy_units) { /* only called for genuine_intel, family 6 */ @@ -4402,7 +4353,9 @@ void rapl_probe_intel(unsigned int family, unsigned int model) BIC_PRESENT(BIC_PkgWatt); break; case INTEL_FAM6_ATOM_TREMONT: /* EHL */ - do_rapl = RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS | RAPL_GFX | RAPL_PKG_POWER_INFO; + do_rapl = + RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS + | RAPL_GFX | RAPL_PKG_POWER_INFO; if (rapl_joules) { BIC_PRESENT(BIC_Pkg_J); BIC_PRESENT(BIC_Cor_J); @@ -4425,7 +4378,9 @@ void rapl_probe_intel(unsigned int family, unsigned int model) break; case INTEL_FAM6_SKYLAKE_L: /* SKL */ case INTEL_FAM6_CANNONLAKE_L: /* CNL */ - do_rapl = RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS | RAPL_GFX | RAPL_PKG_POWER_INFO; + do_rapl = + RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS + | RAPL_GFX | RAPL_PKG_POWER_INFO; BIC_PRESENT(BIC_PKG__); BIC_PRESENT(BIC_RAM__); if (rapl_joules) { @@ -4445,7 +4400,9 @@ void rapl_probe_intel(unsigned int family, unsigned int model) case INTEL_FAM6_SKYLAKE_X: /* SKX */ case INTEL_FAM6_ICELAKE_X: /* ICX */ case INTEL_FAM6_XEON_PHI_KNL: /* KNL */ - do_rapl = RAPL_PKG | RAPL_DRAM | RAPL_DRAM_POWER_INFO | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS | RAPL_PKG_POWER_INFO; + do_rapl = + RAPL_PKG | RAPL_DRAM | RAPL_DRAM_POWER_INFO | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS | + RAPL_PKG_POWER_INFO; BIC_PRESENT(BIC_PKG__); BIC_PRESENT(BIC_RAM__); if (rapl_joules) { @@ -4458,7 +4415,9 @@ void rapl_probe_intel(unsigned int family, unsigned int model) break; case INTEL_FAM6_SANDYBRIDGE_X: case INTEL_FAM6_IVYBRIDGE_X: - do_rapl = RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_DRAM | RAPL_DRAM_POWER_INFO | RAPL_PKG_PERF_STATUS | RAPL_DRAM_PERF_STATUS | RAPL_PKG_POWER_INFO; + do_rapl = + RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_DRAM | RAPL_DRAM_POWER_INFO | RAPL_PKG_PERF_STATUS | + RAPL_DRAM_PERF_STATUS | RAPL_PKG_POWER_INFO; BIC_PRESENT(BIC_PKG__); BIC_PRESENT(BIC_RAM__); if (rapl_joules) { @@ -4483,7 +4442,9 @@ void rapl_probe_intel(unsigned int family, unsigned int model) } break; case INTEL_FAM6_ATOM_GOLDMONT_D: /* DNV */ - do_rapl = RAPL_PKG | RAPL_DRAM | RAPL_DRAM_POWER_INFO | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS | RAPL_PKG_POWER_INFO | RAPL_CORES_ENERGY_STATUS; + do_rapl = + RAPL_PKG | RAPL_DRAM | RAPL_DRAM_POWER_INFO | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS | + RAPL_PKG_POWER_INFO | RAPL_CORES_ENERGY_STATUS; BIC_PRESENT(BIC_PKG__); BIC_PRESENT(BIC_RAM__); if (rapl_joules) { @@ -4600,8 +4561,7 @@ void perf_limit_reasons_probe(unsigned int family, unsigned int model) void automatic_cstate_conversion_probe(unsigned int family, unsigned int model) { - if (is_skx(family, model) || is_bdx(family, model) || - is_icx(family, model)) + if (is_skx(family, model) || is_bdx(family, model) || is_icx(family, model)) has_automatic_cstate_conversion = 1; } @@ -4636,8 +4596,7 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p return 0; dts = (msr >> 16) & 0x7F; - fprintf(outf, "cpu%d: MSR_IA32_PACKAGE_THERM_STATUS: 0x%08llx (%d C)\n", - cpu, msr, tj_max - dts); + fprintf(outf, "cpu%d: MSR_IA32_PACKAGE_THERM_STATUS: 0x%08llx (%d C)\n", cpu, msr, tj_max - dts); if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, &msr)) return 0; @@ -4648,7 +4607,6 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p cpu, msr, tj_max - dts, tj_max - dts2); } - if (do_dts && debug) { unsigned int resolution; @@ -4678,7 +4636,7 @@ void print_power_limit_msr(int cpu, unsigned long long msr, char *label) cpu, label, ((msr >> 15) & 1) ? "EN" : "DIS", ((msr >> 0) & 0x7FFF) * rapl_power_units, - (1.0 + (((msr >> 22) & 0x3)/4.0)) * (1 << ((msr >> 17) & 0x1F)) * rapl_time_units, + (1.0 + (((msr >> 22) & 0x3) / 4.0)) * (1 << ((msr >> 17) & 0x1F)) * rapl_time_units, (((msr >> 16) & 1) ? "EN" : "DIS")); return; @@ -4719,12 +4677,11 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (do_rapl & RAPL_PKG_POWER_INFO) { if (get_msr(cpu, MSR_PKG_POWER_INFO, &msr)) - return -5; - + return -5; fprintf(outf, "cpu%d: MSR_PKG_POWER_INFO: 0x%08llx (%.0f W TDP, RAPL %.0f - %.0f W, %f sec.)\n", cpu, msr, - ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units, + ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units, ((msr >> 16) & RAPL_POWER_GRANULARITY) * rapl_power_units, ((msr >> 32) & RAPL_POWER_GRANULARITY) * rapl_power_units, ((msr >> 48) & RAPL_TIME_GRANULARITY) * rapl_time_units); @@ -4743,17 +4700,17 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) cpu, ((msr >> 47) & 1) ? "EN" : "DIS", ((msr >> 32) & 0x7FFF) * rapl_power_units, - (1.0 + (((msr >> 54) & 0x3)/4.0)) * (1 << ((msr >> 49) & 0x1F)) * rapl_time_units, + (1.0 + (((msr >> 54) & 0x3) / 4.0)) * (1 << ((msr >> 49) & 0x1F)) * rapl_time_units, ((msr >> 48) & 1) ? "EN" : "DIS"); } if (do_rapl & RAPL_DRAM_POWER_INFO) { if (get_msr(cpu, MSR_DRAM_POWER_INFO, &msr)) - return -6; + return -6; fprintf(outf, "cpu%d: MSR_DRAM_POWER_INFO,: 0x%08llx (%.0f W TDP, RAPL %.0f - %.0f W, %f sec.)\n", cpu, msr, - ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units, + ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units, ((msr >> 16) & RAPL_POWER_GRANULARITY) * rapl_power_units, ((msr >> 32) & RAPL_POWER_GRANULARITY) * rapl_power_units, ((msr >> 48) & RAPL_TIME_GRANULARITY) * rapl_time_units); @@ -4762,7 +4719,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (get_msr(cpu, MSR_DRAM_POWER_LIMIT, &msr)) return -9; fprintf(outf, "cpu%d: MSR_DRAM_POWER_LIMIT: 0x%08llx (%slocked)\n", - cpu, msr, (msr >> 31) & 1 ? "" : "UN"); + cpu, msr, (msr >> 31) & 1 ? "" : "UN"); print_power_limit_msr(cpu, msr, "DRAM Limit"); } @@ -4776,7 +4733,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (get_msr(cpu, MSR_PP0_POWER_LIMIT, &msr)) return -9; fprintf(outf, "cpu%d: MSR_PP0_POWER_LIMIT: 0x%08llx (%slocked)\n", - cpu, msr, (msr >> 31) & 1 ? "" : "UN"); + cpu, msr, (msr >> 31) & 1 ? "" : "UN"); print_power_limit_msr(cpu, msr, "Cores Limit"); } if (do_rapl & RAPL_GFX) { @@ -4788,7 +4745,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (get_msr(cpu, MSR_PP1_POWER_LIMIT, &msr)) return -9; fprintf(outf, "cpu%d: MSR_PP1_POWER_LIMIT: 0x%08llx (%slocked)\n", - cpu, msr, (msr >> 31) & 1 ? "" : "UN"); + cpu, msr, (msr >> 31) & 1 ? "" : "UN"); print_power_limit_msr(cpu, msr, "GFX Limit"); } return 0; @@ -4810,24 +4767,24 @@ int has_snb_msrs(unsigned int family, unsigned int model) switch (model) { case INTEL_FAM6_SANDYBRIDGE: case INTEL_FAM6_SANDYBRIDGE_X: - case INTEL_FAM6_IVYBRIDGE: /* IVB */ - case INTEL_FAM6_IVYBRIDGE_X: /* IVB Xeon */ - case INTEL_FAM6_HASWELL: /* HSW */ - case INTEL_FAM6_HASWELL_X: /* HSW */ - case INTEL_FAM6_HASWELL_L: /* HSW */ - case INTEL_FAM6_HASWELL_G: /* HSW */ - case INTEL_FAM6_BROADWELL: /* BDW */ - case INTEL_FAM6_BROADWELL_G: /* BDW */ - case INTEL_FAM6_BROADWELL_X: /* BDX */ - case INTEL_FAM6_SKYLAKE_L: /* SKL */ - case INTEL_FAM6_CANNONLAKE_L: /* CNL */ - case INTEL_FAM6_SKYLAKE_X: /* SKX */ - case INTEL_FAM6_ICELAKE_X: /* ICX */ - case INTEL_FAM6_ATOM_GOLDMONT: /* BXT */ + case INTEL_FAM6_IVYBRIDGE: /* IVB */ + case INTEL_FAM6_IVYBRIDGE_X: /* IVB Xeon */ + case INTEL_FAM6_HASWELL: /* HSW */ + case INTEL_FAM6_HASWELL_X: /* HSW */ + case INTEL_FAM6_HASWELL_L: /* HSW */ + case INTEL_FAM6_HASWELL_G: /* HSW */ + case INTEL_FAM6_BROADWELL: /* BDW */ + case INTEL_FAM6_BROADWELL_G: /* BDW */ + case INTEL_FAM6_BROADWELL_X: /* BDX */ + case INTEL_FAM6_SKYLAKE_L: /* SKL */ + case INTEL_FAM6_CANNONLAKE_L: /* CNL */ + case INTEL_FAM6_SKYLAKE_X: /* SKX */ + case INTEL_FAM6_ICELAKE_X: /* ICX */ + case INTEL_FAM6_ATOM_GOLDMONT: /* BXT */ case INTEL_FAM6_ATOM_GOLDMONT_PLUS: case INTEL_FAM6_ATOM_GOLDMONT_D: /* DNV */ - case INTEL_FAM6_ATOM_TREMONT: /* EHL */ - case INTEL_FAM6_ATOM_TREMONT_D: /* JVL */ + case INTEL_FAM6_ATOM_TREMONT: /* EHL */ + case INTEL_FAM6_ATOM_TREMONT_D: /* JVL */ return 1; } return 0; @@ -4913,7 +4870,7 @@ int is_cnl(unsigned int family, unsigned int model) return 0; switch (model) { - case INTEL_FAM6_CANNONLAKE_L: /* CNL */ + case INTEL_FAM6_CANNONLAKE_L: /* CNL */ return 1; } @@ -4928,7 +4885,7 @@ unsigned int get_aperf_mperf_multiplier(unsigned int family, unsigned int model) } #define SLM_BCLK_FREQS 5 -double slm_freq_table[SLM_BCLK_FREQS] = { 83.3, 100.0, 133.3, 116.7, 80.0}; +double slm_freq_table[SLM_BCLK_FREQS] = { 83.3, 100.0, 133.3, 116.7, 80.0 }; double slm_bclk(void) { @@ -4979,7 +4936,7 @@ int get_cpu_type(struct thread_data *t, struct core_data *c, struct pkg_data *p) __cpuid(0x1a, eax, ebx, ecx, edx); eax = (eax >> 24) & 0xFF; - if (eax == 0x20 ) + if (eax == 0x20) t->is_atom = true; return 0; } @@ -5018,8 +4975,7 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk if (tj_max_override != 0) { tj_max = tj_max_override; - fprintf(outf, "cpu%d: Using cmdline TCC Target (%d C)\n", - cpu, tj_max); + fprintf(outf, "cpu%d: Using cmdline TCC Target (%d C)\n", cpu, tj_max); return 0; } @@ -5037,16 +4993,15 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk case 4: tcc_offset = (msr >> 24) & 0xF; fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C) (%d default - %d offset)\n", - cpu, msr, tcc_default - tcc_offset, tcc_default, tcc_offset); + cpu, msr, tcc_default - tcc_offset, tcc_default, tcc_offset); break; case 6: tcc_offset = (msr >> 24) & 0x3F; fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C) (%d default - %d offset)\n", - cpu, msr, tcc_default - tcc_offset, tcc_default, tcc_offset); + cpu, msr, tcc_default - tcc_offset, tcc_default, tcc_offset); break; default: - fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C)\n", - cpu, msr, tcc_default); + fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C)\n", cpu, msr, tcc_default); break; } } @@ -5060,8 +5015,7 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk guess: tj_max = TJMAX_DEFAULT; - fprintf(outf, "cpu%d: Guessing tjMax %d C, Please use -T to specify\n", - cpu, tj_max); + fprintf(outf, "cpu%d: Guessing tjMax %d C, Please use -T to specify\n", cpu, tj_max); return 0; } @@ -5072,9 +5026,7 @@ void decode_feature_control_msr(void) if (!get_msr(base_cpu, MSR_IA32_FEAT_CTL, &msr)) fprintf(outf, "cpu%d: MSR_IA32_FEATURE_CONTROL: 0x%08llx (%sLocked %s)\n", - base_cpu, msr, - msr & FEAT_CTL_LOCKED ? "" : "UN-", - msr & (1 << 18) ? "SGX" : ""); + base_cpu, msr, msr & FEAT_CTL_LOCKED ? "" : "UN-", msr & (1 << 18) ? "SGX" : ""); } void decode_misc_enable_msr(void) @@ -5102,13 +5054,12 @@ void decode_misc_feature_control(void) return; if (!get_msr(base_cpu, MSR_MISC_FEATURE_CONTROL, &msr)) - fprintf(outf, "cpu%d: MSR_MISC_FEATURE_CONTROL: 0x%08llx (%sL2-Prefetch %sL2-Prefetch-pair %sL1-Prefetch %sL1-IP-Prefetch)\n", - base_cpu, msr, - msr & (0 << 0) ? "No-" : "", - msr & (1 << 0) ? "No-" : "", - msr & (2 << 0) ? "No-" : "", - msr & (3 << 0) ? "No-" : ""); + fprintf(outf, + "cpu%d: MSR_MISC_FEATURE_CONTROL: 0x%08llx (%sL2-Prefetch %sL2-Prefetch-pair %sL1-Prefetch %sL1-IP-Prefetch)\n", + base_cpu, msr, msr & (0 << 0) ? "No-" : "", msr & (1 << 0) ? "No-" : "", + msr & (2 << 0) ? "No-" : "", msr & (3 << 0) ? "No-" : ""); } + /* * Decode MSR_MISC_PWR_MGMT * @@ -5129,10 +5080,9 @@ void decode_misc_pwr_mgmt_msr(void) if (!get_msr(base_cpu, MSR_MISC_PWR_MGMT, &msr)) fprintf(outf, "cpu%d: MSR_MISC_PWR_MGMT: 0x%08llx (%sable-EIST_Coordination %sable-EPB %sable-OOB)\n", base_cpu, msr, - msr & (1 << 0) ? "DIS" : "EN", - msr & (1 << 1) ? "EN" : "DIS", - msr & (1 << 8) ? "EN" : "DIS"); + msr & (1 << 0) ? "DIS" : "EN", msr & (1 << 1) ? "EN" : "DIS", msr & (1 << 8) ? "EN" : "DIS"); } + /* * Decode MSR_CC6_DEMOTION_POLICY_CONFIG, MSR_MC6_DEMOTION_POLICY_CONFIG * @@ -5158,10 +5108,10 @@ void decode_c6_demotion_policy_msr(void) unsigned int intel_model_duplicates(unsigned int model) { - switch(model) { + switch (model) { case INTEL_FAM6_NEHALEM_EP: /* Core i7, Xeon 5500 series - Bloomfield, Gainstown NHM-EP */ case INTEL_FAM6_NEHALEM: /* Core i7 and i5 Processor - Clarksfield, Lynnfield, Jasper Forest */ - case 0x1F: /* Core i7 and i5 Processor - Nehalem */ + case 0x1F: /* Core i7 and i5 Processor - Nehalem */ case INTEL_FAM6_WESTMERE: /* Westmere Client - Clarkdale, Arrandale */ case INTEL_FAM6_WESTMERE_EP: /* Westmere EP - Gulftown */ return INTEL_FAM6_NEHALEM; @@ -5224,13 +5174,11 @@ void print_dev_latency(void) close(fd); return; } - fprintf(outf, "/dev/cpu_dma_latency: %d usec (%s)\n", - value, value == 2000000000 ? "default" : "constrained"); + fprintf(outf, "/dev/cpu_dma_latency: %d usec (%s)\n", value, value == 2000000000 ? "default" : "constrained"); close(fd); } - /* * Linux-perf manages the the HW instructions-retired counter * by enabling when requested, and hiding rollover @@ -5296,7 +5244,8 @@ void process_cpuid() if (!quiet) { fprintf(outf, "CPUID(1): family:model:stepping 0x%x:%x:%x (%d:%d:%d) microcode 0x%x\n", - family, model, stepping, family, model, stepping, (unsigned int)((ucode_patch >> 32) & 0xFFFFFFFF)); + family, model, stepping, family, model, stepping, + (unsigned int)((ucode_patch >> 32) & 0xFFFFFFFF)); fprintf(outf, "CPUID(0x80000000): max_extended_levels: 0x%x\n", max_extended_level); fprintf(outf, "CPUID(1): %s %s %s %s %s %s %s %s %s %s\n", ecx_flags & (1 << 0) ? "SSE3" : "-", @@ -5307,8 +5256,7 @@ void process_cpuid() edx_flags & (1 << 4) ? "TSC" : "-", edx_flags & (1 << 5) ? "MSR" : "-", edx_flags & (1 << 22) ? "ACPI-TM" : "-", - edx_flags & (1 << 28) ? "HT" : "-", - edx_flags & (1 << 29) ? "TM" : "-"); + edx_flags & (1 << 28) ? "HT" : "-", edx_flags & (1 << 29) ? "TM" : "-"); } if (genuine_intel) { model_orig = model; @@ -5364,14 +5312,11 @@ void process_cpuid() has_hwp ? "" : "No-", has_hwp_notify ? "" : "No-", has_hwp_activity_window ? "" : "No-", - has_hwp_epp ? "" : "No-", - has_hwp_pkg ? "" : "No-", - has_epb ? "" : "No-"); + has_hwp_epp ? "" : "No-", has_hwp_pkg ? "" : "No-", has_epb ? "" : "No-"); if (!quiet) decode_misc_enable_msr(); - if (max_level >= 0x7 && !quiet) { int has_sgx; @@ -5403,7 +5348,7 @@ void process_cpuid() eax_crystal, ebx_tsc, crystal_hz); if (crystal_hz == 0) - switch(model) { + switch (model) { case INTEL_FAM6_SKYLAKE_L: /* SKL */ crystal_hz = 24000000; /* 24.0 MHz */ break; @@ -5416,13 +5361,13 @@ void process_cpuid() break; default: crystal_hz = 0; - } + } if (crystal_hz) { - tsc_hz = (unsigned long long) crystal_hz * ebx_tsc / eax_crystal; + tsc_hz = (unsigned long long)crystal_hz *ebx_tsc / eax_crystal; if (!quiet) fprintf(outf, "TSC: %lld MHz (%d Hz * %d / %d / 1000000)\n", - tsc_hz / 1000000, crystal_hz, ebx_tsc, eax_crystal); + tsc_hz / 1000000, crystal_hz, ebx_tsc, eax_crystal); } } } @@ -5517,10 +5462,9 @@ void process_cpuid() BIC_PRESENT(BIC_CPUGFX); } do_slm_cstates = is_slm(family, model); - do_knl_cstates = is_knl(family, model); + do_knl_cstates = is_knl(family, model); - if (do_slm_cstates || do_knl_cstates || is_cnl(family, model) || - is_ehl(family, model)) + if (do_slm_cstates || do_knl_cstates || is_cnl(family, model) || is_ehl(family, model)) BIC_NOT_PRESENT(BIC_CPU_c3); if (!quiet) @@ -5614,7 +5558,7 @@ void topology_probe() if (debug > 1) fprintf(outf, "num_cpus %d max_cpu_num %d\n", topo.num_cpus, topo.max_cpu_num); - cpus = calloc(1, (topo.max_cpu_num + 1) * sizeof(struct cpu_topology)); + cpus = calloc(1, (topo.max_cpu_num + 1) * sizeof(struct cpu_topology)); if (cpus == NULL) err(1, "calloc cpus"); @@ -5693,22 +5637,19 @@ void topology_probe() topo.cores_per_node = max_core_id + 1; if (debug > 1) - fprintf(outf, "max_core_id %d, sizing for %d cores per package\n", - max_core_id, topo.cores_per_node); + fprintf(outf, "max_core_id %d, sizing for %d cores per package\n", max_core_id, topo.cores_per_node); if (!summary_only && topo.cores_per_node > 1) BIC_PRESENT(BIC_Core); topo.num_die = max_die_id + 1; if (debug > 1) - fprintf(outf, "max_die_id %d, sizing for %d die\n", - max_die_id, topo.num_die); + fprintf(outf, "max_die_id %d, sizing for %d die\n", max_die_id, topo.num_die); if (!summary_only && topo.num_die > 1) BIC_PRESENT(BIC_Die); topo.num_packages = max_package_id + 1; if (debug > 1) - fprintf(outf, "max_package_id %d, sizing for %d packages\n", - max_package_id, topo.num_packages); + fprintf(outf, "max_package_id %d, sizing for %d packages\n", max_package_id, topo.num_packages); if (!summary_only && topo.num_packages > 1) BIC_PRESENT(BIC_Package); @@ -5731,21 +5672,15 @@ void topology_probe() fprintf(outf, "cpu %d pkg %d die %d node %d lnode %d core %d thread %d\n", i, cpus[i].physical_package_id, cpus[i].die_id, - cpus[i].physical_node_id, - cpus[i].logical_node_id, - cpus[i].physical_core_id, - cpus[i].thread_id); + cpus[i].physical_node_id, cpus[i].logical_node_id, cpus[i].physical_core_id, cpus[i].thread_id); } } -void -allocate_counters(struct thread_data **t, struct core_data **c, - struct pkg_data **p) +void allocate_counters(struct thread_data **t, struct core_data **c, struct pkg_data **p) { int i; - int num_cores = topo.cores_per_node * topo.nodes_per_pkg * - topo.num_packages; + int num_cores = topo.cores_per_node * topo.nodes_per_pkg * topo.num_packages; int num_threads = topo.threads_per_core * num_cores; *t = calloc(num_threads, sizeof(struct thread_data)); @@ -5773,13 +5708,13 @@ allocate_counters(struct thread_data **t, struct core_data **c, error: err(1, "calloc counters"); } + /* * init_counter() * * set FIRST_THREAD_IN_CORE and FIRST_CORE_IN_PACKAGE */ -void init_counter(struct thread_data *thread_base, struct core_data *core_base, - struct pkg_data *pkg_base, int cpu_id) +void init_counter(struct thread_data *thread_base, struct core_data *core_base, struct pkg_data *pkg_base, int cpu_id) { int pkg_id = cpus[cpu_id].physical_package_id; int node_id = cpus[cpu_id].logical_node_id; @@ -5789,7 +5724,6 @@ void init_counter(struct thread_data *thread_base, struct core_data *core_base, struct core_data *c; struct pkg_data *p; - /* Workaround for systems where physical_node_id==-1 * and logical_node_id==(-1 - topo.num_cpus) */ @@ -5811,7 +5745,6 @@ void init_counter(struct thread_data *thread_base, struct core_data *core_base, p->package_id = pkg_id; } - int initialize_counters(int cpu_id) { init_counter(EVEN_COUNTERS, cpu_id); @@ -5826,12 +5759,14 @@ void allocate_output_buffer() if (outp == NULL) err(-1, "calloc output buffer"); } + void allocate_fd_percpu(void) { fd_percpu = calloc(topo.max_cpu_num + 1, sizeof(int)); if (fd_percpu == NULL) err(-1, "calloc fd_percpu"); } + void allocate_irq_buffers(void) { irq_column_2_cpu = calloc(topo.num_cpus, sizeof(int)); @@ -5842,6 +5777,7 @@ void allocate_irq_buffers(void) if (irqs_per_cpu == NULL) err(-1, "calloc %d", topo.max_cpu_num + 1); } + void setup_all_buffers(void) { topology_probe(); @@ -5872,7 +5808,6 @@ void turbostat_init() process_cpuid(); linux_perf_init(); - if (!quiet) for_all_cpus(print_hwp, ODD_COUNTERS); @@ -5945,7 +5880,7 @@ int fork_it(char **argv) format_all_counters(EVEN_COUNTERS); } - fprintf(outf, "%.6f sec\n", tv_delta.tv_sec + tv_delta.tv_usec/1000000.0); + fprintf(outf, "%.6f sec\n", tv_delta.tv_sec + tv_delta.tv_usec / 1000000.0); flush_output_stderr(); @@ -5970,14 +5905,14 @@ int get_and_dump_counters(void) return status; } -void print_version() { - fprintf(outf, "turbostat version 21.03.12" - " - Len Brown \n"); +void print_version() +{ + fprintf(outf, "turbostat version 21.03.12" " - Len Brown \n"); } int add_counter(unsigned int msr_num, char *path, char *name, - unsigned int width, enum counter_scope scope, - enum counter_type type, enum counter_format format, int flags) + unsigned int width, enum counter_scope scope, + enum counter_type type, enum counter_format format, int flags) { struct msr_counter *msrp; @@ -6003,8 +5938,7 @@ int add_counter(unsigned int msr_num, char *path, char *name, sys.tp = msrp; sys.added_thread_counters++; if (sys.added_thread_counters > MAX_ADDED_THREAD_COUNTERS) { - fprintf(stderr, "exceeded max %d added thread counters\n", - MAX_ADDED_COUNTERS); + fprintf(stderr, "exceeded max %d added thread counters\n", MAX_ADDED_COUNTERS); exit(-1); } break; @@ -6014,8 +5948,7 @@ int add_counter(unsigned int msr_num, char *path, char *name, sys.cp = msrp; sys.added_core_counters++; if (sys.added_core_counters > MAX_ADDED_COUNTERS) { - fprintf(stderr, "exceeded max %d added core counters\n", - MAX_ADDED_COUNTERS); + fprintf(stderr, "exceeded max %d added core counters\n", MAX_ADDED_COUNTERS); exit(-1); } break; @@ -6025,8 +5958,7 @@ int add_counter(unsigned int msr_num, char *path, char *name, sys.pp = msrp; sys.added_package_counters++; if (sys.added_package_counters > MAX_ADDED_COUNTERS) { - fprintf(stderr, "exceeded max %d added package counters\n", - MAX_ADDED_COUNTERS); + fprintf(stderr, "exceeded max %d added package counters\n", MAX_ADDED_COUNTERS); exit(-1); } break; @@ -6163,15 +6095,14 @@ void probe_sysfs(void) for (state = 10; state >= 0; --state) { - sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", - base_cpu, state); + sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", base_cpu, state); input = fopen(path, "r"); if (input == NULL) continue; if (!fgets(name_buf, sizeof(name_buf), input)) err(1, "%s: failed to read file", path); - /* truncate "C1-HSW\n" to "C1", or truncate "C1\n" to "C1" */ + /* truncate "C1-HSW\n" to "C1", or truncate "C1\n" to "C1" */ sp = strchr(name_buf, '-'); if (!sp) sp = strchrnul(name_buf, '\n'); @@ -6187,20 +6118,18 @@ void probe_sysfs(void) if (is_deferred_skip(name_buf)) continue; - add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_USEC, - FORMAT_PERCENT, SYSFS_PERCPU); + add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_USEC, FORMAT_PERCENT, SYSFS_PERCPU); } for (state = 10; state >= 0; --state) { - sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", - base_cpu, state); + sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", base_cpu, state); input = fopen(path, "r"); if (input == NULL) continue; if (!fgets(name_buf, sizeof(name_buf), input)) err(1, "%s: failed to read file", path); - /* truncate "C1-HSW\n" to "C1", or truncate "C1\n" to "C1" */ + /* truncate "C1-HSW\n" to "C1", or truncate "C1\n" to "C1" */ sp = strchr(name_buf, '-'); if (!sp) sp = strchrnul(name_buf, '\n'); @@ -6214,13 +6143,11 @@ void probe_sysfs(void) if (is_deferred_skip(name_buf)) continue; - add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_ITEMS, - FORMAT_DELTA, SYSFS_PERCPU); + add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_ITEMS, FORMAT_DELTA, SYSFS_PERCPU); } } - /* * parse cpuset with following syntax * 1,2,4..6,8-10 and set bits in cpu_subset @@ -6307,37 +6234,35 @@ error: exit(-1); } - void cmdline(int argc, char **argv) { int opt; int option_index = 0; static struct option long_options[] = { - {"add", required_argument, 0, 'a'}, - {"cpu", required_argument, 0, 'c'}, - {"Dump", no_argument, 0, 'D'}, - {"debug", no_argument, 0, 'd'}, /* internal, not documented */ - {"enable", required_argument, 0, 'e'}, - {"interval", required_argument, 0, 'i'}, - {"IPC", no_argument, 0, 'I'}, - {"num_iterations", required_argument, 0, 'n'}, - {"help", no_argument, 0, 'h'}, - {"hide", required_argument, 0, 'H'}, // meh, -h taken by --help - {"Joules", no_argument, 0, 'J'}, - {"list", no_argument, 0, 'l'}, - {"out", required_argument, 0, 'o'}, - {"quiet", no_argument, 0, 'q'}, - {"show", required_argument, 0, 's'}, - {"Summary", no_argument, 0, 'S'}, - {"TCC", required_argument, 0, 'T'}, - {"version", no_argument, 0, 'v' }, - {0, 0, 0, 0 } + { "add", required_argument, 0, 'a' }, + { "cpu", required_argument, 0, 'c' }, + { "Dump", no_argument, 0, 'D' }, + { "debug", no_argument, 0, 'd' }, /* internal, not documented */ + { "enable", required_argument, 0, 'e' }, + { "interval", required_argument, 0, 'i' }, + { "IPC", no_argument, 0, 'I' }, + { "num_iterations", required_argument, 0, 'n' }, + { "help", no_argument, 0, 'h' }, + { "hide", required_argument, 0, 'H' }, // meh, -h taken by --help + { "Joules", no_argument, 0, 'J' }, + { "list", no_argument, 0, 'l' }, + { "out", required_argument, 0, 'o' }, + { "quiet", no_argument, 0, 'q' }, + { "show", required_argument, 0, 's' }, + { "Summary", no_argument, 0, 'S' }, + { "TCC", required_argument, 0, 'T' }, + { "version", no_argument, 0, 'v' }, + { 0, 0, 0, 0 } }; progname = argv[0]; - while ((opt = getopt_long_only(argc, argv, "+C:c:Dde:hi:Jn:o:qST:v", - long_options, &option_index)) != -1) { + while ((opt = getopt_long_only(argc, argv, "+C:c:Dde:hi:Jn:o:qST:v", long_options, &option_index)) != -1) { switch (opt) { case 'a': parse_add_command(optarg); @@ -6372,8 +6297,7 @@ void cmdline(int argc, char **argv) double interval = strtod(optarg, NULL); if (interval < 0.001) { - fprintf(outf, "interval %f seconds is too small\n", - interval); + fprintf(outf, "interval %f seconds is too small\n", interval); exit(2); } @@ -6400,8 +6324,7 @@ void cmdline(int argc, char **argv) num_iterations = strtod(optarg, NULL); if (num_iterations <= 0) { - fprintf(outf, "iterations %d should be positive number\n", - num_iterations); + fprintf(outf, "iterations %d should be positive number\n", num_iterations); exit(2); } break; -- cgit v1.2.3 From 38c6663a68903cf1187003129cd1873551979865 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Tue, 4 May 2021 19:27:34 -0400 Subject: tools/power turbostat: elevate priority of interval mode This makes interval mode less likely to see delayed results on a heavily loaded system. Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 66 ++++++++++++++++++++++++++++++----- 1 file changed, 57 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 13805e460a4d..a0dc39d682cd 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -2266,31 +2266,48 @@ char *pkg_cstate_limit_strings[] = { "reserved", "unknown", "pc0", "pc1", "pc2", int nhm_pkg_cstate_limits[16] = { PCL__0, PCL__1, PCL__3, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, -PCLRSV, PCLRSV }; + PCLRSV, PCLRSV +}; + int snb_pkg_cstate_limits[16] = { PCL__0, PCL__2, PCL_6N, PCL_6R, PCL__7, PCL_7S, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, -PCLRSV, PCLRSV }; + PCLRSV, PCLRSV +}; + int hsw_pkg_cstate_limits[16] = { PCL__0, PCL__2, PCL__3, PCL__6, PCL__7, PCL_7S, PCL__8, PCL__9, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, -PCLRSV, PCLRSV }; + PCLRSV, PCLRSV +}; + int slv_pkg_cstate_limits[16] = { PCL__0, PCL__1, PCLRSV, PCLRSV, PCL__4, PCLRSV, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, -PCL__6, PCL__7 }; + PCL__6, PCL__7 +}; + int amt_pkg_cstate_limits[16] = { PCLUNL, PCL__1, PCL__2, PCLRSV, PCLRSV, PCLRSV, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, -PCLRSV, PCLRSV }; + PCLRSV, PCLRSV +}; + int phi_pkg_cstate_limits[16] = { PCL__0, PCL__2, PCL_6N, PCL_6R, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, -PCLRSV, PCLRSV }; + PCLRSV, PCLRSV +}; + int glm_pkg_cstate_limits[16] = { PCLUNL, PCL__1, PCL__3, PCL__6, PCL__7, PCL_7S, PCL__8, PCL__9, PCL_10, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, -PCLRSV, PCLRSV }; + PCLRSV, PCLRSV +}; + int skx_pkg_cstate_limits[16] = { PCL__0, PCL__2, PCL_6N, PCL_6R, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, -PCLRSV, PCLRSV }; + PCLRSV, PCLRSV +}; + int icx_pkg_cstate_limits[16] = { PCL__0, PCL__2, PCL__6, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, -PCLRSV, PCLRSV }; + PCLRSV, PCLRSV +}; static void calculate_tsc_tweak() { @@ -3401,6 +3418,32 @@ release_msr: free(per_cpu_msr_sum); } +/* + * set_my_sched_priority(pri) + * return previous + */ +int set_my_sched_priority(int priority) +{ + int retval; + int original_priority; + + errno = 0; + original_priority = getpriority(PRIO_PROCESS, 0); + if (errno && (original_priority == -1)) + err(errno, "getpriority"); + + retval = setpriority(PRIO_PROCESS, 0, priority); + if (retval) + err(retval, "setpriority(%d)", priority); + + errno = 0; + retval = getpriority(PRIO_PROCESS, 0); + if (retval != priority) + err(-1, "getpriority(%d) != setpriority(%d)", retval, priority); + + return original_priority; +} + void turbostat_loop() { int retval; @@ -3409,6 +3452,11 @@ void turbostat_loop() setup_signal_handler(); + /* + * elevate own priority for interval mode + */ + set_my_sched_priority(-20); + restart: restarted++; -- cgit v1.2.3 From b60c573dc241ab3a8719e990d86a0011b79eebcb Mon Sep 17 00:00:00 2001 From: Len Brown Date: Tue, 4 May 2021 19:56:17 -0400 Subject: tools/power turbostat: Support "turbostat --hide idle" As idle, in particular, can have many columns on some machines... Make it easy to ignore them all at once. Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.8 | 6 ++++-- tools/power/x86/turbostat/turbostat.c | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index f6b7e85b121c..9b17097bc3d7 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -54,12 +54,14 @@ name as necessary to disambiguate it from others is necessary. Note that option .PP \fB--cpu cpu-set\fP limit output to system summary plus the specified cpu-set. If cpu-set is the string "core", then the system summary plus the first CPU in each core are printed -- eg. subsequent HT siblings are not printed. Or if cpu-set is the string "package", then the system summary plus the first CPU in each package is printed. Otherwise, the system summary plus the specified set of CPUs are printed. The cpu-set is ordered from low to high, comma delimited with ".." and "-" permitted to denote a range. eg. 1,2,8,14..17,21-44 .PP -\fB--hide column\fP do not show the specified built-in columns. May be invoked multiple times, or with a comma-separated list of column names. Use "--hide sysfs" to hide the sysfs statistics columns as a group. +\fB--hide column\fP do not show the specified built-in columns. May be invoked multiple times, or with a comma-separated list of column names. .PP \fB--enable column\fP show the specified built-in columns, which are otherwise disabled, by default. Currently the only built-in counters disabled by default are "usec", "Time_Of_Day_Seconds", "APIC" and "X2APIC". The column name "all" can be used to enable all disabled-by-default built-in counters. .PP -\fB--show column\fP show only the specified built-in columns. May be invoked multiple times, or with a comma-separated list of column names. Use "--show sysfs" to show the sysfs statistics columns as a group. +\fB--show column\fP show only the specified built-in columns. May be invoked multiple times, or with a comma-separated list of column names. +.PP +\fB--show CATEGORY --hide CATEGORY\fP Show and hide also accept a single CATEGORY of columns: "all", "topology", "idle", "frequency", "power", "sysfs", "other". .PP \fB--Dump\fP displays the raw counter values. .PP diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index a0dc39d682cd..4bb08de4cb71 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -667,6 +667,12 @@ struct msr_counter bic[] = { #define BIC_GFXACTMHz (1ULL << 51) #define BIC_IPC (1ULL << 52) +#define BIC_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die ) +#define BIC_THERMAL_PWR ( BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__) +#define BIC_FREQUENCY ( BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz ) +#define BIC_IDLE ( BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX) +#define BIC_OTHER ( BIC_IRQ | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC) + #define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC) unsigned long long bic_enabled = (0xFFFFFFFFFFFFFFFFULL & ~BIC_DISABLED_BY_DEFAULT); @@ -748,6 +754,18 @@ unsigned long long bic_lookup(char *name_list, enum show_hide_mode mode) if (!strcmp(name_list, "all")) return ~0; + if (!strcmp(name_list, "topology")) + return BIC_TOPOLOGY; + if (!strcmp(name_list, "power")) + return BIC_THERMAL_PWR; + if (!strcmp(name_list, "idle")) + return BIC_IDLE; + if (!strcmp(name_list, "frequency")) + return BIC_FREQUENCY; + if (!strcmp(name_list, "other")) + return BIC_OTHER; + if (!strcmp(name_list, "all")) + return 0; for (i = 0; i < MAX_BIC; ++i) { if (!strcmp(name_list, bic[i].name)) { -- cgit v1.2.3 From 3c070b2abf85b92455c2721d0a9edc68893ab6c1 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Tue, 4 May 2021 19:58:08 -0400 Subject: tools/power turbostat: version 2021.05.04 Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 4bb08de4cb71..47d3ba895d6d 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -3,7 +3,7 @@ * turbostat -- show CPU frequency and C-state residency * on modern Intel and AMD processors. * - * Copyright (c) 2013 Intel Corporation. + * Copyright (c) 2021 Intel Corporation. * Len Brown */ @@ -5973,7 +5973,7 @@ int get_and_dump_counters(void) void print_version() { - fprintf(outf, "turbostat version 21.03.12" " - Len Brown \n"); + fprintf(outf, "turbostat version 21.05.04" " - Len Brown \n"); } int add_counter(unsigned int msr_num, char *path, char *name, -- cgit v1.2.3 From fa6c02315f745f00b62c634b220c3fb5c3310258 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Tue, 4 May 2021 18:34:23 -0700 Subject: mm: huge_memory: a new debugfs interface for splitting THP tests We did not have a direct user interface of splitting the compound page backing a THP and there is no need unless we want to expose the THP implementation details to users. Make /split_huge_pages accept a new command to do that. By writing ",," to /split_huge_pages, THPs within the given virtual address range from the process with the given pid are split. It is used to test split_huge_page function. In addition, a selftest program is added to tools/testing/selftests/vm to utilize the interface by splitting PMD THPs and PTE-mapped THPs. This does not change the old behavior, i.e., writing 1 to the interface to split all THPs in the system. Link: https://lkml.kernel.org/r/20210331235309.332292-1-zi.yan@sent.com Signed-off-by: Zi Yan Reviewed-by: Yang Shi Cc: David Hildenbrand Cc: David Rientjes Cc: John Hubbard Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Mika Penttila Cc: Sandipan Das Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 155 ++++++++++- tools/testing/selftests/vm/.gitignore | 1 + tools/testing/selftests/vm/Makefile | 1 + tools/testing/selftests/vm/split_huge_page_test.c | 318 ++++++++++++++++++++++ 4 files changed, 467 insertions(+), 8 deletions(-) create mode 100644 tools/testing/selftests/vm/split_huge_page_test.c (limited to 'tools') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9b31ef84bf7e..58be9fc8253d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -2915,16 +2916,14 @@ static struct shrinker deferred_split_shrinker = { }; #ifdef CONFIG_DEBUG_FS -static int split_huge_pages_set(void *data, u64 val) +static void split_huge_pages_all(void) { struct zone *zone; struct page *page; unsigned long pfn, max_zone_pfn; unsigned long total = 0, split = 0; - if (val != 1) - return -EINVAL; - + pr_debug("Split all THPs\n"); for_each_populated_zone(zone) { max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) { @@ -2948,15 +2947,155 @@ static int split_huge_pages_set(void *data, u64 val) unlock_page(page); next: put_page(page); + cond_resched(); } } - pr_info("%lu of %lu THP split\n", split, total); + pr_debug("%lu of %lu THP split\n", split, total); +} - return 0; +static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma) +{ + return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) || + is_vm_hugetlb_page(vma); +} + +static int split_huge_pages_pid(int pid, unsigned long vaddr_start, + unsigned long vaddr_end) +{ + int ret = 0; + struct task_struct *task; + struct mm_struct *mm; + unsigned long total = 0, split = 0; + unsigned long addr; + + vaddr_start &= PAGE_MASK; + vaddr_end &= PAGE_MASK; + + /* Find the task_struct from pid */ + rcu_read_lock(); + task = find_task_by_vpid(pid); + if (!task) { + rcu_read_unlock(); + ret = -ESRCH; + goto out; + } + get_task_struct(task); + rcu_read_unlock(); + + /* Find the mm_struct */ + mm = get_task_mm(task); + put_task_struct(task); + + if (!mm) { + ret = -EINVAL; + goto out; + } + + pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n", + pid, vaddr_start, vaddr_end); + + mmap_read_lock(mm); + /* + * always increase addr by PAGE_SIZE, since we could have a PTE page + * table filled with PTE-mapped THPs, each of which is distinct. + */ + for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) { + struct vm_area_struct *vma = find_vma(mm, addr); + unsigned int follflags; + struct page *page; + + if (!vma || addr < vma->vm_start) + break; + + /* skip special VMA and hugetlb VMA */ + if (vma_not_suitable_for_thp_split(vma)) { + addr = vma->vm_end; + continue; + } + + /* FOLL_DUMP to ignore special (like zero) pages */ + follflags = FOLL_GET | FOLL_DUMP; + page = follow_page(vma, addr, follflags); + + if (IS_ERR(page)) + continue; + if (!page) + continue; + + if (!is_transparent_hugepage(page)) + goto next; + + total++; + if (!can_split_huge_page(compound_head(page), NULL)) + goto next; + + if (!trylock_page(page)) + goto next; + + if (!split_huge_page(page)) + split++; + + unlock_page(page); +next: + put_page(page); + cond_resched(); + } + mmap_read_unlock(mm); + mmput(mm); + + pr_debug("%lu of %lu THP split\n", split, total); + +out: + return ret; } -DEFINE_DEBUGFS_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set, - "%llu\n"); + +#define MAX_INPUT_BUF_SZ 255 + +static ssize_t split_huge_pages_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppops) +{ + static DEFINE_MUTEX(split_debug_mutex); + ssize_t ret; + char input_buf[MAX_INPUT_BUF_SZ]; /* hold pid, start_vaddr, end_vaddr */ + int pid; + unsigned long vaddr_start, vaddr_end; + + ret = mutex_lock_interruptible(&split_debug_mutex); + if (ret) + return ret; + + ret = -EFAULT; + + memset(input_buf, 0, MAX_INPUT_BUF_SZ); + if (copy_from_user(input_buf, buf, min_t(size_t, count, MAX_INPUT_BUF_SZ))) + goto out; + + input_buf[MAX_INPUT_BUF_SZ - 1] = '\0'; + ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end); + if (ret == 1 && pid == 1) { + split_huge_pages_all(); + ret = strlen(input_buf); + goto out; + } else if (ret != 3) { + ret = -EINVAL; + goto out; + } + + ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end); + if (!ret) + ret = strlen(input_buf); +out: + mutex_unlock(&split_debug_mutex); + return ret; + +} + +static const struct file_operations split_huge_pages_fops = { + .owner = THIS_MODULE, + .write = split_huge_pages_write, + .llseek = no_llseek, +}; static int __init split_huge_pages_debugfs(void) { diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore index 9a35c3f6a557..1f651e85ed60 100644 --- a/tools/testing/selftests/vm/.gitignore +++ b/tools/testing/selftests/vm/.gitignore @@ -22,3 +22,4 @@ map_fixed_noreplace write_to_hugetlbfs hmm-tests local_config.* +split_huge_page_test diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 8b0cd421ebd3..73e1cc96d7c2 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -42,6 +42,7 @@ TEST_GEN_FILES += on-fault-limit TEST_GEN_FILES += thuge-gen TEST_GEN_FILES += transhuge-stress TEST_GEN_FILES += userfaultfd +TEST_GEN_FILES += split_huge_page_test ifeq ($(MACHINE),x86_64) CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_32bit_program.c -m32) diff --git a/tools/testing/selftests/vm/split_huge_page_test.c b/tools/testing/selftests/vm/split_huge_page_test.c new file mode 100644 index 000000000000..2c0c18e60c57 --- /dev/null +++ b/tools/testing/selftests/vm/split_huge_page_test.c @@ -0,0 +1,318 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A test of splitting PMD THPs and PTE-mapped THPs from a specified virtual + * address range in a process via /split_huge_pages interface. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include + +uint64_t pagesize; +unsigned int pageshift; +uint64_t pmd_pagesize; + +#define PMD_SIZE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size" +#define SPLIT_DEBUGFS "/sys/kernel/debug/split_huge_pages" +#define SMAP_PATH "/proc/self/smaps" +#define INPUT_MAX 80 + +#define PFN_MASK ((1UL<<55)-1) +#define KPF_THP (1UL<<22) + +int is_backed_by_thp(char *vaddr, int pagemap_file, int kpageflags_file) +{ + uint64_t paddr; + uint64_t page_flags; + + if (pagemap_file) { + pread(pagemap_file, &paddr, sizeof(paddr), + ((long)vaddr >> pageshift) * sizeof(paddr)); + + if (kpageflags_file) { + pread(kpageflags_file, &page_flags, sizeof(page_flags), + (paddr & PFN_MASK) * sizeof(page_flags)); + + return !!(page_flags & KPF_THP); + } + } + return 0; +} + + +static uint64_t read_pmd_pagesize(void) +{ + int fd; + char buf[20]; + ssize_t num_read; + + fd = open(PMD_SIZE_PATH, O_RDONLY); + if (fd == -1) { + perror("Open hpage_pmd_size failed"); + exit(EXIT_FAILURE); + } + num_read = read(fd, buf, 19); + if (num_read < 1) { + close(fd); + perror("Read hpage_pmd_size failed"); + exit(EXIT_FAILURE); + } + buf[num_read] = '\0'; + close(fd); + + return strtoul(buf, NULL, 10); +} + +static int write_file(const char *path, const char *buf, size_t buflen) +{ + int fd; + ssize_t numwritten; + + fd = open(path, O_WRONLY); + if (fd == -1) + return 0; + + numwritten = write(fd, buf, buflen - 1); + close(fd); + if (numwritten < 1) + return 0; + + return (unsigned int) numwritten; +} + +static void write_debugfs(int pid, uint64_t vaddr_start, uint64_t vaddr_end) +{ + char input[INPUT_MAX]; + int ret; + + ret = snprintf(input, INPUT_MAX, "%d,0x%lx,0x%lx", pid, vaddr_start, + vaddr_end); + if (ret >= INPUT_MAX) { + printf("%s: Debugfs input is too long\n", __func__); + exit(EXIT_FAILURE); + } + + if (!write_file(SPLIT_DEBUGFS, input, ret + 1)) { + perror(SPLIT_DEBUGFS); + exit(EXIT_FAILURE); + } +} + +#define MAX_LINE_LENGTH 500 + +static bool check_for_pattern(FILE *fp, const char *pattern, char *buf) +{ + while (fgets(buf, MAX_LINE_LENGTH, fp) != NULL) { + if (!strncmp(buf, pattern, strlen(pattern))) + return true; + } + return false; +} + +static uint64_t check_huge(void *addr) +{ + uint64_t thp = 0; + int ret; + FILE *fp; + char buffer[MAX_LINE_LENGTH]; + char addr_pattern[MAX_LINE_LENGTH]; + + ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-", + (unsigned long) addr); + if (ret >= MAX_LINE_LENGTH) { + printf("%s: Pattern is too long\n", __func__); + exit(EXIT_FAILURE); + } + + + fp = fopen(SMAP_PATH, "r"); + if (!fp) { + printf("%s: Failed to open file %s\n", __func__, SMAP_PATH); + exit(EXIT_FAILURE); + } + if (!check_for_pattern(fp, addr_pattern, buffer)) + goto err_out; + + /* + * Fetch the AnonHugePages: in the same block and check the number of + * hugepages. + */ + if (!check_for_pattern(fp, "AnonHugePages:", buffer)) + goto err_out; + + if (sscanf(buffer, "AnonHugePages:%10ld kB", &thp) != 1) { + printf("Reading smap error\n"); + exit(EXIT_FAILURE); + } + +err_out: + fclose(fp); + return thp; +} + +void split_pmd_thp(void) +{ + char *one_page; + size_t len = 4 * pmd_pagesize; + uint64_t thp_size; + size_t i; + + one_page = memalign(pmd_pagesize, len); + + if (!one_page) { + printf("Fail to allocate memory\n"); + exit(EXIT_FAILURE); + } + + madvise(one_page, len, MADV_HUGEPAGE); + + for (i = 0; i < len; i++) + one_page[i] = (char)i; + + thp_size = check_huge(one_page); + if (!thp_size) { + printf("No THP is allocated\n"); + exit(EXIT_FAILURE); + } + + /* split all THPs */ + write_debugfs(getpid(), (uint64_t)one_page, (uint64_t)one_page + len); + + for (i = 0; i < len; i++) + if (one_page[i] != (char)i) { + printf("%ld byte corrupted\n", i); + exit(EXIT_FAILURE); + } + + + thp_size = check_huge(one_page); + if (thp_size) { + printf("Still %ld kB AnonHugePages not split\n", thp_size); + exit(EXIT_FAILURE); + } + + printf("Split huge pages successful\n"); + free(one_page); +} + +void split_pte_mapped_thp(void) +{ + char *one_page, *pte_mapped, *pte_mapped2; + size_t len = 4 * pmd_pagesize; + uint64_t thp_size; + size_t i; + const char *pagemap_template = "/proc/%d/pagemap"; + const char *kpageflags_proc = "/proc/kpageflags"; + char pagemap_proc[255]; + int pagemap_fd; + int kpageflags_fd; + + if (snprintf(pagemap_proc, 255, pagemap_template, getpid()) < 0) { + perror("get pagemap proc error"); + exit(EXIT_FAILURE); + } + pagemap_fd = open(pagemap_proc, O_RDONLY); + + if (pagemap_fd == -1) { + perror("read pagemap:"); + exit(EXIT_FAILURE); + } + + kpageflags_fd = open(kpageflags_proc, O_RDONLY); + + if (kpageflags_fd == -1) { + perror("read kpageflags:"); + exit(EXIT_FAILURE); + } + + one_page = mmap((void *)(1UL << 30), len, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + + madvise(one_page, len, MADV_HUGEPAGE); + + for (i = 0; i < len; i++) + one_page[i] = (char)i; + + thp_size = check_huge(one_page); + if (!thp_size) { + printf("No THP is allocated\n"); + exit(EXIT_FAILURE); + } + + /* remap the first pagesize of first THP */ + pte_mapped = mremap(one_page, pagesize, pagesize, MREMAP_MAYMOVE); + + /* remap the Nth pagesize of Nth THP */ + for (i = 1; i < 4; i++) { + pte_mapped2 = mremap(one_page + pmd_pagesize * i + pagesize * i, + pagesize, pagesize, + MREMAP_MAYMOVE|MREMAP_FIXED, + pte_mapped + pagesize * i); + if (pte_mapped2 == (char *)-1) { + perror("mremap failed"); + exit(EXIT_FAILURE); + } + } + + /* smap does not show THPs after mremap, use kpageflags instead */ + thp_size = 0; + for (i = 0; i < pagesize * 4; i++) + if (i % pagesize == 0 && + is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd)) + thp_size++; + + if (thp_size != 4) { + printf("Some THPs are missing during mremap\n"); + exit(EXIT_FAILURE); + } + + /* split all remapped THPs */ + write_debugfs(getpid(), (uint64_t)pte_mapped, + (uint64_t)pte_mapped + pagesize * 4); + + /* smap does not show THPs after mremap, use kpageflags instead */ + thp_size = 0; + for (i = 0; i < pagesize * 4; i++) { + if (pte_mapped[i] != (char)i) { + printf("%ld byte corrupted\n", i); + exit(EXIT_FAILURE); + } + if (i % pagesize == 0 && + is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd)) + thp_size++; + } + + if (thp_size) { + printf("Still %ld THPs not split\n", thp_size); + exit(EXIT_FAILURE); + } + + printf("Split PTE-mapped huge pages successful\n"); + munmap(one_page, len); + close(pagemap_fd); + close(kpageflags_fd); +} + +int main(int argc, char **argv) +{ + if (geteuid() != 0) { + printf("Please run the benchmark as root\n"); + exit(EXIT_FAILURE); + } + + pagesize = getpagesize(); + pageshift = ffs(pagesize) - 1; + pmd_pagesize = read_pmd_pagesize(); + + split_pmd_thp(); + split_pte_mapped_thp(); + + return 0; +} -- cgit v1.2.3 From fbe37501b2526a71d82b898671260524279c6765 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Tue, 4 May 2021 18:34:26 -0700 Subject: mm: huge_memory: debugfs for file-backed THP split Further extend /split_huge_pages to accept ",," for file-backed THP split tests since tmpfs may have file backed by THP that mapped nowhere. Update selftest program to test file-backed THP split too. Link: https://lkml.kernel.org/r/20210331235309.332292-2-zi.yan@sent.com Signed-off-by: Zi Yan Suggested-by: Kirill A. Shutemov Reviewed-by: Yang Shi Cc: "Kirill A . Shutemov" Cc: Shuah Khan Cc: John Hubbard Cc: Sandipan Das Cc: David Hildenbrand Cc: Mika Penttila Cc: David Rientjes Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 90 ++++++++++++++++++++++- tools/testing/selftests/vm/split_huge_page_test.c | 82 +++++++++++++++++++-- 2 files changed, 166 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 58be9fc8253d..b3788683f7d3 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3050,6 +3050,65 @@ out: return ret; } +static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start, + pgoff_t off_end) +{ + struct filename *file; + struct file *candidate; + struct address_space *mapping; + int ret = -EINVAL; + pgoff_t index; + int nr_pages = 1; + unsigned long total = 0, split = 0; + + file = getname_kernel(file_path); + if (IS_ERR(file)) + return ret; + + candidate = file_open_name(file, O_RDONLY, 0); + if (IS_ERR(candidate)) + goto out; + + pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n", + file_path, off_start, off_end); + + mapping = candidate->f_mapping; + + for (index = off_start; index < off_end; index += nr_pages) { + struct page *fpage = pagecache_get_page(mapping, index, + FGP_ENTRY | FGP_HEAD, 0); + + nr_pages = 1; + if (xa_is_value(fpage) || !fpage) + continue; + + if (!is_transparent_hugepage(fpage)) + goto next; + + total++; + nr_pages = thp_nr_pages(fpage); + + if (!trylock_page(fpage)) + goto next; + + if (!split_huge_page(fpage)) + split++; + + unlock_page(fpage); +next: + put_page(fpage); + cond_resched(); + } + + filp_close(candidate, NULL); + ret = 0; + + pr_debug("%lu of %lu file-backed THP split\n", split, total); +out: + putname(file); + return ret; +} + #define MAX_INPUT_BUF_SZ 255 static ssize_t split_huge_pages_write(struct file *file, const char __user *buf, @@ -3057,7 +3116,8 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf, { static DEFINE_MUTEX(split_debug_mutex); ssize_t ret; - char input_buf[MAX_INPUT_BUF_SZ]; /* hold pid, start_vaddr, end_vaddr */ + /* hold pid, start_vaddr, end_vaddr or file_path, off_start, off_end */ + char input_buf[MAX_INPUT_BUF_SZ]; int pid; unsigned long vaddr_start, vaddr_end; @@ -3072,6 +3132,34 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf, goto out; input_buf[MAX_INPUT_BUF_SZ - 1] = '\0'; + + if (input_buf[0] == '/') { + char *tok; + char *buf = input_buf; + char file_path[MAX_INPUT_BUF_SZ]; + pgoff_t off_start = 0, off_end = 0; + size_t input_len = strlen(input_buf); + + tok = strsep(&buf, ","); + if (tok) { + strncpy(file_path, tok, MAX_INPUT_BUF_SZ); + } else { + ret = -EINVAL; + goto out; + } + + ret = sscanf(buf, "0x%lx,0x%lx", &off_start, &off_end); + if (ret != 2) { + ret = -EINVAL; + goto out; + } + ret = split_huge_pages_in_file(file_path, off_start, off_end); + if (!ret) + ret = input_len; + + goto out; + } + ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end); if (ret == 1 && pid == 1) { split_huge_pages_all(); diff --git a/tools/testing/selftests/vm/split_huge_page_test.c b/tools/testing/selftests/vm/split_huge_page_test.c index 2c0c18e60c57..1af16d2c2a0a 100644 --- a/tools/testing/selftests/vm/split_huge_page_test.c +++ b/tools/testing/selftests/vm/split_huge_page_test.c @@ -7,11 +7,13 @@ #define _GNU_SOURCE #include #include +#include #include #include #include #include #include +#include #include #include @@ -24,6 +26,9 @@ uint64_t pmd_pagesize; #define SMAP_PATH "/proc/self/smaps" #define INPUT_MAX 80 +#define PID_FMT "%d,0x%lx,0x%lx" +#define PATH_FMT "%s,0x%lx,0x%lx" + #define PFN_MASK ((1UL<<55)-1) #define KPF_THP (1UL<<22) @@ -87,13 +92,16 @@ static int write_file(const char *path, const char *buf, size_t buflen) return (unsigned int) numwritten; } -static void write_debugfs(int pid, uint64_t vaddr_start, uint64_t vaddr_end) +static void write_debugfs(const char *fmt, ...) { char input[INPUT_MAX]; int ret; + va_list argp; + + va_start(argp, fmt); + ret = vsnprintf(input, INPUT_MAX, fmt, argp); + va_end(argp); - ret = snprintf(input, INPUT_MAX, "%d,0x%lx,0x%lx", pid, vaddr_start, - vaddr_end); if (ret >= INPUT_MAX) { printf("%s: Debugfs input is too long\n", __func__); exit(EXIT_FAILURE); @@ -183,7 +191,8 @@ void split_pmd_thp(void) } /* split all THPs */ - write_debugfs(getpid(), (uint64_t)one_page, (uint64_t)one_page + len); + write_debugfs(PID_FMT, getpid(), (uint64_t)one_page, + (uint64_t)one_page + len); for (i = 0; i < len; i++) if (one_page[i] != (char)i) { @@ -274,7 +283,7 @@ void split_pte_mapped_thp(void) } /* split all remapped THPs */ - write_debugfs(getpid(), (uint64_t)pte_mapped, + write_debugfs(PID_FMT, getpid(), (uint64_t)pte_mapped, (uint64_t)pte_mapped + pagesize * 4); /* smap does not show THPs after mremap, use kpageflags instead */ @@ -300,6 +309,68 @@ void split_pte_mapped_thp(void) close(kpageflags_fd); } +void split_file_backed_thp(void) +{ + int status; + int fd; + ssize_t num_written; + char tmpfs_template[] = "/tmp/thp_split_XXXXXX"; + const char *tmpfs_loc = mkdtemp(tmpfs_template); + char testfile[INPUT_MAX]; + uint64_t pgoff_start = 0, pgoff_end = 1024; + + printf("Please enable pr_debug in split_huge_pages_in_file() if you need more info.\n"); + + status = mount("tmpfs", tmpfs_loc, "tmpfs", 0, "huge=always,size=4m"); + + if (status) { + printf("Unable to create a tmpfs for testing\n"); + exit(EXIT_FAILURE); + } + + status = snprintf(testfile, INPUT_MAX, "%s/thp_file", tmpfs_loc); + if (status >= INPUT_MAX) { + printf("Fail to create file-backed THP split testing file\n"); + goto cleanup; + } + + fd = open(testfile, O_CREAT|O_WRONLY); + if (fd == -1) { + perror("Cannot open testing file\n"); + goto cleanup; + } + + /* write something to the file, so a file-backed THP can be allocated */ + num_written = write(fd, tmpfs_loc, sizeof(tmpfs_loc)); + close(fd); + + if (num_written < 1) { + printf("Fail to write data to testing file\n"); + goto cleanup; + } + + /* split the file-backed THP */ + write_debugfs(PATH_FMT, testfile, pgoff_start, pgoff_end); + + status = unlink(testfile); + if (status) + perror("Cannot remove testing file\n"); + +cleanup: + status = umount(tmpfs_loc); + if (status) { + printf("Unable to umount %s\n", tmpfs_loc); + exit(EXIT_FAILURE); + } + status = rmdir(tmpfs_loc); + if (status) { + perror("cannot remove tmp dir"); + exit(EXIT_FAILURE); + } + + printf("file-backed THP split test done, please check dmesg for more information\n"); +} + int main(int argc, char **argv) { if (geteuid() != 0) { @@ -313,6 +384,7 @@ int main(int argc, char **argv) split_pmd_thp(); split_pte_mapped_thp(); + split_file_backed_thp(); return 0; } -- cgit v1.2.3 From f0fa94330919be8ec5620382b50f1c72844c9224 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Tue, 4 May 2021 18:35:57 -0700 Subject: userfaultfd/selftests: add test exercising minor fault handling Fix a dormant bug in userfaultfd_events_test(), where we did `return faulting_process(0)` instead of `exit(faulting_process(0))`. This caused the forked process to keep running, trying to execute any further test cases after the events test in parallel with the "real" process. Add a simple test case which exercises minor faults. In short, it does the following: 1. "Sets up" an area (area_dst) and a second shared mapping to the same underlying pages (area_dst_alias). 2. Register one of these areas with userfaultfd, in minor fault mode. 3. Start a second thread to handle any minor faults. 4. Populate the underlying pages with the non-UFFD-registered side of the mapping. Basically, memset() each page with some arbitrary contents. 5. Then, using the UFFD-registered mapping, read all of the page contents, asserting that the contents match expectations (we expect the minor fault handling thread can modify the page contents before resolving the fault). The minor fault handling thread, upon receiving an event, flips all the bits (~) in that page, just to prove that it can modify it in some arbitrary way. Then it issues a UFFDIO_CONTINUE ioctl, to setup the mapping and resolve the fault. The reading thread should wake up and see this modification. Currently the minor fault test is only enabled in hugetlb_shared mode, as this is the only configuration the kernel feature supports. Link: https://lkml.kernel.org/r/20210301222728.176417-7-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Reviewed-by: Peter Xu Cc: Adam Ruprecht Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Cannon Matthews Cc: Catalin Marinas Cc: Chinwen Chang Cc: David Rientjes Cc: "Dr . David Alan Gilbert" Cc: Huang Ying Cc: Ingo Molnar Cc: Jann Horn Cc: Jerome Glisse Cc: Kirill A. Shutemov Cc: Lokesh Gidra Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: "Michal Koutn" Cc: Michel Lespinasse Cc: Mike Kravetz Cc: Mike Rapoport Cc: Mina Almasry Cc: Nicholas Piggin Cc: Oliver Upton Cc: Shaohua Li Cc: Shawn Anastasio Cc: Steven Price Cc: Steven Rostedt Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/userfaultfd.c | 164 +++++++++++++++++++++++++++++-- 1 file changed, 158 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 92b8ec423201..f5ab5e0312e7 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -81,6 +81,8 @@ static volatile bool test_uffdio_copy_eexist = true; static volatile bool test_uffdio_zeropage_eexist = true; /* Whether to test uffd write-protection */ static bool test_uffdio_wp = false; +/* Whether to test uffd minor faults */ +static bool test_uffdio_minor = false; static bool map_shared; static int huge_fd; @@ -96,6 +98,7 @@ struct uffd_stats { int cpu; unsigned long missing_faults; unsigned long wp_faults; + unsigned long minor_faults; }; /* pthread_mutex_t starts at page offset 0 */ @@ -153,17 +156,19 @@ static void uffd_stats_reset(struct uffd_stats *uffd_stats, uffd_stats[i].cpu = i; uffd_stats[i].missing_faults = 0; uffd_stats[i].wp_faults = 0; + uffd_stats[i].minor_faults = 0; } } static void uffd_stats_report(struct uffd_stats *stats, int n_cpus) { int i; - unsigned long long miss_total = 0, wp_total = 0; + unsigned long long miss_total = 0, wp_total = 0, minor_total = 0; for (i = 0; i < n_cpus; i++) { miss_total += stats[i].missing_faults; wp_total += stats[i].wp_faults; + minor_total += stats[i].minor_faults; } printf("userfaults: %llu missing (", miss_total); @@ -172,6 +177,9 @@ static void uffd_stats_report(struct uffd_stats *stats, int n_cpus) printf("\b), %llu wp (", wp_total); for (i = 0; i < n_cpus; i++) printf("%lu+", stats[i].wp_faults); + printf("\b), %llu minor (", minor_total); + for (i = 0; i < n_cpus; i++) + printf("%lu+", stats[i].minor_faults); printf("\b)\n"); } @@ -328,7 +336,7 @@ static struct uffd_test_ops shmem_uffd_test_ops = { }; static struct uffd_test_ops hugetlb_uffd_test_ops = { - .expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC, + .expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC & ~(1 << _UFFDIO_CONTINUE), .allocate_area = hugetlb_allocate_area, .release_pages = hugetlb_release_pages, .alias_mapping = hugetlb_alias_mapping, @@ -362,6 +370,22 @@ static void wp_range(int ufd, __u64 start, __u64 len, bool wp) } } +static void continue_range(int ufd, __u64 start, __u64 len) +{ + struct uffdio_continue req; + + req.range.start = start; + req.range.len = len; + req.mode = 0; + + if (ioctl(ufd, UFFDIO_CONTINUE, &req)) { + fprintf(stderr, + "UFFDIO_CONTINUE failed for address 0x%" PRIx64 "\n", + (uint64_t)start); + exit(1); + } +} + static void *locking_thread(void *arg) { unsigned long cpu = (unsigned long) arg; @@ -569,8 +593,32 @@ static void uffd_handle_page_fault(struct uffd_msg *msg, } if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) { + /* Write protect page faults */ wp_range(uffd, msg->arg.pagefault.address, page_size, false); stats->wp_faults++; + } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) { + uint8_t *area; + int b; + + /* + * Minor page faults + * + * To prove we can modify the original range for testing + * purposes, we're going to bit flip this range before + * continuing. + * + * Note that this requires all minor page fault tests operate on + * area_dst (non-UFFD-registered) and area_dst_alias + * (UFFD-registered). + */ + + area = (uint8_t *)(area_dst + + ((char *)msg->arg.pagefault.address - + area_dst_alias)); + for (b = 0; b < page_size; ++b) + area[b] = ~area[b]; + continue_range(uffd, msg->arg.pagefault.address, page_size); + stats->minor_faults++; } else { /* Missing page faults */ if (bounces & BOUNCE_VERIFY && @@ -779,7 +827,7 @@ static int stress(struct uffd_stats *uffd_stats) return 0; } -static int userfaultfd_open(int features) +static int userfaultfd_open_ext(uint64_t *features) { struct uffdio_api uffdio_api; @@ -792,7 +840,7 @@ static int userfaultfd_open(int features) uffd_flags = fcntl(uffd, F_GETFD, NULL); uffdio_api.api = UFFD_API; - uffdio_api.features = features; + uffdio_api.features = *features; if (ioctl(uffd, UFFDIO_API, &uffdio_api)) { fprintf(stderr, "UFFDIO_API failed.\nPlease make sure to " "run with either root or ptrace capability.\n"); @@ -804,9 +852,15 @@ static int userfaultfd_open(int features) return 1; } + *features = uffdio_api.features; return 0; } +static int userfaultfd_open(uint64_t features) +{ + return userfaultfd_open_ext(&features); +} + sigjmp_buf jbuf, *sigbuf; static void sighndl(int sig, siginfo_t *siginfo, void *ptr) @@ -1112,7 +1166,7 @@ static int userfaultfd_events_test(void) } if (!pid) - return faulting_process(0); + exit(faulting_process(0)); waitpid(pid, &err, 0); if (err) { @@ -1215,6 +1269,102 @@ static int userfaultfd_sig_test(void) return userfaults != 0; } +static int userfaultfd_minor_test(void) +{ + struct uffdio_register uffdio_register; + unsigned long expected_ioctls; + unsigned long p; + pthread_t uffd_mon; + uint8_t expected_byte; + void *expected_page; + char c; + struct uffd_stats stats = { 0 }; + uint64_t features = UFFD_FEATURE_MINOR_HUGETLBFS; + + if (!test_uffdio_minor) + return 0; + + printf("testing minor faults: "); + fflush(stdout); + + if (uffd_test_ops->release_pages(area_dst)) + return 1; + + if (userfaultfd_open_ext(&features)) + return 1; + /* If kernel reports the feature isn't supported, skip the test. */ + if (!(features & UFFD_FEATURE_MINOR_HUGETLBFS)) { + printf("skipping test due to lack of feature support\n"); + fflush(stdout); + return 0; + } + + uffdio_register.range.start = (unsigned long)area_dst_alias; + uffdio_register.range.len = nr_pages * page_size; + uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) { + fprintf(stderr, "register failure\n"); + exit(1); + } + + expected_ioctls = uffd_test_ops->expected_ioctls; + expected_ioctls |= 1 << _UFFDIO_CONTINUE; + if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) { + fprintf(stderr, "unexpected missing ioctl(s)\n"); + exit(1); + } + + /* + * After registering with UFFD, populate the non-UFFD-registered side of + * the shared mapping. This should *not* trigger any UFFD minor faults. + */ + for (p = 0; p < nr_pages; ++p) { + memset(area_dst + (p * page_size), p % ((uint8_t)-1), + page_size); + } + + if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) { + perror("uffd_poll_thread create"); + exit(1); + } + + /* + * Read each of the pages back using the UFFD-registered mapping. We + * expect that the first time we touch a page, it will result in a minor + * fault. uffd_poll_thread will resolve the fault by bit-flipping the + * page's contents, and then issuing a CONTINUE ioctl. + */ + + if (posix_memalign(&expected_page, page_size, page_size)) { + fprintf(stderr, "out of memory\n"); + return 1; + } + + for (p = 0; p < nr_pages; ++p) { + expected_byte = ~((uint8_t)(p % ((uint8_t)-1))); + memset(expected_page, expected_byte, page_size); + if (my_bcmp(expected_page, area_dst_alias + (p * page_size), + page_size)) { + fprintf(stderr, + "unexpected page contents after minor fault\n"); + exit(1); + } + } + + if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) { + perror("pipe write"); + exit(1); + } + if (pthread_join(uffd_mon, NULL)) + return 1; + + close(uffd); + + uffd_stats_report(&stats, 1); + + return stats.missing_faults != 0 || stats.minor_faults != nr_pages; +} + static int userfaultfd_stress(void) { void *area; @@ -1413,7 +1563,7 @@ static int userfaultfd_stress(void) close(uffd); return userfaultfd_zeropage_test() || userfaultfd_sig_test() - || userfaultfd_events_test(); + || userfaultfd_events_test() || userfaultfd_minor_test(); } /* @@ -1454,6 +1604,8 @@ static void set_test_type(const char *type) map_shared = true; test_type = TEST_HUGETLB; uffd_test_ops = &hugetlb_uffd_test_ops; + /* Minor faults require shared hugetlb; only enable here. */ + test_uffdio_minor = true; } else if (!strcmp(type, "shmem")) { map_shared = true; test_type = TEST_SHMEM; -- cgit v1.2.3 From 79dbf135e2481eaa77b172d88c343bf85e021545 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 4 May 2021 18:39:23 -0700 Subject: selftests/vm: gup_test: fix test flag In gup_test both gup_flags and test_flags use the same flags field. This is broken. Farther, in the actual gup_test.c all the passed gup_flags are erased and unconditionally replaced with FOLL_WRITE. Which means that test_flags are ignored, and code like this always performs pin dump test: 155 if (gup->flags & GUP_TEST_FLAG_DUMP_PAGES_USE_PIN) 156 nr = pin_user_pages(addr, nr, gup->flags, 157 pages + i, NULL); 158 else 159 nr = get_user_pages(addr, nr, gup->flags, 160 pages + i, NULL); 161 break; Add a new test_flags field, to allow raw gup_flags to work. Add a new subcommand for DUMP_USER_PAGES_TEST to specify that pin test should be performed. Remove unconditional overwriting of gup_flags via FOLL_WRITE. But, preserve the previous behaviour where FOLL_WRITE was the default flag, and add a new option "-W" to unset FOLL_WRITE. Rename flags with gup_flags. With the fix, dump works like this: root@virtme:/# gup_test -c ---- page #0, starting from user virt addr: 0x7f8acb9e4000 page:00000000d3d2ee27 refcount:2 mapcount:1 mapping:0000000000000000 index:0x0 pfn:0x100bcf anon flags: 0x300000000080016(referenced|uptodate|lru|swapbacked) raw: 0300000000080016 ffffd0e204021608 ffffd0e208df2e88 ffff8ea04243ec61 raw: 0000000000000000 0000000000000000 0000000200000000 0000000000000000 page dumped because: gup_test: dump_pages() test DUMP_USER_PAGES_TEST: done root@virtme:/# gup_test -c -p ---- page #0, starting from user virt addr: 0x7fd19701b000 page:00000000baed3c7d refcount:1025 mapcount:1 mapping:0000000000000000 index:0x0 pfn:0x108008 anon flags: 0x300000000080014(uptodate|lru|swapbacked) raw: 0300000000080014 ffffd0e204200188 ffffd0e205e09088 ffff8ea04243ee71 raw: 0000000000000000 0000000000000000 0000040100000000 0000000000000000 page dumped because: gup_test: dump_pages() test DUMP_USER_PAGES_TEST: done Refcount shows the difference between pin vs no-pin case. Also change type of nr from int to long, as it counts number of pages. Link: https://lkml.kernel.org/r/20210215161349.246722-14-pasha.tatashin@soleen.com Signed-off-by: Pavel Tatashin Reviewed-by: John Hubbard Cc: Dan Williams Cc: David Hildenbrand Cc: David Rientjes Cc: Ingo Molnar Cc: Ira Weiny Cc: James Morris Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Sasha Levin Cc: Steven Rostedt (VMware) Cc: Tyler Hicks Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup_test.c | 23 ++++++++++------------- mm/gup_test.h | 3 ++- tools/testing/selftests/vm/gup_test.c | 15 +++++++++++---- 3 files changed, 23 insertions(+), 18 deletions(-) (limited to 'tools') diff --git a/mm/gup_test.c b/mm/gup_test.c index e3cf78e5873e..a6ed1c877679 100644 --- a/mm/gup_test.c +++ b/mm/gup_test.c @@ -94,7 +94,7 @@ static int __gup_test_ioctl(unsigned int cmd, { ktime_t start_time, end_time; unsigned long i, nr_pages, addr, next; - int nr; + long nr; struct page **pages; int ret = 0; bool needs_mmap_lock = @@ -126,37 +126,34 @@ static int __gup_test_ioctl(unsigned int cmd, nr = (next - addr) / PAGE_SIZE; } - /* Filter out most gup flags: only allow a tiny subset here: */ - gup->flags &= FOLL_WRITE; - switch (cmd) { case GUP_FAST_BENCHMARK: - nr = get_user_pages_fast(addr, nr, gup->flags, + nr = get_user_pages_fast(addr, nr, gup->gup_flags, pages + i); break; case GUP_BASIC_TEST: - nr = get_user_pages(addr, nr, gup->flags, pages + i, + nr = get_user_pages(addr, nr, gup->gup_flags, pages + i, NULL); break; case PIN_FAST_BENCHMARK: - nr = pin_user_pages_fast(addr, nr, gup->flags, + nr = pin_user_pages_fast(addr, nr, gup->gup_flags, pages + i); break; case PIN_BASIC_TEST: - nr = pin_user_pages(addr, nr, gup->flags, pages + i, + nr = pin_user_pages(addr, nr, gup->gup_flags, pages + i, NULL); break; case PIN_LONGTERM_BENCHMARK: nr = pin_user_pages(addr, nr, - gup->flags | FOLL_LONGTERM, + gup->gup_flags | FOLL_LONGTERM, pages + i, NULL); break; case DUMP_USER_PAGES_TEST: - if (gup->flags & GUP_TEST_FLAG_DUMP_PAGES_USE_PIN) - nr = pin_user_pages(addr, nr, gup->flags, + if (gup->test_flags & GUP_TEST_FLAG_DUMP_PAGES_USE_PIN) + nr = pin_user_pages(addr, nr, gup->gup_flags, pages + i, NULL); else - nr = get_user_pages(addr, nr, gup->flags, + nr = get_user_pages(addr, nr, gup->gup_flags, pages + i, NULL); break; default: @@ -187,7 +184,7 @@ static int __gup_test_ioctl(unsigned int cmd, start_time = ktime_get(); - put_back_pages(cmd, pages, nr_pages, gup->flags); + put_back_pages(cmd, pages, nr_pages, gup->test_flags); end_time = ktime_get(); gup->put_delta_usec = ktime_us_delta(end_time, start_time); diff --git a/mm/gup_test.h b/mm/gup_test.h index 90a6713d50eb..887ac1d5f5bc 100644 --- a/mm/gup_test.h +++ b/mm/gup_test.h @@ -21,7 +21,8 @@ struct gup_test { __u64 addr; __u64 size; __u32 nr_pages_per_call; - __u32 flags; + __u32 gup_flags; + __u32 test_flags; /* * Each non-zero entry is the number of the page (1-based: first page is * page 1, so that zero entries mean "do nothing") from the .addr base. diff --git a/tools/testing/selftests/vm/gup_test.c b/tools/testing/selftests/vm/gup_test.c index 6c6336dd3b7f..943cc2608dc2 100644 --- a/tools/testing/selftests/vm/gup_test.c +++ b/tools/testing/selftests/vm/gup_test.c @@ -37,13 +37,13 @@ int main(int argc, char **argv) { struct gup_test gup = { 0 }; unsigned long size = 128 * MB; - int i, fd, filed, opt, nr_pages = 1, thp = -1, repeats = 1, write = 0; + int i, fd, filed, opt, nr_pages = 1, thp = -1, repeats = 1, write = 1; unsigned long cmd = GUP_FAST_BENCHMARK; int flags = MAP_PRIVATE; char *file = "/dev/zero"; char *p; - while ((opt = getopt(argc, argv, "m:r:n:F:f:abctTLUuwSH")) != -1) { + while ((opt = getopt(argc, argv, "m:r:n:F:f:abctTLUuwWSHp")) != -1) { switch (opt) { case 'a': cmd = PIN_FAST_BENCHMARK; @@ -65,9 +65,13 @@ int main(int argc, char **argv) */ gup.which_pages[0] = 1; break; + case 'p': + /* works only with DUMP_USER_PAGES_TEST */ + gup.test_flags |= GUP_TEST_FLAG_DUMP_PAGES_USE_PIN; + break; case 'F': /* strtol, so you can pass flags in hex form */ - gup.flags = strtol(optarg, 0, 0); + gup.gup_flags = strtol(optarg, 0, 0); break; case 'm': size = atoi(optarg) * MB; @@ -93,6 +97,9 @@ int main(int argc, char **argv) case 'w': write = 1; break; + case 'W': + write = 0; + break; case 'f': file = optarg; break; @@ -140,7 +147,7 @@ int main(int argc, char **argv) gup.nr_pages_per_call = nr_pages; if (write) - gup.flags |= FOLL_WRITE; + gup.gup_flags |= FOLL_WRITE; fd = open("/sys/kernel/debug/gup_test", O_RDWR); if (fd == -1) { -- cgit v1.2.3 From e44605a8b1aa13d892addc59ec3d416cb186c77b Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 4 May 2021 18:39:27 -0700 Subject: selftests/vm: gup_test: test faulting in kernel, and verify pinnable pages When pages are pinned they can be faulted in userland and migrated, and they can be faulted right in kernel without migration. In either case, the pinned pages must end-up being pinnable (not movable). Add a new test to gup_test, to help verify that the gup/pup (get_user_pages() / pin_user_pages()) behavior with respect to pinnable and movable pages is reasonable and correct. Specifically, provide a way to: 1) Verify that only "pinnable" pages are pinned. This is checked automatically for you. 2) Verify that gup/pup performance is reasonable. This requires comparing benchmarks between doing gup/pup on pages that have been pre-faulted in from user space, vs. doing gup/pup on pages that are not faulted in until gup/pup time (via FOLL_TOUCH). This decision is controlled with the new -z command line option. Link: https://lkml.kernel.org/r/20210215161349.246722-15-pasha.tatashin@soleen.com Signed-off-by: Pavel Tatashin Reviewed-by: John Hubbard Cc: Dan Williams Cc: David Hildenbrand Cc: David Rientjes Cc: Ingo Molnar Cc: Ira Weiny Cc: James Morris Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Sasha Levin Cc: Steven Rostedt (VMware) Cc: Tyler Hicks Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup_test.c | 6 ++++++ tools/testing/selftests/vm/gup_test.c | 23 +++++++++++++++++++---- 2 files changed, 25 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/mm/gup_test.c b/mm/gup_test.c index a6ed1c877679..d974dec19e1c 100644 --- a/mm/gup_test.c +++ b/mm/gup_test.c @@ -52,6 +52,12 @@ static void verify_dma_pinned(unsigned int cmd, struct page **pages, dump_page(page, "gup_test failure"); break; + } else if (cmd == PIN_LONGTERM_BENCHMARK && + WARN(!is_pinnable_page(page), + "pages[%lu] is NOT pinnable but pinned\n", + i)) { + dump_page(page, "gup_test failure"); + break; } } break; diff --git a/tools/testing/selftests/vm/gup_test.c b/tools/testing/selftests/vm/gup_test.c index 943cc2608dc2..1e662d59c502 100644 --- a/tools/testing/selftests/vm/gup_test.c +++ b/tools/testing/selftests/vm/gup_test.c @@ -13,6 +13,7 @@ /* Just the flags we need, copied from mm.h: */ #define FOLL_WRITE 0x01 /* check pte is writable */ +#define FOLL_TOUCH 0x02 /* mark page accessed */ static char *cmd_to_str(unsigned long cmd) { @@ -39,11 +40,11 @@ int main(int argc, char **argv) unsigned long size = 128 * MB; int i, fd, filed, opt, nr_pages = 1, thp = -1, repeats = 1, write = 1; unsigned long cmd = GUP_FAST_BENCHMARK; - int flags = MAP_PRIVATE; + int flags = MAP_PRIVATE, touch = 0; char *file = "/dev/zero"; char *p; - while ((opt = getopt(argc, argv, "m:r:n:F:f:abctTLUuwWSHp")) != -1) { + while ((opt = getopt(argc, argv, "m:r:n:F:f:abctTLUuwWSHpz")) != -1) { switch (opt) { case 'a': cmd = PIN_FAST_BENCHMARK; @@ -110,6 +111,10 @@ int main(int argc, char **argv) case 'H': flags |= (MAP_HUGETLB | MAP_ANONYMOUS); break; + case 'z': + /* fault pages in gup, do not fault in userland */ + touch = 1; + break; default: return -1; } @@ -167,8 +172,18 @@ int main(int argc, char **argv) else if (thp == 0) madvise(p, size, MADV_NOHUGEPAGE); - for (; (unsigned long)p < gup.addr + size; p += PAGE_SIZE) - p[0] = 0; + /* + * FOLL_TOUCH, in gup_test, is used as an either/or case: either + * fault pages in from the kernel via FOLL_TOUCH, or fault them + * in here, from user space. This allows comparison of performance + * between those two cases. + */ + if (touch) { + gup.gup_flags |= FOLL_TOUCH; + } else { + for (; (unsigned long)p < gup.addr + size; p += PAGE_SIZE) + p[0] = 0; + } /* Only report timing information on the *_BENCHMARK commands: */ if ((cmd == PIN_FAST_BENCHMARK) || (cmd == GUP_FAST_BENCHMARK) || -- cgit v1.2.3 From 9683e5775c75097c46bd24e65411b16ac6c6cbb3 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 4 May 2021 16:49:10 -0700 Subject: libbpf: Add NULL check to add_dummy_ksym_var Avoids a segv if btf isn't present. Seen on the call path __bpf_object__open calling bpf_object__collect_externs. Fixes: 5bd022ec01f0 (libbpf: Support extern kernel function) Suggested-by: Stanislav Fomichev Suggested-by: Petar Penkov Signed-off-by: Ian Rogers Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210504234910.976501-1-irogers@google.com --- tools/lib/bpf/libbpf.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index e2a3cf437814..c41d9b2b59ac 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -3216,6 +3216,9 @@ static int add_dummy_ksym_var(struct btf *btf) const struct btf_var_secinfo *vs; const struct btf_type *sec; + if (!btf) + return 0; + sec_btf_id = btf__find_by_name_kind(btf, KSYMS_SEC, BTF_KIND_DATASEC); if (sec_btf_id < 0) -- cgit v1.2.3 From d4455faccd6cbe11ddfdbe28723a2122453b4f4e Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 6 May 2021 18:02:16 -0700 Subject: proc: mandate ->proc_lseek in "struct proc_ops" Now that proc_ops are separate from file_operations and other operations it easy to check all instances to have ->proc_lseek hook and remove check in main code. Note: nonseekable_open() files naturally don't require ->proc_lseek. Garbage collect pde_lseek() function. [adobriyan@gmail.com: smoke test lseek()] Link: https://lkml.kernel.org/r/YG4OIhChOrVTPgdN@localhost.localdomain Link: https://lkml.kernel.org/r/YFYX0Bzwxlc7aBa/@localhost.localdomain Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/isdn/capi/kcapi_proc.c | 1 + drivers/net/wireless/intersil/hostap/hostap_proc.c | 1 + drivers/scsi/esas2r/esas2r_main.c | 1 + fs/proc/inode.c | 14 ++------------ include/linux/proc_fs.h | 1 + tools/testing/selftests/proc/read.c | 4 +++- 6 files changed, 9 insertions(+), 13 deletions(-) (limited to 'tools') diff --git a/drivers/isdn/capi/kcapi_proc.c b/drivers/isdn/capi/kcapi_proc.c index b5ed4ea145cb..77e951206809 100644 --- a/drivers/isdn/capi/kcapi_proc.c +++ b/drivers/isdn/capi/kcapi_proc.c @@ -201,6 +201,7 @@ static ssize_t empty_read(struct file *file, char __user *buf, static const struct proc_ops empty_proc_ops = { .proc_read = empty_read, + .proc_lseek = default_llseek, }; // --------------------------------------------------------------------------- diff --git a/drivers/net/wireless/intersil/hostap/hostap_proc.c b/drivers/net/wireless/intersil/hostap/hostap_proc.c index 97c270845fd1..51c847d98755 100644 --- a/drivers/net/wireless/intersil/hostap/hostap_proc.c +++ b/drivers/net/wireless/intersil/hostap/hostap_proc.c @@ -227,6 +227,7 @@ static ssize_t prism2_aux_dump_proc_no_read(struct file *file, char __user *buf, static const struct proc_ops prism2_aux_dump_proc_ops = { .proc_read = prism2_aux_dump_proc_no_read, + .proc_lseek = default_llseek, }; diff --git a/drivers/scsi/esas2r/esas2r_main.c b/drivers/scsi/esas2r/esas2r_main.c index 5d9eeac6717a..45ec9f16c085 100644 --- a/drivers/scsi/esas2r/esas2r_main.c +++ b/drivers/scsi/esas2r/esas2r_main.c @@ -616,6 +616,7 @@ static const struct file_operations esas2r_proc_fops = { }; static const struct proc_ops esas2r_proc_ops = { + .proc_lseek = default_llseek, .proc_ioctl = esas2r_proc_ioctl, #ifdef CONFIG_COMPAT .proc_compat_ioctl = compat_ptr_ioctl, diff --git a/fs/proc/inode.c b/fs/proc/inode.c index bde6b6f69852..2ea8aaa7206e 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -273,25 +273,15 @@ void proc_entry_rundown(struct proc_dir_entry *de) spin_unlock(&de->pde_unload_lock); } -static loff_t pde_lseek(struct proc_dir_entry *pde, struct file *file, loff_t offset, int whence) -{ - typeof_member(struct proc_ops, proc_lseek) lseek; - - lseek = pde->proc_ops->proc_lseek; - if (!lseek) - lseek = default_llseek; - return lseek(file, offset, whence); -} - static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence) { struct proc_dir_entry *pde = PDE(file_inode(file)); loff_t rv = -EINVAL; if (pde_is_permanent(pde)) { - return pde_lseek(pde, file, offset, whence); + return pde->proc_ops->proc_lseek(file, offset, whence); } else if (use_pde(pde)) { - rv = pde_lseek(pde, file, offset, whence); + rv = pde->proc_ops->proc_lseek(file, offset, whence); unuse_pde(pde); } return rv; diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 000cc0533c33..069c7fd95396 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -32,6 +32,7 @@ struct proc_ops { ssize_t (*proc_read)(struct file *, char __user *, size_t, loff_t *); ssize_t (*proc_read_iter)(struct kiocb *, struct iov_iter *); ssize_t (*proc_write)(struct file *, const char __user *, size_t, loff_t *); + /* mandatory unless nonseekable_open() or equivalent is used */ loff_t (*proc_lseek)(struct file *, loff_t, int); int (*proc_release)(struct inode *, struct file *); __poll_t (*proc_poll)(struct file *, struct poll_table_struct *); diff --git a/tools/testing/selftests/proc/read.c b/tools/testing/selftests/proc/read.c index b3ef9e14d6cc..35ee78dff144 100644 --- a/tools/testing/selftests/proc/read.c +++ b/tools/testing/selftests/proc/read.c @@ -14,7 +14,7 @@ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ // Test -// 1) read of every file in /proc +// 1) read and lseek on every file in /proc // 2) readlink of every symlink in /proc // 3) recursively (1) + (2) for every directory in /proc // 4) write to /proc/*/clear_refs and /proc/*/task/*/clear_refs @@ -45,6 +45,8 @@ static void f_reg(DIR *d, const char *filename) fd = openat(dirfd(d), filename, O_RDONLY|O_NONBLOCK); if (fd == -1) return; + /* struct proc_ops::proc_lseek is mandatory if file is seekable. */ + (void)lseek(fd, 0, SEEK_SET); rv = read(fd, buf, sizeof(buf)); assert((0 <= rv && rv <= sizeof(buf)) || rv == -1); close(fd); -- cgit v1.2.3 From 268af17ada5855a9b703995125a9920ac117b56b Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 6 May 2021 18:02:21 -0700 Subject: selftests: proc: test subset=pid Test that /proc instance mounted with mount -t proc -o subset=pid contains only ".", "..", "self", "thread-self" and pid directories. Note: Currently "subset=pid" doesn't return "." and ".." via readdir. This must be a bug. Link: https://lkml.kernel.org/r/YFYZZ7WGaZlsnChS@localhost.localdomain Signed-off-by: Alexey Dobriyan Acked-by: Alexey Gladkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/proc/Makefile | 1 + tools/testing/selftests/proc/proc-subset-pid.c | 121 +++++++++++++++++++++++++ 2 files changed, 122 insertions(+) create mode 100644 tools/testing/selftests/proc/proc-subset-pid.c (limited to 'tools') diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile index 8be8a03d2973..1054e40a499a 100644 --- a/tools/testing/selftests/proc/Makefile +++ b/tools/testing/selftests/proc/Makefile @@ -12,6 +12,7 @@ TEST_GEN_PROGS += proc-self-map-files-001 TEST_GEN_PROGS += proc-self-map-files-002 TEST_GEN_PROGS += proc-self-syscall TEST_GEN_PROGS += proc-self-wchan +TEST_GEN_PROGS += proc-subset-pid TEST_GEN_PROGS += proc-uptime-001 TEST_GEN_PROGS += proc-uptime-002 TEST_GEN_PROGS += read diff --git a/tools/testing/selftests/proc/proc-subset-pid.c b/tools/testing/selftests/proc/proc-subset-pid.c new file mode 100644 index 000000000000..d1052bcab039 --- /dev/null +++ b/tools/testing/selftests/proc/proc-subset-pid.c @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2021 Alexey Dobriyan + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +/* + * Test that "mount -t proc -o subset=pid" hides everything but pids, + * /proc/self and /proc/thread-self. + */ +#undef NDEBUG +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static inline bool streq(const char *a, const char *b) +{ + return strcmp(a, b) == 0; +} + +static void make_private_proc(void) +{ + if (unshare(CLONE_NEWNS) == -1) { + if (errno == ENOSYS || errno == EPERM) { + exit(4); + } + exit(1); + } + if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) == -1) { + exit(1); + } + if (mount(NULL, "/proc", "proc", 0, "subset=pid") == -1) { + exit(1); + } +} + +static bool string_is_pid(const char *s) +{ + while (1) { + switch (*s++) { + case '0':case '1':case '2':case '3':case '4': + case '5':case '6':case '7':case '8':case '9': + continue; + + case '\0': + return true; + + default: + return false; + } + } +} + +int main(void) +{ + make_private_proc(); + + DIR *d = opendir("/proc"); + assert(d); + + struct dirent *de; + + bool dot = false; + bool dot_dot = false; + bool self = false; + bool thread_self = false; + + while ((de = readdir(d))) { + if (streq(de->d_name, ".")) { + assert(!dot); + dot = true; + assert(de->d_type == DT_DIR); + } else if (streq(de->d_name, "..")) { + assert(!dot_dot); + dot_dot = true; + assert(de->d_type == DT_DIR); + } else if (streq(de->d_name, "self")) { + assert(!self); + self = true; + assert(de->d_type == DT_LNK); + } else if (streq(de->d_name, "thread-self")) { + assert(!thread_self); + thread_self = true; + assert(de->d_type == DT_LNK); + } else { + if (!string_is_pid(de->d_name)) { + fprintf(stderr, "d_name '%s'\n", de->d_name); + assert(0); + } + assert(de->d_type == DT_DIR); + } + } + + char c; + int rv = readlink("/proc/cpuinfo", &c, 1); + assert(rv == -1 && errno == ENOENT); + + int fd = open("/proc/cpuinfo", O_RDONLY); + assert(fd == -1 && errno == ENOENT); + + return 0; +} -- cgit v1.2.3 From d1d1a2cd4627724c37539892db8efa611d2cbd70 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 6 May 2021 18:02:42 -0700 Subject: tools: disable -Wno-type-limits Patch series "lib/find_bit: fast path for small bitmaps", v6. Bitmap operations are much simpler and faster in case of small bitmaps which fit into a single word. In linux/bitmap.c we have a machinery that allows compiler to replace actual function call with a few instructions if bitmaps passed into the function are small and their size is known at compile time. find_*_bit() API lacks this functionality; but users will benefit from it a lot. One important example is cpumask subsystem when NR_CPUS <= BITS_PER_LONG. This patch (of 12): GENMASK(h, l) may be passed with unsigned types. In such case, type-limits warning is generated for example in case of GENMASK(h, 0). Link: https://lkml.kernel.org/r/20210401003153.97325-1-yury.norov@gmail.com Link: https://lkml.kernel.org/r/20210401003153.97325-2-yury.norov@gmail.com Signed-off-by: Yury Norov Acked-by: Rasmus Villemoes Cc: Alexey Klimov Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: David Sterba Cc: Dennis Zhou Cc: Geert Uytterhoeven Cc: Jianpeng Ma Cc: Joe Perches Cc: John Paul Adrian Glaubitz Cc: Josh Poimboeuf Cc: Rich Felker Cc: Stefano Brivio Cc: Wei Yang Cc: Wolfram Sang Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/scripts/Makefile.include | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/scripts/Makefile.include b/tools/scripts/Makefile.include index 25adfec2cb39..f9271f3ea912 100644 --- a/tools/scripts/Makefile.include +++ b/tools/scripts/Makefile.include @@ -38,6 +38,7 @@ EXTRA_WARNINGS += -Wswitch-enum EXTRA_WARNINGS += -Wundef EXTRA_WARNINGS += -Wwrite-strings EXTRA_WARNINGS += -Wformat +EXTRA_WARNINGS += -Wno-type-limits # Makefiles suck: This macro sets a default value of $(2) for the # variable named by $(1), unless the variable has been set by -- cgit v1.2.3 From e5b9252d9000fc82324af5864701c1daffeebd7e Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 6 May 2021 18:02:46 -0700 Subject: tools: bitmap: sync function declarations with the kernel Some functions in tools/include/linux/bitmap.h declare nbits as int. In the kernel nbits is declared as unsigned int. Link: https://lkml.kernel.org/r/20210401003153.97325-3-yury.norov@gmail.com Signed-off-by: Yury Norov Acked-by: Rasmus Villemoes Cc: Alexey Klimov Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: David Sterba Cc: Dennis Zhou Cc: Geert Uytterhoeven Cc: Jianpeng Ma Cc: Joe Perches Cc: John Paul Adrian Glaubitz Cc: Josh Poimboeuf Cc: Rich Felker Cc: Stefano Brivio Cc: Wei Yang Cc: Wolfram Sang Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/include/linux/bitmap.h | 8 ++++---- tools/lib/bitmap.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h index 477a1cae513f..7cbd23e56d48 100644 --- a/tools/include/linux/bitmap.h +++ b/tools/include/linux/bitmap.h @@ -30,7 +30,7 @@ void bitmap_clear(unsigned long *map, unsigned int start, int len); #define small_const_nbits(nbits) \ (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG) -static inline void bitmap_zero(unsigned long *dst, int nbits) +static inline void bitmap_zero(unsigned long *dst, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = 0UL; @@ -66,7 +66,7 @@ static inline int bitmap_full(const unsigned long *src, unsigned int nbits) return find_first_zero_bit(src, nbits) == nbits; } -static inline int bitmap_weight(const unsigned long *src, int nbits) +static inline int bitmap_weight(const unsigned long *src, unsigned int nbits) { if (small_const_nbits(nbits)) return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits)); @@ -74,7 +74,7 @@ static inline int bitmap_weight(const unsigned long *src, int nbits) } static inline void bitmap_or(unsigned long *dst, const unsigned long *src1, - const unsigned long *src2, int nbits) + const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = *src1 | *src2; @@ -141,7 +141,7 @@ static inline void bitmap_free(unsigned long *bitmap) * @buf: buffer to store output * @size: size of @buf */ -size_t bitmap_scnprintf(unsigned long *bitmap, int nbits, +size_t bitmap_scnprintf(unsigned long *bitmap, unsigned int nbits, char *buf, size_t size); /** diff --git a/tools/lib/bitmap.c b/tools/lib/bitmap.c index 5043747ef6c5..f4e914712b6f 100644 --- a/tools/lib/bitmap.c +++ b/tools/lib/bitmap.c @@ -28,11 +28,11 @@ void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, dst[k] = bitmap1[k] | bitmap2[k]; } -size_t bitmap_scnprintf(unsigned long *bitmap, int nbits, +size_t bitmap_scnprintf(unsigned long *bitmap, unsigned int nbits, char *buf, size_t size) { /* current bit is 'cur', most recently seen range is [rbot, rtop] */ - int cur, rbot, rtop; + unsigned int cur, rbot, rtop; bool first = true; size_t ret = 0; -- cgit v1.2.3 From a719101f19d2b4f107c8a79ed8b2866832a1816f Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 6 May 2021 18:02:49 -0700 Subject: tools: sync BITMAP_LAST_WORD_MASK() macro with the kernel Kernel version generates better code. Link: https://lkml.kernel.org/r/20210401003153.97325-4-yury.norov@gmail.com Signed-off-by: Yury Norov Acked-by: Rasmus Villemoes Cc: Alexey Klimov Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: David Sterba Cc: Dennis Zhou Cc: Geert Uytterhoeven Cc: Jianpeng Ma Cc: Joe Perches Cc: John Paul Adrian Glaubitz Cc: Josh Poimboeuf Cc: Rich Felker Cc: Stefano Brivio Cc: Wei Yang Cc: Wolfram Sang Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/include/linux/bitmap.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h index 7cbd23e56d48..4aabc23ec747 100644 --- a/tools/include/linux/bitmap.h +++ b/tools/include/linux/bitmap.h @@ -20,12 +20,7 @@ int __bitmap_equal(const unsigned long *bitmap1, void bitmap_clear(unsigned long *map, unsigned int start, int len); #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1))) - -#define BITMAP_LAST_WORD_MASK(nbits) \ -( \ - ((nbits) % BITS_PER_LONG) ? \ - (1UL<<((nbits) % BITS_PER_LONG))-1 : ~0UL \ -) +#define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1))) #define small_const_nbits(nbits) \ (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG) -- cgit v1.2.3 From 78e48f0667ff11ee444e057c757896062b6ad06b Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 6 May 2021 18:03:00 -0700 Subject: tools: sync small_const_nbits() macro with the kernel Sync implementation with the kernel and move the macro from tools/include/linux/bitmap.h to tools/include/asm-generic/bitsperlong.h Link: https://lkml.kernel.org/r/20210401003153.97325-7-yury.norov@gmail.com Signed-off-by: Yury Norov Acked-by: Rasmus Villemoes Cc: Alexey Klimov Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: David Sterba Cc: Dennis Zhou Cc: Geert Uytterhoeven Cc: Jianpeng Ma Cc: Joe Perches Cc: John Paul Adrian Glaubitz Cc: Josh Poimboeuf Cc: Rich Felker Cc: Stefano Brivio Cc: Wei Yang Cc: Wolfram Sang Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/include/asm-generic/bitsperlong.h | 3 +++ tools/include/linux/bitmap.h | 3 --- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/include/asm-generic/bitsperlong.h b/tools/include/asm-generic/bitsperlong.h index 8f2283052333..2093d56ddd11 100644 --- a/tools/include/asm-generic/bitsperlong.h +++ b/tools/include/asm-generic/bitsperlong.h @@ -18,4 +18,7 @@ #define BITS_PER_LONG_LONG 64 #endif +#define small_const_nbits(nbits) \ + (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG && (nbits) > 0) + #endif /* __ASM_GENERIC_BITS_PER_LONG */ diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h index 4aabc23ec747..330dbf7509cc 100644 --- a/tools/include/linux/bitmap.h +++ b/tools/include/linux/bitmap.h @@ -22,9 +22,6 @@ void bitmap_clear(unsigned long *map, unsigned int start, int len); #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1))) #define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1))) -#define small_const_nbits(nbits) \ - (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG) - static inline void bitmap_zero(unsigned long *dst, unsigned int nbits) { if (small_const_nbits(nbits)) -- cgit v1.2.3 From ea81c1ef441733ee779d776292d6269a97c5d2e1 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 6 May 2021 18:03:07 -0700 Subject: tools: sync find_next_bit implementation Sync the implementation with recent kernel changes. Link: https://lkml.kernel.org/r/20210401003153.97325-9-yury.norov@gmail.com Signed-off-by: Yury Norov Acked-by: Rasmus Villemoes Cc: Alexey Klimov Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: David Sterba Cc: Dennis Zhou Cc: Geert Uytterhoeven Cc: Jianpeng Ma Cc: Joe Perches Cc: John Paul Adrian Glaubitz Cc: Josh Poimboeuf Cc: Rich Felker Cc: Stefano Brivio Cc: Wei Yang Cc: Wolfram Sang Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/include/asm-generic/bitops/find.h | 27 +++++++++++++---- tools/lib/find_bit.c | 52 +++++++++++++-------------------- 2 files changed, 42 insertions(+), 37 deletions(-) (limited to 'tools') diff --git a/tools/include/asm-generic/bitops/find.h b/tools/include/asm-generic/bitops/find.h index 16ed1982cb34..9fe62d10b084 100644 --- a/tools/include/asm-generic/bitops/find.h +++ b/tools/include/asm-generic/bitops/find.h @@ -2,6 +2,10 @@ #ifndef _TOOLS_LINUX_ASM_GENERIC_BITOPS_FIND_H_ #define _TOOLS_LINUX_ASM_GENERIC_BITOPS_FIND_H_ +extern unsigned long _find_next_bit(const unsigned long *addr1, + const unsigned long *addr2, unsigned long nbits, + unsigned long start, unsigned long invert, unsigned long le); + #ifndef find_next_bit /** * find_next_bit - find the next set bit in a memory region @@ -12,8 +16,12 @@ * Returns the bit number for the next set bit * If no bits are set, returns @size. */ -extern unsigned long find_next_bit(const unsigned long *addr, unsigned long - size, unsigned long offset); +static inline +unsigned long find_next_bit(const unsigned long *addr, unsigned long size, + unsigned long offset) +{ + return _find_next_bit(addr, NULL, size, offset, 0UL, 0); +} #endif #ifndef find_next_and_bit @@ -27,13 +35,16 @@ extern unsigned long find_next_bit(const unsigned long *addr, unsigned long * Returns the bit number for the next set bit * If no bits are set, returns @size. */ -extern unsigned long find_next_and_bit(const unsigned long *addr1, +static inline +unsigned long find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, - unsigned long offset); + unsigned long offset) +{ + return _find_next_bit(addr1, addr2, size, offset, 0UL, 0); +} #endif #ifndef find_next_zero_bit - /** * find_next_zero_bit - find the next cleared bit in a memory region * @addr: The address to base the search on @@ -43,8 +54,12 @@ extern unsigned long find_next_and_bit(const unsigned long *addr1, * Returns the bit number of the next zero bit * If no bits are zero, returns @size. */ +static inline unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, - unsigned long offset); + unsigned long offset) +{ + return _find_next_bit(addr, NULL, size, offset, ~0UL, 0); +} #endif #ifndef find_first_bit diff --git a/tools/lib/find_bit.c b/tools/lib/find_bit.c index ac37022e9486..589fd2f26f94 100644 --- a/tools/lib/find_bit.c +++ b/tools/lib/find_bit.c @@ -28,11 +28,12 @@ * searching it for one bits. * - The optional "addr2", which is anded with "addr1" if present. */ -static inline unsigned long _find_next_bit(const unsigned long *addr1, +unsigned long _find_next_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, - unsigned long start, unsigned long invert) + unsigned long start, unsigned long invert, unsigned long le) { - unsigned long tmp; + unsigned long tmp, mask; + (void) le; if (unlikely(start >= nbits)) return nbits; @@ -43,7 +44,19 @@ static inline unsigned long _find_next_bit(const unsigned long *addr1, tmp ^= invert; /* Handle 1st word. */ - tmp &= BITMAP_FIRST_WORD_MASK(start); + mask = BITMAP_FIRST_WORD_MASK(start); + + /* + * Due to the lack of swab() in tools, and the fact that it doesn't + * need little-endian support, just comment it out + */ +#if (0) + if (le) + mask = swab(mask); +#endif + + tmp &= mask; + start = round_down(start, BITS_PER_LONG); while (!tmp) { @@ -57,18 +70,12 @@ static inline unsigned long _find_next_bit(const unsigned long *addr1, tmp ^= invert; } - return min(start + __ffs(tmp), nbits); -} +#if (0) + if (le) + tmp = swab(tmp); #endif -#ifndef find_next_bit -/* - * Find the next set bit in a memory region. - */ -unsigned long find_next_bit(const unsigned long *addr, unsigned long size, - unsigned long offset) -{ - return _find_next_bit(addr, NULL, size, offset, 0UL); + return min(start + __ffs(tmp), nbits); } #endif @@ -105,20 +112,3 @@ unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) return size; } #endif - -#ifndef find_next_zero_bit -unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, - unsigned long offset) -{ - return _find_next_bit(addr, NULL, size, offset, ~0UL); -} -#endif - -#ifndef find_next_and_bit -unsigned long find_next_and_bit(const unsigned long *addr1, - const unsigned long *addr2, unsigned long size, - unsigned long offset) -{ - return _find_next_bit(addr1, addr2, size, offset, 0UL); -} -#endif -- cgit v1.2.3 From eaae7841ba83bb42dcac3177dc65f8dd974e6c0b Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 6 May 2021 18:03:18 -0700 Subject: tools: sync lib/find_bit implementation Add fast paths to find_*_bit() functions as per kernel implementation. Link: https://lkml.kernel.org/r/20210401003153.97325-12-yury.norov@gmail.com Signed-off-by: Yury Norov Acked-by: Rasmus Villemoes Cc: Alexey Klimov Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: David Sterba Cc: Dennis Zhou Cc: Geert Uytterhoeven Cc: Jianpeng Ma Cc: Joe Perches Cc: John Paul Adrian Glaubitz Cc: Josh Poimboeuf Cc: Rich Felker Cc: Stefano Brivio Cc: Wei Yang Cc: Wolfram Sang Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/include/asm-generic/bitops/find.h | 58 +++++++++++++++++++++++++++++++-- tools/lib/find_bit.c | 4 +-- 2 files changed, 57 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/include/asm-generic/bitops/find.h b/tools/include/asm-generic/bitops/find.h index 9fe62d10b084..6481fd11012a 100644 --- a/tools/include/asm-generic/bitops/find.h +++ b/tools/include/asm-generic/bitops/find.h @@ -5,6 +5,9 @@ extern unsigned long _find_next_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start, unsigned long invert, unsigned long le); +extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size); +extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size); +extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size); #ifndef find_next_bit /** @@ -20,6 +23,16 @@ static inline unsigned long find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { + if (small_const_nbits(size)) { + unsigned long val; + + if (unlikely(offset >= size)) + return size; + + val = *addr & GENMASK(size - 1, offset); + return val ? __ffs(val) : size; + } + return _find_next_bit(addr, NULL, size, offset, 0UL, 0); } #endif @@ -40,6 +53,16 @@ unsigned long find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) { + if (small_const_nbits(size)) { + unsigned long val; + + if (unlikely(offset >= size)) + return size; + + val = *addr1 & *addr2 & GENMASK(size - 1, offset); + return val ? __ffs(val) : size; + } + return _find_next_bit(addr1, addr2, size, offset, 0UL, 0); } #endif @@ -58,6 +81,16 @@ static inline unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { + if (small_const_nbits(size)) { + unsigned long val; + + if (unlikely(offset >= size)) + return size; + + val = *addr | ~GENMASK(size - 1, offset); + return val == ~0UL ? size : ffz(val); + } + return _find_next_bit(addr, NULL, size, offset, ~0UL, 0); } #endif @@ -72,8 +105,17 @@ unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, * Returns the bit number of the first set bit. * If no bits are set, returns @size. */ -extern unsigned long find_first_bit(const unsigned long *addr, - unsigned long size); +static inline +unsigned long find_first_bit(const unsigned long *addr, unsigned long size) +{ + if (small_const_nbits(size)) { + unsigned long val = *addr & GENMASK(size - 1, 0); + + return val ? __ffs(val) : size; + } + + return _find_first_bit(addr, size); +} #endif /* find_first_bit */ @@ -87,7 +129,17 @@ extern unsigned long find_first_bit(const unsigned long *addr, * Returns the bit number of the first cleared bit. * If no bits are zero, returns @size. */ -unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size); +static inline +unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) +{ + if (small_const_nbits(size)) { + unsigned long val = *addr | ~GENMASK(size - 1, 0); + + return val == ~0UL ? size : ffz(val); + } + + return _find_first_zero_bit(addr, size); +} #endif #endif /*_TOOLS_LINUX_ASM_GENERIC_BITOPS_FIND_H_ */ diff --git a/tools/lib/find_bit.c b/tools/lib/find_bit.c index 589fd2f26f94..109aa7ffcf97 100644 --- a/tools/lib/find_bit.c +++ b/tools/lib/find_bit.c @@ -83,7 +83,7 @@ unsigned long _find_next_bit(const unsigned long *addr1, /* * Find the first set bit in a memory region. */ -unsigned long find_first_bit(const unsigned long *addr, unsigned long size) +unsigned long _find_first_bit(const unsigned long *addr, unsigned long size) { unsigned long idx; @@ -100,7 +100,7 @@ unsigned long find_first_bit(const unsigned long *addr, unsigned long size) /* * Find the first cleared bit in a memory region. */ -unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) +unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size) { unsigned long idx; -- cgit v1.2.3 From 1e3b918d1dd18bcea3df9339c2d8910ffa95686a Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 6 May 2021 18:04:04 -0700 Subject: kselftest: introduce new epoll test case Patch series "fs/epoll: restore user-visible behavior upon event ready". This series tries to address a change in user visible behavior, reported in https://bugzilla.kernel.org/show_bug.cgi?id=208943. Epoll does not report an event to all the threads running epoll_wait() on the same epoll descriptor. Unsurprisingly, this was bisected back to 339ddb53d373 (fs/epoll: remove unnecessary wakeups of nested epoll), which has had various problems in the past, beyond only nested epoll usage. This patch (of 2): This incorporates the testcase originally reported in: https://bugzilla.kernel.org/show_bug.cgi?id=208943 Which ensures an event is reported to all threads blocked on the same epoll descriptor, otherwise only a single thread will receive the wakeup once the event become ready. Link: https://lkml.kernel.org/r/20210405231025.33829-1-dave@stgolabs.net Link: https://lkml.kernel.org/r/20210405231025.33829-2-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Cc: Jason Baron Cc: Roman Penyaev Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../filesystems/epoll/epoll_wakeup_test.c | 44 ++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c b/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c index ad7fabd575f9..65ede506305c 100644 --- a/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c +++ b/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c @@ -3449,4 +3449,48 @@ TEST(epoll63) close(sfd[1]); } +/* + * t0 t1 + * (ew) \ / (ew) + * e0 + * | (lt) + * s0 + */ +TEST(epoll64) +{ + pthread_t waiter[2]; + struct epoll_event e; + struct epoll_mtcontext ctx = { 0 }; + + signal(SIGUSR1, signal_handler); + + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0); + + ctx.efd[0] = epoll_create(1); + ASSERT_GE(ctx.efd[0], 0); + + e.events = EPOLLIN; + ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0); + + /* + * main will act as the emitter once both waiter threads are + * blocked and expects to both be awoken upon the ready event. + */ + ctx.main = pthread_self(); + ASSERT_EQ(pthread_create(&waiter[0], NULL, waiter_entry1a, &ctx), 0); + ASSERT_EQ(pthread_create(&waiter[1], NULL, waiter_entry1a, &ctx), 0); + + usleep(100000); + ASSERT_EQ(write(ctx.sfd[1], "w", 1), 1); + + ASSERT_EQ(pthread_join(waiter[0], NULL), 0); + ASSERT_EQ(pthread_join(waiter[1], NULL), 0); + + EXPECT_EQ(ctx.count, 2); + + close(ctx.efd[0]); + close(ctx.sfd[0]); + close(ctx.sfd[1]); +} + TEST_HARNESS_MAIN -- cgit v1.2.3 From 9c39c6ffe0c2945c7cf814814c096bc23b63f53d Mon Sep 17 00:00:00 2001 From: Zhang Yunkai Date: Thu, 6 May 2021 18:05:33 -0700 Subject: selftests: remove duplicate include 'assert.h' included in 'sparsebit.c' is duplicated. It is also included in the 161th line. 'string.h' included in 'mincore_selftest.c' is duplicated. It is also included in the 15th line. 'sched.h' included in 'tlbie_test.c' is duplicated. It is also included in the 33th line. Link: https://lkml.kernel.org/r/20210316073336.426255-1-zhang.yunkai@zte.com.cn Signed-off-by: Zhang Yunkai Cc: Paolo Bonzini Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/kvm/lib/sparsebit.c | 1 - tools/testing/selftests/mincore/mincore_selftest.c | 1 - tools/testing/selftests/powerpc/mm/tlbie_test.c | 1 - 3 files changed, 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/sparsebit.c b/tools/testing/selftests/kvm/lib/sparsebit.c index 031ba3c932ed..a0d0c83d83de 100644 --- a/tools/testing/selftests/kvm/lib/sparsebit.c +++ b/tools/testing/selftests/kvm/lib/sparsebit.c @@ -1890,7 +1890,6 @@ void sparsebit_validate_internal(struct sparsebit *s) */ #include -#include struct range { sparsebit_idx_t first, last; diff --git a/tools/testing/selftests/mincore/mincore_selftest.c b/tools/testing/selftests/mincore/mincore_selftest.c index 5a1e85ff5d32..e54106643337 100644 --- a/tools/testing/selftests/mincore/mincore_selftest.c +++ b/tools/testing/selftests/mincore/mincore_selftest.c @@ -14,7 +14,6 @@ #include #include #include -#include #include "../kselftest.h" #include "../kselftest_harness.h" diff --git a/tools/testing/selftests/powerpc/mm/tlbie_test.c b/tools/testing/selftests/powerpc/mm/tlbie_test.c index f85a0938ab25..48344a74b212 100644 --- a/tools/testing/selftests/powerpc/mm/tlbie_test.c +++ b/tools/testing/selftests/powerpc/mm/tlbie_test.c @@ -33,7 +33,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From fa60ce2cb4506701c43bd4cf3ca23d970daf1b9c Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 6 May 2021 18:06:44 -0700 Subject: treewide: remove editor modelines and cruft The section "19) Editor modelines and other cruft" in Documentation/process/coding-style.rst clearly says, "Do not include any of these in source files." I recently receive a patch to explicitly add a new one. Let's do treewide cleanups, otherwise some people follow the existing code and attempt to upstream their favoriate editor setups. It is even nicer if scripts/checkpatch.pl can check it. If we like to impose coding style in an editor-independent manner, I think editorconfig (patch [1]) is a saner solution. [1] https://lore.kernel.org/lkml/20200703073143.423557-1-danny@kdrag0n.dev/ Link: https://lkml.kernel.org/r/20210324054457.1477489-1-masahiroy@kernel.org Signed-off-by: Masahiro Yamada Acked-by: Geert Uytterhoeven Reviewed-by: Miguel Ojeda [auxdisplay] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/m68k/atari/time.c | 7 ------- arch/parisc/include/asm/pdc_chassis.h | 1 - arch/um/drivers/cow.h | 7 ------- drivers/auxdisplay/panel.c | 7 ------- drivers/gpu/drm/qxl/qxl_drv.c | 1 - drivers/media/usb/pwc/pwc-uncompress.c | 3 --- drivers/net/ethernet/adaptec/starfire.c | 8 -------- drivers/net/ethernet/amd/atarilance.c | 8 -------- drivers/net/ethernet/amd/pcnet32.c | 7 ------- .../net/wireless/intersil/orinoco/orinoco_nortel.c | 8 -------- drivers/net/wireless/intersil/orinoco/orinoco_pci.c | 8 -------- drivers/net/wireless/intersil/orinoco/orinoco_plx.c | 8 -------- drivers/net/wireless/intersil/orinoco/orinoco_tmd.c | 8 -------- drivers/parport/parport_ip32.c | 12 ------------ drivers/platform/x86/dell/dell_rbu.c | 3 --- drivers/scsi/53c700.c | 1 - drivers/scsi/53c700.h | 1 - drivers/scsi/ch.c | 6 ------ drivers/scsi/ips.c | 20 -------------------- drivers/scsi/ips.h | 20 -------------------- drivers/scsi/lasi700.c | 1 - drivers/scsi/megaraid/mbox_defs.h | 2 -- drivers/scsi/megaraid/mega_common.h | 2 -- drivers/scsi/megaraid/megaraid_mbox.c | 2 -- drivers/scsi/megaraid/megaraid_mbox.h | 2 -- drivers/scsi/qla1280.c | 12 ------------ drivers/scsi/sni_53c710.c | 1 - drivers/video/fbdev/matrox/matroxfb_base.c | 9 --------- drivers/video/fbdev/vga16fb.c | 10 ---------- fs/configfs/configfs_internal.h | 4 +--- fs/configfs/dir.c | 4 +--- fs/configfs/file.c | 4 +--- fs/configfs/inode.c | 4 +--- fs/configfs/item.c | 4 +--- fs/configfs/mount.c | 4 +--- fs/configfs/symlink.c | 4 +--- fs/nfs/dir.c | 7 ------- fs/nfs/nfs4proc.c | 6 ------ fs/nfs/nfs4renewd.c | 6 ------ fs/nfs/nfs4state.c | 6 ------ fs/nfs/nfs4xdr.c | 6 ------ fs/nfsd/nfs4proc.c | 6 ------ fs/nfsd/nfs4xdr.c | 6 ------ fs/nfsd/xdr4.h | 6 ------ fs/ocfs2/acl.c | 4 +--- fs/ocfs2/acl.h | 4 +--- fs/ocfs2/alloc.c | 4 +--- fs/ocfs2/alloc.h | 4 +--- fs/ocfs2/aops.c | 4 +--- fs/ocfs2/aops.h | 4 +--- fs/ocfs2/blockcheck.c | 4 +--- fs/ocfs2/blockcheck.h | 4 +--- fs/ocfs2/buffer_head_io.c | 4 +--- fs/ocfs2/buffer_head_io.h | 4 +--- fs/ocfs2/cluster/heartbeat.c | 4 +--- fs/ocfs2/cluster/heartbeat.h | 4 +--- fs/ocfs2/cluster/masklog.c | 4 +--- fs/ocfs2/cluster/masklog.h | 4 +--- fs/ocfs2/cluster/netdebug.c | 4 +--- fs/ocfs2/cluster/nodemanager.c | 4 +--- fs/ocfs2/cluster/nodemanager.h | 4 +--- fs/ocfs2/cluster/ocfs2_heartbeat.h | 4 +--- fs/ocfs2/cluster/ocfs2_nodemanager.h | 4 +--- fs/ocfs2/cluster/quorum.c | 4 +--- fs/ocfs2/cluster/quorum.h | 4 +--- fs/ocfs2/cluster/sys.c | 4 +--- fs/ocfs2/cluster/sys.h | 4 +--- fs/ocfs2/cluster/tcp.c | 4 +--- fs/ocfs2/cluster/tcp.h | 4 +--- fs/ocfs2/cluster/tcp_internal.h | 4 +--- fs/ocfs2/dcache.c | 4 +--- fs/ocfs2/dcache.h | 4 +--- fs/ocfs2/dir.c | 4 +--- fs/ocfs2/dir.h | 4 +--- fs/ocfs2/dlm/dlmapi.h | 4 +--- fs/ocfs2/dlm/dlmast.c | 4 +--- fs/ocfs2/dlm/dlmcommon.h | 4 +--- fs/ocfs2/dlm/dlmconvert.c | 4 +--- fs/ocfs2/dlm/dlmconvert.h | 4 +--- fs/ocfs2/dlm/dlmdebug.c | 4 +--- fs/ocfs2/dlm/dlmdebug.h | 4 +--- fs/ocfs2/dlm/dlmdomain.c | 4 +--- fs/ocfs2/dlm/dlmdomain.h | 4 +--- fs/ocfs2/dlm/dlmlock.c | 4 +--- fs/ocfs2/dlm/dlmmaster.c | 4 +--- fs/ocfs2/dlm/dlmrecovery.c | 4 +--- fs/ocfs2/dlm/dlmthread.c | 4 +--- fs/ocfs2/dlm/dlmunlock.c | 4 +--- fs/ocfs2/dlmfs/dlmfs.c | 4 +--- fs/ocfs2/dlmfs/userdlm.c | 4 +--- fs/ocfs2/dlmfs/userdlm.h | 4 +--- fs/ocfs2/dlmglue.c | 4 +--- fs/ocfs2/dlmglue.h | 4 +--- fs/ocfs2/export.c | 4 +--- fs/ocfs2/export.h | 4 +--- fs/ocfs2/extent_map.c | 4 +--- fs/ocfs2/extent_map.h | 4 +--- fs/ocfs2/file.c | 4 +--- fs/ocfs2/file.h | 4 +--- fs/ocfs2/filecheck.c | 4 +--- fs/ocfs2/filecheck.h | 4 +--- fs/ocfs2/heartbeat.c | 4 +--- fs/ocfs2/heartbeat.h | 4 +--- fs/ocfs2/inode.c | 4 +--- fs/ocfs2/inode.h | 4 +--- fs/ocfs2/journal.c | 4 +--- fs/ocfs2/journal.h | 4 +--- fs/ocfs2/localalloc.c | 4 +--- fs/ocfs2/localalloc.h | 4 +--- fs/ocfs2/locks.c | 4 +--- fs/ocfs2/locks.h | 4 +--- fs/ocfs2/mmap.c | 4 +--- fs/ocfs2/move_extents.c | 4 +--- fs/ocfs2/move_extents.h | 4 +--- fs/ocfs2/namei.c | 4 +--- fs/ocfs2/namei.h | 4 +--- fs/ocfs2/ocfs1_fs_compat.h | 4 +--- fs/ocfs2/ocfs2.h | 4 +--- fs/ocfs2/ocfs2_fs.h | 4 +--- fs/ocfs2/ocfs2_ioctl.h | 4 +--- fs/ocfs2/ocfs2_lockid.h | 4 +--- fs/ocfs2/ocfs2_lockingver.h | 4 +--- fs/ocfs2/refcounttree.c | 4 +--- fs/ocfs2/refcounttree.h | 4 +--- fs/ocfs2/reservations.c | 4 +--- fs/ocfs2/reservations.h | 4 +--- fs/ocfs2/resize.c | 4 +--- fs/ocfs2/resize.h | 4 +--- fs/ocfs2/slot_map.c | 4 +--- fs/ocfs2/slot_map.h | 4 +--- fs/ocfs2/stack_o2cb.c | 4 +--- fs/ocfs2/stack_user.c | 4 +--- fs/ocfs2/stackglue.c | 4 +--- fs/ocfs2/stackglue.h | 4 +--- fs/ocfs2/suballoc.c | 4 +--- fs/ocfs2/suballoc.h | 4 +--- fs/ocfs2/super.c | 4 +--- fs/ocfs2/super.h | 4 +--- fs/ocfs2/symlink.c | 4 +--- fs/ocfs2/symlink.h | 4 +--- fs/ocfs2/sysfile.c | 4 +--- fs/ocfs2/sysfile.h | 4 +--- fs/ocfs2/uptodate.c | 4 +--- fs/ocfs2/uptodate.h | 4 +--- fs/ocfs2/xattr.c | 4 +--- fs/ocfs2/xattr.h | 4 +--- fs/reiserfs/procfs.c | 10 ---------- include/linux/configfs.h | 4 +--- include/linux/genl_magic_func.h | 1 - include/linux/genl_magic_struct.h | 1 - include/uapi/linux/if_bonding.h | 11 ----------- include/uapi/linux/nfs4.h | 6 ------ include/xen/interface/elfnote.h | 10 ---------- include/xen/interface/hvm/hvm_vcpu.h | 10 ---------- include/xen/interface/io/xenbus.h | 10 ---------- samples/configfs/configfs_sample.c | 2 -- tools/usb/hcd-tests.sh | 2 -- 157 files changed, 110 insertions(+), 627 deletions(-) (limited to 'tools') diff --git a/arch/m68k/atari/time.c b/arch/m68k/atari/time.c index 1068670cb741..7e44d0e9d0f8 100644 --- a/arch/m68k/atari/time.c +++ b/arch/m68k/atari/time.c @@ -317,10 +317,3 @@ int atari_tt_hwclk( int op, struct rtc_time *t ) return( 0 ); } - -/* - * Local variables: - * c-indent-level: 4 - * tab-width: 8 - * End: - */ diff --git a/arch/parisc/include/asm/pdc_chassis.h b/arch/parisc/include/asm/pdc_chassis.h index ae3e108d22ad..d6d82f53d3d0 100644 --- a/arch/parisc/include/asm/pdc_chassis.h +++ b/arch/parisc/include/asm/pdc_chassis.h @@ -365,4 +365,3 @@ void parisc_pdc_chassis_init(void); PDC_CHASSIS_EOM_SET ) #endif /* _PARISC_PDC_CHASSIS_H */ -/* vim: set ts=8 */ diff --git a/arch/um/drivers/cow.h b/arch/um/drivers/cow.h index 103adac691ed..9a67c017000f 100644 --- a/arch/um/drivers/cow.h +++ b/arch/um/drivers/cow.h @@ -24,10 +24,3 @@ extern void cow_sizes(int version, __u64 size, int sectorsize, int align, int *data_offset_out); #endif - -/* - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/drivers/auxdisplay/panel.c b/drivers/auxdisplay/panel.c index ff5755ee5694..eba04c0de7eb 100644 --- a/drivers/auxdisplay/panel.c +++ b/drivers/auxdisplay/panel.c @@ -1737,10 +1737,3 @@ module_init(panel_init_module); module_exit(panel_cleanup_module); MODULE_AUTHOR("Willy Tarreau"); MODULE_LICENSE("GPL"); - -/* - * Local variables: - * c-indent-level: 4 - * tab-width: 8 - * End: - */ diff --git a/drivers/gpu/drm/qxl/qxl_drv.c b/drivers/gpu/drm/qxl/qxl_drv.c index 1864467f1063..6754f578fed2 100644 --- a/drivers/gpu/drm/qxl/qxl_drv.c +++ b/drivers/gpu/drm/qxl/qxl_drv.c @@ -1,4 +1,3 @@ -/* vim: set ts=8 sw=8 tw=78 ai noexpandtab */ /* qxl_drv.c -- QXL driver -*- linux-c -*- * * Copyright 2011 Red Hat, Inc. diff --git a/drivers/media/usb/pwc/pwc-uncompress.c b/drivers/media/usb/pwc/pwc-uncompress.c index abfc88391036..68bc3829c6b3 100644 --- a/drivers/media/usb/pwc/pwc-uncompress.c +++ b/drivers/media/usb/pwc/pwc-uncompress.c @@ -9,9 +9,6 @@ Please send bug reports and support requests to . The decompression routines have been implemented by reverse-engineering the Nemosoft binary pwcx module. Caveat emptor. - - - vim: set ts=8: */ #include diff --git a/drivers/net/ethernet/adaptec/starfire.c b/drivers/net/ethernet/adaptec/starfire.c index 555299737b51..7965e5e3c985 100644 --- a/drivers/net/ethernet/adaptec/starfire.c +++ b/drivers/net/ethernet/adaptec/starfire.c @@ -2070,11 +2070,3 @@ static void __exit starfire_cleanup (void) module_init(starfire_init); module_exit(starfire_cleanup); - - -/* - * Local variables: - * c-basic-offset: 8 - * tab-width: 8 - * End: - */ diff --git a/drivers/net/ethernet/amd/atarilance.c b/drivers/net/ethernet/amd/atarilance.c index 961796abab35..c1eab916438f 100644 --- a/drivers/net/ethernet/amd/atarilance.c +++ b/drivers/net/ethernet/amd/atarilance.c @@ -1156,11 +1156,3 @@ static void __exit atarilance_module_exit(void) module_init(atarilance_module_init); module_exit(atarilance_module_exit); #endif /* MODULE */ - - -/* - * Local variables: - * c-indent-level: 4 - * tab-width: 4 - * End: - */ diff --git a/drivers/net/ethernet/amd/pcnet32.c b/drivers/net/ethernet/amd/pcnet32.c index aa412506832d..4100ab07e6b7 100644 --- a/drivers/net/ethernet/amd/pcnet32.c +++ b/drivers/net/ethernet/amd/pcnet32.c @@ -3029,10 +3029,3 @@ static void __exit pcnet32_cleanup_module(void) module_init(pcnet32_init_module); module_exit(pcnet32_cleanup_module); - -/* - * Local variables: - * c-indent-level: 4 - * tab-width: 8 - * End: - */ diff --git a/drivers/net/wireless/intersil/orinoco/orinoco_nortel.c b/drivers/net/wireless/intersil/orinoco/orinoco_nortel.c index 96a03d10a080..18bd0d9876c2 100644 --- a/drivers/net/wireless/intersil/orinoco/orinoco_nortel.c +++ b/drivers/net/wireless/intersil/orinoco/orinoco_nortel.c @@ -312,11 +312,3 @@ static void __exit orinoco_nortel_exit(void) module_init(orinoco_nortel_init); module_exit(orinoco_nortel_exit); - -/* - * Local variables: - * c-indent-level: 8 - * c-basic-offset: 8 - * tab-width: 8 - * End: - */ diff --git a/drivers/net/wireless/intersil/orinoco/orinoco_pci.c b/drivers/net/wireless/intersil/orinoco/orinoco_pci.c index f3c86b07b1b9..7e3a6dd60c15 100644 --- a/drivers/net/wireless/intersil/orinoco/orinoco_pci.c +++ b/drivers/net/wireless/intersil/orinoco/orinoco_pci.c @@ -255,11 +255,3 @@ static void __exit orinoco_pci_exit(void) module_init(orinoco_pci_init); module_exit(orinoco_pci_exit); - -/* - * Local variables: - * c-indent-level: 8 - * c-basic-offset: 8 - * tab-width: 8 - * End: - */ diff --git a/drivers/net/wireless/intersil/orinoco/orinoco_plx.c b/drivers/net/wireless/intersil/orinoco/orinoco_plx.c index 16dada94c774..73e6ae124013 100644 --- a/drivers/net/wireless/intersil/orinoco/orinoco_plx.c +++ b/drivers/net/wireless/intersil/orinoco/orinoco_plx.c @@ -360,11 +360,3 @@ static void __exit orinoco_plx_exit(void) module_init(orinoco_plx_init); module_exit(orinoco_plx_exit); - -/* - * Local variables: - * c-indent-level: 8 - * c-basic-offset: 8 - * tab-width: 8 - * End: - */ diff --git a/drivers/net/wireless/intersil/orinoco/orinoco_tmd.c b/drivers/net/wireless/intersil/orinoco/orinoco_tmd.c index 9a9d335611ac..939d5a1dce97 100644 --- a/drivers/net/wireless/intersil/orinoco/orinoco_tmd.c +++ b/drivers/net/wireless/intersil/orinoco/orinoco_tmd.c @@ -235,11 +235,3 @@ static void __exit orinoco_tmd_exit(void) module_init(orinoco_tmd_init); module_exit(orinoco_tmd_exit); - -/* - * Local variables: - * c-indent-level: 8 - * c-basic-offset: 8 - * tab-width: 8 - * End: - */ diff --git a/drivers/parport/parport_ip32.c b/drivers/parport/parport_ip32.c index 48b084e86dc6..0919ed99ba94 100644 --- a/drivers/parport/parport_ip32.c +++ b/drivers/parport/parport_ip32.c @@ -2224,15 +2224,3 @@ MODULE_PARM_DESC(features, ", bit 2: hardware SPP mode" ", bit 3: hardware EPP mode" ", bit 4: hardware ECP mode"); - -/*--- Inform (X)Emacs about preferred coding style ---------------------*/ -/* - * Local Variables: - * mode: c - * c-file-style: "linux" - * indent-tabs-mode: t - * tab-width: 8 - * fill-column: 78 - * ispell-local-dictionary: "american" - * End: - */ diff --git a/drivers/platform/x86/dell/dell_rbu.c b/drivers/platform/x86/dell/dell_rbu.c index 03c3ff34bcf5..085ad0a0d22e 100644 --- a/drivers/platform/x86/dell/dell_rbu.c +++ b/drivers/platform/x86/dell/dell_rbu.c @@ -675,6 +675,3 @@ static __exit void dcdrbu_exit(void) module_exit(dcdrbu_exit); module_init(dcdrbu_init); - -/* vim:noet:ts=8:sw=8 -*/ diff --git a/drivers/scsi/53c700.c b/drivers/scsi/53c700.c index ab42feab233f..77ccb96e5ed4 100644 --- a/drivers/scsi/53c700.c +++ b/drivers/scsi/53c700.c @@ -1,5 +1,4 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8 -*- */ /* NCR (or Symbios) 53c700 and 53c700-66 Driver * diff --git a/drivers/scsi/53c700.h b/drivers/scsi/53c700.h index c9f8c497babb..2df347ca91af 100644 --- a/drivers/scsi/53c700.h +++ b/drivers/scsi/53c700.h @@ -1,5 +1,4 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* -*- mode: c; c-basic-offset: 8 -*- */ /* Driver for 53c700 and 53c700-66 chips from NCR and Symbios * diff --git a/drivers/scsi/ch.c b/drivers/scsi/ch.c index cb74ab1ae5a4..9b89c26ccfdb 100644 --- a/drivers/scsi/ch.c +++ b/drivers/scsi/ch.c @@ -1058,9 +1058,3 @@ static void __exit exit_ch_module(void) module_init(init_ch_module); module_exit(exit_ch_module); - -/* - * Local variables: - * c-basic-offset: 8 - * End: - */ diff --git a/drivers/scsi/ips.c b/drivers/scsi/ips.c index 1a3c534826ba..bc33d54a4011 100644 --- a/drivers/scsi/ips.c +++ b/drivers/scsi/ips.c @@ -7099,23 +7099,3 @@ ips_init_phase2(int index) MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("IBM ServeRAID Adapter Driver " IPS_VER_STRING); MODULE_VERSION(IPS_VER_STRING); - - -/* - * Overrides for Emacs so that we almost follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-indent-level: 2 - * c-brace-imaginary-offset: 0 - * c-brace-offset: -2 - * c-argdecl-indent: 2 - * c-label-offset: -2 - * c-continued-statement-offset: 2 - * c-continued-brace-offset: 0 - * indent-tabs-mode: nil - * tab-width: 8 - * End: - */ diff --git a/drivers/scsi/ips.h b/drivers/scsi/ips.h index 6c0678fb9a67..65edf000e447 100644 --- a/drivers/scsi/ips.h +++ b/drivers/scsi/ips.h @@ -1211,23 +1211,3 @@ typedef struct { IPS_COMPAT_TAMPA, \ IPS_COMPAT_KEYWEST \ } - - -/* - * Overrides for Emacs so that we almost follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-indent-level: 2 - * c-brace-imaginary-offset: 0 - * c-brace-offset: -2 - * c-argdecl-indent: 2 - * c-label-offset: -2 - * c-continued-statement-offset: 2 - * c-continued-brace-offset: 0 - * indent-tabs-mode: nil - * tab-width: 8 - * End: - */ diff --git a/drivers/scsi/lasi700.c b/drivers/scsi/lasi700.c index de71d240a56f..6d14a7a94d0b 100644 --- a/drivers/scsi/lasi700.c +++ b/drivers/scsi/lasi700.c @@ -1,5 +1,4 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8 -*- */ /* PARISC LASI driver for the 53c700 chip * diff --git a/drivers/scsi/megaraid/mbox_defs.h b/drivers/scsi/megaraid/mbox_defs.h index 01a1bfb8ea2a..f0ef8f7f82c1 100644 --- a/drivers/scsi/megaraid/mbox_defs.h +++ b/drivers/scsi/megaraid/mbox_defs.h @@ -781,5 +781,3 @@ typedef struct { } __attribute__ ((packed)) mbox_sgl32; #endif // _MRAID_MBOX_DEFS_H_ - -/* vim: set ts=8 sw=8 tw=78: */ diff --git a/drivers/scsi/megaraid/mega_common.h b/drivers/scsi/megaraid/mega_common.h index 3a7596e47a88..2ad0aa2f837d 100644 --- a/drivers/scsi/megaraid/mega_common.h +++ b/drivers/scsi/megaraid/mega_common.h @@ -282,5 +282,3 @@ struct mraid_pci_blk { }; #endif // _MEGA_COMMON_H_ - -// vim: set ts=8 sw=8 tw=78: diff --git a/drivers/scsi/megaraid/megaraid_mbox.c b/drivers/scsi/megaraid/megaraid_mbox.c index b1a2d3536add..145fde302d7d 100644 --- a/drivers/scsi/megaraid/megaraid_mbox.c +++ b/drivers/scsi/megaraid/megaraid_mbox.c @@ -4068,5 +4068,3 @@ megaraid_sysfs_show_ldnum(struct device *dev, struct device_attribute *attr, cha */ module_init(megaraid_init); module_exit(megaraid_exit); - -/* vim: set ts=8 sw=8 tw=78 ai si: */ diff --git a/drivers/scsi/megaraid/megaraid_mbox.h b/drivers/scsi/megaraid/megaraid_mbox.h index 3e4347c6dab1..d2fe7f69cd5d 100644 --- a/drivers/scsi/megaraid/megaraid_mbox.h +++ b/drivers/scsi/megaraid/megaraid_mbox.h @@ -230,5 +230,3 @@ typedef struct { #define WROUTDOOR(rdev, value) writel(value, (rdev)->baseaddr + 0x2C) #endif // _MEGARAID_H_ - -// vim: set ts=8 sw=8 tw=78: diff --git a/drivers/scsi/qla1280.c b/drivers/scsi/qla1280.c index 8f35174a1f9a..928da90b79be 100644 --- a/drivers/scsi/qla1280.c +++ b/drivers/scsi/qla1280.c @@ -4403,15 +4403,3 @@ MODULE_FIRMWARE("qlogic/1040.bin"); MODULE_FIRMWARE("qlogic/1280.bin"); MODULE_FIRMWARE("qlogic/12160.bin"); MODULE_VERSION(QLA1280_VERSION); - -/* - * Overrides for Emacs so that we almost follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-basic-offset: 8 - * tab-width: 8 - * End: - */ diff --git a/drivers/scsi/sni_53c710.c b/drivers/scsi/sni_53c710.c index 97c6f81b1d2a..678651b9b4dd 100644 --- a/drivers/scsi/sni_53c710.c +++ b/drivers/scsi/sni_53c710.c @@ -1,5 +1,4 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8 -*- */ /* SNI RM driver * diff --git a/drivers/video/fbdev/matrox/matroxfb_base.c b/drivers/video/fbdev/matrox/matroxfb_base.c index a3853421b263..4325bf7f388c 100644 --- a/drivers/video/fbdev/matrox/matroxfb_base.c +++ b/drivers/video/fbdev/matrox/matroxfb_base.c @@ -2608,12 +2608,3 @@ EXPORT_SYMBOL(matroxfb_register_driver); EXPORT_SYMBOL(matroxfb_unregister_driver); EXPORT_SYMBOL(matroxfb_wait_for_sync); EXPORT_SYMBOL(matroxfb_enable_irq); - -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * --------------------------------------------------------------------------- - * Local variables: - * c-basic-offset: 8 - * End: - */ - diff --git a/drivers/video/fbdev/vga16fb.c b/drivers/video/fbdev/vga16fb.c index 1e8a38a7967d..e2757ff1c23d 100644 --- a/drivers/video/fbdev/vga16fb.c +++ b/drivers/video/fbdev/vga16fb.c @@ -1451,13 +1451,3 @@ MODULE_DESCRIPTION("Legacy VGA framebuffer device driver"); MODULE_LICENSE("GPL"); module_init(vga16fb_init); module_exit(vga16fb_exit); - - -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * --------------------------------------------------------------------------- - * Local variables: - * c-basic-offset: 8 - * End: - */ - diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h index 9a3aed249692..c0395363eab9 100644 --- a/fs/configfs/configfs_internal.h +++ b/fs/configfs/configfs_internal.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset:8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * configfs_internal.h - Internal stuff for configfs * * Based on sysfs: diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index b6098e02e20b..ac5e0c0e9181 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dir.c - Operations for configfs directories. * * Based on sysfs: diff --git a/fs/configfs/file.c b/fs/configfs/file.c index da8351d1e455..e26060dae70a 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * file.c - operations for regular (text) files. * * Based on sysfs: diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 42c348bb2903..eb5ec3e46283 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * inode.c - basic inode and dentry operations. * * Based on sysfs: diff --git a/fs/configfs/item.c b/fs/configfs/item.c index 704a4356f137..254170a82aa3 100644 --- a/fs/configfs/item.c +++ b/fs/configfs/item.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * item.c - library routines for handling generic config items * * Based on kobject: diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index 0c6e8cf61953..c2d820063ec4 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * mount.c - operations for initializing and mounting configfs. * * Based on sysfs: diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index 77c854364e60..0623c3edcfb9 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * symlink.c - operations for configfs symlinks. * * Based on sysfs: diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index fc4f490f2d78..3d8e3698d3df 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -3004,10 +3004,3 @@ out_notsup: goto out; } EXPORT_SYMBOL_GPL(nfs_permission); - -/* - * Local variables: - * version-control: t - * kept-new-versions: 5 - * End: - */ diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index c65c4b41e2c1..545010d6cbf3 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -10427,9 +10427,3 @@ const struct xattr_handler *nfs4_xattr_handlers[] = { #endif NULL }; - -/* - * Local variables: - * c-basic-offset: 8 - * End: - */ diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c index ff876dda7f06..db3811af0796 100644 --- a/fs/nfs/nfs4renewd.c +++ b/fs/nfs/nfs4renewd.c @@ -149,9 +149,3 @@ void nfs4_set_lease_period(struct nfs_client *clp, /* Cap maximum reconnect timeout at 1/2 lease period */ rpc_set_connect_timeout(clp->cl_rpcclient, lease, lease >> 1); } - -/* - * Local variables: - * c-basic-offset: 8 - * End: - */ diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 3a51351bdc6a..2eec5bbb55c8 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2695,9 +2695,3 @@ static int nfs4_run_state_manager(void *ptr) module_put_and_exit(0); return 0; } - -/* - * Local variables: - * c-basic-offset: 8 - * End: - */ diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index ac6b79ee9355..d4fd3be0e8ca 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -7629,9 +7629,3 @@ const struct rpc_version nfs_version4 = { .procs = nfs4_procedures, .counts = nfs_version4_counts, }; - -/* - * Local variables: - * c-basic-offset: 8 - * End: - */ diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index daf43b980d4b..f4ce93d7f26e 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -3317,9 +3317,3 @@ const struct svc_version nfsd_version4 = { .vs_rpcb_optnl = true, .vs_need_cong_ctrl = true, }; - -/* - * Local variables: - * c-basic-offset: 8 - * End: - */ diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index e0f06d3cbd44..7abeccb975b2 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -5448,9 +5448,3 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p) nfsd4_sequence_done(resp); return 1; } - -/* - * Local variables: - * c-basic-offset: 8 - * End: - */ diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index fe540a3415c6..a7c425254fee 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -866,9 +866,3 @@ struct nfsd4_operation { #endif - -/* - * Local variables: - * c-basic-offset: 8 - * End: - */ diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 5259badabb56..5c72a7e6d6c5 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * acl.c * * Copyright (C) 2004, 2008 Oracle. All rights reserved. diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h index 4e86450917b2..f59d8d0a61fa 100644 --- a/fs/ocfs2/acl.h +++ b/fs/ocfs2/acl.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * acl.h * * Copyright (C) 2004, 2008 Oracle. All rights reserved. diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 78710788c237..e032f2e2c2c5 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * alloc.c * * Extent allocs and frees diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index 7f973dd76dbc..4af7abaa6e40 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * alloc.h * * Function prototypes diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index ad20403b383f..1294925ac94a 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * Copyright (C) 2002, 2004 Oracle. All rights reserved. */ diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index 70ed4382750d..3a520117fa59 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved. */ diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c index dabfef9c2bc0..863a5316030b 100644 --- a/fs/ocfs2/blockcheck.c +++ b/fs/ocfs2/blockcheck.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * blockcheck.c * * Checksum and ECC codes for the OCFS2 userspace library. diff --git a/fs/ocfs2/blockcheck.h b/fs/ocfs2/blockcheck.h index 8f17d2c85f40..d0578e98ee8d 100644 --- a/fs/ocfs2/blockcheck.h +++ b/fs/ocfs2/blockcheck.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * blockcheck.h * * Checksum and ECC codes for the OCFS2 userspace library. diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index f0b104e483d8..e7758778abef 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * io.c * * Buffer cache handling diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h index 1c5e533fba04..2d51649fc090 100644 --- a/fs/ocfs2/buffer_head_io.h +++ b/fs/ocfs2/buffer_head_io.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * ocfs2_buffer_head.h * * Buffer cache handling functions defined diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 12a7590601dd..e829c2595543 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * Copyright (C) 2004, 2005 Oracle. All rights reserved. */ diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h index beed31ea86cf..1d4100abf6f8 100644 --- a/fs/ocfs2/cluster/heartbeat.h +++ b/fs/ocfs2/cluster/heartbeat.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * heartbeat.h * * Function prototypes diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c index 1d696c96b8b2..810d32815593 100644 --- a/fs/ocfs2/cluster/masklog.c +++ b/fs/ocfs2/cluster/masklog.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * Copyright (C) 2004, 2005 Oracle. All rights reserved. */ diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index 446e452ac7a6..b73fc42e46ff 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * Copyright (C) 2005 Oracle. All rights reserved. */ diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c index 667a5c5e1f66..7524994e3199 100644 --- a/fs/ocfs2/cluster/netdebug.c +++ b/fs/ocfs2/cluster/netdebug.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * netdebug.c * * debug functionality for o2net diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index 7a7640c59f3c..bb82e6b1ff4e 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * Copyright (C) 2004, 2005 Oracle. All rights reserved. */ diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h index 3e0006631cc4..3490e77a952d 100644 --- a/fs/ocfs2/cluster/nodemanager.h +++ b/fs/ocfs2/cluster/nodemanager.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * nodemanager.h * * Function prototypes diff --git a/fs/ocfs2/cluster/ocfs2_heartbeat.h b/fs/ocfs2/cluster/ocfs2_heartbeat.h index 760d850be11e..6088c9f974dd 100644 --- a/fs/ocfs2/cluster/ocfs2_heartbeat.h +++ b/fs/ocfs2/cluster/ocfs2_heartbeat.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * ocfs2_heartbeat.h * * On-disk structures for ocfs2_heartbeat diff --git a/fs/ocfs2/cluster/ocfs2_nodemanager.h b/fs/ocfs2/cluster/ocfs2_nodemanager.h index 21ad307419a8..c9a0b77443e7 100644 --- a/fs/ocfs2/cluster/ocfs2_nodemanager.h +++ b/fs/ocfs2/cluster/ocfs2_nodemanager.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * ocfs2_nodemanager.h * * Header describing the interface between userspace and the kernel diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c index cea739be77c4..189c111bc371 100644 --- a/fs/ocfs2/cluster/quorum.c +++ b/fs/ocfs2/cluster/quorum.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * - * vim: noexpandtab sw=8 ts=8 sts=0: +/* * * Copyright (C) 2005 Oracle. All rights reserved. */ diff --git a/fs/ocfs2/cluster/quorum.h b/fs/ocfs2/cluster/quorum.h index 6d45ce8b18a1..d64bf4482a4a 100644 --- a/fs/ocfs2/cluster/quorum.h +++ b/fs/ocfs2/cluster/quorum.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * Copyright (C) 2005 Oracle. All rights reserved. */ diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c index d6067c3d84c1..022f716c74ff 100644 --- a/fs/ocfs2/cluster/sys.c +++ b/fs/ocfs2/cluster/sys.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * sys.c * * OCFS2 cluster sysfs interface diff --git a/fs/ocfs2/cluster/sys.h b/fs/ocfs2/cluster/sys.h index ce380517cf17..70aaba65317e 100644 --- a/fs/ocfs2/cluster/sys.h +++ b/fs/ocfs2/cluster/sys.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * sys.h * * Function prototypes for o2cb sysfs interface diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 3bd8119bed5e..f660c0dbdb63 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * - * vim: noexpandtab sw=8 ts=8 sts=0: +/* * * Copyright (C) 2004 Oracle. All rights reserved. * diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h index 736338f45c59..a75b551d31c7 100644 --- a/fs/ocfs2/cluster/tcp.h +++ b/fs/ocfs2/cluster/tcp.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * tcp.h * * Function prototypes diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index e6a2b9dfcd16..601c99bd2611 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * Copyright (C) 2005 Oracle. All rights reserved. */ diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index 42a61eecdacd..04fc8344063a 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dcache.c * * dentry cache handling code diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h index 3686a52ba143..7f246c5692d8 100644 --- a/fs/ocfs2/dcache.h +++ b/fs/ocfs2/dcache.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dcache.h * * Function prototypes diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index bdfba9db558a..bd8d534f11cb 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dir.c * * Creates, reads, walks and deletes directory-nodes diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h index e3e7d5dd29e8..4b9f5a12c7d2 100644 --- a/fs/ocfs2/dir.h +++ b/fs/ocfs2/dir.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dir.h * * Function prototypes diff --git a/fs/ocfs2/dlm/dlmapi.h b/fs/ocfs2/dlm/dlmapi.h index 6456c0fbcbb2..bae60ca2672a 100644 --- a/fs/ocfs2/dlm/dlmapi.h +++ b/fs/ocfs2/dlm/dlmapi.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dlmapi.h * * externally exported dlm interfaces diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index 70a10764f249..c681ba957932 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dlmast.c * * AST and BAST functionality for local and remote nodes diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 58d57e25d384..fd2022712167 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dlmcommon.h * * Copyright (C) 2004 Oracle. All rights reserved. diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c index 6051edc33aef..450d46eefab3 100644 --- a/fs/ocfs2/dlm/dlmconvert.c +++ b/fs/ocfs2/dlm/dlmconvert.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dlmconvert.c * * underlying calls for lock conversion diff --git a/fs/ocfs2/dlm/dlmconvert.h b/fs/ocfs2/dlm/dlmconvert.h index 12d9c28bc52f..1f371716513b 100644 --- a/fs/ocfs2/dlm/dlmconvert.h +++ b/fs/ocfs2/dlm/dlmconvert.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dlmconvert.h * * Copyright (C) 2004 Oracle. All rights reserved. diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 4b8b41d23e91..d442cf5dda8a 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dlmdebug.c * * debug functionality for the dlm diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h index f8fd8680a4b6..e08f7357e7ec 100644 --- a/fs/ocfs2/dlm/dlmdebug.h +++ b/fs/ocfs2/dlm/dlmdebug.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dlmdebug.h * * Copyright (C) 2008 Oracle. All rights reserved. diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 357cfc702ce3..9f90fc9551e1 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dlmdomain.c * * defines domain join / leave apis diff --git a/fs/ocfs2/dlm/dlmdomain.h b/fs/ocfs2/dlm/dlmdomain.h index 7c21664d23d0..815abe30ad09 100644 --- a/fs/ocfs2/dlm/dlmdomain.h +++ b/fs/ocfs2/dlm/dlmdomain.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dlmdomain.h * * Copyright (C) 2004 Oracle. All rights reserved. diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index 83f0760e4fba..041fd1791ae7 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dlmlock.c * * underlying calls for lock creation diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index f105746063ed..4960a6de768d 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dlmmod.c * * standalone DLM module diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index afc51736686c..0e7aad1b11cc 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dlmrecovery.c * * recovery stuff diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 5ccc4ff0b82a..c350bd4df770 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dlmthread.c * * standalone DLM module diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index dcb17ca8ae74..61103b2d69fb 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dlmunlock.c * * underlying calls for unlocking locks diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index b2870f1a31df..fa0a14f199eb 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dlmfs.c * * Code which implements the kernel side of a minimal userspace diff --git a/fs/ocfs2/dlmfs/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c index 339f098d9592..29f183a15798 100644 --- a/fs/ocfs2/dlmfs/userdlm.c +++ b/fs/ocfs2/dlmfs/userdlm.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * userdlm.c * * Code which implements the kernel side of a minimal userspace diff --git a/fs/ocfs2/dlmfs/userdlm.h b/fs/ocfs2/dlmfs/userdlm.h index 0558ae768200..47ba18eac423 100644 --- a/fs/ocfs2/dlmfs/userdlm.h +++ b/fs/ocfs2/dlmfs/userdlm.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * userdlm.h * * Userspace dlm defines diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 0fbe8bf7190f..48fd369c29a4 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dlmglue.c * * Code which implements an OCFS2 specific interface to our DLM. diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index b8fbed25df89..e5da5809ed95 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dlmglue.h * * description here diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index 69ed278dd84d..eaa8c80ace3c 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * export.c * * Functions to facilitate NFS exporting diff --git a/fs/ocfs2/export.h b/fs/ocfs2/export.h index d485da0c3439..636357400505 100644 --- a/fs/ocfs2/export.h +++ b/fs/ocfs2/export.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * export.h * * Function prototypes diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 7b93e9c766f6..70a768b623cf 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * extent_map.c * * Block/Cluster mapping functions diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h index e5464f6cee8a..bc4ed59fb925 100644 --- a/fs/ocfs2/extent_map.h +++ b/fs/ocfs2/extent_map.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * extent_map.h * * In-memory file extent mappings for OCFS2. diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index db8a6265b749..f17c3d33fb18 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * file.c * * File open, close, extend, truncate diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index 8536cec5f122..71db8f3aa027 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * file.h * * Function prototypes diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c index 50f11bfdc8c2..90b8d300c1ee 100644 --- a/fs/ocfs2/filecheck.c +++ b/fs/ocfs2/filecheck.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * filecheck.c * * Code which implements online file check. diff --git a/fs/ocfs2/filecheck.h b/fs/ocfs2/filecheck.h index 4d006777ac54..d3bcb8bcfeb0 100644 --- a/fs/ocfs2/filecheck.h +++ b/fs/ocfs2/filecheck.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * filecheck.h * * Online file check. diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c index 60c5f995d30c..9099d8fc7599 100644 --- a/fs/ocfs2/heartbeat.c +++ b/fs/ocfs2/heartbeat.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * heartbeat.c * * Register ourselves with the heartbaet service, keep our node maps diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h index 5fedb2d35dc0..f1f8b1802fe4 100644 --- a/fs/ocfs2/heartbeat.h +++ b/fs/ocfs2/heartbeat.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * heartbeat.h * * Function prototypes diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 7c9dfd50c1c1..bc8f32fab964 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * inode.c * * vfs' aops, fops, dops and iops diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 51a4f7197987..82b28fdacc7e 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * inode.h * * Function prototypes diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index db52e843002a..4e589ce2fce6 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * journal.c * * Defines functions of journalling api diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index bfe611ed1b1d..d158acb8b38a 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * journal.h * * Defines journalling api and structures. diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index fc8252a28cb1..5f6bacbeef6b 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * localalloc.c * * Node local data allocation diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h index e8a5cea48639..08f925b7ec6d 100644 --- a/fs/ocfs2/localalloc.h +++ b/fs/ocfs2/localalloc.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * localalloc.h * * Function prototypes diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c index 7edc4e5c7c2c..fab7c6a4a7d0 100644 --- a/fs/ocfs2/locks.c +++ b/fs/ocfs2/locks.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * locks.c * * Userspace file locking support diff --git a/fs/ocfs2/locks.h b/fs/ocfs2/locks.h index 389fe1fce3a5..b52de3947d5f 100644 --- a/fs/ocfs2/locks.h +++ b/fs/ocfs2/locks.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * locks.h * * Function prototypes for Userspace file locking support diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 25cabbfe87fc..1834f26522ed 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * mmap.c * * Code to deal with the mess that is clustered mmap. diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 758d9661ef1e..192cad0662d8 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * move_extents.c * * Copyright (C) 2011 Oracle. All rights reserved. diff --git a/fs/ocfs2/move_extents.h b/fs/ocfs2/move_extents.h index 28cac43892c5..987f9e559f30 100644 --- a/fs/ocfs2/move_extents.h +++ b/fs/ocfs2/move_extents.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * move_extents.h * * Copyright (C) 2011 Oracle. All rights reserved. diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 05ced86580d1..2c46ff6ba4ea 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * namei.c * * Create and rename file, directory, symlinks diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h index cc091ed02b4a..9cc891eb874e 100644 --- a/fs/ocfs2/namei.h +++ b/fs/ocfs2/namei.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * namei.h * * Function prototypes diff --git a/fs/ocfs2/ocfs1_fs_compat.h b/fs/ocfs2/ocfs1_fs_compat.h index 01ae48c4834d..6dbcf3d467fb 100644 --- a/fs/ocfs2/ocfs1_fs_compat.h +++ b/fs/ocfs2/ocfs1_fs_compat.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * ocfs1_fs_compat.h * * OCFS1 volume header definitions. OCFS2 creates valid but unmountable diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 7993d527edae..bb62cc2e0211 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * ocfs2.h * * Defines macros and structures used in OCFS2 diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 19137c6d087b..638d875eccc7 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * ocfs2_fs.h * * On-disk structures for OCFS2. diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h index 273616bd4f19..9680797bc531 100644 --- a/fs/ocfs2/ocfs2_ioctl.h +++ b/fs/ocfs2/ocfs2_ioctl.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * ocfs2_ioctl.h * * Defines OCFS2 ioctls. diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h index b4be84956bc1..8ac357ce6a30 100644 --- a/fs/ocfs2/ocfs2_lockid.h +++ b/fs/ocfs2/ocfs2_lockid.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * ocfs2_lockid.h * * Defines OCFS2 lockid bits. diff --git a/fs/ocfs2/ocfs2_lockingver.h b/fs/ocfs2/ocfs2_lockingver.h index 5c9c105b33ee..31a5e1619e7f 100644 --- a/fs/ocfs2/ocfs2_lockingver.h +++ b/fs/ocfs2/ocfs2_lockingver.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * ocfs2_lockingver.h * * Defines OCFS2 Locking version values. diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index c19a463fac55..7f6355cbb587 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * refcounttree.c * * Copyright (C) 2009 Oracle. All rights reserved. diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h index 0b9014495726..8197a94feec0 100644 --- a/fs/ocfs2/refcounttree.h +++ b/fs/ocfs2/refcounttree.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * refcounttree.h * * Copyright (C) 2009 Oracle. All rights reserved. diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c index bf3842e34fb9..769e466887b0 100644 --- a/fs/ocfs2/reservations.c +++ b/fs/ocfs2/reservations.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * reservations.c * * Allocation reservations implementation diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h index 6ac88122896d..677c50663595 100644 --- a/fs/ocfs2/reservations.h +++ b/fs/ocfs2/reservations.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * reservations.h * * Allocation reservations function prototypes and structures. diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c index 24eb52f9059c..d65d43c61857 100644 --- a/fs/ocfs2/resize.c +++ b/fs/ocfs2/resize.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * resize.c * * volume resize. diff --git a/fs/ocfs2/resize.h b/fs/ocfs2/resize.h index 0af0c023042c..4990637219ef 100644 --- a/fs/ocfs2/resize.h +++ b/fs/ocfs2/resize.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * resize.h * * Function prototypes diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index 4da0e4b1e79b..0b0ae3ebb0cf 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * slot_map.c * * Copyright (C) 2002, 2004 Oracle. All rights reserved. diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h index 93b53e73f0f7..a43644570b53 100644 --- a/fs/ocfs2/slot_map.h +++ b/fs/ocfs2/slot_map.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * slotmap.h * * description here diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c index f70012038383..88f75f7f02d7 100644 --- a/fs/ocfs2/stack_o2cb.c +++ b/fs/ocfs2/stack_o2cb.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * stack_o2cb.c * * Code which interfaces ocfs2 with the o2cb stack. diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 7397064c3f35..85a47621e0c0 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * stack_user.c * * Code which interfaces ocfs2 with fs/dlm and a userspace stack. diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 8d33ebc6b6fc..d50e8b8dfea4 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * stackglue.c * * Code which implements an OCFS2 specific interface to underlying diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h index e9d26cbeb3b8..3636847fae19 100644 --- a/fs/ocfs2/stackglue.h +++ b/fs/ocfs2/stackglue.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * stackglue.h * * Glue to the underlying cluster stack. diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 8c8cf7f4eb34..8521942f5af2 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * suballoc.c * * metadata alloc and free diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index 50b36250beb6..5805a03d100b 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * suballoc.h * * Defines sub allocator api diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 079f8826993e..c86bd4e60e20 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * super.c * * load/unload driver, mount/dismount volumes diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h index 76facaf63336..8312651135b9 100644 --- a/fs/ocfs2/super.h +++ b/fs/ocfs2/super.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * super.h * * Function prototypes diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c index 94cfacc9bad7..f755a4985821 100644 --- a/fs/ocfs2/symlink.c +++ b/fs/ocfs2/symlink.c @@ -1,6 +1,4 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * linux/cluster/ssi/cfs/symlink.c * * This program is free software; you can redistribute it and/or diff --git a/fs/ocfs2/symlink.h b/fs/ocfs2/symlink.h index 167094d1e5aa..ffcf0210545c 100644 --- a/fs/ocfs2/symlink.h +++ b/fs/ocfs2/symlink.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * symlink.h * * Function prototypes diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c index bb701c4e449f..53a945da873b 100644 --- a/fs/ocfs2/sysfile.c +++ b/fs/ocfs2/sysfile.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * sysfile.c * * Initialize, read, write, etc. system files. diff --git a/fs/ocfs2/sysfile.h b/fs/ocfs2/sysfile.h index a83dd962fccb..2b38c75990fd 100644 --- a/fs/ocfs2/sysfile.h +++ b/fs/ocfs2/sysfile.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * sysfile.h * * Function prototypes diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c index 580852ba05c4..09854925fa5c 100644 --- a/fs/ocfs2/uptodate.c +++ b/fs/ocfs2/uptodate.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * uptodate.c * * Tracking the up-to-date-ness of a local buffer_head with respect to diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h index 77a30cae4879..85d94134001b 100644 --- a/fs/ocfs2/uptodate.h +++ b/fs/ocfs2/uptodate.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * uptodate.h * * Cluster uptodate tracking diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 36ae47a4aef6..dd784eb0cd7c 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * xattr.c * * Copyright (C) 2004, 2008 Oracle. All rights reserved. diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index 9c80382da1f5..00308b57f64f 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * xattr.h * * Copyright (C) 2004, 2008 Oracle. All rights reserved. diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index 155b82870333..4a7cb16e9345 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -488,13 +488,3 @@ int reiserfs_proc_info_global_done(void) * (available at http://www.namesys.com/legalese.html) * */ - -/* - * Make Linus happy. - * Local variables: - * c-indentation-style: "K&R" - * mode-name: "LC" - * c-basic-offset: 8 - * tab-width: 8 - * End: - */ diff --git a/include/linux/configfs.h b/include/linux/configfs.h index 2e8c69b43c64..97cfd13bae51 100644 --- a/include/linux/configfs.h +++ b/include/linux/configfs.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * configfs.h - definitions for the device driver filesystem * * Based on sysfs: diff --git a/include/linux/genl_magic_func.h b/include/linux/genl_magic_func.h index 6cb82301d8e9..939b1a8f571b 100644 --- a/include/linux/genl_magic_func.h +++ b/include/linux/genl_magic_func.h @@ -404,4 +404,3 @@ s_fields \ /* }}}1 */ #endif /* GENL_MAGIC_FUNC_H */ -/* vim: set foldmethod=marker foldlevel=1 nofoldenable : */ diff --git a/include/linux/genl_magic_struct.h b/include/linux/genl_magic_struct.h index 35d21fddaf2d..f81d48987528 100644 --- a/include/linux/genl_magic_struct.h +++ b/include/linux/genl_magic_struct.h @@ -283,4 +283,3 @@ enum { \ /* }}}1 */ #endif /* GENL_MAGIC_STRUCT_H */ -/* vim: set foldmethod=marker nofoldenable : */ diff --git a/include/uapi/linux/if_bonding.h b/include/uapi/linux/if_bonding.h index e8eb4ad03cf1..d174914a837d 100644 --- a/include/uapi/linux/if_bonding.h +++ b/include/uapi/linux/if_bonding.h @@ -153,14 +153,3 @@ enum { #define BOND_3AD_STAT_MAX (__BOND_3AD_STAT_MAX - 1) #endif /* _LINUX_IF_BONDING_H */ - -/* - * Local variables: - * version-control: t - * kept-new-versions: 5 - * c-indent-level: 8 - * c-basic-offset: 8 - * tab-width: 8 - * End: - */ - diff --git a/include/uapi/linux/nfs4.h b/include/uapi/linux/nfs4.h index ed5415e0f1c1..800bb0ffa6e6 100644 --- a/include/uapi/linux/nfs4.h +++ b/include/uapi/linux/nfs4.h @@ -178,9 +178,3 @@ #define NFS4_MAX_BACK_CHANNEL_OPS 2 #endif /* _UAPI_LINUX_NFS4_H */ - -/* - * Local variables: - * c-basic-offset: 8 - * End: - */ diff --git a/include/xen/interface/elfnote.h b/include/xen/interface/elfnote.h index 9e9f9bf7c66d..449bd383cb76 100644 --- a/include/xen/interface/elfnote.h +++ b/include/xen/interface/elfnote.h @@ -208,13 +208,3 @@ #define XEN_ELFNOTE_MAX XEN_ELFNOTE_PHYS32_ENTRY #endif /* __XEN_PUBLIC_ELFNOTE_H__ */ - -/* - * Local variables: - * mode: C - * c-set-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/include/xen/interface/hvm/hvm_vcpu.h b/include/xen/interface/hvm/hvm_vcpu.h index 32ca83edd44d..bfc2138e0bf5 100644 --- a/include/xen/interface/hvm/hvm_vcpu.h +++ b/include/xen/interface/hvm/hvm_vcpu.h @@ -131,13 +131,3 @@ struct vcpu_hvm_context { typedef struct vcpu_hvm_context vcpu_hvm_context_t; #endif /* __XEN_PUBLIC_HVM_HVM_VCPU_H__ */ - -/* - * Local variables: - * mode: C - * c-file-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/include/xen/interface/io/xenbus.h b/include/xen/interface/io/xenbus.h index aaf2951b1cce..fb8716112251 100644 --- a/include/xen/interface/io/xenbus.h +++ b/include/xen/interface/io/xenbus.h @@ -39,13 +39,3 @@ enum xenbus_state }; #endif /* _XEN_PUBLIC_IO_XENBUS_H */ - -/* - * Local variables: - * c-file-style: "linux" - * indent-tabs-mode: t - * c-indent-level: 8 - * c-basic-offset: 8 - * tab-width: 8 - * End: - */ diff --git a/samples/configfs/configfs_sample.c b/samples/configfs/configfs_sample.c index f9008be7a8a1..37a657b25d58 100644 --- a/samples/configfs/configfs_sample.c +++ b/samples/configfs/configfs_sample.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * vim: noexpandtab ts=8 sts=0 sw=8: - * * configfs_example_macros.c - This file is a demonstration module * containing a number of configfs subsystems. It uses the helper * macros defined by configfs.h diff --git a/tools/usb/hcd-tests.sh b/tools/usb/hcd-tests.sh index e8cad6a4f9c9..73f914d13f5c 100644 --- a/tools/usb/hcd-tests.sh +++ b/tools/usb/hcd-tests.sh @@ -272,5 +272,3 @@ do echo '' done done - -# vim: sw=4 -- cgit v1.2.3 From 32d1b3ab588c1231dbfa9eb08819c50529ce77d7 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 5 May 2021 17:18:21 +0200 Subject: KVM: selftests: evmcs_test: Check that VMLAUNCH with bogus EVMPTR is causing #UD 'run->exit_reason == KVM_EXIT_SHUTDOWN' check is not ideal as we may be getting some unexpected exception. Directly check for #UD instead. Signed-off-by: Vitaly Kuznetsov Message-Id: <20210505151823.1341678-2-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86_64/evmcs_test.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/evmcs_test.c index ca22ee6d19cb..b01f64ac6ce3 100644 --- a/tools/testing/selftests/kvm/x86_64/evmcs_test.c +++ b/tools/testing/selftests/kvm/x86_64/evmcs_test.c @@ -19,6 +19,14 @@ #define VCPU_ID 5 +static int ud_count; + +static void guest_ud_handler(struct ex_regs *regs) +{ + ud_count++; + regs->rip += 3; /* VMLAUNCH */ +} + void l2_guest_code(void) { GUEST_SYNC(7); @@ -71,11 +79,11 @@ void guest_code(struct vmx_pages *vmx_pages) if (vmx_pages) l1_guest_code(vmx_pages); - GUEST_DONE(); - /* Try enlightened vmptrld with an incorrect GPA */ evmcs_vmptrld(0xdeadbeef, vmx_pages->enlightened_vmcs); GUEST_ASSERT(vmlaunch()); + GUEST_ASSERT(ud_count == 1); + GUEST_DONE(); } int main(int argc, char *argv[]) @@ -109,6 +117,10 @@ int main(int argc, char *argv[]) vcpu_alloc_vmx(vm, &vmx_pages_gva); vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vm, VCPU_ID); + vm_handle_exception(vm, UD_VECTOR, guest_ud_handler); + for (stage = 1;; stage++) { _vcpu_run(vm, VCPU_ID); TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, @@ -124,7 +136,7 @@ int main(int argc, char *argv[]) case UCALL_SYNC: break; case UCALL_DONE: - goto part1_done; + goto done; default: TEST_FAIL("Unknown ucall %lu", uc.cmd); } @@ -156,10 +168,6 @@ int main(int argc, char *argv[]) (ulong) regs2.rdi, (ulong) regs2.rsi); } -part1_done: - _vcpu_run(vm, VCPU_ID); - TEST_ASSERT(run->exit_reason == KVM_EXIT_SHUTDOWN, - "Unexpected successful VMEnter with invalid eVMCS pointer!"); - +done: kvm_vm_free(vm); } -- cgit v1.2.3 From c9ecafaf0113a305f5085ceb9c7a4b64ca70eae9 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 5 May 2021 17:18:22 +0200 Subject: KVM: selftests: evmcs_test: Check that VMCS12 is alway properly synced to eVMCS after restore Add a test for the regression, introduced by commit f2c7ef3ba955 ("KVM: nSVM: cancel KVM_REQ_GET_NESTED_STATE_PAGES on nested vmexit"). When L2->L1 exit is forced immediately after restoring nested state, KVM_REQ_GET_NESTED_STATE_PAGES request is cleared and VMCS12 changes (e.g. fresh RIP) are not reflected to eVMCS. The consequent nested vCPU run gets broken. Utilize NMI injection to do the job. Signed-off-by: Vitaly Kuznetsov Message-Id: <20210505151823.1341678-3-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86_64/evmcs_test.c | 66 ++++++++++++++++++++----- 1 file changed, 55 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/evmcs_test.c index b01f64ac6ce3..63096cea26c6 100644 --- a/tools/testing/selftests/kvm/x86_64/evmcs_test.c +++ b/tools/testing/selftests/kvm/x86_64/evmcs_test.c @@ -18,30 +18,52 @@ #include "vmx.h" #define VCPU_ID 5 +#define NMI_VECTOR 2 static int ud_count; +void enable_x2apic(void) +{ + uint32_t spiv_reg = APIC_BASE_MSR + (APIC_SPIV >> 4); + + wrmsr(MSR_IA32_APICBASE, rdmsr(MSR_IA32_APICBASE) | + MSR_IA32_APICBASE_ENABLE | MSR_IA32_APICBASE_EXTD); + wrmsr(spiv_reg, rdmsr(spiv_reg) | APIC_SPIV_APIC_ENABLED); +} + static void guest_ud_handler(struct ex_regs *regs) { ud_count++; regs->rip += 3; /* VMLAUNCH */ } +static void guest_nmi_handler(struct ex_regs *regs) +{ +} + void l2_guest_code(void) { GUEST_SYNC(7); GUEST_SYNC(8); + /* Forced exit to L1 upon restore */ + GUEST_SYNC(9); + /* Done, exit to L1 and never come back. */ vmcall(); } -void l1_guest_code(struct vmx_pages *vmx_pages) +void guest_code(struct vmx_pages *vmx_pages) { #define L2_GUEST_STACK_SIZE 64 unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + enable_x2apic(); + + GUEST_SYNC(1); + GUEST_SYNC(2); + enable_vp_assist(vmx_pages->vp_assist_gpa, vmx_pages->vp_assist); GUEST_ASSERT(vmx_pages->vmcs_gpa); @@ -63,21 +85,22 @@ void l1_guest_code(struct vmx_pages *vmx_pages) current_evmcs->revision_id = EVMCS_VERSION; GUEST_SYNC(6); + current_evmcs->pin_based_vm_exec_control |= + PIN_BASED_NMI_EXITING; GUEST_ASSERT(!vmlaunch()); GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa); - GUEST_SYNC(9); + + /* + * NMI forces L2->L1 exit, resuming L2 and hope that EVMCS is + * up-to-date (RIP points where it should and not at the beginning + * of l2_guest_code(). GUEST_SYNC(9) checkes that. + */ GUEST_ASSERT(!vmresume()); - GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); - GUEST_SYNC(10); -} -void guest_code(struct vmx_pages *vmx_pages) -{ - GUEST_SYNC(1); - GUEST_SYNC(2); + GUEST_SYNC(10); - if (vmx_pages) - l1_guest_code(vmx_pages); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + GUEST_SYNC(11); /* Try enlightened vmptrld with an incorrect GPA */ evmcs_vmptrld(0xdeadbeef, vmx_pages->enlightened_vmcs); @@ -86,6 +109,18 @@ void guest_code(struct vmx_pages *vmx_pages) GUEST_DONE(); } +void inject_nmi(struct kvm_vm *vm) +{ + struct kvm_vcpu_events events; + + vcpu_events_get(vm, VCPU_ID, &events); + + events.nmi.pending = 1; + events.flags |= KVM_VCPUEVENT_VALID_NMI_PENDING; + + vcpu_events_set(vm, VCPU_ID, &events); +} + int main(int argc, char *argv[]) { vm_vaddr_t vmx_pages_gva = 0; @@ -120,6 +155,9 @@ int main(int argc, char *argv[]) vm_init_descriptor_tables(vm); vcpu_init_descriptor_tables(vm, VCPU_ID); vm_handle_exception(vm, UD_VECTOR, guest_ud_handler); + vm_handle_exception(vm, NMI_VECTOR, guest_nmi_handler); + + pr_info("Running L1 which uses EVMCS to run L2\n"); for (stage = 1;; stage++) { _vcpu_run(vm, VCPU_ID); @@ -166,6 +204,12 @@ int main(int argc, char *argv[]) TEST_ASSERT(!memcmp(®s1, ®s2, sizeof(regs2)), "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx", (ulong) regs2.rdi, (ulong) regs2.rsi); + + /* Force immediate L2->L1 exit before resuming */ + if (stage == 8) { + pr_info("Injecting NMI into L1 before L2 had a chance to run after restore\n"); + inject_nmi(vm); + } } done: -- cgit v1.2.3 From 5f443e424efab56baa8021da04878f88eb0815d4 Mon Sep 17 00:00:00 2001 From: Bill Wendling Date: Thu, 10 Dec 2020 17:23:17 -0800 Subject: selftests: kvm: remove reassignment of non-absolute variables Clang's integrated assembler does not allow symbols with non-absolute values to be reassigned. Modify the interrupt entry loop macro to be compatible with IAS by using a label and an offset. Cc: Jian Cai Signed-off-by: Bill Wendling References: https://lore.kernel.org/lkml/20200714233024.1789985-1-caij2003@gmail.com/ Message-Id: <20201211012317.3722214-1-morbo@google.com> Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/lib/x86_64/handlers.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/x86_64/handlers.S b/tools/testing/selftests/kvm/lib/x86_64/handlers.S index aaf7bc7d2ce1..7629819734af 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/handlers.S +++ b/tools/testing/selftests/kvm/lib/x86_64/handlers.S @@ -54,9 +54,9 @@ idt_handlers: .align 8 /* Fetch current address and append it to idt_handlers. */ - current_handler = . +666 : .pushsection .rodata -.quad current_handler + .quad 666b .popsection .if ! \has_error -- cgit v1.2.3 From b26990987ffce0525abbd84b36595869cfdbbfe6 Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Thu, 6 May 2021 16:03:52 +0200 Subject: tools/kvm_stat: Fix documentation typo Makes the dash in front of option '-z' disappear in the generated man-page. Signed-off-by: Stefan Raspl Message-Id: <20210506140352.4178789-1-raspl@linux.ibm.com> Signed-off-by: Paolo Bonzini --- tools/kvm/kvm_stat/kvm_stat.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/kvm/kvm_stat/kvm_stat.txt b/tools/kvm/kvm_stat/kvm_stat.txt index feaf46451e83..3a9f2037bd23 100644 --- a/tools/kvm/kvm_stat/kvm_stat.txt +++ b/tools/kvm/kvm_stat/kvm_stat.txt @@ -111,7 +111,7 @@ OPTIONS --tracepoints:: retrieve statistics from tracepoints -*z*:: +-z:: --skip-zero-records:: omit records with all zeros in logging mode -- cgit v1.2.3 From 059e5c321a65657877924256ea8ad9c0df257b45 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Tue, 27 Apr 2021 06:16:36 -0500 Subject: x86/msr: Rename MSR_K8_SYSCFG to MSR_AMD64_SYSCFG The SYSCFG MSR continued being updated beyond the K8 family; drop the K8 name from it. Suggested-by: Borislav Petkov Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Acked-by: Joerg Roedel Link: https://lkml.kernel.org/r/20210427111636.1207-4-brijesh.singh@amd.com --- Documentation/virt/kvm/amd-memory-encryption.rst | 2 +- Documentation/x86/amd-memory-encryption.rst | 6 +++--- arch/x86/include/asm/msr-index.h | 6 +++--- arch/x86/kernel/cpu/amd.c | 4 ++-- arch/x86/kernel/cpu/mtrr/cleanup.c | 2 +- arch/x86/kernel/cpu/mtrr/generic.c | 4 ++-- arch/x86/kernel/mmconf-fam10h_64.c | 2 +- arch/x86/kvm/svm/svm.c | 4 ++-- arch/x86/kvm/x86.c | 2 +- arch/x86/mm/mem_encrypt_identity.c | 6 +++--- arch/x86/pci/amd_bus.c | 2 +- arch/x86/realmode/rm/trampoline_64.S | 4 ++-- drivers/edac/amd64_edac.c | 2 +- tools/arch/x86/include/asm/msr-index.h | 6 +++--- 14 files changed, 26 insertions(+), 26 deletions(-) (limited to 'tools') diff --git a/Documentation/virt/kvm/amd-memory-encryption.rst b/Documentation/virt/kvm/amd-memory-encryption.rst index 5ec8a1902e15..5c081c8c7164 100644 --- a/Documentation/virt/kvm/amd-memory-encryption.rst +++ b/Documentation/virt/kvm/amd-memory-encryption.rst @@ -22,7 +22,7 @@ to SEV:: [ecx]: Bits[31:0] Number of encrypted guests supported simultaneously -If support for SEV is present, MSR 0xc001_0010 (MSR_K8_SYSCFG) and MSR 0xc001_0015 +If support for SEV is present, MSR 0xc001_0010 (MSR_AMD64_SYSCFG) and MSR 0xc001_0015 (MSR_K7_HWCR) can be used to determine if it can be enabled:: 0xc001_0010: diff --git a/Documentation/x86/amd-memory-encryption.rst b/Documentation/x86/amd-memory-encryption.rst index c48d452d0718..a1940ebe7be5 100644 --- a/Documentation/x86/amd-memory-encryption.rst +++ b/Documentation/x86/amd-memory-encryption.rst @@ -53,7 +53,7 @@ CPUID function 0x8000001f reports information related to SME:: system physical addresses, not guest physical addresses) -If support for SME is present, MSR 0xc00100010 (MSR_K8_SYSCFG) can be used to +If support for SME is present, MSR 0xc00100010 (MSR_AMD64_SYSCFG) can be used to determine if SME is enabled and/or to enable memory encryption:: 0xc0010010: @@ -79,7 +79,7 @@ The state of SME in the Linux kernel can be documented as follows: The CPU supports SME (determined through CPUID instruction). - Enabled: - Supported and bit 23 of MSR_K8_SYSCFG is set. + Supported and bit 23 of MSR_AMD64_SYSCFG is set. - Active: Supported, Enabled and the Linux kernel is actively applying @@ -89,7 +89,7 @@ The state of SME in the Linux kernel can be documented as follows: SME can also be enabled and activated in the BIOS. If SME is enabled and activated in the BIOS, then all memory accesses will be encrypted and it will not be necessary to activate the Linux memory encryption support. If the BIOS -merely enables SME (sets bit 23 of the MSR_K8_SYSCFG), then Linux can activate +merely enables SME (sets bit 23 of the MSR_AMD64_SYSCFG), then Linux can activate memory encryption by default (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=y) or by supplying mem_encrypt=on on the kernel command line. However, if BIOS does not enable SME, then Linux will not be able to activate memory encryption, even diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 742d89a00721..211ba3375ee9 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -537,9 +537,9 @@ /* K8 MSRs */ #define MSR_K8_TOP_MEM1 0xc001001a #define MSR_K8_TOP_MEM2 0xc001001d -#define MSR_K8_SYSCFG 0xc0010010 -#define MSR_K8_SYSCFG_MEM_ENCRYPT_BIT 23 -#define MSR_K8_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_K8_SYSCFG_MEM_ENCRYPT_BIT) +#define MSR_AMD64_SYSCFG 0xc0010010 +#define MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT 23 +#define MSR_AMD64_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT) #define MSR_K8_INT_PENDING_MSG 0xc0010055 /* C1E active bits in int pending message */ #define K8_INTP_C1E_ACTIVE_MASK 0x18000000 diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 2d11384dc9ab..0adb0341cd7c 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -593,8 +593,8 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) */ if (cpu_has(c, X86_FEATURE_SME) || cpu_has(c, X86_FEATURE_SEV)) { /* Check if memory encryption is enabled */ - rdmsrl(MSR_K8_SYSCFG, msr); - if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) + rdmsrl(MSR_AMD64_SYSCFG, msr); + if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) goto clear_all; /* diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 0c3b372318b7..b5f43049fa5f 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -836,7 +836,7 @@ int __init amd_special_default_mtrr(void) if (boot_cpu_data.x86 < 0xf) return 0; /* In case some hypervisor doesn't pass SYSCFG through: */ - if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0) + if (rdmsr_safe(MSR_AMD64_SYSCFG, &l, &h) < 0) return 0; /* * Memory between 4GB and top of mem is forced WB by this magic bit. diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index b90f3f437765..558108296f3c 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -53,13 +53,13 @@ static inline void k8_check_syscfg_dram_mod_en(void) (boot_cpu_data.x86 >= 0x0f))) return; - rdmsr(MSR_K8_SYSCFG, lo, hi); + rdmsr(MSR_AMD64_SYSCFG, lo, hi); if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) { pr_err(FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]" " not cleared by BIOS, clearing this bit\n", smp_processor_id()); lo &= ~K8_MTRRFIXRANGE_DRAM_MODIFY; - mtrr_wrmsr(MSR_K8_SYSCFG, lo, hi); + mtrr_wrmsr(MSR_AMD64_SYSCFG, lo, hi); } } diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c index b5cb49e57df8..c94dec6a1834 100644 --- a/arch/x86/kernel/mmconf-fam10h_64.c +++ b/arch/x86/kernel/mmconf-fam10h_64.c @@ -95,7 +95,7 @@ static void get_fam10h_pci_mmconf_base(void) return; /* SYS_CFG */ - address = MSR_K8_SYSCFG; + address = MSR_AMD64_SYSCFG; rdmsrl(address, val); /* TOP_MEM2 is not enabled? */ diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index b649f92287a2..433e8e4fb3a6 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -858,8 +858,8 @@ static __init void svm_adjust_mmio_mask(void) return; /* If memory encryption is not enabled, use existing mask */ - rdmsrl(MSR_K8_SYSCFG, msr); - if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) + rdmsrl(MSR_AMD64_SYSCFG, msr); + if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) return; enc_bit = cpuid_ebx(0x8000001f) & 0x3f; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6eda2834fc05..853c40e89335 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3402,7 +3402,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_LASTBRANCHTOIP: case MSR_IA32_LASTINTFROMIP: case MSR_IA32_LASTINTTOIP: - case MSR_K8_SYSCFG: + case MSR_AMD64_SYSCFG: case MSR_K8_TSEG_ADDR: case MSR_K8_TSEG_MASK: case MSR_VM_HSAVE_PA: diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index 04aba7e80a36..a9639f663d25 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -529,7 +529,7 @@ void __init sme_enable(struct boot_params *bp) /* * No SME if Hypervisor bit is set. This check is here to * prevent a guest from trying to enable SME. For running as a - * KVM guest the MSR_K8_SYSCFG will be sufficient, but there + * KVM guest the MSR_AMD64_SYSCFG will be sufficient, but there * might be other hypervisors which emulate that MSR as non-zero * or even pass it through to the guest. * A malicious hypervisor can still trick a guest into this @@ -542,8 +542,8 @@ void __init sme_enable(struct boot_params *bp) return; /* For SME, check the SYSCFG MSR */ - msr = __rdmsr(MSR_K8_SYSCFG); - if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) + msr = __rdmsr(MSR_AMD64_SYSCFG); + if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) return; } else { /* SEV state cannot be controlled by a command line option */ diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index ae744b6a0785..dd40d3fea74e 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -284,7 +284,7 @@ static int __init early_root_info_init(void) /* need to take out [4G, TOM2) for RAM*/ /* SYS_CFG */ - address = MSR_K8_SYSCFG; + address = MSR_AMD64_SYSCFG; rdmsrl(address, val); /* TOP_MEM2 is enabled? */ if (val & (1<<21)) { diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S index 84c5d1b33d10..cc8391f86cdb 100644 --- a/arch/x86/realmode/rm/trampoline_64.S +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -123,9 +123,9 @@ SYM_CODE_START(startup_32) */ btl $TH_FLAGS_SME_ACTIVE_BIT, pa_tr_flags jnc .Ldone - movl $MSR_K8_SYSCFG, %ecx + movl $MSR_AMD64_SYSCFG, %ecx rdmsr - bts $MSR_K8_SYSCFG_MEM_ENCRYPT_BIT, %eax + bts $MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT, %eax jc .Ldone /* diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 9fa4dfc6ebee..f0d8f60acee1 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -3083,7 +3083,7 @@ static void read_mc_regs(struct amd64_pvt *pvt) edac_dbg(0, " TOP_MEM: 0x%016llx\n", pvt->top_mem); /* Check first whether TOP_MEM2 is enabled: */ - rdmsrl(MSR_K8_SYSCFG, msr_val); + rdmsrl(MSR_AMD64_SYSCFG, msr_val); if (msr_val & BIT(21)) { rdmsrl(MSR_K8_TOP_MEM2, pvt->top_mem2); edac_dbg(0, " TOP_MEM2: 0x%016llx\n", pvt->top_mem2); diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 45029354e0a8..c60b09e7602f 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -533,9 +533,9 @@ /* K8 MSRs */ #define MSR_K8_TOP_MEM1 0xc001001a #define MSR_K8_TOP_MEM2 0xc001001d -#define MSR_K8_SYSCFG 0xc0010010 -#define MSR_K8_SYSCFG_MEM_ENCRYPT_BIT 23 -#define MSR_K8_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_K8_SYSCFG_MEM_ENCRYPT_BIT) +#define MSR_AMD64_SYSCFG 0xc0010010 +#define MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT 23 +#define MSR_AMD64_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT) #define MSR_K8_INT_PENDING_MSG 0xc0010055 /* C1E active bits in int pending message */ #define K8_INTP_C1E_ACTIVE_MASK 0x18000000 -- cgit v1.2.3 From a1bed090fc56e6e24517d96bc076595544fb5317 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Fri, 7 May 2021 17:25:42 +0100 Subject: kselftest/arm64: Add missing stddef.h include to BTI tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Explicitly include stddef.h when building the BTI tests so that we have a definition of NULL, with at least some toolchains this is not done implicitly by anything else: test.c: In function ‘start’: test.c:214:25: error: ‘NULL’ undeclared (first use in this function) 214 | sigaction(SIGILL, &sa, NULL); | ^~~~ test.c:20:1: note: ‘NULL’ is defined in header ‘’; did you forget to ‘#include ’? Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20210507162542.23149-1-broonie@kernel.org Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/bti/test.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/arm64/bti/test.c b/tools/testing/selftests/arm64/bti/test.c index 656b04976ccc..67b77ab83c20 100644 --- a/tools/testing/selftests/arm64/bti/test.c +++ b/tools/testing/selftests/arm64/bti/test.c @@ -6,6 +6,7 @@ #include "system.h" +#include #include #include #include -- cgit v1.2.3 From a3bc4ffeedf4693262fe7c6d133dcfcacd3d18c2 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 3 May 2021 11:48:26 -0300 Subject: tools headers UAPI: Update tools's copy of drm.h headers Picking the changes from: b603e810f740e76b ("drm/uapi: document kernel capabilities") Doesn't result in any tooling changes: $ tools/perf/trace/beauty/drm_ioctl.sh > before $ cp include/uapi/drm/drm.h tools/include/uapi/drm/drm.h $ tools/perf/trace/beauty/drm_ioctl.sh > after $ diff -u before after Silencing these perf build warnings: Warning: Kernel ABI header at 'tools/include/uapi/drm/drm.h' differs from latest version at 'include/uapi/drm/drm.h' diff -u tools/include/uapi/drm/drm.h include/uapi/drm/drm.h Cc: Simon Ser Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/drm/drm.h | 125 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 121 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/include/uapi/drm/drm.h b/tools/include/uapi/drm/drm.h index 0827037c5484..67b94bc3c885 100644 --- a/tools/include/uapi/drm/drm.h +++ b/tools/include/uapi/drm/drm.h @@ -625,30 +625,147 @@ struct drm_gem_open { __u64 size; }; +/** + * DRM_CAP_DUMB_BUFFER + * + * If set to 1, the driver supports creating dumb buffers via the + * &DRM_IOCTL_MODE_CREATE_DUMB ioctl. + */ #define DRM_CAP_DUMB_BUFFER 0x1 +/** + * DRM_CAP_VBLANK_HIGH_CRTC + * + * If set to 1, the kernel supports specifying a CRTC index in the high bits of + * &drm_wait_vblank_request.type. + * + * Starting kernel version 2.6.39, this capability is always set to 1. + */ #define DRM_CAP_VBLANK_HIGH_CRTC 0x2 +/** + * DRM_CAP_DUMB_PREFERRED_DEPTH + * + * The preferred bit depth for dumb buffers. + * + * The bit depth is the number of bits used to indicate the color of a single + * pixel excluding any padding. This is different from the number of bits per + * pixel. For instance, XRGB8888 has a bit depth of 24 but has 32 bits per + * pixel. + * + * Note that this preference only applies to dumb buffers, it's irrelevant for + * other types of buffers. + */ #define DRM_CAP_DUMB_PREFERRED_DEPTH 0x3 +/** + * DRM_CAP_DUMB_PREFER_SHADOW + * + * If set to 1, the driver prefers userspace to render to a shadow buffer + * instead of directly rendering to a dumb buffer. For best speed, userspace + * should do streaming ordered memory copies into the dumb buffer and never + * read from it. + * + * Note that this preference only applies to dumb buffers, it's irrelevant for + * other types of buffers. + */ #define DRM_CAP_DUMB_PREFER_SHADOW 0x4 +/** + * DRM_CAP_PRIME + * + * Bitfield of supported PRIME sharing capabilities. See &DRM_PRIME_CAP_IMPORT + * and &DRM_PRIME_CAP_EXPORT. + * + * PRIME buffers are exposed as dma-buf file descriptors. See + * Documentation/gpu/drm-mm.rst, section "PRIME Buffer Sharing". + */ #define DRM_CAP_PRIME 0x5 +/** + * DRM_PRIME_CAP_IMPORT + * + * If this bit is set in &DRM_CAP_PRIME, the driver supports importing PRIME + * buffers via the &DRM_IOCTL_PRIME_FD_TO_HANDLE ioctl. + */ #define DRM_PRIME_CAP_IMPORT 0x1 +/** + * DRM_PRIME_CAP_EXPORT + * + * If this bit is set in &DRM_CAP_PRIME, the driver supports exporting PRIME + * buffers via the &DRM_IOCTL_PRIME_HANDLE_TO_FD ioctl. + */ #define DRM_PRIME_CAP_EXPORT 0x2 +/** + * DRM_CAP_TIMESTAMP_MONOTONIC + * + * If set to 0, the kernel will report timestamps with ``CLOCK_REALTIME`` in + * struct drm_event_vblank. If set to 1, the kernel will report timestamps with + * ``CLOCK_MONOTONIC``. See ``clock_gettime(2)`` for the definition of these + * clocks. + * + * Starting from kernel version 2.6.39, the default value for this capability + * is 1. Starting kernel version 4.15, this capability is always set to 1. + */ #define DRM_CAP_TIMESTAMP_MONOTONIC 0x6 +/** + * DRM_CAP_ASYNC_PAGE_FLIP + * + * If set to 1, the driver supports &DRM_MODE_PAGE_FLIP_ASYNC. + */ #define DRM_CAP_ASYNC_PAGE_FLIP 0x7 -/* - * The CURSOR_WIDTH and CURSOR_HEIGHT capabilities return a valid widthxheight - * combination for the hardware cursor. The intention is that a hardware - * agnostic userspace can query a cursor plane size to use. +/** + * DRM_CAP_CURSOR_WIDTH + * + * The ``CURSOR_WIDTH`` and ``CURSOR_HEIGHT`` capabilities return a valid + * width x height combination for the hardware cursor. The intention is that a + * hardware agnostic userspace can query a cursor plane size to use. * * Note that the cross-driver contract is to merely return a valid size; * drivers are free to attach another meaning on top, eg. i915 returns the * maximum plane size. */ #define DRM_CAP_CURSOR_WIDTH 0x8 +/** + * DRM_CAP_CURSOR_HEIGHT + * + * See &DRM_CAP_CURSOR_WIDTH. + */ #define DRM_CAP_CURSOR_HEIGHT 0x9 +/** + * DRM_CAP_ADDFB2_MODIFIERS + * + * If set to 1, the driver supports supplying modifiers in the + * &DRM_IOCTL_MODE_ADDFB2 ioctl. + */ #define DRM_CAP_ADDFB2_MODIFIERS 0x10 +/** + * DRM_CAP_PAGE_FLIP_TARGET + * + * If set to 1, the driver supports the &DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE and + * &DRM_MODE_PAGE_FLIP_TARGET_RELATIVE flags in + * &drm_mode_crtc_page_flip_target.flags for the &DRM_IOCTL_MODE_PAGE_FLIP + * ioctl. + */ #define DRM_CAP_PAGE_FLIP_TARGET 0x11 +/** + * DRM_CAP_CRTC_IN_VBLANK_EVENT + * + * If set to 1, the kernel supports reporting the CRTC ID in + * &drm_event_vblank.crtc_id for the &DRM_EVENT_VBLANK and + * &DRM_EVENT_FLIP_COMPLETE events. + * + * Starting kernel version 4.12, this capability is always set to 1. + */ #define DRM_CAP_CRTC_IN_VBLANK_EVENT 0x12 +/** + * DRM_CAP_SYNCOBJ + * + * If set to 1, the driver supports sync objects. See + * Documentation/gpu/drm-mm.rst, section "DRM Sync Objects". + */ #define DRM_CAP_SYNCOBJ 0x13 +/** + * DRM_CAP_SYNCOBJ_TIMELINE + * + * If set to 1, the driver supports timeline operations on sync objects. See + * Documentation/gpu/drm-mm.rst, section "DRM Sync Objects". + */ #define DRM_CAP_SYNCOBJ_TIMELINE 0x14 /* DRM_IOCTL_GET_CAP ioctl argument type */ -- cgit v1.2.3 From 0fdee797d60d71e5a6fd59aa573d84a858e715dd Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 3 May 2021 11:51:17 -0300 Subject: tools headers UAPI: Sync drm/i915_drm.h with the kernel sources To pick the changes in: b5b6f6a610127b17 ("drm/i915/gem: Drop legacy execbuffer support (v2)") That don't result in any change in tooling as this is just adding a comment. Only silences this perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/drm/i915_drm.h' differs from latest version at 'include/uapi/drm/i915_drm.h' diff -u tools/include/uapi/drm/i915_drm.h include/uapi/drm/i915_drm.h Cc: Daniel Vetter Cc: Jason Ekstrand Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/drm/i915_drm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/include/uapi/drm/i915_drm.h b/tools/include/uapi/drm/i915_drm.h index 1987e2ea79a3..ddc47bbf48b6 100644 --- a/tools/include/uapi/drm/i915_drm.h +++ b/tools/include/uapi/drm/i915_drm.h @@ -943,6 +943,7 @@ struct drm_i915_gem_exec_object { __u64 offset; }; +/* DRM_IOCTL_I915_GEM_EXECBUFFER was removed in Linux 5.13 */ struct drm_i915_gem_execbuffer { /** * List of buffers to be validated with their relocations to be -- cgit v1.2.3 From b3172585b13d7171c32cfabdf938eca7fdfe9b31 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 3 May 2021 11:53:37 -0300 Subject: tools arch x86: Sync the msr-index.h copy with the kernel sources To pick up the changes from these csets: d0946a882e622022 ("perf/x86/intel: Hybrid PMU support for perf capabilities") That cause no changes to tooling as it isn't adding any new MSR, just some capabilities for a pre-existing one: $ tools/perf/trace/beauty/tracepoints/x86_msr.sh > before $ cp arch/x86/include/asm/msr-index.h tools/arch/x86/include/asm/msr-index.h $ tools/perf/trace/beauty/tracepoints/x86_msr.sh > after $ diff -u before after $ Just silences this perf build warning: Warning: Kernel ABI header at 'tools/arch/x86/include/asm/msr-index.h' differs from latest version at 'arch/x86/include/asm/msr-index.h' diff -u tools/arch/x86/include/asm/msr-index.h arch/x86/include/asm/msr-index.h Cc: Kan Liang Cc: Peter Zijlstra Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/asm/msr-index.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'tools') diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 45029354e0a8..742d89a00721 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -185,6 +185,9 @@ #define MSR_PEBS_DATA_CFG 0x000003f2 #define MSR_IA32_DS_AREA 0x00000600 #define MSR_IA32_PERF_CAPABILITIES 0x00000345 +#define PERF_CAP_METRICS_IDX 15 +#define PERF_CAP_PT_IDX 16 + #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 #define MSR_IA32_RTIT_CTL 0x00000570 @@ -265,6 +268,7 @@ #define DEBUGCTLMSR_LBR (1UL << 0) /* last branch recording */ #define DEBUGCTLMSR_BTF_SHIFT 1 #define DEBUGCTLMSR_BTF (1UL << 1) /* single-step on branches */ +#define DEBUGCTLMSR_BUS_LOCK_DETECT (1UL << 2) #define DEBUGCTLMSR_TR (1UL << 6) #define DEBUGCTLMSR_BTS (1UL << 7) #define DEBUGCTLMSR_BTINT (1UL << 8) -- cgit v1.2.3 From e8c1167606c63fd8f9934d0b6ce80281463a4945 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 2 Apr 2021 18:40:20 +0900 Subject: perf record: Disallow -c and -F option at the same time It's confusing which one is effective when the both options are given. The current code happens to use -c in this case but users might not be aware of it. We can change it to complain about that instead of relying on the implicit priority. Before: $ perf record -c 111111 -F 99 true [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.031 MB perf.data (8 samples) ] $ perf evlist -F cycles: sample_period=111111 $ After: $ perf record -c 111111 -F 99 true cannot set frequency and period at the same time $ So this change can break existing usages, but I think it's rare to have both options and it'd be better changing them. Suggested-by: Alexey Alexandrov Signed-off-by: Namhyung Kim Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ian Rogers Cc: Jiri Olsa Cc: Mark Rutland Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20210402094020.28164-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/record.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/record.c b/tools/perf/util/record.c index f99852d54b14..43e5b563dee8 100644 --- a/tools/perf/util/record.c +++ b/tools/perf/util/record.c @@ -157,9 +157,15 @@ static int get_max_rate(unsigned int *rate) static int record_opts__config_freq(struct record_opts *opts) { bool user_freq = opts->user_freq != UINT_MAX; + bool user_interval = opts->user_interval != ULLONG_MAX; unsigned int max_rate; - if (opts->user_interval != ULLONG_MAX) + if (user_interval && user_freq) { + pr_err("cannot set frequency and period at the same time\n"); + return -1; + } + + if (user_interval) opts->default_interval = opts->user_interval; if (user_freq) opts->freq = opts->user_freq; -- cgit v1.2.3 From 7aa3c9eabdf76017679e975e2ffd50cde3c010b8 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 6 May 2021 15:56:40 -0700 Subject: perf jevents: Silence warning for ArchStd files JSON files in the level 1 directory are used for ArchStd events (see preprocess_arch_std_files), as such they shouldn't be warned about. Signed-off-by: Ian Rogers Reviewed-by: John Garry Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Joakim Zhang Cc: Kajol Jain Cc: Kim Phillips Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20210506225640.1461000-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/jevents.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/jevents.c b/tools/perf/pmu-events/jevents.c index ed4f0bd72e5a..7422b0ea8790 100644 --- a/tools/perf/pmu-events/jevents.c +++ b/tools/perf/pmu-events/jevents.c @@ -1123,8 +1123,10 @@ static int process_one_file(const char *fpath, const struct stat *sb, mapfile = strdup(fpath); return 0; } - - pr_info("%s: Ignoring file %s\n", prog, fpath); + if (is_json_file(bname)) + pr_debug("%s: ArchStd json is preprocessed %s\n", prog, fpath); + else + pr_info("%s: Ignoring file %s\n", prog, fpath); return 0; } -- cgit v1.2.3 From a11c9a6e472457cf9eeafb585fc5c912f51d1b23 Mon Sep 17 00:00:00 2001 From: Dmitry Koshelev Date: Thu, 6 May 2021 13:11:49 +0000 Subject: perf session: Fix swapping of cpu_map and stat_config records 'data' field in perf_record_cpu_map_data struct is 16-bit wide and so should be swapped using bswap_16(). 'nr' field in perf_record_stat_config struct should be swapped before being used for size calculation. Signed-off-by: Dmitry Koshelev Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Kan Liang Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20210506131244.13328-1-karaghiozis@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/session.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index a12cf4f0e97a..106b3d60881a 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -904,7 +904,7 @@ static void perf_event__cpu_map_swap(union perf_event *event, struct perf_record_record_cpu_map *mask; unsigned i; - data->type = bswap_64(data->type); + data->type = bswap_16(data->type); switch (data->type) { case PERF_CPU_MAP__CPUS: @@ -937,7 +937,7 @@ static void perf_event__stat_config_swap(union perf_event *event, { u64 size; - size = event->stat_config.nr * sizeof(event->stat_config.data[0]); + size = bswap_64(event->stat_config.nr) * sizeof(event->stat_config.data[0]); size += 1; /* nr item itself */ mem_bswap_64(&event->stat_config.nr, size); } -- cgit v1.2.3 From ad1237c30d975535a669746496cbed136aa5a045 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 8 May 2021 22:50:20 +0200 Subject: perf tools: Fix dynamic libbpf link Justin reported broken build with LIBBPF_DYNAMIC=1. When linking libbpf dynamically we need to use perf's hashmap object, because it's not exported in libbpf.so (only in libbpf.a). Following build is now passing: $ make LIBBPF_DYNAMIC=1 BUILD: Doing 'make -j8' parallel build ... $ ldd perf | grep libbpf libbpf.so.0 => /lib64/libbpf.so.0 (0x00007fa7630db000) Fixes: eee19501926d ("perf tools: Grab a copy of libbpf's hashmap") Reported-by: Justin M. Forbes Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Ian Rogers Cc: Mark Rutland Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20210508205020.617984-1-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.config | 1 + tools/perf/util/Build | 7 +++++++ 2 files changed, 8 insertions(+) (limited to 'tools') diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 0d6619064a83..406a9519145e 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -540,6 +540,7 @@ ifndef NO_LIBELF ifdef LIBBPF_DYNAMIC ifeq ($(feature-libbpf), 1) EXTLIBS += -lbpf + $(call detected,CONFIG_LIBBPF_DYNAMIC) else dummy := $(error Error: No libbpf devel library found, please install libbpf-devel); endif diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 8c0d9f368ebc..b64bdc1a7026 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -145,7 +145,14 @@ perf-$(CONFIG_LIBELF) += symbol-elf.o perf-$(CONFIG_LIBELF) += probe-file.o perf-$(CONFIG_LIBELF) += probe-event.o +ifdef CONFIG_LIBBPF_DYNAMIC + hashmap := 1 +endif ifndef CONFIG_LIBBPF + hashmap := 1 +endif + +ifdef hashmap perf-y += hashmap.o endif -- cgit v1.2.3 From 0d943d5fde6070c2661a99618ea95b99655589ad Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sun, 9 May 2021 09:39:02 -0300 Subject: tools headers UAPI: Sync linux/kvm.h with the kernel sources To pick the changes in: 15fb7de1a7f5af0d ("KVM: SVM: Add KVM_SEV_RECEIVE_UPDATE_DATA command") 3bf725699bf62494 ("KVM: arm64: Add support for the KVM PTP service") 4cfdd47d6d95aca4 ("KVM: SVM: Add KVM_SEV SEND_START command") 54526d1fd59338fd ("KVM: x86: Support KVM VMs sharing SEV context") 5569e2e7a650dfff ("KVM: SVM: Add support for KVM_SEV_SEND_CANCEL command") 8b13c36493d8cb56 ("KVM: introduce KVM_CAP_SET_GUEST_DEBUG2") af43cbbf954b50ca ("KVM: SVM: Add support for KVM_SEV_RECEIVE_START command") d3d1af85e2c75bb5 ("KVM: SVM: Add KVM_SEND_UPDATE_DATA command") fe7e948837f312d8 ("KVM: x86: Add capability to grant VM access to privileged SGX attribute") That don't cause any change in tooling as it doesn't introduce any new ioctl. $ grep kvm tools/perf/trace/beauty/*.sh tools/perf/trace/beauty/kvm_ioctl.sh:printf "static const char *kvm_ioctl_cmds[] = {\n" tools/perf/trace/beauty/kvm_ioctl.sh:egrep $regex ${header_dir}/kvm.h | \ $ $ tools/perf/trace/beauty/kvm_ioctl.sh > before $ cp include/uapi/linux/kvm.h tools/include/uapi/linux/kvm.h $ tools/perf/trace/beauty/kvm_ioctl.sh > after $ diff -u before after $ This silences this perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/kvm.h' differs from latest version at 'include/uapi/linux/kvm.h' diff -u tools/include/uapi/linux/kvm.h include/uapi/linux/kvm.h Cc: Brijesh Singh Cc: Jianyong Wu Cc: Marc Zyngier Cc: Nathan Tempelman Cc: Paolo Bonzini Cc: Sean Christopherson Cc: Steve Rutherford Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/kvm.h | 45 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'tools') diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index f6afee209620..3fd9a7e9d90c 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -1078,6 +1078,10 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_DIRTY_LOG_RING 192 #define KVM_CAP_X86_BUS_LOCK_EXIT 193 #define KVM_CAP_PPC_DAWR1 194 +#define KVM_CAP_SET_GUEST_DEBUG2 195 +#define KVM_CAP_SGX_ATTRIBUTE 196 +#define KVM_CAP_VM_COPY_ENC_CONTEXT_FROM 197 +#define KVM_CAP_PTP_KVM 198 #ifdef KVM_CAP_IRQ_ROUTING @@ -1671,6 +1675,8 @@ enum sev_cmd_id { KVM_SEV_CERT_EXPORT, /* Attestation report */ KVM_SEV_GET_ATTESTATION_REPORT, + /* Guest Migration Extension */ + KVM_SEV_SEND_CANCEL, KVM_SEV_NR_MAX, }; @@ -1729,6 +1735,45 @@ struct kvm_sev_attestation_report { __u32 len; }; +struct kvm_sev_send_start { + __u32 policy; + __u64 pdh_cert_uaddr; + __u32 pdh_cert_len; + __u64 plat_certs_uaddr; + __u32 plat_certs_len; + __u64 amd_certs_uaddr; + __u32 amd_certs_len; + __u64 session_uaddr; + __u32 session_len; +}; + +struct kvm_sev_send_update_data { + __u64 hdr_uaddr; + __u32 hdr_len; + __u64 guest_uaddr; + __u32 guest_len; + __u64 trans_uaddr; + __u32 trans_len; +}; + +struct kvm_sev_receive_start { + __u32 handle; + __u32 policy; + __u64 pdh_uaddr; + __u32 pdh_len; + __u64 session_uaddr; + __u32 session_len; +}; + +struct kvm_sev_receive_update_data { + __u64 hdr_uaddr; + __u32 hdr_len; + __u64 guest_uaddr; + __u32 guest_len; + __u64 trans_uaddr; + __u32 trans_len; +}; + #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) #define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) -- cgit v1.2.3 From b35629bc2fd59691504debda99c320cf966c8e3a Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sun, 9 May 2021 09:45:25 -0300 Subject: tools headers kvm: Sync kvm headers with the kernel sources To pick the changes in: 3c0c2ad1ae75963c ("KVM: VMX: Add basic handling of VM-Exit from SGX enclave") None of them trigger any changes in tooling, this time this is just to silence these perf build warnings: Warning: Kernel ABI header at 'tools/arch/x86/include/uapi/asm/vmx.h' differs from latest version at 'arch/x86/include/uapi/asm/vmx.h' diff -u tools/arch/x86/include/uapi/asm/vmx.h arch/x86/include/uapi/asm/vmx.h Cc: Paolo Bonzini Cc: Sean Christopherson Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/uapi/asm/vmx.h | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/arch/x86/include/uapi/asm/vmx.h b/tools/arch/x86/include/uapi/asm/vmx.h index b8e650a985e3..946d761adbd3 100644 --- a/tools/arch/x86/include/uapi/asm/vmx.h +++ b/tools/arch/x86/include/uapi/asm/vmx.h @@ -27,6 +27,7 @@ #define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000 +#define VMX_EXIT_REASONS_SGX_ENCLAVE_MODE 0x08000000 #define EXIT_REASON_EXCEPTION_NMI 0 #define EXIT_REASON_EXTERNAL_INTERRUPT 1 -- cgit v1.2.3 From a00b7e39d6b56e6f49cdd51a9ebf92627a19d877 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 7 May 2021 17:54:35 +0900 Subject: perf tools: Fix a build error on arm64 with clang Since clang's -Wmissing-field-initializers warns if a data structure is initialized with a signle NULL as below, ---- tools/perf $ make CC=clang LLVM=1 ... arch/arm64/util/kvm-stat.c:74:9: error: missing field 'ops' initializer [-Werror,-Wmissing-field-initializers] { NULL }, ^ 1 error generated. ---- add another field initializer expressly as same as other arch's kvm-stat.c code. Signed-off-by: Masami Hiramatsu Cc: Anders Roxell Cc: Leo Yan Cc: Sergey Senozhatsky Link: http://lore.kernel.org/lkml/162037767540.94840.15758657049033010518.stgit@devnote2 Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm64/util/kvm-stat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/arch/arm64/util/kvm-stat.c b/tools/perf/arch/arm64/util/kvm-stat.c index 2303256b7d05..73d18e0ed6f6 100644 --- a/tools/perf/arch/arm64/util/kvm-stat.c +++ b/tools/perf/arch/arm64/util/kvm-stat.c @@ -71,7 +71,7 @@ struct kvm_reg_events_ops kvm_reg_events_ops[] = { .name = "vmexit", .ops = &exit_events, }, - { NULL }, + { NULL, NULL }, }; const char * const kvm_skip_events[] = { -- cgit v1.2.3 From f8bcb061ea013a9b39a071b9dd9f6ea0aa2caf72 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sun, 9 May 2021 09:55:30 -0300 Subject: tools headers UAPI: Sync files changed by landlock, quotactl_path and mount_settattr new syscalls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To pick the changes in these csets: a49f4f81cb48925e ("arch: Wire up Landlock syscalls") 2a1867219c7b27f9 ("fs: add mount_setattr()") fa8b90070a80bb1a ("quota: wire up quotactl_path") That silences these perf build warnings and add support for those new syscalls in tools such as 'perf trace'. For instance, this is now possible: # ~acme/bin/perf trace -v -e landlock* event qualifier tracepoint filter: (common_pid != 129365 && common_pid != 3502) && (id == 444 || id == 445 || id == 446) ^C# That is tha filter expression attached to the raw_syscalls:sys_{enter,exit} tracepoints. $ grep landlock tools/perf/arch/x86/entry/syscalls/syscall_64.tbl 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self $ This addresses these perf build warnings: Warning: Kernel ABI header at 'tools/include/uapi/asm-generic/unistd.h' differs from latest version at 'include/uapi/asm-generic/unistd.h' diff -u tools/include/uapi/asm-generic/unistd.h include/uapi/asm-generic/unistd.h Warning: Kernel ABI header at 'tools/perf/arch/x86/entry/syscalls/syscall_64.tbl' differs from latest version at 'arch/x86/entry/syscalls/syscall_64.tbl' diff -u tools/perf/arch/x86/entry/syscalls/syscall_64.tbl arch/x86/entry/syscalls/syscall_64.tbl Warning: Kernel ABI header at 'tools/perf/arch/powerpc/entry/syscalls/syscall.tbl' differs from latest version at 'arch/powerpc/kernel/syscalls/syscall.tbl' diff -u tools/perf/arch/powerpc/entry/syscalls/syscall.tbl arch/powerpc/kernel/syscalls/syscall.tbl Warning: Kernel ABI header at 'tools/perf/arch/s390/entry/syscalls/syscall.tbl' differs from latest version at 'arch/s390/kernel/syscalls/syscall.tbl' diff -u tools/perf/arch/s390/entry/syscalls/syscall.tbl arch/s390/kernel/syscalls/syscall.tbl Warning: Kernel ABI header at 'tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl' differs from latest version at 'arch/mips/kernel/syscalls/syscall_n64.tbl' diff -u tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl arch/mips/kernel/syscalls/syscall_n64.tbl Cc: Christian Brauner Cc: James Morris Cc: Jan Kara Cc: Mickaël Salaün Cc: Sascha Hauer Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/asm-generic/unistd.h | 11 ++++++++++- tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl | 5 +++++ tools/perf/arch/powerpc/entry/syscalls/syscall.tbl | 4 ++++ tools/perf/arch/s390/entry/syscalls/syscall.tbl | 4 ++++ tools/perf/arch/x86/entry/syscalls/syscall_64.tbl | 4 ++++ 5 files changed, 27 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h index ce58cff99b66..6de5a7fc066b 100644 --- a/tools/include/uapi/asm-generic/unistd.h +++ b/tools/include/uapi/asm-generic/unistd.h @@ -863,9 +863,18 @@ __SYSCALL(__NR_process_madvise, sys_process_madvise) __SC_COMP(__NR_epoll_pwait2, sys_epoll_pwait2, compat_sys_epoll_pwait2) #define __NR_mount_setattr 442 __SYSCALL(__NR_mount_setattr, sys_mount_setattr) +#define __NR_quotactl_path 443 +__SYSCALL(__NR_quotactl_path, sys_quotactl_path) + +#define __NR_landlock_create_ruleset 444 +__SYSCALL(__NR_landlock_create_ruleset, sys_landlock_create_ruleset) +#define __NR_landlock_add_rule 445 +__SYSCALL(__NR_landlock_add_rule, sys_landlock_add_rule) +#define __NR_landlock_restrict_self 446 +__SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self) #undef __NR_syscalls -#define __NR_syscalls 443 +#define __NR_syscalls 447 /* * 32 bit systems traditionally used different diff --git a/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl b/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl index 91649690b52f..9974f5f8e49b 100644 --- a/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl +++ b/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl @@ -356,3 +356,8 @@ 439 n64 faccessat2 sys_faccessat2 440 n64 process_madvise sys_process_madvise 441 n64 epoll_pwait2 sys_epoll_pwait2 +442 n64 mount_setattr sys_mount_setattr +443 n64 quotactl_path sys_quotactl_path +444 n64 landlock_create_ruleset sys_landlock_create_ruleset +445 n64 landlock_add_rule sys_landlock_add_rule +446 n64 landlock_restrict_self sys_landlock_restrict_self diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl index 0b2480cf3e47..2e68fbb57cc6 100644 --- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl @@ -522,3 +522,7 @@ 440 common process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr +443 common quotactl_path sys_quotactl_path +444 common landlock_create_ruleset sys_landlock_create_ruleset +445 common landlock_add_rule sys_landlock_add_rule +446 common landlock_restrict_self sys_landlock_restrict_self diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl index 3abef2144dac..7e4a2aba366d 100644 --- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl @@ -445,3 +445,7 @@ 440 common process_madvise sys_process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr sys_mount_setattr +443 common quotactl_path sys_quotactl_path sys_quotactl_path +444 common landlock_create_ruleset sys_landlock_create_ruleset sys_landlock_create_ruleset +445 common landlock_add_rule sys_landlock_add_rule sys_landlock_add_rule +446 common landlock_restrict_self sys_landlock_restrict_self sys_landlock_restrict_self diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl index 7bf01cbe582f..ecd551b08d05 100644 --- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl +++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl @@ -364,6 +364,10 @@ 440 common process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr +443 common quotactl_path sys_quotactl_path +444 common landlock_create_ruleset sys_landlock_create_ruleset +445 common landlock_add_rule sys_landlock_add_rule +446 common landlock_restrict_self sys_landlock_restrict_self # # Due to a historical design error, certain syscalls are numbered differently -- cgit v1.2.3 From 5a80ee4219a52194f0e815bbceec40eb32c523ec Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sun, 9 May 2021 10:06:40 -0300 Subject: tools headers UAPI: Sync linux/prctl.h with the kernel sources To pick a new prctl introduced in: 201698626fbca1cf ("arm64: Introduce prctl(PR_PAC_{SET,GET}_ENABLED_KEYS)") That results in $ grep prctl tools/perf/trace/beauty/*.sh tools/perf/trace/beauty/prctl_option.sh:printf "static const char *prctl_options[] = {\n" tools/perf/trace/beauty/prctl_option.sh:egrep $regex ${header_dir}/prctl.h | grep -v PR_SET_PTRACER | \ tools/perf/trace/beauty/prctl_option.sh:printf "static const char *prctl_set_mm_options[] = {\n" tools/perf/trace/beauty/prctl_option.sh:egrep $regex ${header_dir}/prctl.h | \ tools/perf/trace/beauty/x86_arch_prctl.sh:prctl_arch_header=${x86_header_dir}/prctl.h tools/perf/trace/beauty/x86_arch_prctl.sh: printf "#define x86_arch_prctl_codes_%d_offset %s\n" $idx $first_entry tools/perf/trace/beauty/x86_arch_prctl.sh: printf "static const char *x86_arch_prctl_codes_%d[] = {\n" $idx tools/perf/trace/beauty/x86_arch_prctl.sh: egrep -q $regex ${prctl_arch_header} && \ tools/perf/trace/beauty/x86_arch_prctl.sh: (egrep $regex ${prctl_arch_header} | \ $ tools/perf/trace/beauty/prctl_option.sh > before $ cp include/uapi/linux/prctl.h tools/include/uapi/linux/prctl.h $ tools/perf/trace/beauty/prctl_option.sh > after $ diff -u before after --- before 2021-05-09 10:06:10.064559675 -0300 +++ after 2021-05-09 10:06:21.319791396 -0300 @@ -54,6 +54,8 @@ [57] = "SET_IO_FLUSHER", [58] = "GET_IO_FLUSHER", [59] = "SET_SYSCALL_USER_DISPATCH", + [60] = "PAC_SET_ENABLED_KEYS", + [61] = "PAC_GET_ENABLED_KEYS", }; static const char *prctl_set_mm_options[] = { [1] = "START_CODE", $ Now users can do: # perf trace -e syscalls:sys_enter_prctl --filter "option==PAC_GET_ENABLED_KEYS" ^C# # trace -v -e syscalls:sys_enter_prctl --filter "option==PAC_GET_ENABLED_KEYS" New filter for syscalls:sys_enter_prctl: (option==0x3d) && (common_pid != 5519 && common_pid != 3404) ^C# And also when prctl appears in a session, its options will be translated to the string. Cc: Catalin Marinas Cc: Peter Collingbourne Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/prctl.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'tools') diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h index 667f1aed091c..18a9f59dc067 100644 --- a/tools/include/uapi/linux/prctl.h +++ b/tools/include/uapi/linux/prctl.h @@ -255,4 +255,8 @@ struct prctl_mm_map { # define SYSCALL_DISPATCH_FILTER_ALLOW 0 # define SYSCALL_DISPATCH_FILTER_BLOCK 1 +/* Set/get enabled arm64 pointer authentication keys */ +#define PR_PAC_SET_ENABLED_KEYS 60 +#define PR_PAC_GET_ENABLED_KEYS 61 + #endif /* _LINUX_PRCTL_H */ -- cgit v1.2.3 From fb24e308b6310541e70d11a3f19dc40742974b95 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sun, 9 May 2021 10:19:37 -0300 Subject: tools arch: Update arch/x86/lib/mem{cpy,set}_64.S copies used in 'perf bench mem memcpy' To bring in the change made in this cset: 5e21a3ecad1500e3 ("x86/alternative: Merge include files") This just silences these perf tools build warnings, no change in the tools: Warning: Kernel ABI header at 'tools/arch/x86/lib/memcpy_64.S' differs from latest version at 'arch/x86/lib/memcpy_64.S' diff -u tools/arch/x86/lib/memcpy_64.S arch/x86/lib/memcpy_64.S Warning: Kernel ABI header at 'tools/arch/x86/lib/memset_64.S' differs from latest version at 'arch/x86/lib/memset_64.S' diff -u tools/arch/x86/lib/memset_64.S arch/x86/lib/memset_64.S Cc: Borislav Petkov Cc: Juergen Gross Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/lib/memcpy_64.S | 2 +- tools/arch/x86/lib/memset_64.S | 2 +- tools/include/asm/alternative-asm.h | 10 ---------- tools/include/asm/alternative.h | 10 ++++++++++ 4 files changed, 12 insertions(+), 12 deletions(-) delete mode 100644 tools/include/asm/alternative-asm.h create mode 100644 tools/include/asm/alternative.h (limited to 'tools') diff --git a/tools/arch/x86/lib/memcpy_64.S b/tools/arch/x86/lib/memcpy_64.S index 1e299ac73c86..1cc9da6e29c7 100644 --- a/tools/arch/x86/lib/memcpy_64.S +++ b/tools/arch/x86/lib/memcpy_64.S @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include .pushsection .noinstr.text, "ax" diff --git a/tools/arch/x86/lib/memset_64.S b/tools/arch/x86/lib/memset_64.S index 0bfd26e4ca9e..9827ae267f96 100644 --- a/tools/arch/x86/lib/memset_64.S +++ b/tools/arch/x86/lib/memset_64.S @@ -3,7 +3,7 @@ #include #include -#include +#include #include /* diff --git a/tools/include/asm/alternative-asm.h b/tools/include/asm/alternative-asm.h deleted file mode 100644 index b54bd860dff6..000000000000 --- a/tools/include/asm/alternative-asm.h +++ /dev/null @@ -1,10 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _TOOLS_ASM_ALTERNATIVE_ASM_H -#define _TOOLS_ASM_ALTERNATIVE_ASM_H - -/* Just disable it so we can build arch/x86/lib/memcpy_64.S for perf bench: */ - -#define altinstruction_entry # -#define ALTERNATIVE_2 # - -#endif diff --git a/tools/include/asm/alternative.h b/tools/include/asm/alternative.h new file mode 100644 index 000000000000..b54bd860dff6 --- /dev/null +++ b/tools/include/asm/alternative.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _TOOLS_ASM_ALTERNATIVE_ASM_H +#define _TOOLS_ASM_ALTERNATIVE_ASM_H + +/* Just disable it so we can build arch/x86/lib/memcpy_64.S for perf bench: */ + +#define altinstruction_entry # +#define ALTERNATIVE_2 # + +#endif -- cgit v1.2.3 From 3916329309eace19e8c32bc821064a119474c309 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sun, 9 May 2021 10:21:33 -0300 Subject: tools include UAPI powerpc: Sync errno.h with the kernel headers To pick the change in: 7de21e679e6a789f ("powerpc: fix EDEADLOCK redefinition error in uapi/asm/errno.h") That will make the errno number -> string tables to pick this change on powerpc. Silencing this perf build warning: Warning: Kernel ABI header at 'tools/arch/powerpc/include/uapi/asm/errno.h' differs from latest version at 'arch/powerpc/include/uapi/asm/errno.h' diff -u tools/arch/powerpc/include/uapi/asm/errno.h arch/powerpc/include/uapi/asm/errno.h Cc: Michael Ellerman Cc: Tony Ambardar Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/powerpc/include/uapi/asm/errno.h | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/arch/powerpc/include/uapi/asm/errno.h b/tools/arch/powerpc/include/uapi/asm/errno.h index cc79856896a1..4ba87de32be0 100644 --- a/tools/arch/powerpc/include/uapi/asm/errno.h +++ b/tools/arch/powerpc/include/uapi/asm/errno.h @@ -2,6 +2,7 @@ #ifndef _ASM_POWERPC_ERRNO_H #define _ASM_POWERPC_ERRNO_H +#undef EDEADLOCK #include #undef EDEADLOCK -- cgit v1.2.3 From 6faf64f5248166ecaf50107e883c383e0b66bb70 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sun, 9 May 2021 10:24:29 -0300 Subject: tools headers cpufeatures: Sync with the kernel sources To pick the changes from: 4e6292114c741221 ("x86/paravirt: Add new features for paravirt patching") a161545ab53b174c ("x86/cpufeatures: Enumerate Intel Hybrid Technology feature bit") a89dfde3dc3c2dbf ("x86: Remove dynamic NOP selection") b8921dccf3b25798 ("x86/cpufeatures: Add SGX1 and SGX2 sub-features") f21d4d3b97a86035 ("x86/cpufeatures: Enumerate #DB for bus lock detection") f333374e108e7e4c ("x86/cpufeatures: Add the Virtual SPEC_CTRL feature") This only causes these perf files to be rebuilt: CC /tmp/build/perf/bench/mem-memcpy-x86-64-asm.o CC /tmp/build/perf/bench/mem-memset-x86-64-asm.o And addresses this perf build warning: Warning: Kernel ABI header at 'tools/arch/x86/include/asm/cpufeatures.h' differs from latest version at 'arch/x86/include/asm/cpufeatures.h' diff -u tools/arch/x86/include/asm/cpufeatures.h arch/x86/include/asm/cpufeatures.h Cc: Babu Moger Cc: Borislav Petkov Cc: Fenghua Yu Cc: Juergen Gross Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Ricardo Neri Cc: Sean Christopherson Cc: Thomas Gleixner Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/asm/cpufeatures.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index cc96e26d69f7..ac37830ae941 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -84,7 +84,7 @@ /* CPU types for specific tunings: */ #define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */ -#define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */ +/* FREE, was #define X86_FEATURE_K7 ( 3*32+ 5) "" Athlon */ #define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */ #define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ #define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ @@ -236,6 +236,8 @@ #define X86_FEATURE_EPT_AD ( 8*32+17) /* Intel Extended Page Table access-dirty bit */ #define X86_FEATURE_VMCALL ( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */ #define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */ +#define X86_FEATURE_PVUNLOCK ( 8*32+20) /* "" PV unlock function */ +#define X86_FEATURE_VCPUPREEMPT ( 8*32+21) /* "" PV vcpu_is_preempted function */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */ #define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ @@ -290,6 +292,8 @@ #define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */ #define X86_FEATURE_SPLIT_LOCK_DETECT (11*32+ 6) /* #AC for split lock */ #define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */ +#define X86_FEATURE_SGX1 (11*32+ 8) /* "" Basic SGX */ +#define X86_FEATURE_SGX2 (11*32+ 9) /* "" SGX Enclave Dynamic Memory Management (EDMM) */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ @@ -336,6 +340,7 @@ #define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ #define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ #define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */ +#define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* Virtual SPEC_CTRL */ #define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* "" SVME addr check */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */ @@ -354,6 +359,7 @@ #define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ #define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */ #define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */ +#define X86_FEATURE_BUS_LOCK_DETECT (16*32+24) /* Bus Lock detect */ #define X86_FEATURE_CLDEMOTE (16*32+25) /* CLDEMOTE instruction */ #define X86_FEATURE_MOVDIRI (16*32+27) /* MOVDIRI instruction */ #define X86_FEATURE_MOVDIR64B (16*32+28) /* MOVDIR64B instruction */ @@ -374,6 +380,7 @@ #define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */ #define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */ #define X86_FEATURE_SERIALIZE (18*32+14) /* SERIALIZE instruction */ +#define X86_FEATURE_HYBRID_CPU (18*32+15) /* "" This part has CPUs of more than one type */ #define X86_FEATURE_TSXLDTRK (18*32+16) /* TSX Suspend Load Address Tracking */ #define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */ #define X86_FEATURE_ARCH_LBR (18*32+19) /* Intel ARCH LBR */ -- cgit v1.2.3 From 71d7924b3e8acaca6a3b0fc3261170031ada3b70 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sun, 9 May 2021 10:29:10 -0300 Subject: tools headers UAPI: Sync perf_event.h with the kernel sources To pick up the changes in: 2b26f0aa004995f4 ("perf: Support only inheriting events if cloned with CLONE_THREAD") 2e498d0a74e5b88a ("perf: Add support for event removal on exec") 547b60988e631f74 ("perf: aux: Add flags for the buffer format") 55bcf6ef314ae8ba ("perf: Extend PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE") 7dde51767ca5339e ("perf: aux: Add CoreSight PMU buffer formats") 97ba62b278674293 ("perf: Add support for SIGTRAP on perf events") d0d1dd628527c77d ("perf core: Add PERF_COUNT_SW_CGROUP_SWITCHES event") Also change the expected sizeof(struct perf_event_attr) from 120 to 128 due to fields being added for the SIGTRAP changes. Addressing this perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/perf_event.h' differs from latest version at 'include/uapi/linux/perf_event.h' diff -u tools/include/uapi/linux/perf_event.h include/uapi/linux/perf_event.h Cc: Kan Liang Cc: Marco Elver Cc: Mathieu Poirier Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki K Poulose Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/perf_event.h | 26 +++++++++++++++++++++----- tools/perf/tests/attr/base-record | 2 +- tools/perf/tests/attr/base-stat | 2 +- tools/perf/tests/attr/system-wide-dummy | 2 +- 4 files changed, 24 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 14332f4cf816..bf8143505c49 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -127,6 +127,7 @@ enum perf_sw_ids { PERF_COUNT_SW_EMULATION_FAULTS = 8, PERF_COUNT_SW_DUMMY = 9, PERF_COUNT_SW_BPF_OUTPUT = 10, + PERF_COUNT_SW_CGROUP_SWITCHES = 11, PERF_COUNT_SW_MAX, /* non-ABI */ }; @@ -326,6 +327,7 @@ enum perf_event_read_format { #define PERF_ATTR_SIZE_VER4 104 /* add: sample_regs_intr */ #define PERF_ATTR_SIZE_VER5 112 /* add: aux_watermark */ #define PERF_ATTR_SIZE_VER6 120 /* add: aux_sample_size */ +#define PERF_ATTR_SIZE_VER7 128 /* add: sig_data */ /* * Hardware event_id to monitor via a performance monitoring event: @@ -404,7 +406,10 @@ struct perf_event_attr { cgroup : 1, /* include cgroup events */ text_poke : 1, /* include text poke events */ build_id : 1, /* use build id in mmap2 events */ - __reserved_1 : 29; + inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */ + remove_on_exec : 1, /* event is removed from task on exec */ + sigtrap : 1, /* send synchronous SIGTRAP on event */ + __reserved_1 : 26; union { __u32 wakeup_events; /* wakeup every n events */ @@ -456,6 +461,12 @@ struct perf_event_attr { __u16 __reserved_2; __u32 aux_sample_size; __u32 __reserved_3; + + /* + * User provided data if sigtrap=1, passed back to user via + * siginfo_t::si_perf, e.g. to permit user to identify the event. + */ + __u64 sig_data; }; /* @@ -1171,10 +1182,15 @@ enum perf_callchain_context { /** * PERF_RECORD_AUX::flags bits */ -#define PERF_AUX_FLAG_TRUNCATED 0x01 /* record was truncated to fit */ -#define PERF_AUX_FLAG_OVERWRITE 0x02 /* snapshot from overwrite mode */ -#define PERF_AUX_FLAG_PARTIAL 0x04 /* record contains gaps */ -#define PERF_AUX_FLAG_COLLISION 0x08 /* sample collided with another */ +#define PERF_AUX_FLAG_TRUNCATED 0x01 /* record was truncated to fit */ +#define PERF_AUX_FLAG_OVERWRITE 0x02 /* snapshot from overwrite mode */ +#define PERF_AUX_FLAG_PARTIAL 0x04 /* record contains gaps */ +#define PERF_AUX_FLAG_COLLISION 0x08 /* sample collided with another */ +#define PERF_AUX_FLAG_PMU_FORMAT_TYPE_MASK 0xff00 /* PMU specific trace format type */ + +/* CoreSight PMU AUX buffer formats */ +#define PERF_AUX_FLAG_CORESIGHT_FORMAT_CORESIGHT 0x0000 /* Default for backward compatibility */ +#define PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW 0x0100 /* Raw format of the source */ #define PERF_FLAG_FD_NO_GROUP (1UL << 0) #define PERF_FLAG_FD_OUTPUT (1UL << 1) diff --git a/tools/perf/tests/attr/base-record b/tools/perf/tests/attr/base-record index 645009c08b3c..4a7b8deef3fd 100644 --- a/tools/perf/tests/attr/base-record +++ b/tools/perf/tests/attr/base-record @@ -5,7 +5,7 @@ group_fd=-1 flags=0|8 cpu=* type=0|1 -size=120 +size=128 config=0 sample_period=* sample_type=263 diff --git a/tools/perf/tests/attr/base-stat b/tools/perf/tests/attr/base-stat index b0f42c34882e..408164456530 100644 --- a/tools/perf/tests/attr/base-stat +++ b/tools/perf/tests/attr/base-stat @@ -5,7 +5,7 @@ group_fd=-1 flags=0|8 cpu=* type=0 -size=120 +size=128 config=0 sample_period=0 sample_type=65536 diff --git a/tools/perf/tests/attr/system-wide-dummy b/tools/perf/tests/attr/system-wide-dummy index eba723cc0d38..86a15dd359d9 100644 --- a/tools/perf/tests/attr/system-wide-dummy +++ b/tools/perf/tests/attr/system-wide-dummy @@ -7,7 +7,7 @@ cpu=* pid=-1 flags=8 type=1 -size=120 +size=128 config=9 sample_period=4000 sample_type=455 -- cgit v1.2.3 From 67e7ec0bd4535fc6e6d3f5d174f80e10a8a80c6e Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 8 May 2021 12:22:12 -0300 Subject: libbpf: Provide GELF_ST_VISIBILITY() define for older libelf Where that macro isn't available. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/YJaspEh0qZr4LYOc@kernel.org --- tools/lib/bpf/libbpf_internal.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'tools') diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index ee426226928f..acbcf6c7bdf8 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -41,6 +41,11 @@ #define ELF_C_READ_MMAP ELF_C_READ #endif +/* Older libelf all end up in this expression, for both 32 and 64 bit */ +#ifndef GELF_ST_VISIBILITY +#define GELF_ST_VISIBILITY(o) ((o) & 0x03) +#endif + #define BTF_INFO_ENC(kind, kind_flag, vlen) \ ((!!(kind_flag) << 31) | ((kind) << 24) | ((vlen) & BTF_MAX_VLEN)) #define BTF_TYPE_ENC(name, info, size_or_type) (name), (info), (size_or_type) -- cgit v1.2.3 From 096eccdef0b32f47e9354231ddc3aaaf9527d51c Mon Sep 17 00:00:00 2001 From: Jussi Maki Date: Wed, 5 May 2021 08:59:25 +0000 Subject: selftests/bpf: Rewrite test_tc_redirect.sh as prog_tests/tc_redirect.c As discussed in [0], this ports test_tc_redirect.sh to the test_progs framework and removes the old test. This makes it more in line with rest of the tests and makes it possible to run this test case with vmtest.sh and under the bpf CI. The upcoming skb_change_head() helper fix in [0] is depending on it and extending the test case to redirect a packet from L3 device to veth. [0] https://lore.kernel.org/bpf/20210427135550.807355-1-joamaki@gmail.com Signed-off-by: Jussi Maki Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210505085925.783985-1-joamaki@gmail.com --- tools/testing/selftests/bpf/network_helpers.c | 2 +- tools/testing/selftests/bpf/network_helpers.h | 1 + .../testing/selftests/bpf/prog_tests/tc_redirect.c | 589 +++++++++++++++++++++ tools/testing/selftests/bpf/progs/test_tc_neigh.c | 33 +- .../selftests/bpf/progs/test_tc_neigh_fib.c | 9 +- tools/testing/selftests/bpf/progs/test_tc_peer.c | 33 +- tools/testing/selftests/bpf/test_tc_redirect.sh | 216 -------- 7 files changed, 617 insertions(+), 266 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/tc_redirect.c delete mode 100755 tools/testing/selftests/bpf/test_tc_redirect.sh (limited to 'tools') diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 12ee40284da0..2060bc122c53 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -40,7 +40,7 @@ struct ipv6_packet pkt_v6 = { .tcp.doff = 5, }; -static int settimeo(int fd, int timeout_ms) +int settimeo(int fd, int timeout_ms) { struct timeval timeout = { .tv_sec = 3 }; diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index 7205f8afdba1..5e0d51c07b63 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -33,6 +33,7 @@ struct ipv6_packet { } __packed; extern struct ipv6_packet pkt_v6; +int settimeo(int fd, int timeout_ms); int start_server(int family, int type, const char *addr, __u16 port, int timeout_ms); int connect_to_fd(int server_fd, int timeout_ms); diff --git a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c new file mode 100644 index 000000000000..95ef9fcd31d8 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c @@ -0,0 +1,589 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause + +/* + * This test sets up 3 netns (src <-> fwd <-> dst). There is no direct veth link + * between src and dst. The netns fwd has veth links to each src and dst. The + * client is in src and server in dst. The test installs a TC BPF program to each + * host facing veth in fwd which calls into i) bpf_redirect_neigh() to perform the + * neigh addr population and redirect or ii) bpf_redirect_peer() for namespace + * switch from ingress side; it also installs a checker prog on the egress side + * to drop unexpected traffic. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include + +#include "test_progs.h" +#include "network_helpers.h" +#include "test_tc_neigh_fib.skel.h" +#include "test_tc_neigh.skel.h" +#include "test_tc_peer.skel.h" + +#define NS_SRC "ns_src" +#define NS_FWD "ns_fwd" +#define NS_DST "ns_dst" + +#define IP4_SRC "172.16.1.100" +#define IP4_DST "172.16.2.100" +#define IP4_PORT 9004 + +#define IP6_SRC "::1:dead:beef:cafe" +#define IP6_DST "::2:dead:beef:cafe" +#define IP6_PORT 9006 + +#define IP4_SLL "169.254.0.1" +#define IP4_DLL "169.254.0.2" +#define IP4_NET "169.254.0.0" + +#define IFADDR_STR_LEN 18 +#define PING_ARGS "-c 3 -w 10 -q" + +#define SRC_PROG_PIN_FILE "/sys/fs/bpf/test_tc_src" +#define DST_PROG_PIN_FILE "/sys/fs/bpf/test_tc_dst" +#define CHK_PROG_PIN_FILE "/sys/fs/bpf/test_tc_chk" + +#define TIMEOUT_MILLIS 10000 + +#define MAX_PROC_MODS 128 +#define MAX_PROC_VALUE_LEN 16 + +#define log_err(MSG, ...) \ + fprintf(stderr, "(%s:%d: errno: %s) " MSG "\n", \ + __FILE__, __LINE__, strerror(errno), ##__VA_ARGS__) + +struct proc_mod { + char path[PATH_MAX]; + char oldval[MAX_PROC_VALUE_LEN]; + int oldlen; +}; + +static const char * const namespaces[] = {NS_SRC, NS_FWD, NS_DST, NULL}; +static int root_netns_fd = -1; +static int num_proc_mods; +static struct proc_mod proc_mods[MAX_PROC_MODS]; + +/** + * modify_proc() - Modify entry in /proc + * + * Modifies an entry in /proc and saves the original value for later + * restoration with restore_proc(). + */ +static int modify_proc(const char *path, const char *newval) +{ + struct proc_mod *mod; + FILE *f; + + if (num_proc_mods + 1 > MAX_PROC_MODS) + return -1; + + f = fopen(path, "r+"); + if (!f) + return -1; + + mod = &proc_mods[num_proc_mods]; + num_proc_mods++; + + strncpy(mod->path, path, PATH_MAX); + + if (!fread(mod->oldval, 1, MAX_PROC_VALUE_LEN, f)) { + log_err("reading from %s failed", path); + goto fail; + } + rewind(f); + if (fwrite(newval, strlen(newval), 1, f) != 1) { + log_err("writing to %s failed", path); + goto fail; + } + + fclose(f); + return 0; + +fail: + fclose(f); + num_proc_mods--; + return -1; +} + +/** + * restore_proc() - Restore all /proc modifications + */ +static void restore_proc(void) +{ + int i; + + for (i = 0; i < num_proc_mods; i++) { + struct proc_mod *mod = &proc_mods[i]; + FILE *f; + + f = fopen(mod->path, "w"); + if (!f) { + log_err("fopen of %s failed", mod->path); + continue; + } + + if (fwrite(mod->oldval, mod->oldlen, 1, f) != 1) + log_err("fwrite to %s failed", mod->path); + + fclose(f); + } + num_proc_mods = 0; +} + +/** + * setns_by_name() - Set networks namespace by name + */ +static int setns_by_name(const char *name) +{ + int nsfd; + char nspath[PATH_MAX]; + int err; + + snprintf(nspath, sizeof(nspath), "%s/%s", "/var/run/netns", name); + nsfd = open(nspath, O_RDONLY | O_CLOEXEC); + if (nsfd < 0) + return nsfd; + + err = setns(nsfd, CLONE_NEWNET); + close(nsfd); + + return err; +} + +/** + * setns_root() - Set network namespace to original (root) namespace + * + * Not expected to ever fail, so error not returned, but failure logged + * and test marked as failed. + */ +static void setns_root(void) +{ + ASSERT_OK(setns(root_netns_fd, CLONE_NEWNET), "setns root"); +} + +static int netns_setup_namespaces(const char *verb) +{ + const char * const *ns = namespaces; + char cmd[128]; + + while (*ns) { + snprintf(cmd, sizeof(cmd), "ip netns %s %s", verb, *ns); + if (!ASSERT_OK(system(cmd), cmd)) + return -1; + ns++; + } + return 0; +} + +struct netns_setup_result { + int ifindex_veth_src_fwd; + int ifindex_veth_dst_fwd; +}; + +static int get_ifaddr(const char *name, char *ifaddr) +{ + char path[PATH_MAX]; + FILE *f; + int ret; + + snprintf(path, PATH_MAX, "/sys/class/net/%s/address", name); + f = fopen(path, "r"); + if (!ASSERT_OK_PTR(f, path)) + return -1; + + ret = fread(ifaddr, 1, IFADDR_STR_LEN, f); + if (!ASSERT_EQ(ret, IFADDR_STR_LEN, "fread ifaddr")) { + fclose(f); + return -1; + } + fclose(f); + return 0; +} + +static int get_ifindex(const char *name) +{ + char path[PATH_MAX]; + char buf[32]; + FILE *f; + int ret; + + snprintf(path, PATH_MAX, "/sys/class/net/%s/ifindex", name); + f = fopen(path, "r"); + if (!ASSERT_OK_PTR(f, path)) + return -1; + + ret = fread(buf, 1, sizeof(buf), f); + if (!ASSERT_GT(ret, 0, "fread ifindex")) { + fclose(f); + return -1; + } + fclose(f); + return atoi(buf); +} + +#define SYS(fmt, ...) \ + ({ \ + char cmd[1024]; \ + snprintf(cmd, sizeof(cmd), fmt, ##__VA_ARGS__); \ + if (!ASSERT_OK(system(cmd), cmd)) \ + goto fail; \ + }) + +static int netns_setup_links_and_routes(struct netns_setup_result *result) +{ + char veth_src_fwd_addr[IFADDR_STR_LEN+1] = {}; + char veth_dst_fwd_addr[IFADDR_STR_LEN+1] = {}; + + SYS("ip link add veth_src type veth peer name veth_src_fwd"); + SYS("ip link add veth_dst type veth peer name veth_dst_fwd"); + if (get_ifaddr("veth_src_fwd", veth_src_fwd_addr)) + goto fail; + if (get_ifaddr("veth_dst_fwd", veth_dst_fwd_addr)) + goto fail; + + result->ifindex_veth_src_fwd = get_ifindex("veth_src_fwd"); + if (result->ifindex_veth_src_fwd < 0) + goto fail; + result->ifindex_veth_dst_fwd = get_ifindex("veth_dst_fwd"); + if (result->ifindex_veth_dst_fwd < 0) + goto fail; + + SYS("ip link set veth_src netns " NS_SRC); + SYS("ip link set veth_src_fwd netns " NS_FWD); + SYS("ip link set veth_dst_fwd netns " NS_FWD); + SYS("ip link set veth_dst netns " NS_DST); + + /** setup in 'src' namespace */ + if (!ASSERT_OK(setns_by_name(NS_SRC), "setns src")) + goto fail; + + SYS("ip addr add " IP4_SRC "/32 dev veth_src"); + SYS("ip addr add " IP6_SRC "/128 dev veth_src nodad"); + SYS("ip link set dev veth_src up"); + + SYS("ip route add " IP4_DST "/32 dev veth_src scope global"); + SYS("ip route add " IP4_NET "/16 dev veth_src scope global"); + SYS("ip route add " IP6_DST "/128 dev veth_src scope global"); + + SYS("ip neigh add " IP4_DST " dev veth_src lladdr %s", + veth_src_fwd_addr); + SYS("ip neigh add " IP6_DST " dev veth_src lladdr %s", + veth_src_fwd_addr); + + /** setup in 'fwd' namespace */ + if (!ASSERT_OK(setns_by_name(NS_FWD), "setns fwd")) + goto fail; + + /* The fwd netns automatically gets a v6 LL address / routes, but also + * needs v4 one in order to start ARP probing. IP4_NET route is added + * to the endpoints so that the ARP processing will reply. + */ + SYS("ip addr add " IP4_SLL "/32 dev veth_src_fwd"); + SYS("ip addr add " IP4_DLL "/32 dev veth_dst_fwd"); + SYS("ip link set dev veth_src_fwd up"); + SYS("ip link set dev veth_dst_fwd up"); + + SYS("ip route add " IP4_SRC "/32 dev veth_src_fwd scope global"); + SYS("ip route add " IP6_SRC "/128 dev veth_src_fwd scope global"); + SYS("ip route add " IP4_DST "/32 dev veth_dst_fwd scope global"); + SYS("ip route add " IP6_DST "/128 dev veth_dst_fwd scope global"); + + /** setup in 'dst' namespace */ + if (!ASSERT_OK(setns_by_name(NS_DST), "setns dst")) + goto fail; + + SYS("ip addr add " IP4_DST "/32 dev veth_dst"); + SYS("ip addr add " IP6_DST "/128 dev veth_dst nodad"); + SYS("ip link set dev veth_dst up"); + + SYS("ip route add " IP4_SRC "/32 dev veth_dst scope global"); + SYS("ip route add " IP4_NET "/16 dev veth_dst scope global"); + SYS("ip route add " IP6_SRC "/128 dev veth_dst scope global"); + + SYS("ip neigh add " IP4_SRC " dev veth_dst lladdr %s", + veth_dst_fwd_addr); + SYS("ip neigh add " IP6_SRC " dev veth_dst lladdr %s", + veth_dst_fwd_addr); + + setns_root(); + return 0; +fail: + setns_root(); + return -1; +} + +static int netns_load_bpf(void) +{ + if (!ASSERT_OK(setns_by_name(NS_FWD), "setns fwd")) + return -1; + + SYS("tc qdisc add dev veth_src_fwd clsact"); + SYS("tc filter add dev veth_src_fwd ingress bpf da object-pinned " + SRC_PROG_PIN_FILE); + SYS("tc filter add dev veth_src_fwd egress bpf da object-pinned " + CHK_PROG_PIN_FILE); + + SYS("tc qdisc add dev veth_dst_fwd clsact"); + SYS("tc filter add dev veth_dst_fwd ingress bpf da object-pinned " + DST_PROG_PIN_FILE); + SYS("tc filter add dev veth_dst_fwd egress bpf da object-pinned " + CHK_PROG_PIN_FILE); + + setns_root(); + return -1; +fail: + setns_root(); + return -1; +} + +static int netns_unload_bpf(void) +{ + if (!ASSERT_OK(setns_by_name(NS_FWD), "setns fwd")) + goto fail; + SYS("tc qdisc delete dev veth_src_fwd clsact"); + SYS("tc qdisc delete dev veth_dst_fwd clsact"); + + setns_root(); + return 0; +fail: + setns_root(); + return -1; +} + + +static void test_tcp(int family, const char *addr, __u16 port) +{ + int listen_fd = -1, accept_fd = -1, client_fd = -1; + char buf[] = "testing testing"; + int n; + + if (!ASSERT_OK(setns_by_name(NS_DST), "setns dst")) + return; + + listen_fd = start_server(family, SOCK_STREAM, addr, port, 0); + if (!ASSERT_GE(listen_fd, 0, "listen")) + goto done; + + if (!ASSERT_OK(setns_by_name(NS_SRC), "setns src")) + goto done; + + client_fd = connect_to_fd(listen_fd, TIMEOUT_MILLIS); + if (!ASSERT_GE(client_fd, 0, "connect_to_fd")) + goto done; + + accept_fd = accept(listen_fd, NULL, NULL); + if (!ASSERT_GE(accept_fd, 0, "accept")) + goto done; + + if (!ASSERT_OK(settimeo(accept_fd, TIMEOUT_MILLIS), "settimeo")) + goto done; + + n = write(client_fd, buf, sizeof(buf)); + if (!ASSERT_EQ(n, sizeof(buf), "send to server")) + goto done; + + n = read(accept_fd, buf, sizeof(buf)); + ASSERT_EQ(n, sizeof(buf), "recv from server"); + +done: + setns_root(); + if (listen_fd >= 0) + close(listen_fd); + if (accept_fd >= 0) + close(accept_fd); + if (client_fd >= 0) + close(client_fd); +} + +static int test_ping(int family, const char *addr) +{ + const char *ping = family == AF_INET6 ? "ping6" : "ping"; + + SYS("ip netns exec " NS_SRC " %s " PING_ARGS " %s", ping, addr); + return 0; +fail: + return -1; +} + +static void test_connectivity(void) +{ + test_tcp(AF_INET, IP4_DST, IP4_PORT); + test_ping(AF_INET, IP4_DST); + test_tcp(AF_INET6, IP6_DST, IP6_PORT); + test_ping(AF_INET6, IP6_DST); +} + +static void test_tc_redirect_neigh_fib(struct netns_setup_result *setup_result) +{ + struct test_tc_neigh_fib *skel; + int err; + + skel = test_tc_neigh_fib__open(); + if (!ASSERT_OK_PTR(skel, "test_tc_neigh_fib__open")) + return; + + if (!ASSERT_OK(test_tc_neigh_fib__load(skel), "test_tc_neigh_fib__load")) { + test_tc_neigh_fib__destroy(skel); + return; + } + + err = bpf_program__pin(skel->progs.tc_src, SRC_PROG_PIN_FILE); + if (!ASSERT_OK(err, "pin " SRC_PROG_PIN_FILE)) + goto done; + + err = bpf_program__pin(skel->progs.tc_chk, CHK_PROG_PIN_FILE); + if (!ASSERT_OK(err, "pin " CHK_PROG_PIN_FILE)) + goto done; + + err = bpf_program__pin(skel->progs.tc_dst, DST_PROG_PIN_FILE); + if (!ASSERT_OK(err, "pin " DST_PROG_PIN_FILE)) + goto done; + + if (netns_load_bpf()) + goto done; + + /* bpf_fib_lookup() checks if forwarding is enabled */ + if (!ASSERT_OK(setns_by_name(NS_FWD), "setns fwd")) + goto done; + + err = modify_proc("/proc/sys/net/ipv4/ip_forward", "1"); + if (!ASSERT_OK(err, "set ipv4.ip_forward")) + goto done; + + err = modify_proc("/proc/sys/net/ipv6/conf/all/forwarding", "1"); + if (!ASSERT_OK(err, "set ipv6.forwarding")) + goto done; + setns_root(); + + test_connectivity(); +done: + bpf_program__unpin(skel->progs.tc_src, SRC_PROG_PIN_FILE); + bpf_program__unpin(skel->progs.tc_chk, CHK_PROG_PIN_FILE); + bpf_program__unpin(skel->progs.tc_dst, DST_PROG_PIN_FILE); + test_tc_neigh_fib__destroy(skel); + netns_unload_bpf(); + setns_root(); + restore_proc(); +} + +static void test_tc_redirect_neigh(struct netns_setup_result *setup_result) +{ + struct test_tc_neigh *skel; + int err; + + skel = test_tc_neigh__open(); + if (!ASSERT_OK_PTR(skel, "test_tc_neigh__open")) + return; + + skel->rodata->IFINDEX_SRC = setup_result->ifindex_veth_src_fwd; + skel->rodata->IFINDEX_DST = setup_result->ifindex_veth_dst_fwd; + + err = test_tc_neigh__load(skel); + if (!ASSERT_OK(err, "test_tc_neigh__load")) { + test_tc_neigh__destroy(skel); + return; + } + + err = bpf_program__pin(skel->progs.tc_src, SRC_PROG_PIN_FILE); + if (!ASSERT_OK(err, "pin " SRC_PROG_PIN_FILE)) + goto done; + + err = bpf_program__pin(skel->progs.tc_chk, CHK_PROG_PIN_FILE); + if (!ASSERT_OK(err, "pin " CHK_PROG_PIN_FILE)) + goto done; + + err = bpf_program__pin(skel->progs.tc_dst, DST_PROG_PIN_FILE); + if (!ASSERT_OK(err, "pin " DST_PROG_PIN_FILE)) + goto done; + + if (netns_load_bpf()) + goto done; + + test_connectivity(); + +done: + bpf_program__unpin(skel->progs.tc_src, SRC_PROG_PIN_FILE); + bpf_program__unpin(skel->progs.tc_chk, CHK_PROG_PIN_FILE); + bpf_program__unpin(skel->progs.tc_dst, DST_PROG_PIN_FILE); + test_tc_neigh__destroy(skel); + netns_unload_bpf(); + setns_root(); +} + +static void test_tc_redirect_peer(struct netns_setup_result *setup_result) +{ + struct test_tc_peer *skel; + int err; + + skel = test_tc_peer__open(); + if (!ASSERT_OK_PTR(skel, "test_tc_peer__open")) + return; + + skel->rodata->IFINDEX_SRC = setup_result->ifindex_veth_src_fwd; + skel->rodata->IFINDEX_DST = setup_result->ifindex_veth_dst_fwd; + + err = test_tc_peer__load(skel); + if (!ASSERT_OK(err, "test_tc_peer__load")) { + test_tc_peer__destroy(skel); + return; + } + + err = bpf_program__pin(skel->progs.tc_src, SRC_PROG_PIN_FILE); + if (!ASSERT_OK(err, "pin " SRC_PROG_PIN_FILE)) + goto done; + + err = bpf_program__pin(skel->progs.tc_chk, CHK_PROG_PIN_FILE); + if (!ASSERT_OK(err, "pin " CHK_PROG_PIN_FILE)) + goto done; + + err = bpf_program__pin(skel->progs.tc_dst, DST_PROG_PIN_FILE); + if (!ASSERT_OK(err, "pin " DST_PROG_PIN_FILE)) + goto done; + + if (netns_load_bpf()) + goto done; + + test_connectivity(); + +done: + bpf_program__unpin(skel->progs.tc_src, SRC_PROG_PIN_FILE); + bpf_program__unpin(skel->progs.tc_chk, CHK_PROG_PIN_FILE); + bpf_program__unpin(skel->progs.tc_dst, DST_PROG_PIN_FILE); + test_tc_peer__destroy(skel); + netns_unload_bpf(); + setns_root(); +} + +void test_tc_redirect(void) +{ + struct netns_setup_result setup_result; + + root_netns_fd = open("/proc/self/ns/net", O_RDONLY); + if (!ASSERT_GE(root_netns_fd, 0, "open /proc/self/ns/net")) + return; + + if (netns_setup_namespaces("add")) + goto done; + + if (netns_setup_links_and_routes(&setup_result)) + goto done; + + if (test__start_subtest("tc_redirect_peer")) + test_tc_redirect_peer(&setup_result); + + if (test__start_subtest("tc_redirect_neigh")) + test_tc_redirect_neigh(&setup_result); + + if (test__start_subtest("tc_redirect_neigh_fib")) + test_tc_redirect_neigh_fib(&setup_result); + +done: + close(root_netns_fd); + netns_setup_namespaces("delete"); +} diff --git a/tools/testing/selftests/bpf/progs/test_tc_neigh.c b/tools/testing/selftests/bpf/progs/test_tc_neigh.c index b985ac4e7a81..90f64a85998f 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_neigh.c +++ b/tools/testing/selftests/bpf/progs/test_tc_neigh.c @@ -33,17 +33,8 @@ a.s6_addr32[3] == b.s6_addr32[3]) #endif -enum { - dev_src, - dev_dst, -}; - -struct bpf_map_def SEC("maps") ifindex_map = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(int), - .value_size = sizeof(int), - .max_entries = 2, -}; +static volatile const __u32 IFINDEX_SRC; +static volatile const __u32 IFINDEX_DST; static __always_inline bool is_remote_ep_v4(struct __sk_buff *skb, __be32 addr) @@ -79,14 +70,8 @@ static __always_inline bool is_remote_ep_v6(struct __sk_buff *skb, return v6_equal(ip6h->daddr, addr); } -static __always_inline int get_dev_ifindex(int which) -{ - int *ifindex = bpf_map_lookup_elem(&ifindex_map, &which); - - return ifindex ? *ifindex : 0; -} - -SEC("chk_egress") int tc_chk(struct __sk_buff *skb) +SEC("classifier/chk_egress") +int tc_chk(struct __sk_buff *skb) { void *data_end = ctx_ptr(skb->data_end); void *data = ctx_ptr(skb->data); @@ -98,7 +83,8 @@ SEC("chk_egress") int tc_chk(struct __sk_buff *skb) return !raw[0] && !raw[1] && !raw[2] ? TC_ACT_SHOT : TC_ACT_OK; } -SEC("dst_ingress") int tc_dst(struct __sk_buff *skb) +SEC("classifier/dst_ingress") +int tc_dst(struct __sk_buff *skb) { __u8 zero[ETH_ALEN * 2]; bool redirect = false; @@ -119,10 +105,11 @@ SEC("dst_ingress") int tc_dst(struct __sk_buff *skb) if (bpf_skb_store_bytes(skb, 0, &zero, sizeof(zero), 0) < 0) return TC_ACT_SHOT; - return bpf_redirect_neigh(get_dev_ifindex(dev_src), NULL, 0, 0); + return bpf_redirect_neigh(IFINDEX_SRC, NULL, 0, 0); } -SEC("src_ingress") int tc_src(struct __sk_buff *skb) +SEC("classifier/src_ingress") +int tc_src(struct __sk_buff *skb) { __u8 zero[ETH_ALEN * 2]; bool redirect = false; @@ -143,7 +130,7 @@ SEC("src_ingress") int tc_src(struct __sk_buff *skb) if (bpf_skb_store_bytes(skb, 0, &zero, sizeof(zero), 0) < 0) return TC_ACT_SHOT; - return bpf_redirect_neigh(get_dev_ifindex(dev_dst), NULL, 0, 0); + return bpf_redirect_neigh(IFINDEX_DST, NULL, 0, 0); } char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_tc_neigh_fib.c b/tools/testing/selftests/bpf/progs/test_tc_neigh_fib.c index d82ed3457030..f7ab69cf018e 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_neigh_fib.c +++ b/tools/testing/selftests/bpf/progs/test_tc_neigh_fib.c @@ -75,7 +75,8 @@ static __always_inline int fill_fib_params_v6(struct __sk_buff *skb, return 0; } -SEC("chk_egress") int tc_chk(struct __sk_buff *skb) +SEC("classifier/chk_egress") +int tc_chk(struct __sk_buff *skb) { void *data_end = ctx_ptr(skb->data_end); void *data = ctx_ptr(skb->data); @@ -142,12 +143,14 @@ static __always_inline int tc_redir(struct __sk_buff *skb) /* these are identical, but keep them separate for compatibility with the * section names expected by test_tc_redirect.sh */ -SEC("dst_ingress") int tc_dst(struct __sk_buff *skb) +SEC("classifier/dst_ingress") +int tc_dst(struct __sk_buff *skb) { return tc_redir(skb); } -SEC("src_ingress") int tc_src(struct __sk_buff *skb) +SEC("classifier/src_ingress") +int tc_src(struct __sk_buff *skb) { return tc_redir(skb); } diff --git a/tools/testing/selftests/bpf/progs/test_tc_peer.c b/tools/testing/selftests/bpf/progs/test_tc_peer.c index fc84a7685aa2..72c72950c3bb 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_peer.c +++ b/tools/testing/selftests/bpf/progs/test_tc_peer.c @@ -8,38 +8,25 @@ #include -enum { - dev_src, - dev_dst, -}; +static volatile const __u32 IFINDEX_SRC; +static volatile const __u32 IFINDEX_DST; -struct bpf_map_def SEC("maps") ifindex_map = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(int), - .value_size = sizeof(int), - .max_entries = 2, -}; - -static __always_inline int get_dev_ifindex(int which) -{ - int *ifindex = bpf_map_lookup_elem(&ifindex_map, &which); - - return ifindex ? *ifindex : 0; -} - -SEC("chk_egress") int tc_chk(struct __sk_buff *skb) +SEC("classifier/chk_egress") +int tc_chk(struct __sk_buff *skb) { return TC_ACT_SHOT; } -SEC("dst_ingress") int tc_dst(struct __sk_buff *skb) +SEC("classifier/dst_ingress") +int tc_dst(struct __sk_buff *skb) { - return bpf_redirect_peer(get_dev_ifindex(dev_src), 0); + return bpf_redirect_peer(IFINDEX_SRC, 0); } -SEC("src_ingress") int tc_src(struct __sk_buff *skb) +SEC("classifier/src_ingress") +int tc_src(struct __sk_buff *skb) { - return bpf_redirect_peer(get_dev_ifindex(dev_dst), 0); + return bpf_redirect_peer(IFINDEX_DST, 0); } char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_tc_redirect.sh b/tools/testing/selftests/bpf/test_tc_redirect.sh deleted file mode 100755 index 8868aa1ca902..000000000000 --- a/tools/testing/selftests/bpf/test_tc_redirect.sh +++ /dev/null @@ -1,216 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# This test sets up 3 netns (src <-> fwd <-> dst). There is no direct veth link -# between src and dst. The netns fwd has veth links to each src and dst. The -# client is in src and server in dst. The test installs a TC BPF program to each -# host facing veth in fwd which calls into i) bpf_redirect_neigh() to perform the -# neigh addr population and redirect or ii) bpf_redirect_peer() for namespace -# switch from ingress side; it also installs a checker prog on the egress side -# to drop unexpected traffic. - -if [[ $EUID -ne 0 ]]; then - echo "This script must be run as root" - echo "FAIL" - exit 1 -fi - -# check that needed tools are present -command -v nc >/dev/null 2>&1 || \ - { echo >&2 "nc is not available"; exit 1; } -command -v dd >/dev/null 2>&1 || \ - { echo >&2 "dd is not available"; exit 1; } -command -v timeout >/dev/null 2>&1 || \ - { echo >&2 "timeout is not available"; exit 1; } -command -v ping >/dev/null 2>&1 || \ - { echo >&2 "ping is not available"; exit 1; } -if command -v ping6 >/dev/null 2>&1; then PING6=ping6; else PING6=ping; fi -command -v perl >/dev/null 2>&1 || \ - { echo >&2 "perl is not available"; exit 1; } -command -v jq >/dev/null 2>&1 || \ - { echo >&2 "jq is not available"; exit 1; } -command -v bpftool >/dev/null 2>&1 || \ - { echo >&2 "bpftool is not available"; exit 1; } - -readonly GREEN='\033[0;92m' -readonly RED='\033[0;31m' -readonly NC='\033[0m' # No Color - -readonly PING_ARG="-c 3 -w 10 -q" - -readonly TIMEOUT=10 - -readonly NS_SRC="ns-src-$(mktemp -u XXXXXX)" -readonly NS_FWD="ns-fwd-$(mktemp -u XXXXXX)" -readonly NS_DST="ns-dst-$(mktemp -u XXXXXX)" - -readonly IP4_SRC="172.16.1.100" -readonly IP4_DST="172.16.2.100" - -readonly IP6_SRC="::1:dead:beef:cafe" -readonly IP6_DST="::2:dead:beef:cafe" - -readonly IP4_SLL="169.254.0.1" -readonly IP4_DLL="169.254.0.2" -readonly IP4_NET="169.254.0.0" - -netns_cleanup() -{ - ip netns del ${NS_SRC} - ip netns del ${NS_FWD} - ip netns del ${NS_DST} -} - -netns_setup() -{ - ip netns add "${NS_SRC}" - ip netns add "${NS_FWD}" - ip netns add "${NS_DST}" - - ip link add veth_src type veth peer name veth_src_fwd - ip link add veth_dst type veth peer name veth_dst_fwd - - ip link set veth_src netns ${NS_SRC} - ip link set veth_src_fwd netns ${NS_FWD} - - ip link set veth_dst netns ${NS_DST} - ip link set veth_dst_fwd netns ${NS_FWD} - - ip -netns ${NS_SRC} addr add ${IP4_SRC}/32 dev veth_src - ip -netns ${NS_DST} addr add ${IP4_DST}/32 dev veth_dst - - # The fwd netns automatically get a v6 LL address / routes, but also - # needs v4 one in order to start ARP probing. IP4_NET route is added - # to the endpoints so that the ARP processing will reply. - - ip -netns ${NS_FWD} addr add ${IP4_SLL}/32 dev veth_src_fwd - ip -netns ${NS_FWD} addr add ${IP4_DLL}/32 dev veth_dst_fwd - - ip -netns ${NS_SRC} addr add ${IP6_SRC}/128 dev veth_src nodad - ip -netns ${NS_DST} addr add ${IP6_DST}/128 dev veth_dst nodad - - ip -netns ${NS_SRC} link set dev veth_src up - ip -netns ${NS_FWD} link set dev veth_src_fwd up - - ip -netns ${NS_DST} link set dev veth_dst up - ip -netns ${NS_FWD} link set dev veth_dst_fwd up - - ip -netns ${NS_SRC} route add ${IP4_DST}/32 dev veth_src scope global - ip -netns ${NS_SRC} route add ${IP4_NET}/16 dev veth_src scope global - ip -netns ${NS_FWD} route add ${IP4_SRC}/32 dev veth_src_fwd scope global - - ip -netns ${NS_SRC} route add ${IP6_DST}/128 dev veth_src scope global - ip -netns ${NS_FWD} route add ${IP6_SRC}/128 dev veth_src_fwd scope global - - ip -netns ${NS_DST} route add ${IP4_SRC}/32 dev veth_dst scope global - ip -netns ${NS_DST} route add ${IP4_NET}/16 dev veth_dst scope global - ip -netns ${NS_FWD} route add ${IP4_DST}/32 dev veth_dst_fwd scope global - - ip -netns ${NS_DST} route add ${IP6_SRC}/128 dev veth_dst scope global - ip -netns ${NS_FWD} route add ${IP6_DST}/128 dev veth_dst_fwd scope global - - fmac_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/address) - fmac_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/address) - - ip -netns ${NS_SRC} neigh add ${IP4_DST} dev veth_src lladdr $fmac_src - ip -netns ${NS_DST} neigh add ${IP4_SRC} dev veth_dst lladdr $fmac_dst - - ip -netns ${NS_SRC} neigh add ${IP6_DST} dev veth_src lladdr $fmac_src - ip -netns ${NS_DST} neigh add ${IP6_SRC} dev veth_dst lladdr $fmac_dst -} - -netns_test_connectivity() -{ - set +e - - ip netns exec ${NS_DST} bash -c "nc -4 -l -p 9004 &" - ip netns exec ${NS_DST} bash -c "nc -6 -l -p 9006 &" - - TEST="TCPv4 connectivity test" - ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP4_DST}/9004" - if [ $? -ne 0 ]; then - echo -e "${TEST}: ${RED}FAIL${NC}" - exit 1 - fi - echo -e "${TEST}: ${GREEN}PASS${NC}" - - TEST="TCPv6 connectivity test" - ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP6_DST}/9006" - if [ $? -ne 0 ]; then - echo -e "${TEST}: ${RED}FAIL${NC}" - exit 1 - fi - echo -e "${TEST}: ${GREEN}PASS${NC}" - - TEST="ICMPv4 connectivity test" - ip netns exec ${NS_SRC} ping $PING_ARG ${IP4_DST} - if [ $? -ne 0 ]; then - echo -e "${TEST}: ${RED}FAIL${NC}" - exit 1 - fi - echo -e "${TEST}: ${GREEN}PASS${NC}" - - TEST="ICMPv6 connectivity test" - ip netns exec ${NS_SRC} $PING6 $PING_ARG ${IP6_DST} - if [ $? -ne 0 ]; then - echo -e "${TEST}: ${RED}FAIL${NC}" - exit 1 - fi - echo -e "${TEST}: ${GREEN}PASS${NC}" - - set -e -} - -hex_mem_str() -{ - perl -e 'print join(" ", unpack("(H2)8", pack("L", @ARGV)))' $1 -} - -netns_setup_bpf() -{ - local obj=$1 - local use_forwarding=${2:-0} - - ip netns exec ${NS_FWD} tc qdisc add dev veth_src_fwd clsact - ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd ingress bpf da obj $obj sec src_ingress - ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd egress bpf da obj $obj sec chk_egress - - ip netns exec ${NS_FWD} tc qdisc add dev veth_dst_fwd clsact - ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd ingress bpf da obj $obj sec dst_ingress - ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd egress bpf da obj $obj sec chk_egress - - if [ "$use_forwarding" -eq "1" ]; then - # bpf_fib_lookup() checks if forwarding is enabled - ip netns exec ${NS_FWD} sysctl -w net.ipv4.ip_forward=1 - ip netns exec ${NS_FWD} sysctl -w net.ipv6.conf.veth_dst_fwd.forwarding=1 - ip netns exec ${NS_FWD} sysctl -w net.ipv6.conf.veth_src_fwd.forwarding=1 - return 0 - fi - - veth_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/ifindex) - veth_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/ifindex) - - progs=$(ip netns exec ${NS_FWD} bpftool net --json | jq -r '.[] | .tc | map(.id) | .[]') - for prog in $progs; do - map=$(bpftool prog show id $prog --json | jq -r '.map_ids | .? | .[]') - if [ ! -z "$map" ]; then - bpftool map update id $map key hex $(hex_mem_str 0) value hex $(hex_mem_str $veth_src) - bpftool map update id $map key hex $(hex_mem_str 1) value hex $(hex_mem_str $veth_dst) - fi - done -} - -trap netns_cleanup EXIT -set -e - -netns_setup -netns_setup_bpf test_tc_neigh.o -netns_test_connectivity -netns_cleanup -netns_setup -netns_setup_bpf test_tc_neigh_fib.o 1 -netns_test_connectivity -netns_cleanup -netns_setup -netns_setup_bpf test_tc_peer.o -netns_test_connectivity -- cgit v1.2.3 From 46c7405df7de8deb97229eacebcee96d61415f3f Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Wed, 12 May 2021 19:42:10 +0200 Subject: objtool: Fix elf_create_undef_symbol() endianness Currently x86 cross-compilation fails on big endian system with: x86_64-cross-ld: init/main.o: invalid string offset 488112128 >= 6229 for section `.strtab' Mark new ELF data in elf_create_undef_symbol() as symbol, so that libelf does endianness handling correctly. Fixes: 2f2f7e47f052 ("objtool: Add elf_create_undef_symbol()") Signed-off-by: Vasily Gorbik Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/patch-1.thread-6c9df9.git-d39264656387.your-ad-here.call-01620841104-ext-2554@work.hours --- tools/objtool/elf.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index d08f5f3670f8..743c2e9d0f56 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -762,6 +762,7 @@ struct symbol *elf_create_undef_symbol(struct elf *elf, const char *name) data->d_buf = &sym->sym; data->d_size = sizeof(sym->sym); data->d_align = 1; + data->d_type = ELF_T_SYM; sym->idx = symtab->len / sizeof(sym->sym); -- cgit v1.2.3 From f66c05d6baf36069c01a02f869bebb75586f2318 Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Wed, 12 May 2021 19:42:13 +0200 Subject: objtool/x86: Fix elf_add_alternative() endianness Currently x86 kernel cross-compiled on big endian system fails at boot with: kernel BUG at arch/x86/kernel/alternative.c:258! Corresponding bug condition look like the following: BUG_ON(feature >= (NCAPINTS + NBUGINTS) * 32); Fix that by converting alternative feature/cpuid to target endianness. Fixes: 9bc0bb50727c ("objtool/x86: Rewrite retpoline thunk calls") Signed-off-by: Vasily Gorbik Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/patch-2.thread-6c9df9.git-6c9df9a8098d.your-ad-here.call-01620841104-ext-2554@work.hours --- tools/objtool/arch/x86/decode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index cedf3ede7545..24295d39713b 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -19,6 +19,7 @@ #include #include #include +#include #include static int is_x86_64(const struct elf *elf) @@ -725,7 +726,7 @@ static int elf_add_alternative(struct elf *elf, return -1; } - alt->cpuid = cpuid; + alt->cpuid = bswap_if_needed(cpuid); alt->instrlen = orig_len; alt->replacementlen = repl_len; -- cgit v1.2.3 From 7ddb4cc2b885c740523e6ea54a1f4434acfa3368 Mon Sep 17 00:00:00 2001 From: Zou Wei Date: Tue, 20 Apr 2021 15:47:47 +0800 Subject: tools/testing/nvdimm: Make symbol '__nfit_test_ioremap' static The sparse tool complains as follows: tools/testing/nvdimm/test/iomap.c:65:14: warning: symbol '__nfit_test_ioremap' was not declared. Should it be static? This symbol is not used outside of iomap.c, so this commit marks it static. Reported-by: Hulk Robot Signed-off-by: Zou Wei Link: https://lore.kernel.org/r/1618904867-25275-1-git-send-email-zou_wei@huawei.com Signed-off-by: Dan Williams --- tools/testing/nvdimm/test/iomap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c index c62d372d426f..ed563bdd88f3 100644 --- a/tools/testing/nvdimm/test/iomap.c +++ b/tools/testing/nvdimm/test/iomap.c @@ -62,7 +62,7 @@ struct nfit_test_resource *get_nfit_res(resource_size_t resource) } EXPORT_SYMBOL(get_nfit_res); -void __iomem *__nfit_test_ioremap(resource_size_t offset, unsigned long size, +static void __iomem *__nfit_test_ioremap(resource_size_t offset, unsigned long size, void __iomem *(*fallback_fn)(resource_size_t, unsigned long)) { struct nfit_test_resource *nfit_res = get_nfit_res(offset); -- cgit v1.2.3 From e9cfd259c6d386f6235395a13bd4f357a979b2d0 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 7 May 2021 00:33:50 -0700 Subject: ACPI: NFIT: Fix support for variable 'SPA' structure size ACPI 6.4 introduced the "SpaLocationCookie" to the NFIT "System Physical Address (SPA) Range Structure". The presence of that new field is indicated by the ACPI_NFIT_LOCATION_COOKIE_VALID flag. Pre-ACPI-6.4 firmware implementations omit the flag and maintain the original size of the structure. Update the implementation to check that flag to determine the size rather than the ACPI 6.4 compliant definition of 'struct acpi_nfit_system_address' from the Linux ACPICA definitions. Update the test infrastructure for the new expectations as well, i.e. continue to emulate the ACPI 6.3 definition of that structure. Without this fix the kernel fails to validate 'SPA' structures and this leads to a crash in nfit_get_smbios_id() since that routine assumes that SPAs are valid if it finds valid SMBIOS tables. BUG: unable to handle page fault for address: ffffffffffffffa8 [..] Call Trace: skx_get_nvdimm_info+0x56/0x130 [skx_edac] skx_get_dimm_config+0x1f5/0x213 [skx_edac] skx_register_mci+0x132/0x1c0 [skx_edac] Cc: Bob Moore Cc: Erik Kaneda Fixes: cf16b05c607b ("ACPICA: ACPI 6.4: NFIT: add Location Cookie field") Reported-by: Yi Zhang Tested-by: Yi Zhang Reviewed-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/162037273007.1195827.10907249070709169329.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/acpi/nfit/core.c | 15 ++++++++++---- tools/testing/nvdimm/test/nfit.c | 42 ++++++++++++++++++++++++---------------- 2 files changed, 36 insertions(+), 21 deletions(-) (limited to 'tools') diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index 958aaac869e8..23d9a09d7060 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -686,6 +686,13 @@ int nfit_spa_type(struct acpi_nfit_system_address *spa) return -1; } +static size_t sizeof_spa(struct acpi_nfit_system_address *spa) +{ + if (spa->flags & ACPI_NFIT_LOCATION_COOKIE_VALID) + return sizeof(*spa); + return sizeof(*spa) - 8; +} + static bool add_spa(struct acpi_nfit_desc *acpi_desc, struct nfit_table_prev *prev, struct acpi_nfit_system_address *spa) @@ -693,22 +700,22 @@ static bool add_spa(struct acpi_nfit_desc *acpi_desc, struct device *dev = acpi_desc->dev; struct nfit_spa *nfit_spa; - if (spa->header.length != sizeof(*spa)) + if (spa->header.length != sizeof_spa(spa)) return false; list_for_each_entry(nfit_spa, &prev->spas, list) { - if (memcmp(nfit_spa->spa, spa, sizeof(*spa)) == 0) { + if (memcmp(nfit_spa->spa, spa, sizeof_spa(spa)) == 0) { list_move_tail(&nfit_spa->list, &acpi_desc->spas); return true; } } - nfit_spa = devm_kzalloc(dev, sizeof(*nfit_spa) + sizeof(*spa), + nfit_spa = devm_kzalloc(dev, sizeof(*nfit_spa) + sizeof_spa(spa), GFP_KERNEL); if (!nfit_spa) return false; INIT_LIST_HEAD(&nfit_spa->list); - memcpy(nfit_spa->spa, spa, sizeof(*spa)); + memcpy(nfit_spa->spa, spa, sizeof_spa(spa)); list_add_tail(&nfit_spa->list, &acpi_desc->spas); dev_dbg(dev, "spa index: %d type: %s\n", spa->range_index, diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c index 9b185bf82da8..54f367cbadae 100644 --- a/tools/testing/nvdimm/test/nfit.c +++ b/tools/testing/nvdimm/test/nfit.c @@ -1871,9 +1871,16 @@ static void smart_init(struct nfit_test *t) } } +static size_t sizeof_spa(struct acpi_nfit_system_address *spa) +{ + /* until spa location cookie support is added... */ + return sizeof(*spa) - 8; +} + static int nfit_test0_alloc(struct nfit_test *t) { - size_t nfit_size = sizeof(struct acpi_nfit_system_address) * NUM_SPA + struct acpi_nfit_system_address *spa = NULL; + size_t nfit_size = sizeof_spa(spa) * NUM_SPA + sizeof(struct acpi_nfit_memory_map) * NUM_MEM + sizeof(struct acpi_nfit_control_region) * NUM_DCR + offsetof(struct acpi_nfit_control_region, @@ -1937,7 +1944,8 @@ static int nfit_test0_alloc(struct nfit_test *t) static int nfit_test1_alloc(struct nfit_test *t) { - size_t nfit_size = sizeof(struct acpi_nfit_system_address) * 2 + struct acpi_nfit_system_address *spa = NULL; + size_t nfit_size = sizeof_spa(spa) * 2 + sizeof(struct acpi_nfit_memory_map) * 2 + offsetof(struct acpi_nfit_control_region, window_size) * 2; int i; @@ -2000,7 +2008,7 @@ static void nfit_test0_setup(struct nfit_test *t) */ spa = nfit_buf; spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; - spa->header.length = sizeof(*spa); + spa->header.length = sizeof_spa(spa); memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_PM), 16); spa->range_index = 0+1; spa->address = t->spa_set_dma[0]; @@ -2014,7 +2022,7 @@ static void nfit_test0_setup(struct nfit_test *t) */ spa = nfit_buf + offset; spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; - spa->header.length = sizeof(*spa); + spa->header.length = sizeof_spa(spa); memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_PM), 16); spa->range_index = 1+1; spa->address = t->spa_set_dma[1]; @@ -2024,7 +2032,7 @@ static void nfit_test0_setup(struct nfit_test *t) /* spa2 (dcr0) dimm0 */ spa = nfit_buf + offset; spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; - spa->header.length = sizeof(*spa); + spa->header.length = sizeof_spa(spa); memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_DCR), 16); spa->range_index = 2+1; spa->address = t->dcr_dma[0]; @@ -2034,7 +2042,7 @@ static void nfit_test0_setup(struct nfit_test *t) /* spa3 (dcr1) dimm1 */ spa = nfit_buf + offset; spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; - spa->header.length = sizeof(*spa); + spa->header.length = sizeof_spa(spa); memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_DCR), 16); spa->range_index = 3+1; spa->address = t->dcr_dma[1]; @@ -2044,7 +2052,7 @@ static void nfit_test0_setup(struct nfit_test *t) /* spa4 (dcr2) dimm2 */ spa = nfit_buf + offset; spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; - spa->header.length = sizeof(*spa); + spa->header.length = sizeof_spa(spa); memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_DCR), 16); spa->range_index = 4+1; spa->address = t->dcr_dma[2]; @@ -2054,7 +2062,7 @@ static void nfit_test0_setup(struct nfit_test *t) /* spa5 (dcr3) dimm3 */ spa = nfit_buf + offset; spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; - spa->header.length = sizeof(*spa); + spa->header.length = sizeof_spa(spa); memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_DCR), 16); spa->range_index = 5+1; spa->address = t->dcr_dma[3]; @@ -2064,7 +2072,7 @@ static void nfit_test0_setup(struct nfit_test *t) /* spa6 (bdw for dcr0) dimm0 */ spa = nfit_buf + offset; spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; - spa->header.length = sizeof(*spa); + spa->header.length = sizeof_spa(spa); memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_BDW), 16); spa->range_index = 6+1; spa->address = t->dimm_dma[0]; @@ -2074,7 +2082,7 @@ static void nfit_test0_setup(struct nfit_test *t) /* spa7 (bdw for dcr1) dimm1 */ spa = nfit_buf + offset; spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; - spa->header.length = sizeof(*spa); + spa->header.length = sizeof_spa(spa); memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_BDW), 16); spa->range_index = 7+1; spa->address = t->dimm_dma[1]; @@ -2084,7 +2092,7 @@ static void nfit_test0_setup(struct nfit_test *t) /* spa8 (bdw for dcr2) dimm2 */ spa = nfit_buf + offset; spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; - spa->header.length = sizeof(*spa); + spa->header.length = sizeof_spa(spa); memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_BDW), 16); spa->range_index = 8+1; spa->address = t->dimm_dma[2]; @@ -2094,7 +2102,7 @@ static void nfit_test0_setup(struct nfit_test *t) /* spa9 (bdw for dcr3) dimm3 */ spa = nfit_buf + offset; spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; - spa->header.length = sizeof(*spa); + spa->header.length = sizeof_spa(spa); memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_BDW), 16); spa->range_index = 9+1; spa->address = t->dimm_dma[3]; @@ -2581,7 +2589,7 @@ static void nfit_test0_setup(struct nfit_test *t) /* spa10 (dcr4) dimm4 */ spa = nfit_buf + offset; spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; - spa->header.length = sizeof(*spa); + spa->header.length = sizeof_spa(spa); memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_DCR), 16); spa->range_index = 10+1; spa->address = t->dcr_dma[4]; @@ -2595,7 +2603,7 @@ static void nfit_test0_setup(struct nfit_test *t) */ spa = nfit_buf + offset; spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; - spa->header.length = sizeof(*spa); + spa->header.length = sizeof_spa(spa); memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_PM), 16); spa->range_index = 11+1; spa->address = t->spa_set_dma[2]; @@ -2605,7 +2613,7 @@ static void nfit_test0_setup(struct nfit_test *t) /* spa12 (bdw for dcr4) dimm4 */ spa = nfit_buf + offset; spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; - spa->header.length = sizeof(*spa); + spa->header.length = sizeof_spa(spa); memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_BDW), 16); spa->range_index = 12+1; spa->address = t->dimm_dma[4]; @@ -2739,7 +2747,7 @@ static void nfit_test1_setup(struct nfit_test *t) /* spa0 (flat range with no bdw aliasing) */ spa = nfit_buf + offset; spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; - spa->header.length = sizeof(*spa); + spa->header.length = sizeof_spa(spa); memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_PM), 16); spa->range_index = 0+1; spa->address = t->spa_set_dma[0]; @@ -2749,7 +2757,7 @@ static void nfit_test1_setup(struct nfit_test *t) /* virtual cd region */ spa = nfit_buf + offset; spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; - spa->header.length = sizeof(*spa); + spa->header.length = sizeof_spa(spa); memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_VCD), 16); spa->range_index = 0; spa->address = t->spa_set_dma[1]; -- cgit v1.2.3 From c6de37dd5e48b883db032aa4dc0547a4858b9f20 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 21 Apr 2021 11:58:48 -0700 Subject: tools build: Fix quiet cmd indentation The tools quiet cmd output has mismatched indentation (and extra space character between cmd name and target name) compared to the rest of kbuild out: HOSTCC scripts/insert-sys-cert LD /srv/code/tools/objtool/arch/x86/objtool-in.o LD /srv/code/tools/objtool/libsubcmd-in.o AR /srv/code/tools/objtool/libsubcmd.a HOSTLD scripts/genksyms/genksyms CC scripts/mod/empty.o HOSTCC scripts/mod/mk_elfconfig CC scripts/mod/devicetable-offsets.s MKELF scripts/mod/elfconfig.h HOSTCC scripts/mod/modpost.o HOSTCC scripts/mod/file2alias.o HOSTCC scripts/mod/sumversion.o LD /srv/code/tools/objtool/objtool-in.o LINK /srv/code/tools/objtool/objtool HOSTLD scripts/mod/modpost CC kernel/bounds.s Adjust to match the rest of kbuild. Signed-off-by: Kees Cook Signed-off-by: Masahiro Yamada --- tools/build/Makefile.build | 22 +++++++++++----------- tools/scripts/Makefile.include | 30 +++++++++++++++--------------- 2 files changed, 26 insertions(+), 26 deletions(-) (limited to 'tools') diff --git a/tools/build/Makefile.build b/tools/build/Makefile.build index cd72016c3cfa..715092fc6a23 100644 --- a/tools/build/Makefile.build +++ b/tools/build/Makefile.build @@ -51,39 +51,39 @@ subdir-obj-y := build-file := $(dir)/Build -include $(build-file) -quiet_cmd_flex = FLEX $@ -quiet_cmd_bison = BISON $@ +quiet_cmd_flex = FLEX $@ +quiet_cmd_bison = BISON $@ # Create directory unless it exists -quiet_cmd_mkdir = MKDIR $(dir $@) +quiet_cmd_mkdir = MKDIR $(dir $@) cmd_mkdir = mkdir -p $(dir $@) rule_mkdir = $(if $(wildcard $(dir $@)),,@$(call echo-cmd,mkdir) $(cmd_mkdir)) # Compile command -quiet_cmd_cc_o_c = CC $@ +quiet_cmd_cc_o_c = CC $@ cmd_cc_o_c = $(CC) $(c_flags) -c -o $@ $< -quiet_cmd_host_cc_o_c = HOSTCC $@ +quiet_cmd_host_cc_o_c = HOSTCC $@ cmd_host_cc_o_c = $(HOSTCC) $(host_c_flags) -c -o $@ $< -quiet_cmd_cxx_o_c = CXX $@ +quiet_cmd_cxx_o_c = CXX $@ cmd_cxx_o_c = $(CXX) $(cxx_flags) -c -o $@ $< -quiet_cmd_cpp_i_c = CPP $@ +quiet_cmd_cpp_i_c = CPP $@ cmd_cpp_i_c = $(CC) $(c_flags) -E -o $@ $< -quiet_cmd_cc_s_c = AS $@ +quiet_cmd_cc_s_c = AS $@ cmd_cc_s_c = $(CC) $(c_flags) -S -o $@ $< -quiet_cmd_gen = GEN $@ +quiet_cmd_gen = GEN $@ # Link agregate command # If there's nothing to link, create empty $@ object. -quiet_cmd_ld_multi = LD $@ +quiet_cmd_ld_multi = LD $@ cmd_ld_multi = $(if $(strip $(obj-y)),\ $(LD) -r -o $@ $(filter $(obj-y),$^),rm -f $@; $(AR) rcs $@) -quiet_cmd_host_ld_multi = HOSTLD $@ +quiet_cmd_host_ld_multi = HOSTLD $@ cmd_host_ld_multi = $(if $(strip $(obj-y)),\ $(HOSTLD) -r -o $@ $(filter $(obj-y),$^),rm -f $@; $(HOSTAR) rcs $@) diff --git a/tools/scripts/Makefile.include b/tools/scripts/Makefile.include index f9271f3ea912..071312f5eb92 100644 --- a/tools/scripts/Makefile.include +++ b/tools/scripts/Makefile.include @@ -131,29 +131,29 @@ QUIET_SUBDIR1 = ifneq ($(silent),1) ifneq ($(V),1) - QUIET_CC = @echo ' CC '$@; - QUIET_CC_FPIC = @echo ' CC FPIC '$@; - QUIET_CLANG = @echo ' CLANG '$@; - QUIET_AR = @echo ' AR '$@; - QUIET_LINK = @echo ' LINK '$@; - QUIET_MKDIR = @echo ' MKDIR '$@; - QUIET_GEN = @echo ' GEN '$@; + QUIET_CC = @echo ' CC '$@; + QUIET_CC_FPIC = @echo ' CC FPIC '$@; + QUIET_CLANG = @echo ' CLANG '$@; + QUIET_AR = @echo ' AR '$@; + QUIET_LINK = @echo ' LINK '$@; + QUIET_MKDIR = @echo ' MKDIR '$@; + QUIET_GEN = @echo ' GEN '$@; QUIET_SUBDIR0 = +@subdir= QUIET_SUBDIR1 = ;$(NO_SUBDIR) \ - echo ' SUBDIR '$$subdir; \ + echo ' SUBDIR '$$subdir; \ $(MAKE) $(PRINT_DIR) -C $$subdir - QUIET_FLEX = @echo ' FLEX '$@; - QUIET_BISON = @echo ' BISON '$@; - QUIET_GENSKEL = @echo ' GEN-SKEL '$@; + QUIET_FLEX = @echo ' FLEX '$@; + QUIET_BISON = @echo ' BISON '$@; + QUIET_GENSKEL = @echo ' GENSKEL '$@; descend = \ - +@echo ' DESCEND '$(1); \ + +@echo ' DESCEND '$(1); \ mkdir -p $(OUTPUT)$(1) && \ $(MAKE) $(COMMAND_O) subdir=$(if $(subdir),$(subdir)/$(1),$(1)) $(PRINT_DIR) -C $(1) $(2) - QUIET_CLEAN = @printf ' CLEAN %s\n' $1; - QUIET_INSTALL = @printf ' INSTALL %s\n' $1; - QUIET_UNINST = @printf ' UNINST %s\n' $1; + QUIET_CLEAN = @printf ' CLEAN %s\n' $1; + QUIET_INSTALL = @printf ' INSTALL %s\n' $1; + QUIET_UNINST = @printf ' UNINST %s\n' $1; endif endif -- cgit v1.2.3 From c67d734975a25ba7b6e8f820c13e0d8eb4a2a77c Mon Sep 17 00:00:00 2001 From: Milian Wolff Date: Thu, 29 Apr 2021 20:57:59 +0200 Subject: perf buildid-list: Initialize zstd_data Fixes segmentation fault when trying to obtain buildid list (e.g. via perf-archive) from a zstd-compressed `perf.data` file: ``` $ perf record -z ls ... [ perf record: Captured and wrote 0,010 MB perf.data, compressed (original 0,001 MB, ratio is 2,190) ] $ memcheck perf buildid-list ... ==57268== Invalid read of size 4 ==57268== at 0x5260D88: ZSTD_decompressStream (in /usr/lib/libzstd.so.1.4.9) ==57268== by 0x4BB51B: zstd_decompress_stream (zstd.c:100) ==57268== by 0x425C6C: perf_session__process_compressed_event (session.c:73) ==57268== by 0x427450: perf_session__process_user_event (session.c:1631) ==57268== by 0x42A609: reader__process_events (session.c:2207) ==57268== by 0x42A609: __perf_session__process_events (session.c:2264) ==57268== by 0x42A609: perf_session__process_events (session.c:2297) ==57268== by 0x343A62: perf_session__list_build_ids (builtin-buildid-list.c:88) ==57268== by 0x343A62: cmd_buildid_list (builtin-buildid-list.c:120) ==57268== by 0x3C7732: run_builtin (perf.c:313) ==57268== by 0x331157: handle_internal_command (perf.c:365) ==57268== by 0x331157: run_argv (perf.c:409) ==57268== by 0x331157: main (perf.c:539) ==57268== Address 0x7470 is not stack'd, malloc'd or (recently) free'd ``` Signed-off-by: Milian Wolff Cc: Alexey Budankov Link: http://lore.kernel.org/lkml/20210429185759.59870-1-milian.wolff@kdab.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-buildid-list.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'tools') diff --git a/tools/perf/builtin-buildid-list.c b/tools/perf/builtin-buildid-list.c index 87f5b1a4a7fa..833405c27dae 100644 --- a/tools/perf/builtin-buildid-list.c +++ b/tools/perf/builtin-buildid-list.c @@ -80,6 +80,9 @@ static int perf_session__list_build_ids(bool force, bool with_hits) if (!perf_header__has_feat(&session->header, HEADER_BUILD_ID)) with_hits = true; + if (zstd_init(&(session->zstd_data), 0) < 0) + pr_warning("Decompression initialization failed. Reported data may be incomplete.\n"); + /* * in pipe-mode, the only way to get the buildids is to parse * the record stream. Buildids are stored as RECORD_HEADER_BUILD_ID -- cgit v1.2.3 From 3c91e8efaf4838e4c8e465656e9707b5de26f3db Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 12 May 2021 12:35:07 -0300 Subject: tools arch kvm: Sync kvm headers with the kernel sources To pick up the changes from: 70f094f4f01dc4d6 ("KVM: nVMX: Properly pad 'struct kvm_vmx_nested_state_hdr'") That don't entail changes in tooling. This silences these tools/perf build warnings: Warning: Kernel ABI header at 'tools/arch/x86/include/uapi/asm/kvm.h' differs from latest version at 'arch/x86/include/uapi/asm/kvm.h' diff -u tools/arch/x86/include/uapi/asm/kvm.h arch/x86/include/uapi/asm/kvm.h Cc: Paolo Bonzini Cc: Vitaly Kuznetsov Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/uapi/asm/kvm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools') diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h index 5a3022c8af82..0662f644aad9 100644 --- a/tools/arch/x86/include/uapi/asm/kvm.h +++ b/tools/arch/x86/include/uapi/asm/kvm.h @@ -437,6 +437,8 @@ struct kvm_vmx_nested_state_hdr { __u16 flags; } smm; + __u16 pad; + __u32 flags; __u64 preemption_timer_deadline; }; -- cgit v1.2.3 From 0683b53197b55343a166f1507086823030809a19 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 2 May 2021 17:28:31 -0500 Subject: signal: Deliver all of the siginfo perf data in _perf Don't abuse si_errno and deliver all of the perf data in _perf member of siginfo_t. Note: The data field in the perf data structures in a u64 to allow a pointer to be encoded without needed to implement a 32bit and 64bit version of the same structure. There already exists a 32bit and 64bit versions siginfo_t, and the 32bit version can not include a 64bit member as it only has 32bit alignment. So unsigned long is used in siginfo_t instead of a u64 as unsigned long can encode a pointer on all architectures linux supports. v1: https://lkml.kernel.org/r/m11rarqqx2.fsf_-_@fess.ebiederm.org v2: https://lkml.kernel.org/r/20210503203814.25487-10-ebiederm@xmission.com v3: https://lkml.kernel.org/r/20210505141101.11519-11-ebiederm@xmission.com Link: https://lkml.kernel.org/r/20210517195748.8880-4-ebiederm@xmission.com Reviewed-by: Marco Elver Signed-off-by: "Eric W. Biederman" --- arch/m68k/kernel/signal.c | 3 ++- arch/x86/kernel/signal_compat.c | 6 ++++-- fs/signalfd.c | 3 ++- include/linux/compat.h | 5 ++++- include/uapi/asm-generic/siginfo.h | 8 ++++++-- include/uapi/linux/perf_event.h | 2 +- include/uapi/linux/signalfd.h | 4 ++-- kernel/signal.c | 21 +++++++++++++-------- .../testing/selftests/perf_events/sigtrap_threads.c | 14 +++++++------- 9 files changed, 41 insertions(+), 25 deletions(-) (limited to 'tools') diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c index a4b7ee1df211..8f215e79e70e 100644 --- a/arch/m68k/kernel/signal.c +++ b/arch/m68k/kernel/signal.c @@ -623,7 +623,8 @@ static inline void siginfo_build_tests(void) BUILD_BUG_ON(offsetof(siginfo_t, si_pkey) != 0x12); /* _sigfault._perf */ - BUILD_BUG_ON(offsetof(siginfo_t, si_perf) != 0x10); + BUILD_BUG_ON(offsetof(siginfo_t, si_perf_data) != 0x10); + BUILD_BUG_ON(offsetof(siginfo_t, si_perf_type) != 0x14); /* _sigpoll */ BUILD_BUG_ON(offsetof(siginfo_t, si_band) != 0x0c); diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index a9fcabd8a5e5..06743ec054d2 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -141,8 +141,10 @@ static inline void signal_compat_build_tests(void) BUILD_BUG_ON(offsetof(siginfo_t, si_pkey) != 0x20); BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pkey) != 0x14); - BUILD_BUG_ON(offsetof(siginfo_t, si_perf) != 0x18); - BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf) != 0x10); + BUILD_BUG_ON(offsetof(siginfo_t, si_perf_data) != 0x18); + BUILD_BUG_ON(offsetof(siginfo_t, si_perf_type) != 0x20); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf_data) != 0x10); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf_type) != 0x14); CHECK_CSI_OFFSET(_sigpoll); CHECK_CSI_SIZE (_sigpoll, 2*sizeof(int)); diff --git a/fs/signalfd.c b/fs/signalfd.c index e87e59581653..373df2f12415 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -134,7 +134,8 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, break; case SIL_PERF_EVENT: new.ssi_addr = (long) kinfo->si_addr; - new.ssi_perf = kinfo->si_perf; + new.ssi_perf_type = kinfo->si_perf_type; + new.ssi_perf_data = kinfo->si_perf_data; break; case SIL_CHLD: new.ssi_pid = kinfo->si_pid; diff --git a/include/linux/compat.h b/include/linux/compat.h index 6af7bef15e94..a27fffaae121 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -236,7 +236,10 @@ typedef struct compat_siginfo { u32 _pkey; } _addr_pkey; /* used when si_code=TRAP_PERF */ - compat_ulong_t _perf; + struct { + compat_ulong_t _data; + u32 _type; + } _perf; }; } _sigfault; diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h index e663bf117b46..5a3c221f4c9d 100644 --- a/include/uapi/asm-generic/siginfo.h +++ b/include/uapi/asm-generic/siginfo.h @@ -91,7 +91,10 @@ union __sifields { __u32 _pkey; } _addr_pkey; /* used when si_code=TRAP_PERF */ - unsigned long _perf; + struct { + unsigned long _data; + __u32 _type; + } _perf; }; } _sigfault; @@ -154,7 +157,8 @@ typedef struct siginfo { #define si_lower _sifields._sigfault._addr_bnd._lower #define si_upper _sifields._sigfault._addr_bnd._upper #define si_pkey _sifields._sigfault._addr_pkey._pkey -#define si_perf _sifields._sigfault._perf +#define si_perf_data _sifields._sigfault._perf._data +#define si_perf_type _sifields._sigfault._perf._type #define si_band _sifields._sigpoll._band #define si_fd _sifields._sigpoll._fd #define si_call_addr _sifields._sigsys._call_addr diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index e54e639248c8..7b14753b3d38 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -464,7 +464,7 @@ struct perf_event_attr { /* * User provided data if sigtrap=1, passed back to user via - * siginfo_t::si_perf, e.g. to permit user to identify the event. + * siginfo_t::si_perf_data, e.g. to permit user to identify the event. */ __u64 sig_data; }; diff --git a/include/uapi/linux/signalfd.h b/include/uapi/linux/signalfd.h index 7e333042c7e3..e78dddf433fc 100644 --- a/include/uapi/linux/signalfd.h +++ b/include/uapi/linux/signalfd.h @@ -39,8 +39,8 @@ struct signalfd_siginfo { __s32 ssi_syscall; __u64 ssi_call_addr; __u32 ssi_arch; - __u32 __pad3; - __u64 ssi_perf; + __u32 ssi_perf_type; + __u64 ssi_perf_data; /* * Pad strcture to 128 bytes. Remember to update the diff --git a/kernel/signal.c b/kernel/signal.c index 3a18d13c39b2..dca53515ae3f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1768,11 +1768,13 @@ int force_sig_perf(void __user *addr, u32 type, u64 sig_data) struct kernel_siginfo info; clear_siginfo(&info); - info.si_signo = SIGTRAP; - info.si_errno = type; - info.si_code = TRAP_PERF; - info.si_addr = addr; - info.si_perf = sig_data; + info.si_signo = SIGTRAP; + info.si_errno = 0; + info.si_code = TRAP_PERF; + info.si_addr = addr; + info.si_perf_data = sig_data; + info.si_perf_type = type; + return force_sig_info(&info); } @@ -3356,7 +3358,8 @@ void copy_siginfo_to_external32(struct compat_siginfo *to, break; case SIL_PERF_EVENT: to->si_addr = ptr_to_compat(from->si_addr); - to->si_perf = from->si_perf; + to->si_perf_data = from->si_perf_data; + to->si_perf_type = from->si_perf_type; break; case SIL_CHLD: to->si_pid = from->si_pid; @@ -3432,7 +3435,8 @@ static int post_copy_siginfo_from_user32(kernel_siginfo_t *to, break; case SIL_PERF_EVENT: to->si_addr = compat_ptr(from->si_addr); - to->si_perf = from->si_perf; + to->si_perf_data = from->si_perf_data; + to->si_perf_type = from->si_perf_type; break; case SIL_CHLD: to->si_pid = from->si_pid; @@ -4615,7 +4619,8 @@ static inline void siginfo_buildtime_checks(void) CHECK_OFFSET(si_lower); CHECK_OFFSET(si_upper); CHECK_OFFSET(si_pkey); - CHECK_OFFSET(si_perf); + CHECK_OFFSET(si_perf_data); + CHECK_OFFSET(si_perf_type); /* sigpoll */ CHECK_OFFSET(si_band); diff --git a/tools/testing/selftests/perf_events/sigtrap_threads.c b/tools/testing/selftests/perf_events/sigtrap_threads.c index 78ddf5e11625..8e83cf91513a 100644 --- a/tools/testing/selftests/perf_events/sigtrap_threads.c +++ b/tools/testing/selftests/perf_events/sigtrap_threads.c @@ -43,7 +43,7 @@ static struct { siginfo_t first_siginfo; /* First observed siginfo_t. */ } ctx; -/* Unique value to check si_perf is correctly set from perf_event_attr::sig_data. */ +/* Unique value to check si_perf_data is correctly set from perf_event_attr::sig_data. */ #define TEST_SIG_DATA(addr) (~(unsigned long)(addr)) static struct perf_event_attr make_event_attr(bool enabled, volatile void *addr) @@ -164,8 +164,8 @@ TEST_F(sigtrap_threads, enable_event) EXPECT_EQ(ctx.signal_count, NUM_THREADS); EXPECT_EQ(ctx.tids_want_signal, 0); EXPECT_EQ(ctx.first_siginfo.si_addr, &ctx.iterate_on); - EXPECT_EQ(ctx.first_siginfo.si_errno, PERF_TYPE_BREAKPOINT); - EXPECT_EQ(ctx.first_siginfo.si_perf, TEST_SIG_DATA(&ctx.iterate_on)); + EXPECT_EQ(ctx.first_siginfo.si_perf_type, PERF_TYPE_BREAKPOINT); + EXPECT_EQ(ctx.first_siginfo.si_perf_data, TEST_SIG_DATA(&ctx.iterate_on)); /* Check enabled for parent. */ ctx.iterate_on = 0; @@ -183,8 +183,8 @@ TEST_F(sigtrap_threads, modify_and_enable_event) EXPECT_EQ(ctx.signal_count, NUM_THREADS); EXPECT_EQ(ctx.tids_want_signal, 0); EXPECT_EQ(ctx.first_siginfo.si_addr, &ctx.iterate_on); - EXPECT_EQ(ctx.first_siginfo.si_errno, PERF_TYPE_BREAKPOINT); - EXPECT_EQ(ctx.first_siginfo.si_perf, TEST_SIG_DATA(&ctx.iterate_on)); + EXPECT_EQ(ctx.first_siginfo.si_perf_type, PERF_TYPE_BREAKPOINT); + EXPECT_EQ(ctx.first_siginfo.si_perf_data, TEST_SIG_DATA(&ctx.iterate_on)); /* Check enabled for parent. */ ctx.iterate_on = 0; @@ -203,8 +203,8 @@ TEST_F(sigtrap_threads, signal_stress) EXPECT_EQ(ctx.signal_count, NUM_THREADS * ctx.iterate_on); EXPECT_EQ(ctx.tids_want_signal, 0); EXPECT_EQ(ctx.first_siginfo.si_addr, &ctx.iterate_on); - EXPECT_EQ(ctx.first_siginfo.si_errno, PERF_TYPE_BREAKPOINT); - EXPECT_EQ(ctx.first_siginfo.si_perf, TEST_SIG_DATA(&ctx.iterate_on)); + EXPECT_EQ(ctx.first_siginfo.si_perf_type, PERF_TYPE_BREAKPOINT); + EXPECT_EQ(ctx.first_siginfo.si_perf_data, TEST_SIG_DATA(&ctx.iterate_on)); } TEST_HARNESS_MAIN -- cgit v1.2.3 From 316a76a58c3f30735e5e416a6dc304d6bb86312d Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Mon, 17 May 2021 16:09:31 +0200 Subject: perf test: Fix libpfm4 support (63) test error for nested event groups Compiling perf with make LIBPFM4=1 includes libpfm support and enables test case 63 'Test libpfm4 support'. This test reports an error on all platforms for subtest 63.2 'test groups of --pfm-events'. The reported error message is 'nested event groups not supported' # ./perf test -F 63 63: Test libpfm4 support : 63.1: test of individual --pfm-events : Error: failed to parse event stereolab : event not found Error: failed to parse event stereolab,instructions : event not found Error: failed to parse event instructions,stereolab : event not found Ok 63.2: test groups of --pfm-events : Error: nested event groups not supported <------ Error message here Error: failed to parse event {stereolab} : event not found Error: failed to parse event {instructions,cycles},{instructions,stereolab} :\ event not found Ok # This patch addresses the error message 'nested event groups not supported'. The root cause is function parse_libpfm_events_option() which parses the event string '{},{instructions}' and can not handle a leading empty group notation '{},...'. The code detects the first (empty) group indicator '{' but does not terminate group processing on the following group closing character '}'. So when the second group indicator '{' is detected, the code assumes a nested group and returns an error. With the error message fixed, also change the expected event number to one for the test case to succeed. While at it also fix a memory leak. In good case the function does not free the duplicated string given as first parameter. Output after: # ./perf test -F 63 63: Test libpfm4 support : 63.1: test of individual --pfm-events : Error: failed to parse event stereolab : event not found Error: failed to parse event stereolab,instructions : event not found Error: failed to parse event instructions,stereolab : event not found Ok 63.2: test groups of --pfm-events : Error: failed to parse event {stereolab} : event not found Error: failed to parse event {instructions,cycles},{instructions,stereolab} : \ event not found Ok # Error message 'nested event groups not supported' is gone. Signed-off-by: Thomas Richter Acked-By: Ian Rogers Acked-by: Sumanth Korikkar Cc: Heiko Carstens Cc: Stephane Eranian Cc: Sven Schnelle Cc: Vasily Gorbik Link: http://lore.kernel.org/lkml/20210517140931.2559364-1-tmricht@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/pfm.c | 4 ++-- tools/perf/util/pfm.c | 11 ++++++++++- 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/pfm.c b/tools/perf/tests/pfm.c index 76a53126efdf..d4b0ef74defc 100644 --- a/tools/perf/tests/pfm.c +++ b/tools/perf/tests/pfm.c @@ -131,8 +131,8 @@ static int test__pfm_group(void) }, { .events = "{},{instructions}", - .nr_events = 0, - .nr_groups = 0, + .nr_events = 1, + .nr_groups = 1, }, { .events = "{instructions},{instructions}", diff --git a/tools/perf/util/pfm.c b/tools/perf/util/pfm.c index d735acb6c29c..6eef6dfeaa57 100644 --- a/tools/perf/util/pfm.c +++ b/tools/perf/util/pfm.c @@ -62,8 +62,16 @@ int parse_libpfm_events_option(const struct option *opt, const char *str, } /* no event */ - if (*q == '\0') + if (*q == '\0') { + if (*sep == '}') { + if (grp_evt < 0) { + ui__error("cannot close a non-existing event group\n"); + goto error; + } + grp_evt--; + } continue; + } memset(&attr, 0, sizeof(attr)); event_attr_init(&attr); @@ -107,6 +115,7 @@ int parse_libpfm_events_option(const struct option *opt, const char *str, grp_evt = -1; } } + free(p_orig); return 0; error: free(p_orig); -- cgit v1.2.3 From cb7987837c31b217b28089bbc78922d5c9187869 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 19 May 2021 10:45:13 +0300 Subject: perf intel-pt: Fix transaction abort handling When adding support for power events, some handling of FUP packets was unified. That resulted in breaking reporting of TSX aborts, by not considering the associated TIP packet. Fix that. Example: A machine that supports TSX is required. It will have flag "rtm". Kernel parameter tsx=on may be required. # for w in `cat /proc/cpuinfo | grep -m1 flags `;do echo $w | grep rtm ; done rtm Test program: #include #include int main() { int x = 0; if (_xbegin() == _XBEGIN_STARTED) { x = 1; _xabort(1); } else { printf("x = %d\n", x); } return 0; } Compile with -mrtm i.e. gcc -Wall -Wextra -mrtm xabort.c -o xabort Record: perf record -e intel_pt/cyc/u --filter 'filter main @ ./xabort' ./xabort Before: # perf script --itrace=be -F+flags,+addr,-period,-event --ns xabort 1478 [007] 92161.431348552: tr strt 0 [unknown] ([unknown]) => 400b6d main+0x0 (/root/xabort) xabort 1478 [007] 92161.431348624: jmp 400b96 main+0x29 (/root/xabort) => 400bae main+0x41 (/root/xabort) xabort 1478 [007] 92161.431348624: return 400bb4 main+0x47 (/root/xabort) => 400b87 main+0x1a (/root/xabort) xabort 1478 [007] 92161.431348637: jcc 400b8a main+0x1d (/root/xabort) => 400b98 main+0x2b (/root/xabort) xabort 1478 [007] 92161.431348644: tr end call 400ba9 main+0x3c (/root/xabort) => 40f690 printf+0x0 (/root/xabort) xabort 1478 [007] 92161.431360859: tr strt 0 [unknown] ([unknown]) => 400bae main+0x41 (/root/xabort) xabort 1478 [007] 92161.431360882: tr end return 400bb4 main+0x47 (/root/xabort) => 401139 __libc_start_main+0x309 (/root/xabort) After: # perf script --itrace=be -F+flags,+addr,-period,-event --ns xabort 1478 [007] 92161.431348552: tr strt 0 [unknown] ([unknown]) => 400b6d main+0x0 (/root/xabort) xabort 1478 [007] 92161.431348624: tx abrt 400b93 main+0x26 (/root/xabort) => 400b87 main+0x1a (/root/xabort) xabort 1478 [007] 92161.431348637: jcc 400b8a main+0x1d (/root/xabort) => 400b98 main+0x2b (/root/xabort) xabort 1478 [007] 92161.431348644: tr end call 400ba9 main+0x3c (/root/xabort) => 40f690 printf+0x0 (/root/xabort) xabort 1478 [007] 92161.431360859: tr strt 0 [unknown] ([unknown]) => 400bae main+0x41 (/root/xabort) xabort 1478 [007] 92161.431360882: tr end return 400bb4 main+0x47 (/root/xabort) => 401139 __libc_start_main+0x309 (/root/xabort) Fixes: a472e65fc490a ("perf intel-pt: Add decoder support for ptwrite and power event packets") Signed-off-by: Adrian Hunter Cc: Andi Kleen Cc: Jiri Olsa Cc: stable@vger.kernel.org Link: http://lore.kernel.org/lkml/20210519074515.9262-2-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/intel-pt-decoder/intel-pt-decoder.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c index 8c59677bee13..20ad663978cc 100644 --- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c +++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c @@ -1146,6 +1146,8 @@ static bool intel_pt_fup_event(struct intel_pt_decoder *decoder) decoder->set_fup_tx_flags = false; decoder->tx_flags = decoder->fup_tx_flags; decoder->state.type = INTEL_PT_TRANSACTION; + if (decoder->fup_tx_flags & INTEL_PT_ABORT_TX) + decoder->state.type |= INTEL_PT_BRANCH; decoder->state.from_ip = decoder->ip; decoder->state.to_ip = 0; decoder->state.flags = decoder->fup_tx_flags; @@ -1220,8 +1222,10 @@ static int intel_pt_walk_fup(struct intel_pt_decoder *decoder) return 0; if (err == -EAGAIN || intel_pt_fup_with_nlip(decoder, &intel_pt_insn, ip, err)) { + bool no_tip = decoder->pkt_state != INTEL_PT_STATE_FUP; + decoder->pkt_state = INTEL_PT_STATE_IN_SYNC; - if (intel_pt_fup_event(decoder)) + if (intel_pt_fup_event(decoder) && no_tip) return 0; return -EAGAIN; } -- cgit v1.2.3 From c954eb72b31a9dc56c99b450253ec5b121add320 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 19 May 2021 10:45:14 +0300 Subject: perf intel-pt: Fix sample instruction bytes The decoder reports the current instruction if it was decoded. In some cases the current instruction is not decoded, in which case the instruction bytes length must be set to zero. Ensure that is always done. Note perf script can anyway get the instruction bytes for any samples where they are not present. Also note, that there is a redundant "ptq->insn_len = 0" statement which is not removed until a subsequent patch in order to make this patch apply cleanly to stable branches. Example: A machne that supports TSX is required. It will have flag "rtm". Kernel parameter tsx=on may be required. # for w in `cat /proc/cpuinfo | grep -m1 flags `;do echo $w | grep rtm ; done rtm Test program: #include #include int main() { int x = 0; if (_xbegin() == _XBEGIN_STARTED) { x = 1; _xabort(1); } else { printf("x = %d\n", x); } return 0; } Compile with -mrtm i.e. gcc -Wall -Wextra -mrtm xabort.c -o xabort Record: perf record -e intel_pt/cyc/u --filter 'filter main @ ./xabort' ./xabort Before: # perf script --itrace=xe -F+flags,+insn,-period --xed --ns xabort 1478 [007] 92161.431348581: transactions: x 400b81 main+0x14 (/root/xabort) mov $0xffffffff, %eax xabort 1478 [007] 92161.431348624: transactions: tx abrt 400b93 main+0x26 (/root/xabort) mov $0xffffffff, %eax After: # perf script --itrace=xe -F+flags,+insn,-period --xed --ns xabort 1478 [007] 92161.431348581: transactions: x 400b81 main+0x14 (/root/xabort) xbegin 0x6 xabort 1478 [007] 92161.431348624: transactions: tx abrt 400b93 main+0x26 (/root/xabort) xabort $0x1 Fixes: faaa87680b25d ("perf intel-pt/bts: Report instruction bytes and length in sample") Signed-off-by: Adrian Hunter Cc: Andi Kleen Cc: Jiri Olsa Cc: stable@vger.kernel.org Link: http://lore.kernel.org/lkml/20210519074515.9262-3-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/intel-pt.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index 8658d42ce57a..beae5cbe9cc2 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -707,8 +707,10 @@ static int intel_pt_walk_next_insn(struct intel_pt_insn *intel_pt_insn, *ip += intel_pt_insn->length; - if (to_ip && *ip == to_ip) + if (to_ip && *ip == to_ip) { + intel_pt_insn->length = 0; goto out_no_cache; + } if (*ip >= al.map->end) break; @@ -1198,6 +1200,7 @@ static void intel_pt_set_pid_tid_cpu(struct intel_pt *pt, static void intel_pt_sample_flags(struct intel_pt_queue *ptq) { + ptq->insn_len = 0; if (ptq->state->flags & INTEL_PT_ABORT_TX) { ptq->flags = PERF_IP_FLAG_BRANCH | PERF_IP_FLAG_TX_ABORT; } else if (ptq->state->flags & INTEL_PT_ASYNC) { -- cgit v1.2.3 From 0a0c59724516fabf9705c0d9927fa12319908852 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 19 May 2021 10:45:15 +0300 Subject: perf intel-pt: Remove redundant setting of ptq->insn_len Remove redundant "ptq->insn_len = 0" statement. Signed-off-by: Adrian Hunter Cc: Andi Kleen Cc: Jiri Olsa Link: http://lore.kernel.org/lkml/20210519074515.9262-4-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/intel-pt.c | 1 - 1 file changed, 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index beae5cbe9cc2..0dfec8761b9a 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -1214,7 +1214,6 @@ static void intel_pt_sample_flags(struct intel_pt_queue *ptq) ptq->flags = PERF_IP_FLAG_BRANCH | PERF_IP_FLAG_CALL | PERF_IP_FLAG_ASYNC | PERF_IP_FLAG_INTERRUPT; - ptq->insn_len = 0; } else { if (ptq->state->from_ip) ptq->flags = intel_pt_insn_type(ptq->state->insn_op); -- cgit v1.2.3 From fb6c79d7261afb7e942251254ea47951c2a9a706 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 10 Feb 2021 17:33:27 +0900 Subject: perf tools: Add 'cgroup-switches' software event It counts how often cgroups are changed actually during the context switches. # perf stat -a -e context-switches,cgroup-switches -a sleep 1 Performance counter stats for 'system wide': 11,267 context-switches 10,950 cgroup-switches 1.015634369 seconds time elapsed Committer notes: The kernel patches landed in v5.13, but this entry wasn't filled in perf's parse-events tables, which was leading to a segfault when running 'perf list' on a kernel with that feature, as reported by Thomas Richter. Also removed the part touching tools/include/uapi/linux/perf_event.h as it was updated in the usual sync with the kernel UAPI headers, in a previous, already upstream, patch. Signed-off-by: Namhyung Kim Cc: Alexander Shishkin Cc: Andi Kleen Cc: Heiko Carstens Cc: Ian Rogers Cc: Jiri Olsa Cc: Mark Rutland Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Richter Link: http://lore.kernel.org/lkml/20210210083327.22726-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 4 ++++ tools/perf/util/parse-events.l | 1 + 2 files changed, 5 insertions(+) (limited to 'tools') diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 4dad14265b81..269997066f6e 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -150,6 +150,10 @@ struct event_symbol event_symbols_sw[PERF_COUNT_SW_MAX] = { .symbol = "bpf-output", .alias = "", }, + [PERF_COUNT_SW_CGROUP_SWITCHES] = { + .symbol = "cgroup-switches", + .alias = "", + }, }; #define __PERF_EVENT_FIELD(config, name) \ diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l index fb8646cc3e83..923849024b15 100644 --- a/tools/perf/util/parse-events.l +++ b/tools/perf/util/parse-events.l @@ -347,6 +347,7 @@ emulation-faults { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_EM dummy { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_DUMMY); } duration_time { return tool(yyscanner, PERF_TOOL_DURATION_TIME); } bpf-output { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_BPF_OUTPUT); } +cgroup-switches { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CGROUP_SWITCHES); } /* * We have to handle the kernel PMU event cycles-ct/cycles-t/mem-loads/mem-stores separately. -- cgit v1.2.3 From 5665bc35c1ed917ac8fd06cb651317bb47a65b10 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 20 May 2021 21:19:30 +1000 Subject: powerpc/64s/syscall: Use pt_regs.trap to distinguish syscall ABI difference between sc and scv syscalls The sc and scv 0 system calls have different ABI conventions, and ptracers need to know which system call type is being used if they want to look at the syscall registers. Document that pt_regs.trap can be used for this, and fix one in-tree user to work with scv 0 syscalls. Fixes: 7fa95f9adaee ("powerpc/64s: system call support for scv/rfscv instructions") Cc: stable@vger.kernel.org # v5.9+ Reported-by: "Dmitry V. Levin" Suggested-by: "Dmitry V. Levin" Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210520111931.2597127-1-npiggin@gmail.com --- Documentation/powerpc/syscall64-abi.rst | 10 ++++++++++ tools/testing/selftests/seccomp/seccomp_bpf.c | 27 ++++++++++++++++++--------- 2 files changed, 28 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/Documentation/powerpc/syscall64-abi.rst b/Documentation/powerpc/syscall64-abi.rst index dabee3729e5a..56490c4c0c07 100644 --- a/Documentation/powerpc/syscall64-abi.rst +++ b/Documentation/powerpc/syscall64-abi.rst @@ -109,6 +109,16 @@ auxiliary vector. scv 0 syscalls will always behave as PPC_FEATURE2_HTM_NOSC. +ptrace +------ +When ptracing system calls (PTRACE_SYSCALL), the pt_regs.trap value contains +the system call type that can be used to distinguish between sc and scv 0 +system calls, and the different register conventions can be accounted for. + +If the value of (pt_regs.trap & 0xfff0) is 0xc00 then the system call was +performed with the sc instruction, if it is 0x3000 then the system call was +performed with the scv 0 instruction. + vsyscall ======== diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 98c3b647f54d..e3d5c77a8612 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -1753,16 +1753,25 @@ TEST_F(TRACE_poke, getpid_runs_normally) # define SYSCALL_RET_SET(_regs, _val) \ do { \ typeof(_val) _result = (_val); \ - /* \ - * A syscall error is signaled by CR0 SO bit \ - * and the code is stored as a positive value. \ - */ \ - if (_result < 0) { \ - SYSCALL_RET(_regs) = -_result; \ - (_regs).ccr |= 0x10000000; \ - } else { \ + if ((_regs.trap & 0xfff0) == 0x3000) { \ + /* \ + * scv 0 system call uses -ve result \ + * for error, so no need to adjust. \ + */ \ SYSCALL_RET(_regs) = _result; \ - (_regs).ccr &= ~0x10000000; \ + } else { \ + /* \ + * A syscall error is signaled by the \ + * CR0 SO bit and the code is stored as \ + * a positive value. \ + */ \ + if (_result < 0) { \ + SYSCALL_RET(_regs) = -_result; \ + (_regs).ccr |= 0x10000000; \ + } else { \ + SYSCALL_RET(_regs) = _result; \ + (_regs).ccr &= ~0x10000000; \ + } \ } \ } while (0) # define SYSCALL_RET_SET_ON_PTRACE_EXIT -- cgit v1.2.3 From 8570e75a55430844a8e85e3458e5701556334ffd Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 19 May 2021 21:33:33 +0000 Subject: selftests: Add .gitignore for nci test suite Building the nci test suite produces a binary, nci_dev, that git then tries to track. Add a .gitignore file to tell git to ignore this binary. Signed-off-by: David Matlack Signed-off-by: David S. Miller --- tools/testing/selftests/nci/.gitignore | 1 + 1 file changed, 1 insertion(+) create mode 100644 tools/testing/selftests/nci/.gitignore (limited to 'tools') diff --git a/tools/testing/selftests/nci/.gitignore b/tools/testing/selftests/nci/.gitignore new file mode 100644 index 000000000000..448eeb4590fc --- /dev/null +++ b/tools/testing/selftests/nci/.gitignore @@ -0,0 +1 @@ +/nci_dev -- cgit v1.2.3 From 8f1634b82189e715b0f82f16ce54fab43cfedd8a Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 14 May 2021 10:05:28 -0700 Subject: selftests/bpf: Convert static to global in tc_redirect progs Both IFINDEX_SRC and IFINDEX_DST are set from the userspace and it won't work once bpf merges with bpf-next. Fixes: 096eccdef0b3 ("selftests/bpf: Rewrite test_tc_redirect.sh as prog_tests/tc_redirect.c") Signed-off-by: Stanislav Fomichev Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210514170528.3750250-1-sdf@google.com --- tools/testing/selftests/bpf/progs/test_tc_neigh.c | 4 ++-- tools/testing/selftests/bpf/progs/test_tc_peer.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/progs/test_tc_neigh.c b/tools/testing/selftests/bpf/progs/test_tc_neigh.c index 90f64a85998f..0c93d326a663 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_neigh.c +++ b/tools/testing/selftests/bpf/progs/test_tc_neigh.c @@ -33,8 +33,8 @@ a.s6_addr32[3] == b.s6_addr32[3]) #endif -static volatile const __u32 IFINDEX_SRC; -static volatile const __u32 IFINDEX_DST; +volatile const __u32 IFINDEX_SRC; +volatile const __u32 IFINDEX_DST; static __always_inline bool is_remote_ep_v4(struct __sk_buff *skb, __be32 addr) diff --git a/tools/testing/selftests/bpf/progs/test_tc_peer.c b/tools/testing/selftests/bpf/progs/test_tc_peer.c index 72c72950c3bb..ef264bced0e6 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_peer.c +++ b/tools/testing/selftests/bpf/progs/test_tc_peer.c @@ -8,8 +8,8 @@ #include -static volatile const __u32 IFINDEX_SRC; -static volatile const __u32 IFINDEX_DST; +volatile const __u32 IFINDEX_SRC; +volatile const __u32 IFINDEX_DST; SEC("classifier/chk_egress") int tc_chk(struct __sk_buff *skb) -- cgit v1.2.3 From 704e2beba23c45eaa056b1c03b5e1fb221e03f80 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 14 May 2021 11:07:26 -0700 Subject: selftests/bpf: Test ringbuf mmap read-only and read-write restrictions Extend ringbuf selftest to validate read/write and read-only restrictions on memory mapping consumer/producer/data pages. Ensure no "escalations" from PROT_READ to PROT_WRITE/PROT_EXEC is allowed. And test that mremap() fails to expand mmap()'ed area. Signed-off-by: Thadeu Lima de Souza Cascardo Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210514180726.843157-1-andrii@kernel.org --- tools/testing/selftests/bpf/prog_tests/ringbuf.c | 49 +++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf.c b/tools/testing/selftests/bpf/prog_tests/ringbuf.c index de78617f6550..f9a8ae331963 100644 --- a/tools/testing/selftests/bpf/prog_tests/ringbuf.c +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf.c @@ -86,8 +86,9 @@ void test_ringbuf(void) const size_t rec_sz = BPF_RINGBUF_HDR_SZ + sizeof(struct sample); pthread_t thread; long bg_ret = -1; - int err, cnt; + int err, cnt, rb_fd; int page_size = getpagesize(); + void *mmap_ptr, *tmp_ptr; skel = test_ringbuf__open(); if (CHECK(!skel, "skel_open", "skeleton open failed\n")) @@ -101,6 +102,52 @@ void test_ringbuf(void) if (CHECK(err != 0, "skel_load", "skeleton load failed\n")) goto cleanup; + rb_fd = bpf_map__fd(skel->maps.ringbuf); + /* good read/write cons_pos */ + mmap_ptr = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, rb_fd, 0); + ASSERT_OK_PTR(mmap_ptr, "rw_cons_pos"); + tmp_ptr = mremap(mmap_ptr, page_size, 2 * page_size, MREMAP_MAYMOVE); + if (!ASSERT_ERR_PTR(tmp_ptr, "rw_extend")) + goto cleanup; + ASSERT_ERR(mprotect(mmap_ptr, page_size, PROT_EXEC), "exec_cons_pos_protect"); + ASSERT_OK(munmap(mmap_ptr, page_size), "unmap_rw"); + + /* bad writeable prod_pos */ + mmap_ptr = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, rb_fd, page_size); + err = -errno; + ASSERT_ERR_PTR(mmap_ptr, "wr_prod_pos"); + ASSERT_EQ(err, -EPERM, "wr_prod_pos_err"); + + /* bad writeable data pages */ + mmap_ptr = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, rb_fd, 2 * page_size); + err = -errno; + ASSERT_ERR_PTR(mmap_ptr, "wr_data_page_one"); + ASSERT_EQ(err, -EPERM, "wr_data_page_one_err"); + mmap_ptr = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, rb_fd, 3 * page_size); + ASSERT_ERR_PTR(mmap_ptr, "wr_data_page_two"); + mmap_ptr = mmap(NULL, 2 * page_size, PROT_WRITE, MAP_SHARED, rb_fd, 2 * page_size); + ASSERT_ERR_PTR(mmap_ptr, "wr_data_page_all"); + + /* good read-only pages */ + mmap_ptr = mmap(NULL, 4 * page_size, PROT_READ, MAP_SHARED, rb_fd, 0); + if (!ASSERT_OK_PTR(mmap_ptr, "ro_prod_pos")) + goto cleanup; + + ASSERT_ERR(mprotect(mmap_ptr, 4 * page_size, PROT_WRITE), "write_protect"); + ASSERT_ERR(mprotect(mmap_ptr, 4 * page_size, PROT_EXEC), "exec_protect"); + ASSERT_ERR_PTR(mremap(mmap_ptr, 0, 4 * page_size, MREMAP_MAYMOVE), "ro_remap"); + ASSERT_OK(munmap(mmap_ptr, 4 * page_size), "unmap_ro"); + + /* good read-only pages with initial offset */ + mmap_ptr = mmap(NULL, page_size, PROT_READ, MAP_SHARED, rb_fd, page_size); + if (!ASSERT_OK_PTR(mmap_ptr, "ro_prod_pos")) + goto cleanup; + + ASSERT_ERR(mprotect(mmap_ptr, page_size, PROT_WRITE), "write_protect"); + ASSERT_ERR(mprotect(mmap_ptr, page_size, PROT_EXEC), "exec_protect"); + ASSERT_ERR_PTR(mremap(mmap_ptr, 0, 3 * page_size, MREMAP_MAYMOVE), "ro_remap"); + ASSERT_OK(munmap(mmap_ptr, page_size), "unmap_ro"); + /* only trigger BPF program for current process */ skel->bss->pid = getpid(); -- cgit v1.2.3 From 3b2f17ad1770e51b8b4e68b5069c4f1ee477eff8 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 19 May 2021 13:50:31 -0300 Subject: perf parse-events: Check if the software events array slots are populated To avoid a NULL pointer dereference when the kernel supports the new feature but the tooling still hasn't an entry for it. This happened with the recently added PERF_COUNT_SW_CGROUP_SWITCHES software event. Reported-by: Thomas Richter Cc: Adrian Hunter Cc: Heiko Carstens Cc: Jiri Olsa Cc: Namhyung Kim Cc: Sumanth Korikkar Link: https://lore.kernel.org/linux-perf-users/YKVESEKRjKtILhog@kernel.org/ Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 269997066f6e..84108c17f48d 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -2932,9 +2932,14 @@ restart: } for (i = 0; i < max; i++, syms++) { + /* + * New attr.config still not supported here, the latest + * example was PERF_COUNT_SW_CGROUP_SWITCHES + */ + if (syms->symbol == NULL) + continue; - if (event_glob != NULL && syms->symbol != NULL && - !(strglobmatch(syms->symbol, event_glob) || + if (event_glob != NULL && !(strglobmatch(syms->symbol, event_glob) || (syms->alias && strglobmatch(syms->alias, event_glob)))) continue; -- cgit v1.2.3 From ec347b7c319156c3b488681d1813d08d88499cc6 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 21 May 2021 16:00:31 -0300 Subject: tools headers UAPI: Sync linux/fs.h with the kernel sources To pick the trivial change in: 63c8af5687f6b1b7 ("block: uapi: fix comment about block device ioctl") This silences this perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/fs.h' differs from latest version at 'include/uapi/linux/fs.h' diff -u tools/include/uapi/linux/fs.h include/uapi/linux/fs.h Cc: Damien Le Moal Cc: Jens Axboe Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/include/uapi/linux/fs.h b/tools/include/uapi/linux/fs.h index f44eb0a04afd..4c32e97dcdf0 100644 --- a/tools/include/uapi/linux/fs.h +++ b/tools/include/uapi/linux/fs.h @@ -185,7 +185,7 @@ struct fsxattr { #define BLKROTATIONAL _IO(0x12,126) #define BLKZEROOUT _IO(0x12,127) /* - * A jump here: 130-131 are reserved for zoned block devices + * A jump here: 130-136 are reserved for zoned block devices * (see uapi/linux/blkzoned.h) */ -- cgit v1.2.3 From 4224680ee7aaf0f13ab762ffb2a77373737dce5e Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 21 May 2021 16:00:31 -0300 Subject: tools headers UAPI: Sync linux/perf_event.h with the kernel sources To pick the trivial change in: 0683b53197b55343 ("signal: Deliver all of the siginfo perf data in _perf") This silences this perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/perf_event.h' differs from latest version at 'include/uapi/linux/perf_event.h' diff -u tools/include/uapi/linux/perf_event.h include/uapi/linux/perf_event.h Cc: Eric W. Biederman Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/perf_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index bf8143505c49..f92880a15645 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -464,7 +464,7 @@ struct perf_event_attr { /* * User provided data if sigtrap=1, passed back to user via - * siginfo_t::si_perf, e.g. to permit user to identify the event. + * siginfo_t::si_perf_data, e.g. to permit user to identify the event. */ __u64 sig_data; }; -- cgit v1.2.3 From bffcbe79370e8fda7f1d19899de83aa2a833bf69 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 21 May 2021 16:14:00 -0300 Subject: tools headers UAPI: Sync files changed by the quotactl_path unwiring To pick the changes in this csets: 5b9fedb31e476693 ("quota: Disable quotactl_path syscall") That silences these perf build warnings: Warning: Kernel ABI header at 'tools/include/uapi/asm-generic/unistd.h' differs from latest version at 'include/uapi/asm-generic/unistd.h' diff -u tools/include/uapi/asm-generic/unistd.h include/uapi/asm-generic/unistd.h Warning: Kernel ABI header at 'tools/perf/arch/x86/entry/syscalls/syscall_64.tbl' differs from latest version at 'arch/x86/entry/syscalls/syscall_64.tbl' diff -u tools/perf/arch/x86/entry/syscalls/syscall_64.tbl arch/x86/entry/syscalls/syscall_64.tbl Warning: Kernel ABI header at 'tools/perf/arch/powerpc/entry/syscalls/syscall.tbl' differs from latest version at 'arch/powerpc/kernel/syscalls/syscall.tbl' diff -u tools/perf/arch/powerpc/entry/syscalls/syscall.tbl arch/powerpc/kernel/syscalls/syscall.tbl Warning: Kernel ABI header at 'tools/perf/arch/s390/entry/syscalls/syscall.tbl' differs from latest version at 'arch/s390/kernel/syscalls/syscall.tbl' diff -u tools/perf/arch/s390/entry/syscalls/syscall.tbl arch/s390/kernel/syscalls/syscall.tbl Warning: Kernel ABI header at 'tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl' differs from latest version at 'arch/mips/kernel/syscalls/syscall_n64.tbl' diff -u tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl arch/mips/kernel/syscalls/syscall_n64.tbl Cc: Jan Kara Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl | 2 +- tools/perf/arch/powerpc/entry/syscalls/syscall.tbl | 2 +- tools/perf/arch/s390/entry/syscalls/syscall.tbl | 2 +- tools/perf/arch/x86/entry/syscalls/syscall_64.tbl | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl b/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl index 9974f5f8e49b..9cd1c34f31b5 100644 --- a/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl +++ b/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl @@ -357,7 +357,7 @@ 440 n64 process_madvise sys_process_madvise 441 n64 epoll_pwait2 sys_epoll_pwait2 442 n64 mount_setattr sys_mount_setattr -443 n64 quotactl_path sys_quotactl_path +# 443 reserved for quotactl_path 444 n64 landlock_create_ruleset sys_landlock_create_ruleset 445 n64 landlock_add_rule sys_landlock_add_rule 446 n64 landlock_restrict_self sys_landlock_restrict_self diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl index 2e68fbb57cc6..8f052ff4058c 100644 --- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl @@ -522,7 +522,7 @@ 440 common process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr -443 common quotactl_path sys_quotactl_path +# 443 reserved for quotactl_path 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl index 7e4a2aba366d..0690263df1dd 100644 --- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl @@ -445,7 +445,7 @@ 440 common process_madvise sys_process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr sys_mount_setattr -443 common quotactl_path sys_quotactl_path sys_quotactl_path +# 443 reserved for quotactl_path 444 common landlock_create_ruleset sys_landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self sys_landlock_restrict_self diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl index ecd551b08d05..ce18119ea0d0 100644 --- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl +++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl @@ -364,7 +364,7 @@ 440 common process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr -443 common quotactl_path sys_quotactl_path +# 443 reserved for quotactl_path 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self -- cgit v1.2.3 From a6172059758ba1b496ae024cece7d5bdc8d017db Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 21 May 2021 12:20:51 +0300 Subject: perf scripts python: exported-sql-viewer.py: Fix copy to clipboard from Top Calls by elapsed Time report Provide missing argument to prevent following error when copying a selection to the clipboard: Traceback (most recent call last): File "tools/perf/scripts/python/exported-sql-viewer.py", line 4041, in menu.addAction(CreateAction("&Copy selection", "Copy to clipboard", lambda: CopyCellsToClipboardHdr(self.view), self.view)) File "tools/perf/scripts/python/exported-sql-viewer.py", line 4021, in CopyCellsToClipboardHdr CopyCellsToClipboard(view, False, True) File "tools/perf/scripts/python/exported-sql-viewer.py", line 4018, in CopyCellsToClipboard view.CopyCellsToClipboard(view, as_csv, with_hdr) File "tools/perf/scripts/python/exported-sql-viewer.py", line 3871, in CopyTableCellsToClipboard val = model.headerData(col, Qt.Horizontal) TypeError: headerData() missing 1 required positional argument: 'role' Fixes: 96c43b9a7ab3b ("perf scripts python: exported-sql-viewer.py: Add copy to clipboard") Signed-off-by: Adrian Hunter Cc: Jiri Olsa Cc: stable@vger.kernel.org Link: http://lore.kernel.org/lkml/20210521092053.25683-2-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/scripts/python/exported-sql-viewer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/scripts/python/exported-sql-viewer.py b/tools/perf/scripts/python/exported-sql-viewer.py index 7daa8bb70a5a..b5078d65704e 100755 --- a/tools/perf/scripts/python/exported-sql-viewer.py +++ b/tools/perf/scripts/python/exported-sql-viewer.py @@ -3868,7 +3868,7 @@ def CopyTableCellsToClipboard(view, as_csv=False, with_hdr=False): if with_hdr: model = indexes[0].model() for col in range(min_col, max_col + 1): - val = model.headerData(col, Qt.Horizontal) + val = model.headerData(col, Qt.Horizontal, Qt.DisplayRole) if as_csv: text += sep + ToCSValue(val) sep = "," -- cgit v1.2.3 From fd931b2e234a7cc451a7bbb1965d6ce623189158 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 21 May 2021 12:20:52 +0300 Subject: perf scripts python: exported-sql-viewer.py: Fix Array TypeError The 'Array' class is present in more than one python standard library. In some versions of Python 3, the following error occurs: Traceback (most recent call last): File "tools/perf/scripts/python/exported-sql-viewer.py", line 4702, in reports_menu.addAction(CreateAction(label, "Create a new window displaying branch events", lambda a=None,x=dbid: self.NewBranchView(x), self)) File "tools/perf/scripts/python/exported-sql-viewer.py", line 4727, in NewBranchView BranchWindow(self.glb, event_id, ReportVars(), self) File "tools/perf/scripts/python/exported-sql-viewer.py", line 3208, in __init__ self.model = LookupCreateModel(model_name, lambda: BranchModel(glb, event_id, report_vars.where_clause)) File "tools/perf/scripts/python/exported-sql-viewer.py", line 343, in LookupCreateModel model = create_fn() File "tools/perf/scripts/python/exported-sql-viewer.py", line 3208, in self.model = LookupCreateModel(model_name, lambda: BranchModel(glb, event_id, report_vars.where_clause)) File "tools/perf/scripts/python/exported-sql-viewer.py", line 3124, in __init__ self.fetcher = SQLFetcher(glb, sql, prep, self.AddSample) File "tools/perf/scripts/python/exported-sql-viewer.py", line 2658, in __init__ self.buffer = Array(c_char, self.buffer_size, lock=False) TypeError: abstract class This apparently happens because Python can be inconsistent about which class of the name 'Array' gets imported. Fix by importing explicitly by name so that only the desired 'Array' gets imported. Fixes: 8392b74b575c3 ("perf scripts python: exported-sql-viewer.py: Add ability to display all the database tables") Signed-off-by: Adrian Hunter Cc: Jiri Olsa Cc: stable@vger.kernel.org Link: http://lore.kernel.org/lkml/20210521092053.25683-3-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/scripts/python/exported-sql-viewer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/scripts/python/exported-sql-viewer.py b/tools/perf/scripts/python/exported-sql-viewer.py index b5078d65704e..4a63843f623c 100755 --- a/tools/perf/scripts/python/exported-sql-viewer.py +++ b/tools/perf/scripts/python/exported-sql-viewer.py @@ -125,8 +125,9 @@ if pyside_version_1: from PySide.QtGui import * from PySide.QtSql import * -from decimal import * -from ctypes import * +from decimal import Decimal, ROUND_HALF_UP +from ctypes import CDLL, Structure, create_string_buffer, addressof, sizeof, \ + c_void_p, c_bool, c_byte, c_char, c_int, c_uint, c_longlong, c_ulonglong from multiprocessing import Process, Array, Value, Event # xrange is range in Python3 -- cgit v1.2.3 From f56299a9c998e0bfbd4ab07cafe9eb8444512448 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 21 May 2021 12:20:53 +0300 Subject: perf scripts python: exported-sql-viewer.py: Fix warning display Deprecation warnings are useful only for the developer, not an end user. Display warnings only when requested using the python -W option. This stops the display of warnings like: tools/perf/scripts/python/exported-sql-viewer.py:5102: DeprecationWarning: an integer is required (got type PySide2.QtCore.Qt.AlignmentFlag). Implicit conversion to integers using __int__ is deprecated, and may be removed in a future version of Python. err = app.exec_() Since the warning can be fixed only in PySide2, we must wait for it to be finally fixed there. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Cc: stable@vger.kernel.org # v5.3+ Link: http://lore.kernel.org/lkml/20210521092053.25683-4-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/scripts/python/exported-sql-viewer.py | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'tools') diff --git a/tools/perf/scripts/python/exported-sql-viewer.py b/tools/perf/scripts/python/exported-sql-viewer.py index 4a63843f623c..711d4f9f5645 100755 --- a/tools/perf/scripts/python/exported-sql-viewer.py +++ b/tools/perf/scripts/python/exported-sql-viewer.py @@ -91,6 +91,11 @@ from __future__ import print_function import sys +# Only change warnings if the python -W option was not used +if not sys.warnoptions: + import warnings + # PySide2 causes deprecation warnings, ignore them. + warnings.filterwarnings("ignore", category=DeprecationWarning) import argparse import weakref import threading -- cgit v1.2.3 From f42907e8a4515635615a6ffd44242454ef843c04 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 21 May 2021 20:51:27 +0300 Subject: perf script: Add missing PERF_IP_FLAG_CHARS for VM-Entry and VM-Exit Add 'g' (guest) for VM-Entry and 'h' (host) for VM-Exit. Fixes: c025d46cd932c ("perf script: Add branch types for VM-Entry and VM-Exit") Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: http://lore.kernel.org/lkml/20210521175127.27264-1-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-intel-pt.txt | 6 +++--- tools/perf/Documentation/perf-script.txt | 7 ++++--- tools/perf/util/event.h | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/perf/Documentation/perf-intel-pt.txt b/tools/perf/Documentation/perf-intel-pt.txt index 1dcec73c910c..bcf3eca5afbe 100644 --- a/tools/perf/Documentation/perf-intel-pt.txt +++ b/tools/perf/Documentation/perf-intel-pt.txt @@ -108,9 +108,9 @@ displayed as follows: perf script --itrace=ibxwpe -F+flags -The flags are "bcrosyiABEx" which stand for branch, call, return, conditional, -system, asynchronous, interrupt, transaction abort, trace begin, trace end, and -in transaction, respectively. +The flags are "bcrosyiABExgh" which stand for branch, call, return, conditional, +system, asynchronous, interrupt, transaction abort, trace begin, trace end, +in transaction, VM-entry, and VM-exit respectively. perf script also supports higher level ways to dump instruction traces: diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index 5b8b61075039..48a5f5b26dd4 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -183,14 +183,15 @@ OPTIONS At this point usage is displayed, and perf-script exits. The flags field is synthesized and may have a value when Instruction - Trace decoding. The flags are "bcrosyiABEx" which stand for branch, + Trace decoding. The flags are "bcrosyiABExgh" which stand for branch, call, return, conditional, system, asynchronous, interrupt, - transaction abort, trace begin, trace end, and in transaction, + transaction abort, trace begin, trace end, in transaction, VM-Entry, and VM-Exit respectively. Known combinations of flags are printed more nicely e.g. "call" for "bc", "return" for "br", "jcc" for "bo", "jmp" for "b", "int" for "bci", "iret" for "bri", "syscall" for "bcs", "sysret" for "brs", "async" for "by", "hw int" for "bcyi", "tx abrt" for "bA", "tr strt" for "bB", - "tr end" for "bE". However the "x" flag will be display separately in those + "tr end" for "bE", "vmentry" for "bcg", "vmexit" for "bch". + However the "x" flag will be displayed separately in those cases e.g. "jcc (x)" for a condition branch within a transaction. The callindent field is synthesized and may have a value when diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index 8a62fb39e365..19ad64f2bd83 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -100,7 +100,7 @@ enum { PERF_IP_FLAG_VMEXIT = 1ULL << 12, }; -#define PERF_IP_FLAG_CHARS "bcrosyiABEx" +#define PERF_IP_FLAG_CHARS "bcrosyiABExgh" #define PERF_BRANCH_MASK (\ PERF_IP_FLAG_BRANCH |\ -- cgit v1.2.3 From f8b61bd20479c094fb421da42fef6b4ff22a589e Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 11 May 2021 23:51:16 -0700 Subject: perf stat: Skip evlist__[enable|disable] when all events uses BPF When all events of a perf-stat session use BPF, it is not necessary to call evlist__enable() and evlist__disable(). Skip them when all_counters_use_bpf is true. Signed-off-by: Song Liu Reported-by: Jiri Olsa Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 13 ++++++++++--- tools/perf/util/evlist.c | 3 --- 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 5a830ae09418..f9f74a514315 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -572,7 +572,8 @@ static int enable_counters(void) * - we have initial delay configured */ if (!target__none(&target) || stat_config.initial_delay) { - evlist__enable(evsel_list); + if (!all_counters_use_bpf) + evlist__enable(evsel_list); if (stat_config.initial_delay > 0) pr_info(EVLIST_ENABLED_MSG); } @@ -581,13 +582,19 @@ static int enable_counters(void) static void disable_counters(void) { + struct evsel *counter; + /* * If we don't have tracee (attaching to task or cpu), counters may * still be running. To get accurate group ratios, we must stop groups * from counting before reading their constituent counters. */ - if (!target__none(&target)) - evlist__disable(evsel_list); + if (!target__none(&target)) { + evlist__for_each_entry(evsel_list, counter) + bpf_counter__disable(counter); + if (!all_counters_use_bpf) + evlist__disable(evsel_list); + } } static volatile int workload_exec_errno; diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 6e5c41528c7d..6ea3e677dc1e 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -425,9 +425,6 @@ static void __evlist__disable(struct evlist *evlist, char *evsel_name) if (affinity__setup(&affinity) < 0) return; - evlist__for_each_entry(evlist, pos) - bpf_counter__disable(pos); - /* Disable 'immediate' events last */ for (imm = 0; imm <= 1; imm++) { evlist__for_each_cpu(evlist, i, cpu) { -- cgit v1.2.3 From 4d1cd3b2c5c1c32826454de3a18c6183238d47ed Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Sat, 22 May 2021 17:41:53 -0700 Subject: tools/testing/selftests/exec: fix link error Fix the link error by adding '-static': gcc -Wall -Wl,-z,max-page-size=0x1000 -pie load_address.c -o /home/yang/linux/tools/testing/selftests/exec/load_address_4096 /usr/bin/ld: /tmp/ccopEGun.o: relocation R_AARCH64_ADR_PREL_PG_HI21 against symbol `stderr@@GLIBC_2.17' which may bind externally can not be used when making a shared object; recompile with -fPIC /usr/bin/ld: /tmp/ccopEGun.o(.text+0x158): unresolvable R_AARCH64_ADR_PREL_PG_HI21 relocation against symbol `stderr@@GLIBC_2.17' /usr/bin/ld: final link failed: bad value collect2: error: ld returned 1 exit status make: *** [Makefile:25: tools/testing/selftests/exec/load_address_4096] Error 1 Link: https://lkml.kernel.org/r/20210514092422.2367367-1-yangyingliang@huawei.com Fixes: 206e22f01941 ("tools/testing/selftests: add self-test for verifying load alignment") Signed-off-by: Yang Yingliang Cc: Chris Kennelly Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/exec/Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/exec/Makefile b/tools/testing/selftests/exec/Makefile index cf69b2fcce59..dd61118df66e 100644 --- a/tools/testing/selftests/exec/Makefile +++ b/tools/testing/selftests/exec/Makefile @@ -28,8 +28,8 @@ $(OUTPUT)/execveat.denatured: $(OUTPUT)/execveat cp $< $@ chmod -x $@ $(OUTPUT)/load_address_4096: load_address.c - $(CC) $(CFLAGS) $(LDFLAGS) -Wl,-z,max-page-size=0x1000 -pie $< -o $@ + $(CC) $(CFLAGS) $(LDFLAGS) -Wl,-z,max-page-size=0x1000 -pie -static $< -o $@ $(OUTPUT)/load_address_2097152: load_address.c - $(CC) $(CFLAGS) $(LDFLAGS) -Wl,-z,max-page-size=0x200000 -pie $< -o $@ + $(CC) $(CFLAGS) $(LDFLAGS) -Wl,-z,max-page-size=0x200000 -pie -static $< -o $@ $(OUTPUT)/load_address_16777216: load_address.c - $(CC) $(CFLAGS) $(LDFLAGS) -Wl,-z,max-page-size=0x1000000 -pie $< -o $@ + $(CC) $(CFLAGS) $(LDFLAGS) -Wl,-z,max-page-size=0x1000000 -pie -static $< -o $@ -- cgit v1.2.3 From f747e6667ebb2ffb8133486c9cd19800d72b0d98 Mon Sep 17 00:00:00 2001 From: Rikard Falkeborn Date: Sat, 22 May 2021 17:42:02 -0700 Subject: linux/bits.h: fix compilation error with GENMASK GENMASK() has an input check which uses __builtin_choose_expr() to enable a compile time sanity check of its inputs if they are known at compile time. However, it turns out that __builtin_constant_p() does not always return a compile time constant [0]. It was thought this problem was fixed with gcc 4.9 [1], but apparently this is not the case [2]. Switch to use __is_constexpr() instead which always returns a compile time constant, regardless of its inputs. Link: https://lore.kernel.org/lkml/42b4342b-aefc-a16a-0d43-9f9c0d63ba7a@rasmusvillemoes.dk [0] Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=19449 [1] Link: https://lore.kernel.org/lkml/1ac7bbc2-45d9-26ed-0b33-bf382b8d858b@I-love.SAKURA.ne.jp [2] Link: https://lkml.kernel.org/r/20210511203716.117010-1-rikard.falkeborn@gmail.com Signed-off-by: Rikard Falkeborn Reported-by: Tetsuo Handa Acked-by: Arnd Bergmann Reviewed-by: Andy Shevchenko Cc: Ard Biesheuvel Cc: Yury Norov Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bits.h | 2 +- include/linux/const.h | 8 ++++++++ include/linux/minmax.h | 10 ++-------- tools/include/linux/bits.h | 2 +- tools/include/linux/const.h | 8 ++++++++ 5 files changed, 20 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/include/linux/bits.h b/include/linux/bits.h index 7f475d59a097..87d112650dfb 100644 --- a/include/linux/bits.h +++ b/include/linux/bits.h @@ -22,7 +22,7 @@ #include #define GENMASK_INPUT_CHECK(h, l) \ (BUILD_BUG_ON_ZERO(__builtin_choose_expr( \ - __builtin_constant_p((l) > (h)), (l) > (h), 0))) + __is_constexpr((l) > (h)), (l) > (h), 0))) #else /* * BUILD_BUG_ON_ZERO is not available in h files included from asm files, diff --git a/include/linux/const.h b/include/linux/const.h index 81b8aae5a855..435ddd72d2c4 100644 --- a/include/linux/const.h +++ b/include/linux/const.h @@ -3,4 +3,12 @@ #include +/* + * This returns a constant expression while determining if an argument is + * a constant expression, most importantly without evaluating the argument. + * Glory to Martin Uecker + */ +#define __is_constexpr(x) \ + (sizeof(int) == sizeof(*(8 ? ((void *)((long)(x) * 0l)) : (int *)8))) + #endif /* _LINUX_CONST_H */ diff --git a/include/linux/minmax.h b/include/linux/minmax.h index c0f57b0c64d9..5433c08fcc68 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -2,6 +2,8 @@ #ifndef _LINUX_MINMAX_H #define _LINUX_MINMAX_H +#include + /* * min()/max()/clamp() macros must accomplish three things: * @@ -17,14 +19,6 @@ #define __typecheck(x, y) \ (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1))) -/* - * This returns a constant expression while determining if an argument is - * a constant expression, most importantly without evaluating the argument. - * Glory to Martin Uecker - */ -#define __is_constexpr(x) \ - (sizeof(int) == sizeof(*(8 ? ((void *)((long)(x) * 0l)) : (int *)8))) - #define __no_side_effects(x, y) \ (__is_constexpr(x) && __is_constexpr(y)) diff --git a/tools/include/linux/bits.h b/tools/include/linux/bits.h index 7f475d59a097..87d112650dfb 100644 --- a/tools/include/linux/bits.h +++ b/tools/include/linux/bits.h @@ -22,7 +22,7 @@ #include #define GENMASK_INPUT_CHECK(h, l) \ (BUILD_BUG_ON_ZERO(__builtin_choose_expr( \ - __builtin_constant_p((l) > (h)), (l) > (h), 0))) + __is_constexpr((l) > (h)), (l) > (h), 0))) #else /* * BUILD_BUG_ON_ZERO is not available in h files included from asm files, diff --git a/tools/include/linux/const.h b/tools/include/linux/const.h index 81b8aae5a855..435ddd72d2c4 100644 --- a/tools/include/linux/const.h +++ b/tools/include/linux/const.h @@ -3,4 +3,12 @@ #include +/* + * This returns a constant expression while determining if an argument is + * a constant expression, most importantly without evaluating the argument. + * Glory to Martin Uecker + */ +#define __is_constexpr(x) \ + (sizeof(int) == sizeof(*(8 ? ((void *)((long)(x) * 0l)) : (int *)8))) + #endif /* _LINUX_CONST_H */ -- cgit v1.2.3 From 3a62fed2fd7b6fea96d720e779cafc30dfb3a22e Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Sat, 22 May 2021 15:14:45 +0200 Subject: net/sched: fq_pie: re-factor fix for fq_pie endless loop the patch that fixed an endless loop in_fq_pie_init() was not considering that 65535 is a valid class id. The correct bugfix for this infinite loop is to change 'idx' to become an u32, like Colin proposed in the past [1]. Fix this as follows: - restore 65536 as maximum possible values of 'flows_cnt' - use u32 'idx' when iterating on 'q->flows' - fix the TDC selftest This reverts commit bb2f930d6dd708469a587dc9ed1efe1ef969c0bf. [1] https://lore.kernel.org/netdev/20210407163808.499027-1-colin.king@canonical.com/ CC: Colin Ian King CC: stable@vger.kernel.org Fixes: bb2f930d6dd7 ("net/sched: fix infinite loop in sch_fq_pie") Fixes: ec97ecf1ebe4 ("net: sched: add Flow Queue PIE packet scheduler") Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- net/sched/sch_fq_pie.c | 10 +++++----- tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_pie.json | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'tools') diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c index 949163fe68af..266c7c1869d9 100644 --- a/net/sched/sch_fq_pie.c +++ b/net/sched/sch_fq_pie.c @@ -297,9 +297,9 @@ static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt, goto flow_error; } q->flows_cnt = nla_get_u32(tb[TCA_FQ_PIE_FLOWS]); - if (!q->flows_cnt || q->flows_cnt >= 65536) { + if (!q->flows_cnt || q->flows_cnt > 65536) { NL_SET_ERR_MSG_MOD(extack, - "Number of flows must range in [1..65535]"); + "Number of flows must range in [1..65536]"); goto flow_error; } } @@ -367,7 +367,7 @@ static void fq_pie_timer(struct timer_list *t) struct fq_pie_sched_data *q = from_timer(q, t, adapt_timer); struct Qdisc *sch = q->sch; spinlock_t *root_lock; /* to lock qdisc for probability calculations */ - u16 idx; + u32 idx; root_lock = qdisc_lock(qdisc_root_sleeping(sch)); spin_lock(root_lock); @@ -388,7 +388,7 @@ static int fq_pie_init(struct Qdisc *sch, struct nlattr *opt, { struct fq_pie_sched_data *q = qdisc_priv(sch); int err; - u16 idx; + u32 idx; pie_params_init(&q->p_params); sch->limit = 10 * 1024; @@ -500,7 +500,7 @@ static int fq_pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d) static void fq_pie_reset(struct Qdisc *sch) { struct fq_pie_sched_data *q = qdisc_priv(sch); - u16 idx; + u32 idx; INIT_LIST_HEAD(&q->new_flows); INIT_LIST_HEAD(&q->old_flows); diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_pie.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_pie.json index 1cda2e11b3ad..773c5027553d 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_pie.json +++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_pie.json @@ -9,11 +9,11 @@ "setup": [ "$IP link add dev $DUMMY type dummy || /bin/true" ], - "cmdUnderTest": "$TC qdisc add dev $DUMMY root fq_pie flows 65536", - "expExitCode": "2", + "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root fq_pie flows 65536", + "expExitCode": "0", "verifyCmd": "$TC qdisc show dev $DUMMY", - "matchPattern": "qdisc", - "matchCount": "0", + "matchPattern": "qdisc fq_pie 1: root refcnt 2 limit 10240p flows 65536", + "matchCount": "1", "teardown": [ "$IP link del dev $DUMMY" ] -- cgit v1.2.3 From a8deba8547e39f26440101164a3bbc2899c5b305 Mon Sep 17 00:00:00 2001 From: Liu Jian Date: Tue, 25 May 2021 09:41:39 +0800 Subject: bpftool: Add sock_release help info for cgroup attach/prog load command The help information was not added at the time when the function got added. Fix this and add the missing information to its cli, documentation and bash completion. Fixes: db94cc0b4805 ("bpftool: Add support for BPF_CGROUP_INET_SOCK_RELEASE") Signed-off-by: Liu Jian Signed-off-by: Daniel Borkmann Reviewed-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20210525014139.323859-1-liujian56@huawei.com --- tools/bpf/bpftool/Documentation/bpftool-cgroup.rst | 4 +++- tools/bpf/bpftool/Documentation/bpftool-prog.rst | 2 +- tools/bpf/bpftool/bash-completion/bpftool | 6 +++--- tools/bpf/bpftool/cgroup.c | 3 ++- tools/bpf/bpftool/prog.c | 2 +- 5 files changed, 10 insertions(+), 7 deletions(-) (limited to 'tools') diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst index 790944c35602..baee8591ac76 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst @@ -30,7 +30,8 @@ CGROUP COMMANDS | *ATTACH_TYPE* := { **ingress** | **egress** | **sock_create** | **sock_ops** | **device** | | **bind4** | **bind6** | **post_bind4** | **post_bind6** | **connect4** | **connect6** | | **getpeername4** | **getpeername6** | **getsockname4** | **getsockname6** | **sendmsg4** | -| **sendmsg6** | **recvmsg4** | **recvmsg6** | **sysctl** | **getsockopt** | **setsockopt** } +| **sendmsg6** | **recvmsg4** | **recvmsg6** | **sysctl** | **getsockopt** | **setsockopt** | +| **sock_release** } | *ATTACH_FLAGS* := { **multi** | **override** } DESCRIPTION @@ -106,6 +107,7 @@ DESCRIPTION **getpeername6** call to getpeername(2) for an inet6 socket (since 5.8); **getsockname4** call to getsockname(2) for an inet4 socket (since 5.8); **getsockname6** call to getsockname(2) for an inet6 socket (since 5.8). + **sock_release** closing an userspace inet socket (since 5.9). **bpftool cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG* Detach *PROG* from the cgroup *CGROUP* and attach type diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index 358c7309d419..fe1b38e7e887 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -44,7 +44,7 @@ PROG COMMANDS | **cgroup/connect4** | **cgroup/connect6** | **cgroup/getpeername4** | **cgroup/getpeername6** | | **cgroup/getsockname4** | **cgroup/getsockname6** | **cgroup/sendmsg4** | **cgroup/sendmsg6** | | **cgroup/recvmsg4** | **cgroup/recvmsg6** | **cgroup/sysctl** | -| **cgroup/getsockopt** | **cgroup/setsockopt** | +| **cgroup/getsockopt** | **cgroup/setsockopt** | **cgroup/sock_release** | | **struct_ops** | **fentry** | **fexit** | **freplace** | **sk_lookup** | } | *ATTACH_TYPE* := { diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index d67518bcbd44..cc33c5824a2f 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -478,7 +478,7 @@ _bpftool() cgroup/recvmsg4 cgroup/recvmsg6 \ cgroup/post_bind4 cgroup/post_bind6 \ cgroup/sysctl cgroup/getsockopt \ - cgroup/setsockopt struct_ops \ + cgroup/setsockopt cgroup/sock_release struct_ops \ fentry fexit freplace sk_lookup" -- \ "$cur" ) ) return 0 @@ -1021,7 +1021,7 @@ _bpftool() device bind4 bind6 post_bind4 post_bind6 connect4 connect6 \ getpeername4 getpeername6 getsockname4 getsockname6 \ sendmsg4 sendmsg6 recvmsg4 recvmsg6 sysctl getsockopt \ - setsockopt' + setsockopt sock_release' local ATTACH_FLAGS='multi override' local PROG_TYPE='id pinned tag name' case $prev in @@ -1032,7 +1032,7 @@ _bpftool() ingress|egress|sock_create|sock_ops|device|bind4|bind6|\ post_bind4|post_bind6|connect4|connect6|getpeername4|\ getpeername6|getsockname4|getsockname6|sendmsg4|sendmsg6|\ - recvmsg4|recvmsg6|sysctl|getsockopt|setsockopt) + recvmsg4|recvmsg6|sysctl|getsockopt|setsockopt|sock_release) COMPREPLY=( $( compgen -W "$PROG_TYPE" -- \ "$cur" ) ) return 0 diff --git a/tools/bpf/bpftool/cgroup.c b/tools/bpf/bpftool/cgroup.c index d901cc1b904a..6e53b1d393f4 100644 --- a/tools/bpf/bpftool/cgroup.c +++ b/tools/bpf/bpftool/cgroup.c @@ -28,7 +28,8 @@ " connect6 | getpeername4 | getpeername6 |\n" \ " getsockname4 | getsockname6 | sendmsg4 |\n" \ " sendmsg6 | recvmsg4 | recvmsg6 |\n" \ - " sysctl | getsockopt | setsockopt }" + " sysctl | getsockopt | setsockopt |\n" \ + " sock_release }" static unsigned int query_flags; diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 3f067d2d7584..da4846c9856a 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -2138,7 +2138,7 @@ static int do_help(int argc, char **argv) " cgroup/getpeername4 | cgroup/getpeername6 |\n" " cgroup/getsockname4 | cgroup/getsockname6 | cgroup/sendmsg4 |\n" " cgroup/sendmsg6 | cgroup/recvmsg4 | cgroup/recvmsg6 |\n" - " cgroup/getsockopt | cgroup/setsockopt |\n" + " cgroup/getsockopt | cgroup/setsockopt | cgroup/sock_release |\n" " struct_ops | fentry | fexit | freplace | sk_lookup }\n" " ATTACH_TYPE := { msg_verdict | stream_verdict | stream_parser |\n" " flow_dissector }\n" -- cgit v1.2.3 From 6fd5fb63820a9a1146aba0bba2fdbc1db4b903e7 Mon Sep 17 00:00:00 2001 From: Jussi Maki Date: Tue, 25 May 2021 10:29:55 +0000 Subject: selftests/bpf: Add test for l3 use of bpf_redirect_peer Add a test case for using bpf_skb_change_head() in combination with bpf_redirect_peer() to redirect a packet from a L3 device to veth and back. The test uses a BPF program that adds L2 headers to the packet coming from a L3 device and then calls bpf_redirect_peer() to redirect the packet to a veth device. The test fails as skb->mac_len is not set properly and thus the ethernet headers are not properly skb_pull'd in cls_bpf_classify(), causing tcp_v4_rcv() to point the TCP header into middle of the IP header. Signed-off-by: Jussi Maki Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210525102955.2811090-1-joamaki@gmail.com --- .../testing/selftests/bpf/prog_tests/tc_redirect.c | 552 ++++++++++++++------- tools/testing/selftests/bpf/progs/test_tc_peer.c | 31 ++ 2 files changed, 405 insertions(+), 178 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c index 95ef9fcd31d8..5703c918812b 100644 --- a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c @@ -11,14 +11,17 @@ */ #define _GNU_SOURCE -#include + +#include #include #include +#include +#include #include #include #include #include -#include +#include #include "test_progs.h" #include "network_helpers.h" @@ -32,18 +35,25 @@ #define IP4_SRC "172.16.1.100" #define IP4_DST "172.16.2.100" +#define IP4_TUN_SRC "172.17.1.100" +#define IP4_TUN_FWD "172.17.1.200" #define IP4_PORT 9004 -#define IP6_SRC "::1:dead:beef:cafe" -#define IP6_DST "::2:dead:beef:cafe" +#define IP6_SRC "0::1:dead:beef:cafe" +#define IP6_DST "0::2:dead:beef:cafe" +#define IP6_TUN_SRC "1::1:dead:beef:cafe" +#define IP6_TUN_FWD "1::2:dead:beef:cafe" #define IP6_PORT 9006 #define IP4_SLL "169.254.0.1" #define IP4_DLL "169.254.0.2" #define IP4_NET "169.254.0.0" +#define MAC_DST_FWD "00:11:22:33:44:55" +#define MAC_DST "00:22:33:44:55:66" + #define IFADDR_STR_LEN 18 -#define PING_ARGS "-c 3 -w 10 -q" +#define PING_ARGS "-i 0.2 -c 3 -w 10 -q" #define SRC_PROG_PIN_FILE "/sys/fs/bpf/test_tc_src" #define DST_PROG_PIN_FILE "/sys/fs/bpf/test_tc_dst" @@ -51,120 +61,104 @@ #define TIMEOUT_MILLIS 10000 -#define MAX_PROC_MODS 128 -#define MAX_PROC_VALUE_LEN 16 - #define log_err(MSG, ...) \ fprintf(stderr, "(%s:%d: errno: %s) " MSG "\n", \ __FILE__, __LINE__, strerror(errno), ##__VA_ARGS__) -struct proc_mod { - char path[PATH_MAX]; - char oldval[MAX_PROC_VALUE_LEN]; - int oldlen; -}; - static const char * const namespaces[] = {NS_SRC, NS_FWD, NS_DST, NULL}; -static int root_netns_fd = -1; -static int num_proc_mods; -static struct proc_mod proc_mods[MAX_PROC_MODS]; -/** - * modify_proc() - Modify entry in /proc - * - * Modifies an entry in /proc and saves the original value for later - * restoration with restore_proc(). - */ -static int modify_proc(const char *path, const char *newval) +static int write_file(const char *path, const char *newval) { - struct proc_mod *mod; FILE *f; - if (num_proc_mods + 1 > MAX_PROC_MODS) - return -1; - f = fopen(path, "r+"); if (!f) return -1; - - mod = &proc_mods[num_proc_mods]; - num_proc_mods++; - - strncpy(mod->path, path, PATH_MAX); - - if (!fread(mod->oldval, 1, MAX_PROC_VALUE_LEN, f)) { - log_err("reading from %s failed", path); - goto fail; - } - rewind(f); if (fwrite(newval, strlen(newval), 1, f) != 1) { log_err("writing to %s failed", path); - goto fail; + fclose(f); + return -1; } - fclose(f); return 0; - -fail: - fclose(f); - num_proc_mods--; - return -1; } -/** - * restore_proc() - Restore all /proc modifications - */ -static void restore_proc(void) +struct nstoken { + int orig_netns_fd; +}; + +static int setns_by_fd(int nsfd) { - int i; + int err; - for (i = 0; i < num_proc_mods; i++) { - struct proc_mod *mod = &proc_mods[i]; - FILE *f; + err = setns(nsfd, CLONE_NEWNET); + close(nsfd); - f = fopen(mod->path, "w"); - if (!f) { - log_err("fopen of %s failed", mod->path); - continue; - } + if (!ASSERT_OK(err, "setns")) + return err; - if (fwrite(mod->oldval, mod->oldlen, 1, f) != 1) - log_err("fwrite to %s failed", mod->path); + /* Switch /sys to the new namespace so that e.g. /sys/class/net + * reflects the devices in the new namespace. + */ + err = unshare(CLONE_NEWNS); + if (!ASSERT_OK(err, "unshare")) + return err; - fclose(f); - } - num_proc_mods = 0; + err = umount2("/sys", MNT_DETACH); + if (!ASSERT_OK(err, "umount2 /sys")) + return err; + + err = mount("sysfs", "/sys", "sysfs", 0, NULL); + if (!ASSERT_OK(err, "mount /sys")) + return err; + + err = mount("bpffs", "/sys/fs/bpf", "bpf", 0, NULL); + if (!ASSERT_OK(err, "mount /sys/fs/bpf")) + return err; + + return 0; } /** - * setns_by_name() - Set networks namespace by name + * open_netns() - Switch to specified network namespace by name. + * + * Returns token with which to restore the original namespace + * using close_netns(). */ -static int setns_by_name(const char *name) +static struct nstoken *open_netns(const char *name) { int nsfd; char nspath[PATH_MAX]; int err; + struct nstoken *token; + + token = malloc(sizeof(struct nstoken)); + if (!ASSERT_OK_PTR(token, "malloc token")) + return NULL; + + token->orig_netns_fd = open("/proc/self/ns/net", O_RDONLY); + if (!ASSERT_GE(token->orig_netns_fd, 0, "open /proc/self/ns/net")) + goto fail; snprintf(nspath, sizeof(nspath), "%s/%s", "/var/run/netns", name); nsfd = open(nspath, O_RDONLY | O_CLOEXEC); - if (nsfd < 0) - return nsfd; + if (!ASSERT_GE(nsfd, 0, "open netns fd")) + goto fail; - err = setns(nsfd, CLONE_NEWNET); - close(nsfd); + err = setns_by_fd(nsfd); + if (!ASSERT_OK(err, "setns_by_fd")) + goto fail; - return err; + return token; +fail: + free(token); + return NULL; } -/** - * setns_root() - Set network namespace to original (root) namespace - * - * Not expected to ever fail, so error not returned, but failure logged - * and test marked as failed. - */ -static void setns_root(void) +static void close_netns(struct nstoken *token) { - ASSERT_OK(setns(root_netns_fd, CLONE_NEWNET), "setns root"); + ASSERT_OK(setns_by_fd(token->orig_netns_fd), "setns_by_fd"); + free(token); } static int netns_setup_namespaces(const char *verb) @@ -237,15 +231,17 @@ static int get_ifindex(const char *name) static int netns_setup_links_and_routes(struct netns_setup_result *result) { + struct nstoken *nstoken = NULL; char veth_src_fwd_addr[IFADDR_STR_LEN+1] = {}; - char veth_dst_fwd_addr[IFADDR_STR_LEN+1] = {}; SYS("ip link add veth_src type veth peer name veth_src_fwd"); SYS("ip link add veth_dst type veth peer name veth_dst_fwd"); + + SYS("ip link set veth_dst_fwd address " MAC_DST_FWD); + SYS("ip link set veth_dst address " MAC_DST); + if (get_ifaddr("veth_src_fwd", veth_src_fwd_addr)) goto fail; - if (get_ifaddr("veth_dst_fwd", veth_dst_fwd_addr)) - goto fail; result->ifindex_veth_src_fwd = get_ifindex("veth_src_fwd"); if (result->ifindex_veth_src_fwd < 0) @@ -260,7 +256,8 @@ static int netns_setup_links_and_routes(struct netns_setup_result *result) SYS("ip link set veth_dst netns " NS_DST); /** setup in 'src' namespace */ - if (!ASSERT_OK(setns_by_name(NS_SRC), "setns src")) + nstoken = open_netns(NS_SRC); + if (!ASSERT_OK_PTR(nstoken, "setns src")) goto fail; SYS("ip addr add " IP4_SRC "/32 dev veth_src"); @@ -276,8 +273,11 @@ static int netns_setup_links_and_routes(struct netns_setup_result *result) SYS("ip neigh add " IP6_DST " dev veth_src lladdr %s", veth_src_fwd_addr); + close_netns(nstoken); + /** setup in 'fwd' namespace */ - if (!ASSERT_OK(setns_by_name(NS_FWD), "setns fwd")) + nstoken = open_netns(NS_FWD); + if (!ASSERT_OK_PTR(nstoken, "setns fwd")) goto fail; /* The fwd netns automatically gets a v6 LL address / routes, but also @@ -294,8 +294,11 @@ static int netns_setup_links_and_routes(struct netns_setup_result *result) SYS("ip route add " IP4_DST "/32 dev veth_dst_fwd scope global"); SYS("ip route add " IP6_DST "/128 dev veth_dst_fwd scope global"); + close_netns(nstoken); + /** setup in 'dst' namespace */ - if (!ASSERT_OK(setns_by_name(NS_DST), "setns dst")) + nstoken = open_netns(NS_DST); + if (!ASSERT_OK_PTR(nstoken, "setns dst")) goto fail; SYS("ip addr add " IP4_DST "/32 dev veth_dst"); @@ -306,23 +309,20 @@ static int netns_setup_links_and_routes(struct netns_setup_result *result) SYS("ip route add " IP4_NET "/16 dev veth_dst scope global"); SYS("ip route add " IP6_SRC "/128 dev veth_dst scope global"); - SYS("ip neigh add " IP4_SRC " dev veth_dst lladdr %s", - veth_dst_fwd_addr); - SYS("ip neigh add " IP6_SRC " dev veth_dst lladdr %s", - veth_dst_fwd_addr); + SYS("ip neigh add " IP4_SRC " dev veth_dst lladdr " MAC_DST_FWD); + SYS("ip neigh add " IP6_SRC " dev veth_dst lladdr " MAC_DST_FWD); + + close_netns(nstoken); - setns_root(); return 0; fail: - setns_root(); + if (nstoken) + close_netns(nstoken); return -1; } static int netns_load_bpf(void) { - if (!ASSERT_OK(setns_by_name(NS_FWD), "setns fwd")) - return -1; - SYS("tc qdisc add dev veth_src_fwd clsact"); SYS("tc filter add dev veth_src_fwd ingress bpf da object-pinned " SRC_PROG_PIN_FILE); @@ -335,42 +335,29 @@ static int netns_load_bpf(void) SYS("tc filter add dev veth_dst_fwd egress bpf da object-pinned " CHK_PROG_PIN_FILE); - setns_root(); - return -1; -fail: - setns_root(); - return -1; -} - -static int netns_unload_bpf(void) -{ - if (!ASSERT_OK(setns_by_name(NS_FWD), "setns fwd")) - goto fail; - SYS("tc qdisc delete dev veth_src_fwd clsact"); - SYS("tc qdisc delete dev veth_dst_fwd clsact"); - - setns_root(); return 0; fail: - setns_root(); return -1; } - static void test_tcp(int family, const char *addr, __u16 port) { int listen_fd = -1, accept_fd = -1, client_fd = -1; char buf[] = "testing testing"; int n; + struct nstoken *nstoken; - if (!ASSERT_OK(setns_by_name(NS_DST), "setns dst")) + nstoken = open_netns(NS_DST); + if (!ASSERT_OK_PTR(nstoken, "setns dst")) return; listen_fd = start_server(family, SOCK_STREAM, addr, port, 0); if (!ASSERT_GE(listen_fd, 0, "listen")) goto done; - if (!ASSERT_OK(setns_by_name(NS_SRC), "setns src")) + close_netns(nstoken); + nstoken = open_netns(NS_SRC); + if (!ASSERT_OK_PTR(nstoken, "setns src")) goto done; client_fd = connect_to_fd(listen_fd, TIMEOUT_MILLIS); @@ -392,7 +379,8 @@ static void test_tcp(int family, const char *addr, __u16 port) ASSERT_EQ(n, sizeof(buf), "recv from server"); done: - setns_root(); + if (nstoken) + close_netns(nstoken); if (listen_fd >= 0) close(listen_fd); if (accept_fd >= 0) @@ -405,7 +393,7 @@ static int test_ping(int family, const char *addr) { const char *ping = family == AF_INET6 ? "ping6" : "ping"; - SYS("ip netns exec " NS_SRC " %s " PING_ARGS " %s", ping, addr); + SYS("ip netns exec " NS_SRC " %s " PING_ARGS " %s > /dev/null", ping, addr); return 0; fail: return -1; @@ -419,19 +407,37 @@ static void test_connectivity(void) test_ping(AF_INET6, IP6_DST); } +static int set_forwarding(bool enable) +{ + int err; + + err = write_file("/proc/sys/net/ipv4/ip_forward", enable ? "1" : "0"); + if (!ASSERT_OK(err, "set ipv4.ip_forward=0")) + return err; + + err = write_file("/proc/sys/net/ipv6/conf/all/forwarding", enable ? "1" : "0"); + if (!ASSERT_OK(err, "set ipv6.forwarding=0")) + return err; + + return 0; +} + static void test_tc_redirect_neigh_fib(struct netns_setup_result *setup_result) { - struct test_tc_neigh_fib *skel; + struct nstoken *nstoken = NULL; + struct test_tc_neigh_fib *skel = NULL; int err; + nstoken = open_netns(NS_FWD); + if (!ASSERT_OK_PTR(nstoken, "setns fwd")) + return; + skel = test_tc_neigh_fib__open(); if (!ASSERT_OK_PTR(skel, "test_tc_neigh_fib__open")) - return; + goto done; - if (!ASSERT_OK(test_tc_neigh_fib__load(skel), "test_tc_neigh_fib__load")) { - test_tc_neigh_fib__destroy(skel); - return; - } + if (!ASSERT_OK(test_tc_neigh_fib__load(skel), "test_tc_neigh_fib__load")) + goto done; err = bpf_program__pin(skel->progs.tc_src, SRC_PROG_PIN_FILE); if (!ASSERT_OK(err, "pin " SRC_PROG_PIN_FILE)) @@ -449,46 +455,37 @@ static void test_tc_redirect_neigh_fib(struct netns_setup_result *setup_result) goto done; /* bpf_fib_lookup() checks if forwarding is enabled */ - if (!ASSERT_OK(setns_by_name(NS_FWD), "setns fwd")) + if (!ASSERT_OK(set_forwarding(true), "enable forwarding")) goto done; - err = modify_proc("/proc/sys/net/ipv4/ip_forward", "1"); - if (!ASSERT_OK(err, "set ipv4.ip_forward")) - goto done; - - err = modify_proc("/proc/sys/net/ipv6/conf/all/forwarding", "1"); - if (!ASSERT_OK(err, "set ipv6.forwarding")) - goto done; - setns_root(); - test_connectivity(); + done: - bpf_program__unpin(skel->progs.tc_src, SRC_PROG_PIN_FILE); - bpf_program__unpin(skel->progs.tc_chk, CHK_PROG_PIN_FILE); - bpf_program__unpin(skel->progs.tc_dst, DST_PROG_PIN_FILE); - test_tc_neigh_fib__destroy(skel); - netns_unload_bpf(); - setns_root(); - restore_proc(); + if (skel) + test_tc_neigh_fib__destroy(skel); + close_netns(nstoken); } static void test_tc_redirect_neigh(struct netns_setup_result *setup_result) { - struct test_tc_neigh *skel; + struct nstoken *nstoken = NULL; + struct test_tc_neigh *skel = NULL; int err; + nstoken = open_netns(NS_FWD); + if (!ASSERT_OK_PTR(nstoken, "setns fwd")) + return; + skel = test_tc_neigh__open(); if (!ASSERT_OK_PTR(skel, "test_tc_neigh__open")) - return; + goto done; skel->rodata->IFINDEX_SRC = setup_result->ifindex_veth_src_fwd; skel->rodata->IFINDEX_DST = setup_result->ifindex_veth_dst_fwd; err = test_tc_neigh__load(skel); - if (!ASSERT_OK(err, "test_tc_neigh__load")) { - test_tc_neigh__destroy(skel); - return; - } + if (!ASSERT_OK(err, "test_tc_neigh__load")) + goto done; err = bpf_program__pin(skel->progs.tc_src, SRC_PROG_PIN_FILE); if (!ASSERT_OK(err, "pin " SRC_PROG_PIN_FILE)) @@ -505,34 +502,37 @@ static void test_tc_redirect_neigh(struct netns_setup_result *setup_result) if (netns_load_bpf()) goto done; + if (!ASSERT_OK(set_forwarding(false), "disable forwarding")) + goto done; + test_connectivity(); done: - bpf_program__unpin(skel->progs.tc_src, SRC_PROG_PIN_FILE); - bpf_program__unpin(skel->progs.tc_chk, CHK_PROG_PIN_FILE); - bpf_program__unpin(skel->progs.tc_dst, DST_PROG_PIN_FILE); - test_tc_neigh__destroy(skel); - netns_unload_bpf(); - setns_root(); + if (skel) + test_tc_neigh__destroy(skel); + close_netns(nstoken); } static void test_tc_redirect_peer(struct netns_setup_result *setup_result) { + struct nstoken *nstoken; struct test_tc_peer *skel; int err; + nstoken = open_netns(NS_FWD); + if (!ASSERT_OK_PTR(nstoken, "setns fwd")) + return; + skel = test_tc_peer__open(); if (!ASSERT_OK_PTR(skel, "test_tc_peer__open")) - return; + goto done; skel->rodata->IFINDEX_SRC = setup_result->ifindex_veth_src_fwd; skel->rodata->IFINDEX_DST = setup_result->ifindex_veth_dst_fwd; err = test_tc_peer__load(skel); - if (!ASSERT_OK(err, "test_tc_peer__load")) { - test_tc_peer__destroy(skel); - return; - } + if (!ASSERT_OK(err, "test_tc_peer__load")) + goto done; err = bpf_program__pin(skel->progs.tc_src, SRC_PROG_PIN_FILE); if (!ASSERT_OK(err, "pin " SRC_PROG_PIN_FILE)) @@ -549,41 +549,237 @@ static void test_tc_redirect_peer(struct netns_setup_result *setup_result) if (netns_load_bpf()) goto done; + if (!ASSERT_OK(set_forwarding(false), "disable forwarding")) + goto done; + test_connectivity(); done: - bpf_program__unpin(skel->progs.tc_src, SRC_PROG_PIN_FILE); - bpf_program__unpin(skel->progs.tc_chk, CHK_PROG_PIN_FILE); - bpf_program__unpin(skel->progs.tc_dst, DST_PROG_PIN_FILE); - test_tc_peer__destroy(skel); - netns_unload_bpf(); - setns_root(); + if (skel) + test_tc_peer__destroy(skel); + close_netns(nstoken); } -void test_tc_redirect(void) +static int tun_open(char *name) +{ + struct ifreq ifr; + int fd, err; + + fd = open("/dev/net/tun", O_RDWR); + if (!ASSERT_GE(fd, 0, "open /dev/net/tun")) + return -1; + + memset(&ifr, 0, sizeof(ifr)); + + ifr.ifr_flags = IFF_TUN | IFF_NO_PI; + if (*name) + strncpy(ifr.ifr_name, name, IFNAMSIZ); + + err = ioctl(fd, TUNSETIFF, &ifr); + if (!ASSERT_OK(err, "ioctl TUNSETIFF")) + goto fail; + + SYS("ip link set dev %s up", name); + + return fd; +fail: + close(fd); + return -1; +} + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +enum { + SRC_TO_TARGET = 0, + TARGET_TO_SRC = 1, +}; + +static int tun_relay_loop(int src_fd, int target_fd) { - struct netns_setup_result setup_result; + fd_set rfds, wfds; - root_netns_fd = open("/proc/self/ns/net", O_RDONLY); - if (!ASSERT_GE(root_netns_fd, 0, "open /proc/self/ns/net")) + FD_ZERO(&rfds); + FD_ZERO(&wfds); + + for (;;) { + char buf[1500]; + int direction, nread, nwrite; + + FD_SET(src_fd, &rfds); + FD_SET(target_fd, &rfds); + + if (select(1 + MAX(src_fd, target_fd), &rfds, NULL, NULL, NULL) < 0) { + log_err("select failed"); + return 1; + } + + direction = FD_ISSET(src_fd, &rfds) ? SRC_TO_TARGET : TARGET_TO_SRC; + + nread = read(direction == SRC_TO_TARGET ? src_fd : target_fd, buf, sizeof(buf)); + if (nread < 0) { + log_err("read failed"); + return 1; + } + + nwrite = write(direction == SRC_TO_TARGET ? target_fd : src_fd, buf, nread); + if (nwrite != nread) { + log_err("write failed"); + return 1; + } + } +} + +static void test_tc_redirect_peer_l3(struct netns_setup_result *setup_result) +{ + struct test_tc_peer *skel = NULL; + struct nstoken *nstoken = NULL; + int err; + int tunnel_pid = -1; + int src_fd, target_fd; + int ifindex; + + /* Start a L3 TUN/TAP tunnel between the src and dst namespaces. + * This test is using TUN/TAP instead of e.g. IPIP or GRE tunnel as those + * expose the L2 headers encapsulating the IP packet to BPF and hence + * don't have skb in suitable state for this test. Alternative to TUN/TAP + * would be e.g. Wireguard which would appear as a pure L3 device to BPF, + * but that requires much more complicated setup. + */ + nstoken = open_netns(NS_SRC); + if (!ASSERT_OK_PTR(nstoken, "setns " NS_SRC)) return; - if (netns_setup_namespaces("add")) - goto done; + src_fd = tun_open("tun_src"); + if (!ASSERT_GE(src_fd, 0, "tun_open tun_src")) + goto fail; - if (netns_setup_links_and_routes(&setup_result)) - goto done; + close_netns(nstoken); + + nstoken = open_netns(NS_FWD); + if (!ASSERT_OK_PTR(nstoken, "setns " NS_FWD)) + goto fail; - if (test__start_subtest("tc_redirect_peer")) - test_tc_redirect_peer(&setup_result); + target_fd = tun_open("tun_fwd"); + if (!ASSERT_GE(target_fd, 0, "tun_open tun_fwd")) + goto fail; - if (test__start_subtest("tc_redirect_neigh")) - test_tc_redirect_neigh(&setup_result); + tunnel_pid = fork(); + if (!ASSERT_GE(tunnel_pid, 0, "fork tun_relay_loop")) + goto fail; - if (test__start_subtest("tc_redirect_neigh_fib")) - test_tc_redirect_neigh_fib(&setup_result); + if (tunnel_pid == 0) + exit(tun_relay_loop(src_fd, target_fd)); -done: - close(root_netns_fd); - netns_setup_namespaces("delete"); + skel = test_tc_peer__open(); + if (!ASSERT_OK_PTR(skel, "test_tc_peer__open")) + goto fail; + + ifindex = get_ifindex("tun_fwd"); + if (!ASSERT_GE(ifindex, 0, "get_ifindex tun_fwd")) + goto fail; + + skel->rodata->IFINDEX_SRC = ifindex; + skel->rodata->IFINDEX_DST = setup_result->ifindex_veth_dst_fwd; + + err = test_tc_peer__load(skel); + if (!ASSERT_OK(err, "test_tc_peer__load")) + goto fail; + + err = bpf_program__pin(skel->progs.tc_src_l3, SRC_PROG_PIN_FILE); + if (!ASSERT_OK(err, "pin " SRC_PROG_PIN_FILE)) + goto fail; + + err = bpf_program__pin(skel->progs.tc_dst_l3, DST_PROG_PIN_FILE); + if (!ASSERT_OK(err, "pin " DST_PROG_PIN_FILE)) + goto fail; + + err = bpf_program__pin(skel->progs.tc_chk, CHK_PROG_PIN_FILE); + if (!ASSERT_OK(err, "pin " CHK_PROG_PIN_FILE)) + goto fail; + + /* Load "tc_src_l3" to the tun_fwd interface to redirect packets + * towards dst, and "tc_dst" to redirect packets + * and "tc_chk" on veth_dst_fwd to drop non-redirected packets. + */ + SYS("tc qdisc add dev tun_fwd clsact"); + SYS("tc filter add dev tun_fwd ingress bpf da object-pinned " + SRC_PROG_PIN_FILE); + + SYS("tc qdisc add dev veth_dst_fwd clsact"); + SYS("tc filter add dev veth_dst_fwd ingress bpf da object-pinned " + DST_PROG_PIN_FILE); + SYS("tc filter add dev veth_dst_fwd egress bpf da object-pinned " + CHK_PROG_PIN_FILE); + + /* Setup route and neigh tables */ + SYS("ip -netns " NS_SRC " addr add dev tun_src " IP4_TUN_SRC "/24"); + SYS("ip -netns " NS_FWD " addr add dev tun_fwd " IP4_TUN_FWD "/24"); + + SYS("ip -netns " NS_SRC " addr add dev tun_src " IP6_TUN_SRC "/64 nodad"); + SYS("ip -netns " NS_FWD " addr add dev tun_fwd " IP6_TUN_FWD "/64 nodad"); + + SYS("ip -netns " NS_SRC " route del " IP4_DST "/32 dev veth_src scope global"); + SYS("ip -netns " NS_SRC " route add " IP4_DST "/32 via " IP4_TUN_FWD + " dev tun_src scope global"); + SYS("ip -netns " NS_DST " route add " IP4_TUN_SRC "/32 dev veth_dst scope global"); + SYS("ip -netns " NS_SRC " route del " IP6_DST "/128 dev veth_src scope global"); + SYS("ip -netns " NS_SRC " route add " IP6_DST "/128 via " IP6_TUN_FWD + " dev tun_src scope global"); + SYS("ip -netns " NS_DST " route add " IP6_TUN_SRC "/128 dev veth_dst scope global"); + + SYS("ip -netns " NS_DST " neigh add " IP4_TUN_SRC " dev veth_dst lladdr " MAC_DST_FWD); + SYS("ip -netns " NS_DST " neigh add " IP6_TUN_SRC " dev veth_dst lladdr " MAC_DST_FWD); + + if (!ASSERT_OK(set_forwarding(false), "disable forwarding")) + goto fail; + + test_connectivity(); + +fail: + if (tunnel_pid > 0) { + kill(tunnel_pid, SIGTERM); + waitpid(tunnel_pid, NULL, 0); + } + if (src_fd >= 0) + close(src_fd); + if (target_fd >= 0) + close(target_fd); + if (skel) + test_tc_peer__destroy(skel); + if (nstoken) + close_netns(nstoken); +} + +#define RUN_TEST(name) \ + ({ \ + struct netns_setup_result setup_result; \ + if (test__start_subtest(#name)) \ + if (ASSERT_OK(netns_setup_namespaces("add"), "setup namespaces")) { \ + if (ASSERT_OK(netns_setup_links_and_routes(&setup_result), \ + "setup links and routes")) \ + test_ ## name(&setup_result); \ + netns_setup_namespaces("delete"); \ + } \ + }) + +static void *test_tc_redirect_run_tests(void *arg) +{ + RUN_TEST(tc_redirect_peer); + RUN_TEST(tc_redirect_peer_l3); + RUN_TEST(tc_redirect_neigh); + RUN_TEST(tc_redirect_neigh_fib); + return NULL; +} + +void test_tc_redirect(void) +{ + pthread_t test_thread; + int err; + + /* Run the tests in their own thread to isolate the namespace changes + * so they do not affect the environment of other tests. + * (specifically needed because of unshare(CLONE_NEWNS) in open_netns()) + */ + err = pthread_create(&test_thread, NULL, &test_tc_redirect_run_tests, NULL); + if (ASSERT_OK(err, "pthread_create")) + ASSERT_OK(pthread_join(test_thread, NULL), "pthread_join"); } diff --git a/tools/testing/selftests/bpf/progs/test_tc_peer.c b/tools/testing/selftests/bpf/progs/test_tc_peer.c index ef264bced0e6..fe818cd5f010 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_peer.c +++ b/tools/testing/selftests/bpf/progs/test_tc_peer.c @@ -5,12 +5,17 @@ #include #include #include +#include +#include #include volatile const __u32 IFINDEX_SRC; volatile const __u32 IFINDEX_DST; +static const __u8 src_mac[] = {0x00, 0x11, 0x22, 0x33, 0x44, 0x55}; +static const __u8 dst_mac[] = {0x00, 0x22, 0x33, 0x44, 0x55, 0x66}; + SEC("classifier/chk_egress") int tc_chk(struct __sk_buff *skb) { @@ -29,4 +34,30 @@ int tc_src(struct __sk_buff *skb) return bpf_redirect_peer(IFINDEX_DST, 0); } +SEC("classifier/dst_ingress_l3") +int tc_dst_l3(struct __sk_buff *skb) +{ + return bpf_redirect(IFINDEX_SRC, 0); +} + +SEC("classifier/src_ingress_l3") +int tc_src_l3(struct __sk_buff *skb) +{ + __u16 proto = skb->protocol; + + if (bpf_skb_change_head(skb, ETH_HLEN, 0) != 0) + return TC_ACT_SHOT; + + if (bpf_skb_store_bytes(skb, 0, &src_mac, ETH_ALEN, 0) != 0) + return TC_ACT_SHOT; + + if (bpf_skb_store_bytes(skb, ETH_ALEN, &dst_mac, ETH_ALEN, 0) != 0) + return TC_ACT_SHOT; + + if (bpf_skb_store_bytes(skb, ETH_ALEN + ETH_ALEN, &proto, sizeof(__u16), 0) != 0) + return TC_ACT_SHOT; + + return bpf_redirect_peer(IFINDEX_DST, 0); +} + char __license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 1bad6fd52be4ce12d207e2820ceb0f29ab31fc53 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 4 May 2021 08:58:25 +0000 Subject: bpf, selftests: Adjust few selftest result_unpriv outcomes Given we don't need to simulate the speculative domain for registers with immediates anymore since the verifier uses direct imm-based rewrites instead of having to mask, we can also lift a few cases that were previously rejected. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov --- tools/testing/selftests/bpf/verifier/stack_ptr.c | 2 -- tools/testing/selftests/bpf/verifier/value_ptr_arith.c | 8 -------- 2 files changed, 10 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/verifier/stack_ptr.c b/tools/testing/selftests/bpf/verifier/stack_ptr.c index 07eaa04412ae..8ab94d65f3d5 100644 --- a/tools/testing/selftests/bpf/verifier/stack_ptr.c +++ b/tools/testing/selftests/bpf/verifier/stack_ptr.c @@ -295,8 +295,6 @@ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0), BPF_EXIT_INSN(), }, - .result_unpriv = REJECT, - .errstr_unpriv = "invalid write to stack R1 off=0 size=1", .result = ACCEPT, .retval = 42, }, diff --git a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c index e5913fd3b903..7ae2859d495c 100644 --- a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c +++ b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c @@ -300,8 +300,6 @@ }, .fixup_map_array_48b = { 3 }, .result = ACCEPT, - .result_unpriv = REJECT, - .errstr_unpriv = "R0 pointer arithmetic of map value goes out of range", .retval = 1, }, { @@ -371,8 +369,6 @@ }, .fixup_map_array_48b = { 3 }, .result = ACCEPT, - .result_unpriv = REJECT, - .errstr_unpriv = "R0 pointer arithmetic of map value goes out of range", .retval = 1, }, { @@ -472,8 +468,6 @@ }, .fixup_map_array_48b = { 3 }, .result = ACCEPT, - .result_unpriv = REJECT, - .errstr_unpriv = "R0 pointer arithmetic of map value goes out of range", .retval = 1, }, { @@ -766,8 +760,6 @@ }, .fixup_map_array_48b = { 3 }, .result = ACCEPT, - .result_unpriv = REJECT, - .errstr_unpriv = "R0 pointer arithmetic of map value goes out of range", .retval = 1, }, { -- cgit v1.2.3 From 75ea44e356b5de8c817f821c9dd68ae329e82add Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Tue, 25 May 2021 18:07:58 +0200 Subject: perf jevents: Fix getting maximum number of fds On some hosts, rlim.rlim_max can be returned as RLIM_INFINITY. By casting it to int, it is interpreted as -1, which will cause get_maxfds to return 0, causing "Invalid argument" errors in nftw() calls. Fix this by casting the second argument of min() to rlim_t instead. Fixes: 80eeb67fe577 ("perf jevents: Program to convert JSON file") Signed-off-by: Felix Fietkau Cc: Alexander Shishkin Cc: Andi Kleen Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Sukadev Bhattiprolu Link: http://lore.kernel.org/lkml/20210525160758.97829-1-nbd@nbd.name Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/jevents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/jevents.c b/tools/perf/pmu-events/jevents.c index 7422b0ea8790..9604446f8360 100644 --- a/tools/perf/pmu-events/jevents.c +++ b/tools/perf/pmu-events/jevents.c @@ -960,7 +960,7 @@ static int get_maxfds(void) struct rlimit rlim; if (getrlimit(RLIMIT_NOFILE, &rlim) == 0) - return min((int)rlim.rlim_max / 2, 512); + return min(rlim.rlim_max / 2, (rlim_t)512); return 512; } -- cgit v1.2.3 From 39fe2fc96694164723846fccf6caa42c3aee6ec4 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 12 May 2021 12:31:06 +0800 Subject: selftests: kvm: make allocation of extra memory take effect The extra memory pages is missed to be allocated during VM creating. perf_test_util and kvm_page_table_test use it to alloc extra memory currently. Fix it by adding extra_mem_pages to the total memory calculation before allocate. Signed-off-by: Zhenzhong Duan Message-Id: <20210512043107.30076-1-zhenzhong.duan@intel.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/lib/kvm_util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index fc83f6c5902d..159f4d62241d 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -295,7 +295,7 @@ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, */ uint64_t vcpu_pages = (DEFAULT_STACK_PGS + num_percpu_pages) * nr_vcpus; uint64_t extra_pg_pages = (extra_mem_pages + vcpu_pages) / PTES_PER_MIN_PAGE * 2; - uint64_t pages = DEFAULT_GUEST_PHY_PAGES + vcpu_pages + extra_pg_pages; + uint64_t pages = DEFAULT_GUEST_PHY_PAGES + extra_mem_pages + vcpu_pages + extra_pg_pages; struct kvm_vm *vm; int i; -- cgit v1.2.3 From a13534d6676d2f2a9aa286e27e482b4896ff90e3 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 24 May 2021 14:27:38 +0200 Subject: selftests: kvm: fix potential issue with ELF loading vm_vaddr_alloc() sets up GVA to GPA mapping page by page; therefore, GPAs may not be continuous if same memslot is used for data and page table allocation. kvm_vm_elf_load() however expects a continuous range of HVAs (and thus GPAs) because it does not try to read file data page by page. Fix this mismatch by allocating memory in one step. Reported-by: Zhenzhong Duan Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/lib/kvm_util.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 159f4d62241d..12d953d8ee35 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1099,6 +1099,9 @@ vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, uint64_t pages = (sz >> vm->page_shift) + ((sz % vm->page_size) != 0); virt_pgd_alloc(vm, pgd_memslot); + vm_paddr_t paddr = vm_phy_pages_alloc(vm, pages, + KVM_UTIL_MIN_PFN * vm->page_size, + data_memslot); /* * Find an unused range of virtual page addresses of at least @@ -1108,11 +1111,7 @@ vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, /* Map the virtual pages. */ for (vm_vaddr_t vaddr = vaddr_start; pages > 0; - pages--, vaddr += vm->page_size) { - vm_paddr_t paddr; - - paddr = vm_phy_page_alloc(vm, - KVM_UTIL_MIN_PFN * vm->page_size, data_memslot); + pages--, vaddr += vm->page_size, paddr += vm->page_size) { virt_pg_map(vm, vaddr, paddr, pgd_memslot); -- cgit v1.2.3 From 22721a56109940f15b673d0f01907b7a7202275e Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Tue, 13 Apr 2021 16:08:27 +0200 Subject: KVM: selftests: Keep track of memslots more efficiently The KVM selftest framework was using a simple list for keeping track of the memslots currently in use. This resulted in lookups and adding a single memslot being O(n), the later due to linear scanning of the existing memslot set to check for the presence of any conflicting entries. Before this change, benchmarking high count of memslots was more or less impossible as pretty much all the benchmark time was spent in the selftest framework code. We can simply use a rbtree for keeping track of both of gfn and hva. We don't need an interval tree for hva here as we can't have overlapping memslots because we allocate a completely new memory chunk for each new memslot. Signed-off-by: Maciej S. Szmigiero Reviewed-by: Andrew Jones Message-Id: Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/Makefile | 2 +- tools/testing/selftests/kvm/lib/kvm_util.c | 141 ++++++++++++++++----- .../testing/selftests/kvm/lib/kvm_util_internal.h | 15 ++- tools/testing/selftests/kvm/lib/rbtree.c | 1 + 4 files changed, 124 insertions(+), 35 deletions(-) create mode 100644 tools/testing/selftests/kvm/lib/rbtree.c (limited to 'tools') diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index e439d027939d..a8c30f888d40 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -33,7 +33,7 @@ ifeq ($(ARCH),s390) UNAME_M := s390x endif -LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/sparsebit.c lib/test_util.c lib/guest_modes.c lib/perf_test_util.c +LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/rbtree.c lib/sparsebit.c lib/test_util.c lib/guest_modes.c lib/perf_test_util.c LIBKVM_x86_64 = lib/x86_64/processor.c lib/x86_64/vmx.c lib/x86_64/svm.c lib/x86_64/ucall.c lib/x86_64/handlers.S LIBKVM_aarch64 = lib/aarch64/processor.c lib/aarch64/ucall.c LIBKVM_s390x = lib/s390x/processor.c lib/s390x/ucall.c lib/s390x/diag318_test_handler.c diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 12d953d8ee35..1255744758e3 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -203,7 +203,9 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) TEST_ASSERT(vm != NULL, "Insufficient Memory"); INIT_LIST_HEAD(&vm->vcpus); - INIT_LIST_HEAD(&vm->userspace_mem_regions); + vm->regions.gpa_tree = RB_ROOT; + vm->regions.hva_tree = RB_ROOT; + hash_init(vm->regions.slot_hash); vm->mode = mode; vm->type = 0; @@ -355,13 +357,14 @@ struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages, */ void kvm_vm_restart(struct kvm_vm *vmp, int perm) { + int ctr; struct userspace_mem_region *region; vm_open(vmp, perm); if (vmp->has_irqchip) vm_create_irqchip(vmp); - list_for_each_entry(region, &vmp->userspace_mem_regions, list) { + hash_for_each(vmp->regions.slot_hash, ctr, region, slot_node) { int ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION, ®ion->region); TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n" " rc: %i errno: %i\n" @@ -424,14 +427,21 @@ uint32_t kvm_vm_reset_dirty_ring(struct kvm_vm *vm) static struct userspace_mem_region * userspace_mem_region_find(struct kvm_vm *vm, uint64_t start, uint64_t end) { - struct userspace_mem_region *region; + struct rb_node *node; - list_for_each_entry(region, &vm->userspace_mem_regions, list) { + for (node = vm->regions.gpa_tree.rb_node; node; ) { + struct userspace_mem_region *region = + container_of(node, struct userspace_mem_region, gpa_node); uint64_t existing_start = region->region.guest_phys_addr; uint64_t existing_end = region->region.guest_phys_addr + region->region.memory_size - 1; if (start <= existing_end && end >= existing_start) return region; + + if (start < existing_start) + node = node->rb_left; + else + node = node->rb_right; } return NULL; @@ -546,11 +556,16 @@ void kvm_vm_release(struct kvm_vm *vmp) } static void __vm_mem_region_delete(struct kvm_vm *vm, - struct userspace_mem_region *region) + struct userspace_mem_region *region, + bool unlink) { int ret; - list_del(®ion->list); + if (unlink) { + rb_erase(®ion->gpa_node, &vm->regions.gpa_tree); + rb_erase(®ion->hva_node, &vm->regions.hva_tree); + hash_del(®ion->slot_node); + } region->region.memory_size = 0; ret = ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, ®ion->region); @@ -569,14 +584,16 @@ static void __vm_mem_region_delete(struct kvm_vm *vm, */ void kvm_vm_free(struct kvm_vm *vmp) { - struct userspace_mem_region *region, *tmp; + int ctr; + struct hlist_node *node; + struct userspace_mem_region *region; if (vmp == NULL) return; /* Free userspace_mem_regions. */ - list_for_each_entry_safe(region, tmp, &vmp->userspace_mem_regions, list) - __vm_mem_region_delete(vmp, region); + hash_for_each_safe(vmp->regions.slot_hash, ctr, node, region, slot_node) + __vm_mem_region_delete(vmp, region, false); /* Free sparsebit arrays. */ sparsebit_free(&vmp->vpages_valid); @@ -658,6 +675,57 @@ int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, vm_vaddr_t gva, size_t len) return 0; } +static void vm_userspace_mem_region_gpa_insert(struct rb_root *gpa_tree, + struct userspace_mem_region *region) +{ + struct rb_node **cur, *parent; + + for (cur = &gpa_tree->rb_node, parent = NULL; *cur; ) { + struct userspace_mem_region *cregion; + + cregion = container_of(*cur, typeof(*cregion), gpa_node); + parent = *cur; + if (region->region.guest_phys_addr < + cregion->region.guest_phys_addr) + cur = &(*cur)->rb_left; + else { + TEST_ASSERT(region->region.guest_phys_addr != + cregion->region.guest_phys_addr, + "Duplicate GPA in region tree"); + + cur = &(*cur)->rb_right; + } + } + + rb_link_node(®ion->gpa_node, parent, cur); + rb_insert_color(®ion->gpa_node, gpa_tree); +} + +static void vm_userspace_mem_region_hva_insert(struct rb_root *hva_tree, + struct userspace_mem_region *region) +{ + struct rb_node **cur, *parent; + + for (cur = &hva_tree->rb_node, parent = NULL; *cur; ) { + struct userspace_mem_region *cregion; + + cregion = container_of(*cur, typeof(*cregion), hva_node); + parent = *cur; + if (region->host_mem < cregion->host_mem) + cur = &(*cur)->rb_left; + else { + TEST_ASSERT(region->host_mem != + cregion->host_mem, + "Duplicate HVA in region tree"); + + cur = &(*cur)->rb_right; + } + } + + rb_link_node(®ion->hva_node, parent, cur); + rb_insert_color(®ion->hva_node, hva_tree); +} + /* * VM Userspace Memory Region Add * @@ -722,7 +790,8 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, (uint64_t) region->region.memory_size); /* Confirm no region with the requested slot already exists. */ - list_for_each_entry(region, &vm->userspace_mem_regions, list) { + hash_for_each_possible(vm->regions.slot_hash, region, slot_node, + slot) { if (region->region.slot != slot) continue; @@ -793,8 +862,10 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, ret, errno, slot, flags, guest_paddr, (uint64_t) region->region.memory_size); - /* Add to linked-list of memory regions. */ - list_add(®ion->list, &vm->userspace_mem_regions); + /* Add to quick lookup data structures */ + vm_userspace_mem_region_gpa_insert(&vm->regions.gpa_tree, region); + vm_userspace_mem_region_hva_insert(&vm->regions.hva_tree, region); + hash_add(vm->regions.slot_hash, ®ion->slot_node, slot); } /* @@ -817,10 +888,10 @@ memslot2region(struct kvm_vm *vm, uint32_t memslot) { struct userspace_mem_region *region; - list_for_each_entry(region, &vm->userspace_mem_regions, list) { + hash_for_each_possible(vm->regions.slot_hash, region, slot_node, + memslot) if (region->region.slot == memslot) return region; - } fprintf(stderr, "No mem region with the requested slot found,\n" " requested slot: %u\n", memslot); @@ -905,7 +976,7 @@ void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa) */ void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot) { - __vm_mem_region_delete(vm, memslot2region(vm, slot)); + __vm_mem_region_delete(vm, memslot2region(vm, slot), true); } /* @@ -1176,16 +1247,14 @@ void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa) { struct userspace_mem_region *region; - list_for_each_entry(region, &vm->userspace_mem_regions, list) { - if ((gpa >= region->region.guest_phys_addr) - && (gpa <= (region->region.guest_phys_addr - + region->region.memory_size - 1))) - return (void *) ((uintptr_t) region->host_mem - + (gpa - region->region.guest_phys_addr)); + region = userspace_mem_region_find(vm, gpa, gpa); + if (!region) { + TEST_FAIL("No vm physical memory at 0x%lx", gpa); + return NULL; } - TEST_FAIL("No vm physical memory at 0x%lx", gpa); - return NULL; + return (void *)((uintptr_t)region->host_mem + + (gpa - region->region.guest_phys_addr)); } /* @@ -1207,15 +1276,22 @@ void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa) */ vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva) { - struct userspace_mem_region *region; + struct rb_node *node; + + for (node = vm->regions.hva_tree.rb_node; node; ) { + struct userspace_mem_region *region = + container_of(node, struct userspace_mem_region, hva_node); + + if (hva >= region->host_mem) { + if (hva <= (region->host_mem + + region->region.memory_size - 1)) + return (vm_paddr_t)((uintptr_t) + region->region.guest_phys_addr + + (hva - (uintptr_t)region->host_mem)); - list_for_each_entry(region, &vm->userspace_mem_regions, list) { - if ((hva >= region->host_mem) - && (hva <= (region->host_mem - + region->region.memory_size - 1))) - return (vm_paddr_t) ((uintptr_t) - region->region.guest_phys_addr - + (hva - (uintptr_t) region->host_mem)); + node = node->rb_right; + } else + node = node->rb_left; } TEST_FAIL("No mapping to a guest physical address, hva: %p", hva); @@ -1821,6 +1897,7 @@ int kvm_device_access(int dev_fd, uint32_t group, uint64_t attr, */ void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) { + int ctr; struct userspace_mem_region *region; struct vcpu *vcpu; @@ -1828,7 +1905,7 @@ void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) fprintf(stream, "%*sfd: %i\n", indent, "", vm->fd); fprintf(stream, "%*spage_size: 0x%x\n", indent, "", vm->page_size); fprintf(stream, "%*sMem Regions:\n", indent, ""); - list_for_each_entry(region, &vm->userspace_mem_regions, list) { + hash_for_each(vm->regions.slot_hash, ctr, region, slot_node) { fprintf(stream, "%*sguest_phys: 0x%lx size: 0x%lx " "host_virt: %p\n", indent + 2, "", (uint64_t) region->region.guest_phys_addr, diff --git a/tools/testing/selftests/kvm/lib/kvm_util_internal.h b/tools/testing/selftests/kvm/lib/kvm_util_internal.h index 91ce1b5d480b..b30e8c7b119b 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util_internal.h +++ b/tools/testing/selftests/kvm/lib/kvm_util_internal.h @@ -8,6 +8,9 @@ #ifndef SELFTEST_KVM_UTIL_INTERNAL_H #define SELFTEST_KVM_UTIL_INTERNAL_H +#include "linux/hashtable.h" +#include "linux/rbtree.h" + #include "sparsebit.h" struct userspace_mem_region { @@ -18,7 +21,9 @@ struct userspace_mem_region { void *host_mem; void *mmap_start; size_t mmap_size; - struct list_head list; + struct rb_node gpa_node; + struct rb_node hva_node; + struct hlist_node slot_node; }; struct vcpu { @@ -31,6 +36,12 @@ struct vcpu { uint32_t dirty_gfns_count; }; +struct userspace_mem_regions { + struct rb_root gpa_tree; + struct rb_root hva_tree; + DECLARE_HASHTABLE(slot_hash, 9); +}; + struct kvm_vm { int mode; unsigned long type; @@ -43,7 +54,7 @@ struct kvm_vm { unsigned int va_bits; uint64_t max_gfn; struct list_head vcpus; - struct list_head userspace_mem_regions; + struct userspace_mem_regions regions; struct sparsebit *vpages_valid; struct sparsebit *vpages_mapped; bool has_irqchip; diff --git a/tools/testing/selftests/kvm/lib/rbtree.c b/tools/testing/selftests/kvm/lib/rbtree.c new file mode 100644 index 000000000000..a703f0194ea3 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/rbtree.c @@ -0,0 +1 @@ +#include "../../../../lib/rbtree.c" -- cgit v1.2.3 From cad347fab142bcb9bebc125b5ba0c1e52ce74fdc Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Tue, 13 Apr 2021 16:08:28 +0200 Subject: KVM: selftests: add a memslot-related performance benchmark This benchmark contains the following tests: * Map test, where the host unmaps guest memory while the guest writes to it (maps it). The test is designed in a way to make the unmap operation on the host take a negligible amount of time in comparison with the mapping operation in the guest. The test area is actually split in two: the first half is being mapped by the guest while the second half in being unmapped by the host. Then a guest <-> host sync happens and the areas are reversed. * Unmap test which is broadly similar to the above map test, but it is designed in an opposite way: to make the mapping operation in the guest take a negligible amount of time in comparison with the unmap operation on the host. This test is available in two variants: with per-page unmap operation or a chunked one (using 2 MiB chunk size). * Move active area test which involves moving the last (highest gfn) memslot a bit back and forth on the host while the guest is concurrently writing around the area being moved (including over the moved memslot). * Move inactive area test which is similar to the previous move active area test, but now guest writes all happen outside of the area being moved. * Read / write test in which the guest writes to the beginning of each page of the test area while the host writes to the middle of each such page. Then each side checks the values the other side has written. This particular test is not expected to give different results depending on particular memslots implementation, it is meant as a rough sanity check and to provide insight on the spread of test results expected. Each test performs its operation in a loop until a test period ends (this is 5 seconds by default, but it is configurable). Then the total count of loops done is divided by the actual elapsed time to give the test result. The tests have a configurable memslot cap with the "-s" test option, by default the system maximum is used. Each test is repeated a particular number of times (by default 20 times), the best result achieved is printed. The test memory area is divided equally between memslots, the reminder is added to the last memslot. The test area size does not depend on the number of memslots in use. The tests also measure the time that it took to add all these memslots. The best result from the tests that use the whole test area is printed after all the requested tests are done. In general, these tests are designed to use as much memory as possible (within reason) while still doing 100+ loops even on high memslot counts with the default test length. Increasing the test runtime makes it increasingly more likely that some event will happen on the system during the test run, which might lower the test result. Signed-off-by: Maciej S. Szmigiero Reviewed-by: Andrew Jones Message-Id: <8d31bb3d92bc8fa33a9756fa802ee14266ab994e.1618253574.git.maciej.szmigiero@oracle.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/.gitignore | 1 + tools/testing/selftests/kvm/Makefile | 1 + tools/testing/selftests/kvm/memslot_perf_test.c | 1037 +++++++++++++++++++++++ 3 files changed, 1039 insertions(+) create mode 100644 tools/testing/selftests/kvm/memslot_perf_test.c (limited to 'tools') diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index bd83158e0e0b..524c857a049c 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -41,5 +41,6 @@ /kvm_create_max_vcpus /kvm_page_table_test /memslot_modification_stress_test +/memslot_perf_test /set_memory_region_test /steal_time diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index a8c30f888d40..daaee1888b12 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -74,6 +74,7 @@ TEST_GEN_PROGS_x86_64 += hardware_disable_test TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus TEST_GEN_PROGS_x86_64 += kvm_page_table_test TEST_GEN_PROGS_x86_64 += memslot_modification_stress_test +TEST_GEN_PROGS_x86_64 += memslot_perf_test TEST_GEN_PROGS_x86_64 += set_memory_region_test TEST_GEN_PROGS_x86_64 += steal_time diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c new file mode 100644 index 000000000000..4ae0e5ec0f74 --- /dev/null +++ b/tools/testing/selftests/kvm/memslot_perf_test.c @@ -0,0 +1,1037 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A memslot-related performance benchmark. + * + * Copyright (C) 2021 Oracle and/or its affiliates. + * + * Basic guest setup / host vCPU thread code lifted from set_memory_region_test. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#define VCPU_ID 0 + +#define MEM_SIZE ((512U << 20) + 4096) +#define MEM_SIZE_PAGES (MEM_SIZE / 4096) +#define MEM_GPA 0x10000000UL +#define MEM_AUX_GPA MEM_GPA +#define MEM_SYNC_GPA MEM_AUX_GPA +#define MEM_TEST_GPA (MEM_AUX_GPA + 4096) +#define MEM_TEST_SIZE (MEM_SIZE - 4096) +static_assert(MEM_SIZE % 4096 == 0, "invalid mem size"); +static_assert(MEM_TEST_SIZE % 4096 == 0, "invalid mem test size"); + +/* + * 32 MiB is max size that gets well over 100 iterations on 509 slots. + * Considering that each slot needs to have at least one page up to + * 8194 slots in use can then be tested (although with slightly + * limited resolution). + */ +#define MEM_SIZE_MAP ((32U << 20) + 4096) +#define MEM_SIZE_MAP_PAGES (MEM_SIZE_MAP / 4096) +#define MEM_TEST_MAP_SIZE (MEM_SIZE_MAP - 4096) +#define MEM_TEST_MAP_SIZE_PAGES (MEM_TEST_MAP_SIZE / 4096) +static_assert(MEM_SIZE_MAP % 4096 == 0, "invalid map test region size"); +static_assert(MEM_TEST_MAP_SIZE % 4096 == 0, "invalid map test region size"); +static_assert(MEM_TEST_MAP_SIZE_PAGES % 2 == 0, "invalid map test region size"); +static_assert(MEM_TEST_MAP_SIZE_PAGES > 2, "invalid map test region size"); + +/* + * 128 MiB is min size that fills 32k slots with at least one page in each + * while at the same time gets 100+ iterations in such test + */ +#define MEM_TEST_UNMAP_SIZE (128U << 20) +#define MEM_TEST_UNMAP_SIZE_PAGES (MEM_TEST_UNMAP_SIZE / 4096) +/* 2 MiB chunk size like a typical huge page */ +#define MEM_TEST_UNMAP_CHUNK_PAGES (2U << (20 - 12)) +static_assert(MEM_TEST_UNMAP_SIZE <= MEM_TEST_SIZE, + "invalid unmap test region size"); +static_assert(MEM_TEST_UNMAP_SIZE % 4096 == 0, + "invalid unmap test region size"); +static_assert(MEM_TEST_UNMAP_SIZE_PAGES % + (2 * MEM_TEST_UNMAP_CHUNK_PAGES) == 0, + "invalid unmap test region size"); + +/* + * For the move active test the middle of the test area is placed on + * a memslot boundary: half lies in the memslot being moved, half in + * other memslot(s). + * + * When running this test with 32k memslots (32764, really) each memslot + * contains 4 pages. + * The last one additionally contains the remaining 21 pages of memory, + * for the total size of 25 pages. + * Hence, the maximum size here is 50 pages. + */ +#define MEM_TEST_MOVE_SIZE_PAGES (50) +#define MEM_TEST_MOVE_SIZE (MEM_TEST_MOVE_SIZE_PAGES * 4096) +#define MEM_TEST_MOVE_GPA_DEST (MEM_GPA + MEM_SIZE) +static_assert(MEM_TEST_MOVE_SIZE <= MEM_TEST_SIZE, + "invalid move test region size"); + +#define MEM_TEST_VAL_1 0x1122334455667788 +#define MEM_TEST_VAL_2 0x99AABBCCDDEEFF00 + +struct vm_data { + struct kvm_vm *vm; + pthread_t vcpu_thread; + uint32_t nslots; + uint64_t npages; + uint64_t pages_per_slot; + void **hva_slots; + bool mmio_ok; + uint64_t mmio_gpa_min; + uint64_t mmio_gpa_max; +}; + +struct sync_area { + atomic_bool start_flag; + atomic_bool exit_flag; + atomic_bool sync_flag; + void *move_area_ptr; +}; + +/* + * Technically, we need also for the atomic bool to be address-free, which + * is recommended, but not strictly required, by C11 for lockless + * implementations. + * However, in practice both GCC and Clang fulfill this requirement on + * all KVM-supported platforms. + */ +static_assert(ATOMIC_BOOL_LOCK_FREE == 2, "atomic bool is not lockless"); + +static sem_t vcpu_ready; + +static bool map_unmap_verify; + +static bool verbose; +#define pr_info_v(...) \ + do { \ + if (verbose) \ + pr_info(__VA_ARGS__); \ + } while (0) + +static void *vcpu_worker(void *data) +{ + struct vm_data *vm = data; + struct kvm_run *run; + struct ucall uc; + uint64_t cmd; + + run = vcpu_state(vm->vm, VCPU_ID); + while (1) { + vcpu_run(vm->vm, VCPU_ID); + + if (run->exit_reason == KVM_EXIT_IO) { + cmd = get_ucall(vm->vm, VCPU_ID, &uc); + if (cmd != UCALL_SYNC) + break; + + sem_post(&vcpu_ready); + continue; + } + + if (run->exit_reason != KVM_EXIT_MMIO) + break; + + TEST_ASSERT(vm->mmio_ok, "Unexpected mmio exit"); + TEST_ASSERT(run->mmio.is_write, "Unexpected mmio read"); + TEST_ASSERT(run->mmio.len == 8, + "Unexpected exit mmio size = %u", run->mmio.len); + TEST_ASSERT(run->mmio.phys_addr >= vm->mmio_gpa_min && + run->mmio.phys_addr <= vm->mmio_gpa_max, + "Unexpected exit mmio address = 0x%llx", + run->mmio.phys_addr); + } + + if (run->exit_reason == KVM_EXIT_IO && cmd == UCALL_ABORT) + TEST_FAIL("%s at %s:%ld, val = %lu", (const char *)uc.args[0], + __FILE__, uc.args[1], uc.args[2]); + + return NULL; +} + +static void wait_for_vcpu(void) +{ + struct timespec ts; + + TEST_ASSERT(!clock_gettime(CLOCK_REALTIME, &ts), + "clock_gettime() failed: %d\n", errno); + + ts.tv_sec += 2; + TEST_ASSERT(!sem_timedwait(&vcpu_ready, &ts), + "sem_timedwait() failed: %d\n", errno); +} + +static void *vm_gpa2hva(struct vm_data *data, uint64_t gpa, uint64_t *rempages) +{ + uint64_t gpage, pgoffs; + uint32_t slot, slotoffs; + void *base; + + TEST_ASSERT(gpa >= MEM_GPA, "Too low gpa to translate"); + TEST_ASSERT(gpa < MEM_GPA + data->npages * 4096, + "Too high gpa to translate"); + gpa -= MEM_GPA; + + gpage = gpa / 4096; + pgoffs = gpa % 4096; + slot = min(gpage / data->pages_per_slot, (uint64_t)data->nslots - 1); + slotoffs = gpage - (slot * data->pages_per_slot); + + if (rempages) { + uint64_t slotpages; + + if (slot == data->nslots - 1) + slotpages = data->npages - slot * data->pages_per_slot; + else + slotpages = data->pages_per_slot; + + TEST_ASSERT(!pgoffs, + "Asking for remaining pages in slot but gpa not page aligned"); + *rempages = slotpages - slotoffs; + } + + base = data->hva_slots[slot]; + return (uint8_t *)base + slotoffs * 4096 + pgoffs; +} + +static uint64_t vm_slot2gpa(struct vm_data *data, uint32_t slot) +{ + TEST_ASSERT(slot < data->nslots, "Too high slot number"); + + return MEM_GPA + slot * data->pages_per_slot * 4096; +} + +static struct vm_data *alloc_vm(void) +{ + struct vm_data *data; + + data = malloc(sizeof(*data)); + TEST_ASSERT(data, "malloc(vmdata) failed"); + + data->vm = NULL; + data->hva_slots = NULL; + + return data; +} + +static bool prepare_vm(struct vm_data *data, int nslots, uint64_t *maxslots, + void *guest_code, uint64_t mempages, + struct timespec *slot_runtime) +{ + uint32_t max_mem_slots; + uint64_t rempages; + uint64_t guest_addr; + uint32_t slot; + struct timespec tstart; + struct sync_area *sync; + + max_mem_slots = kvm_check_cap(KVM_CAP_NR_MEMSLOTS); + TEST_ASSERT(max_mem_slots > 1, + "KVM_CAP_NR_MEMSLOTS should be greater than 1"); + TEST_ASSERT(nslots > 1 || nslots == -1, + "Slot count cap should be greater than 1"); + if (nslots != -1) + max_mem_slots = min(max_mem_slots, (uint32_t)nslots); + pr_info_v("Allowed number of memory slots: %"PRIu32"\n", max_mem_slots); + + TEST_ASSERT(mempages > 1, + "Can't test without any memory"); + + data->npages = mempages; + data->nslots = max_mem_slots - 1; + data->pages_per_slot = mempages / data->nslots; + if (!data->pages_per_slot) { + *maxslots = mempages + 1; + return false; + } + + rempages = mempages % data->nslots; + data->hva_slots = malloc(sizeof(*data->hva_slots) * data->nslots); + TEST_ASSERT(data->hva_slots, "malloc() fail"); + + data->vm = vm_create_default(VCPU_ID, mempages, guest_code); + + pr_info_v("Adding slots 1..%i, each slot with %"PRIu64" pages + %"PRIu64" extra pages last\n", + max_mem_slots - 1, data->pages_per_slot, rempages); + + clock_gettime(CLOCK_MONOTONIC, &tstart); + for (slot = 1, guest_addr = MEM_GPA; slot < max_mem_slots; slot++) { + uint64_t npages; + + npages = data->pages_per_slot; + if (slot == max_mem_slots - 1) + npages += rempages; + + vm_userspace_mem_region_add(data->vm, VM_MEM_SRC_ANONYMOUS, + guest_addr, slot, npages, + 0); + guest_addr += npages * 4096; + } + *slot_runtime = timespec_elapsed(tstart); + + for (slot = 0, guest_addr = MEM_GPA; slot < max_mem_slots - 1; slot++) { + uint64_t npages; + uint64_t gpa; + + npages = data->pages_per_slot; + if (slot == max_mem_slots - 2) + npages += rempages; + + gpa = vm_phy_pages_alloc(data->vm, npages, guest_addr, + slot + 1); + TEST_ASSERT(gpa == guest_addr, + "vm_phy_pages_alloc() failed\n"); + + data->hva_slots[slot] = addr_gpa2hva(data->vm, guest_addr); + memset(data->hva_slots[slot], 0, npages * 4096); + + guest_addr += npages * 4096; + } + + virt_map(data->vm, MEM_GPA, MEM_GPA, mempages, 0); + + sync = (typeof(sync))vm_gpa2hva(data, MEM_SYNC_GPA, NULL); + atomic_init(&sync->start_flag, false); + atomic_init(&sync->exit_flag, false); + atomic_init(&sync->sync_flag, false); + + data->mmio_ok = false; + + return true; +} + +static void launch_vm(struct vm_data *data) +{ + pr_info_v("Launching the test VM\n"); + + pthread_create(&data->vcpu_thread, NULL, vcpu_worker, data); + + /* Ensure the guest thread is spun up. */ + wait_for_vcpu(); +} + +static void free_vm(struct vm_data *data) +{ + kvm_vm_free(data->vm); + free(data->hva_slots); + free(data); +} + +static void wait_guest_exit(struct vm_data *data) +{ + pthread_join(data->vcpu_thread, NULL); +} + +static void let_guest_run(struct sync_area *sync) +{ + atomic_store_explicit(&sync->start_flag, true, memory_order_release); +} + +static void guest_spin_until_start(void) +{ + struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA; + + while (!atomic_load_explicit(&sync->start_flag, memory_order_acquire)) + ; +} + +static void make_guest_exit(struct sync_area *sync) +{ + atomic_store_explicit(&sync->exit_flag, true, memory_order_release); +} + +static bool _guest_should_exit(void) +{ + struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA; + + return atomic_load_explicit(&sync->exit_flag, memory_order_acquire); +} + +#define guest_should_exit() unlikely(_guest_should_exit()) + +/* + * noinline so we can easily see how much time the host spends waiting + * for the guest. + * For the same reason use alarm() instead of polling clock_gettime() + * to implement a wait timeout. + */ +static noinline void host_perform_sync(struct sync_area *sync) +{ + alarm(2); + + atomic_store_explicit(&sync->sync_flag, true, memory_order_release); + while (atomic_load_explicit(&sync->sync_flag, memory_order_acquire)) + ; + + alarm(0); +} + +static bool guest_perform_sync(void) +{ + struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA; + bool expected; + + do { + if (guest_should_exit()) + return false; + + expected = true; + } while (!atomic_compare_exchange_weak_explicit(&sync->sync_flag, + &expected, false, + memory_order_acq_rel, + memory_order_relaxed)); + + return true; +} + +static void guest_code_test_memslot_move(void) +{ + struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA; + uintptr_t base = (typeof(base))READ_ONCE(sync->move_area_ptr); + + GUEST_SYNC(0); + + guest_spin_until_start(); + + while (!guest_should_exit()) { + uintptr_t ptr; + + for (ptr = base; ptr < base + MEM_TEST_MOVE_SIZE; + ptr += 4096) + *(uint64_t *)ptr = MEM_TEST_VAL_1; + + /* + * No host sync here since the MMIO exits are so expensive + * that the host would spend most of its time waiting for + * the guest and so instead of measuring memslot move + * performance we would measure the performance and + * likelihood of MMIO exits + */ + } + + GUEST_DONE(); +} + +static void guest_code_test_memslot_map(void) +{ + struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA; + + GUEST_SYNC(0); + + guest_spin_until_start(); + + while (1) { + uintptr_t ptr; + + for (ptr = MEM_TEST_GPA; + ptr < MEM_TEST_GPA + MEM_TEST_MAP_SIZE / 2; ptr += 4096) + *(uint64_t *)ptr = MEM_TEST_VAL_1; + + if (!guest_perform_sync()) + break; + + for (ptr = MEM_TEST_GPA + MEM_TEST_MAP_SIZE / 2; + ptr < MEM_TEST_GPA + MEM_TEST_MAP_SIZE; ptr += 4096) + *(uint64_t *)ptr = MEM_TEST_VAL_2; + + if (!guest_perform_sync()) + break; + } + + GUEST_DONE(); +} + +static void guest_code_test_memslot_unmap(void) +{ + struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA; + + GUEST_SYNC(0); + + guest_spin_until_start(); + + while (1) { + uintptr_t ptr = MEM_TEST_GPA; + + /* + * We can afford to access (map) just a small number of pages + * per host sync as otherwise the host will spend + * a significant amount of its time waiting for the guest + * (instead of doing unmap operations), so this will + * effectively turn this test into a map performance test. + * + * Just access a single page to be on the safe side. + */ + *(uint64_t *)ptr = MEM_TEST_VAL_1; + + if (!guest_perform_sync()) + break; + + ptr += MEM_TEST_UNMAP_SIZE / 2; + *(uint64_t *)ptr = MEM_TEST_VAL_2; + + if (!guest_perform_sync()) + break; + } + + GUEST_DONE(); +} + +static void guest_code_test_memslot_rw(void) +{ + GUEST_SYNC(0); + + guest_spin_until_start(); + + while (1) { + uintptr_t ptr; + + for (ptr = MEM_TEST_GPA; + ptr < MEM_TEST_GPA + MEM_TEST_SIZE; ptr += 4096) + *(uint64_t *)ptr = MEM_TEST_VAL_1; + + if (!guest_perform_sync()) + break; + + for (ptr = MEM_TEST_GPA + 4096 / 2; + ptr < MEM_TEST_GPA + MEM_TEST_SIZE; ptr += 4096) { + uint64_t val = *(uint64_t *)ptr; + + GUEST_ASSERT_1(val == MEM_TEST_VAL_2, val); + *(uint64_t *)ptr = 0; + } + + if (!guest_perform_sync()) + break; + } + + GUEST_DONE(); +} + +static bool test_memslot_move_prepare(struct vm_data *data, + struct sync_area *sync, + uint64_t *maxslots, bool isactive) +{ + uint64_t movesrcgpa, movetestgpa; + + movesrcgpa = vm_slot2gpa(data, data->nslots - 1); + + if (isactive) { + uint64_t lastpages; + + vm_gpa2hva(data, movesrcgpa, &lastpages); + if (lastpages < MEM_TEST_MOVE_SIZE_PAGES / 2) { + *maxslots = 0; + return false; + } + } + + movetestgpa = movesrcgpa - (MEM_TEST_MOVE_SIZE / (isactive ? 2 : 1)); + sync->move_area_ptr = (void *)movetestgpa; + + if (isactive) { + data->mmio_ok = true; + data->mmio_gpa_min = movesrcgpa; + data->mmio_gpa_max = movesrcgpa + MEM_TEST_MOVE_SIZE / 2 - 1; + } + + return true; +} + +static bool test_memslot_move_prepare_active(struct vm_data *data, + struct sync_area *sync, + uint64_t *maxslots) +{ + return test_memslot_move_prepare(data, sync, maxslots, true); +} + +static bool test_memslot_move_prepare_inactive(struct vm_data *data, + struct sync_area *sync, + uint64_t *maxslots) +{ + return test_memslot_move_prepare(data, sync, maxslots, false); +} + +static void test_memslot_move_loop(struct vm_data *data, struct sync_area *sync) +{ + uint64_t movesrcgpa; + + movesrcgpa = vm_slot2gpa(data, data->nslots - 1); + vm_mem_region_move(data->vm, data->nslots - 1 + 1, + MEM_TEST_MOVE_GPA_DEST); + vm_mem_region_move(data->vm, data->nslots - 1 + 1, movesrcgpa); +} + +static void test_memslot_do_unmap(struct vm_data *data, + uint64_t offsp, uint64_t count) +{ + uint64_t gpa, ctr; + + for (gpa = MEM_TEST_GPA + offsp * 4096, ctr = 0; ctr < count; ) { + uint64_t npages; + void *hva; + int ret; + + hva = vm_gpa2hva(data, gpa, &npages); + TEST_ASSERT(npages, "Empty memory slot at gptr 0x%"PRIx64, gpa); + npages = min(npages, count - ctr); + ret = madvise(hva, npages * 4096, MADV_DONTNEED); + TEST_ASSERT(!ret, + "madvise(%p, MADV_DONTNEED) on VM memory should not fail for gptr 0x%"PRIx64, + hva, gpa); + ctr += npages; + gpa += npages * 4096; + } + TEST_ASSERT(ctr == count, + "madvise(MADV_DONTNEED) should exactly cover all of the requested area"); +} + +static void test_memslot_map_unmap_check(struct vm_data *data, + uint64_t offsp, uint64_t valexp) +{ + uint64_t gpa; + uint64_t *val; + + if (!map_unmap_verify) + return; + + gpa = MEM_TEST_GPA + offsp * 4096; + val = (typeof(val))vm_gpa2hva(data, gpa, NULL); + TEST_ASSERT(*val == valexp, + "Guest written values should read back correctly before unmap (%"PRIu64" vs %"PRIu64" @ %"PRIx64")", + *val, valexp, gpa); + *val = 0; +} + +static void test_memslot_map_loop(struct vm_data *data, struct sync_area *sync) +{ + /* + * Unmap the second half of the test area while guest writes to (maps) + * the first half. + */ + test_memslot_do_unmap(data, MEM_TEST_MAP_SIZE_PAGES / 2, + MEM_TEST_MAP_SIZE_PAGES / 2); + + /* + * Wait for the guest to finish writing the first half of the test + * area, verify the written value on the first and the last page of + * this area and then unmap it. + * Meanwhile, the guest is writing to (mapping) the second half of + * the test area. + */ + host_perform_sync(sync); + test_memslot_map_unmap_check(data, 0, MEM_TEST_VAL_1); + test_memslot_map_unmap_check(data, + MEM_TEST_MAP_SIZE_PAGES / 2 - 1, + MEM_TEST_VAL_1); + test_memslot_do_unmap(data, 0, MEM_TEST_MAP_SIZE_PAGES / 2); + + + /* + * Wait for the guest to finish writing the second half of the test + * area and verify the written value on the first and the last page + * of this area. + * The area will be unmapped at the beginning of the next loop + * iteration. + * Meanwhile, the guest is writing to (mapping) the first half of + * the test area. + */ + host_perform_sync(sync); + test_memslot_map_unmap_check(data, MEM_TEST_MAP_SIZE_PAGES / 2, + MEM_TEST_VAL_2); + test_memslot_map_unmap_check(data, MEM_TEST_MAP_SIZE_PAGES - 1, + MEM_TEST_VAL_2); +} + +static void test_memslot_unmap_loop_common(struct vm_data *data, + struct sync_area *sync, + uint64_t chunk) +{ + uint64_t ctr; + + /* + * Wait for the guest to finish mapping page(s) in the first half + * of the test area, verify the written value and then perform unmap + * of this area. + * Meanwhile, the guest is writing to (mapping) page(s) in the second + * half of the test area. + */ + host_perform_sync(sync); + test_memslot_map_unmap_check(data, 0, MEM_TEST_VAL_1); + for (ctr = 0; ctr < MEM_TEST_UNMAP_SIZE_PAGES / 2; ctr += chunk) + test_memslot_do_unmap(data, ctr, chunk); + + /* Likewise, but for the opposite host / guest areas */ + host_perform_sync(sync); + test_memslot_map_unmap_check(data, MEM_TEST_UNMAP_SIZE_PAGES / 2, + MEM_TEST_VAL_2); + for (ctr = MEM_TEST_UNMAP_SIZE_PAGES / 2; + ctr < MEM_TEST_UNMAP_SIZE_PAGES; ctr += chunk) + test_memslot_do_unmap(data, ctr, chunk); +} + +static void test_memslot_unmap_loop(struct vm_data *data, + struct sync_area *sync) +{ + test_memslot_unmap_loop_common(data, sync, 1); +} + +static void test_memslot_unmap_loop_chunked(struct vm_data *data, + struct sync_area *sync) +{ + test_memslot_unmap_loop_common(data, sync, MEM_TEST_UNMAP_CHUNK_PAGES); +} + +static void test_memslot_rw_loop(struct vm_data *data, struct sync_area *sync) +{ + uint64_t gptr; + + for (gptr = MEM_TEST_GPA + 4096 / 2; + gptr < MEM_TEST_GPA + MEM_TEST_SIZE; gptr += 4096) + *(uint64_t *)vm_gpa2hva(data, gptr, NULL) = MEM_TEST_VAL_2; + + host_perform_sync(sync); + + for (gptr = MEM_TEST_GPA; + gptr < MEM_TEST_GPA + MEM_TEST_SIZE; gptr += 4096) { + uint64_t *vptr = (typeof(vptr))vm_gpa2hva(data, gptr, NULL); + uint64_t val = *vptr; + + TEST_ASSERT(val == MEM_TEST_VAL_1, + "Guest written values should read back correctly (is %"PRIu64" @ %"PRIx64")", + val, gptr); + *vptr = 0; + } + + host_perform_sync(sync); +} + +struct test_data { + const char *name; + uint64_t mem_size; + void (*guest_code)(void); + bool (*prepare)(struct vm_data *data, struct sync_area *sync, + uint64_t *maxslots); + void (*loop)(struct vm_data *data, struct sync_area *sync); +}; + +static bool test_execute(int nslots, uint64_t *maxslots, + unsigned int maxtime, + const struct test_data *tdata, + uint64_t *nloops, + struct timespec *slot_runtime, + struct timespec *guest_runtime) +{ + uint64_t mem_size = tdata->mem_size ? : MEM_SIZE_PAGES; + struct vm_data *data; + struct sync_area *sync; + struct timespec tstart; + bool ret = true; + + data = alloc_vm(); + if (!prepare_vm(data, nslots, maxslots, tdata->guest_code, + mem_size, slot_runtime)) { + ret = false; + goto exit_free; + } + + sync = (typeof(sync))vm_gpa2hva(data, MEM_SYNC_GPA, NULL); + + if (tdata->prepare && + !tdata->prepare(data, sync, maxslots)) { + ret = false; + goto exit_free; + } + + launch_vm(data); + + clock_gettime(CLOCK_MONOTONIC, &tstart); + let_guest_run(sync); + + while (1) { + *guest_runtime = timespec_elapsed(tstart); + if (guest_runtime->tv_sec >= maxtime) + break; + + tdata->loop(data, sync); + + (*nloops)++; + } + + make_guest_exit(sync); + wait_guest_exit(data); + +exit_free: + free_vm(data); + + return ret; +} + +static const struct test_data tests[] = { + { + .name = "map", + .mem_size = MEM_SIZE_MAP_PAGES, + .guest_code = guest_code_test_memslot_map, + .loop = test_memslot_map_loop, + }, + { + .name = "unmap", + .mem_size = MEM_TEST_UNMAP_SIZE_PAGES + 1, + .guest_code = guest_code_test_memslot_unmap, + .loop = test_memslot_unmap_loop, + }, + { + .name = "unmap chunked", + .mem_size = MEM_TEST_UNMAP_SIZE_PAGES + 1, + .guest_code = guest_code_test_memslot_unmap, + .loop = test_memslot_unmap_loop_chunked, + }, + { + .name = "move active area", + .guest_code = guest_code_test_memslot_move, + .prepare = test_memslot_move_prepare_active, + .loop = test_memslot_move_loop, + }, + { + .name = "move inactive area", + .guest_code = guest_code_test_memslot_move, + .prepare = test_memslot_move_prepare_inactive, + .loop = test_memslot_move_loop, + }, + { + .name = "RW", + .guest_code = guest_code_test_memslot_rw, + .loop = test_memslot_rw_loop + }, +}; + +#define NTESTS ARRAY_SIZE(tests) + +struct test_args { + int tfirst; + int tlast; + int nslots; + int seconds; + int runs; +}; + +static void help(char *name, struct test_args *targs) +{ + int ctr; + + pr_info("usage: %s [-h] [-v] [-d] [-s slots] [-f first_test] [-e last_test] [-l test_length] [-r run_count]\n", + name); + pr_info(" -h: print this help screen.\n"); + pr_info(" -v: enable verbose mode (not for benchmarking).\n"); + pr_info(" -d: enable extra debug checks.\n"); + pr_info(" -s: specify memslot count cap (-1 means no cap; currently: %i)\n", + targs->nslots); + pr_info(" -f: specify the first test to run (currently: %i; max %zu)\n", + targs->tfirst, NTESTS - 1); + pr_info(" -e: specify the last test to run (currently: %i; max %zu)\n", + targs->tlast, NTESTS - 1); + pr_info(" -l: specify the test length in seconds (currently: %i)\n", + targs->seconds); + pr_info(" -r: specify the number of runs per test (currently: %i)\n", + targs->runs); + + pr_info("\nAvailable tests:\n"); + for (ctr = 0; ctr < NTESTS; ctr++) + pr_info("%d: %s\n", ctr, tests[ctr].name); +} + +static bool parse_args(int argc, char *argv[], + struct test_args *targs) +{ + int opt; + + while ((opt = getopt(argc, argv, "hvds:f:e:l:r:")) != -1) { + switch (opt) { + case 'h': + default: + help(argv[0], targs); + return false; + case 'v': + verbose = true; + break; + case 'd': + map_unmap_verify = true; + break; + case 's': + targs->nslots = atoi(optarg); + if (targs->nslots <= 0 && targs->nslots != -1) { + pr_info("Slot count cap has to be positive or -1 for no cap\n"); + return false; + } + break; + case 'f': + targs->tfirst = atoi(optarg); + if (targs->tfirst < 0) { + pr_info("First test to run has to be non-negative\n"); + return false; + } + break; + case 'e': + targs->tlast = atoi(optarg); + if (targs->tlast < 0 || targs->tlast >= NTESTS) { + pr_info("Last test to run has to be non-negative and less than %zu\n", + NTESTS); + return false; + } + break; + case 'l': + targs->seconds = atoi(optarg); + if (targs->seconds < 0) { + pr_info("Test length in seconds has to be non-negative\n"); + return false; + } + break; + case 'r': + targs->runs = atoi(optarg); + if (targs->runs <= 0) { + pr_info("Runs per test has to be positive\n"); + return false; + } + break; + } + } + + if (optind < argc) { + help(argv[0], targs); + return false; + } + + if (targs->tfirst > targs->tlast) { + pr_info("First test to run cannot be greater than the last test to run\n"); + return false; + } + + return true; +} + +struct test_result { + struct timespec slot_runtime, guest_runtime, iter_runtime; + int64_t slottimens, runtimens; + uint64_t nloops; +}; + +static bool test_loop(const struct test_data *data, + const struct test_args *targs, + struct test_result *rbestslottime, + struct test_result *rbestruntime) +{ + uint64_t maxslots; + struct test_result result; + + result.nloops = 0; + if (!test_execute(targs->nslots, &maxslots, targs->seconds, data, + &result.nloops, + &result.slot_runtime, &result.guest_runtime)) { + if (maxslots) + pr_info("Memslot count too high for this test, decrease the cap (max is %"PRIu64")\n", + maxslots); + else + pr_info("Memslot count may be too high for this test, try adjusting the cap\n"); + + return false; + } + + pr_info("Test took %ld.%.9lds for slot setup + %ld.%.9lds all iterations\n", + result.slot_runtime.tv_sec, result.slot_runtime.tv_nsec, + result.guest_runtime.tv_sec, result.guest_runtime.tv_nsec); + if (!result.nloops) { + pr_info("No full loops done - too short test time or system too loaded?\n"); + return true; + } + + result.iter_runtime = timespec_div(result.guest_runtime, + result.nloops); + pr_info("Done %"PRIu64" iterations, avg %ld.%.9lds each\n", + result.nloops, + result.iter_runtime.tv_sec, + result.iter_runtime.tv_nsec); + result.slottimens = timespec_to_ns(result.slot_runtime); + result.runtimens = timespec_to_ns(result.iter_runtime); + + /* + * Only rank the slot setup time for tests using the whole test memory + * area so they are comparable + */ + if (!data->mem_size && + (!rbestslottime->slottimens || + result.slottimens < rbestslottime->slottimens)) + *rbestslottime = result; + if (!rbestruntime->runtimens || + result.runtimens < rbestruntime->runtimens) + *rbestruntime = result; + + return true; +} + +int main(int argc, char *argv[]) +{ + struct test_args targs = { + .tfirst = 0, + .tlast = NTESTS - 1, + .nslots = -1, + .seconds = 5, + .runs = 20, + }; + struct test_result rbestslottime; + int tctr; + + /* Tell stdout not to buffer its content */ + setbuf(stdout, NULL); + + if (!parse_args(argc, argv, &targs)) + return -1; + + rbestslottime.slottimens = 0; + for (tctr = targs.tfirst; tctr <= targs.tlast; tctr++) { + const struct test_data *data = &tests[tctr]; + unsigned int runctr; + struct test_result rbestruntime; + + if (tctr > targs.tfirst) + pr_info("\n"); + + pr_info("Testing %s performance with %i runs, %d seconds each\n", + data->name, targs.runs, targs.seconds); + + rbestruntime.runtimens = 0; + for (runctr = 0; runctr < targs.runs; runctr++) + if (!test_loop(data, &targs, + &rbestslottime, &rbestruntime)) + break; + + if (rbestruntime.runtimens) + pr_info("Best runtime result was %ld.%.9lds per iteration (with %"PRIu64" iterations)\n", + rbestruntime.iter_runtime.tv_sec, + rbestruntime.iter_runtime.tv_nsec, + rbestruntime.nloops); + } + + if (rbestslottime.slottimens) + pr_info("Best slot setup time for the whole test area was %ld.%.9lds\n", + rbestslottime.slot_runtime.tv_sec, + rbestslottime.slot_runtime.tv_nsec); + + return 0; +} -- cgit v1.2.3 From ef4c9f4f654622fa15b7a94a9bd1f19e76bb7feb Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 21 May 2021 17:38:28 +0000 Subject: KVM: selftests: Fix 32-bit truncation of vm_get_max_gfn() vm_get_max_gfn() casts vm->max_gfn from a uint64_t to an unsigned int, which causes the upper 32-bits of the max_gfn to get truncated. Nobody noticed until now likely because vm_get_max_gfn() is only used as a mechanism to create a memslot in an unused region of the guest physical address space (the top), and the top of the 32-bit physical address space was always good enough. This fix reveals a bug in memslot_modification_stress_test which was trying to create a dummy memslot past the end of guest physical memory. Fix that by moving the dummy memslot lower. Fixes: 52200d0d944e ("KVM: selftests: Remove duplicate guest mode handling") Reviewed-by: Venkatesh Srinivas Signed-off-by: David Matlack Message-Id: <20210521173828.1180619-1-dmatlack@google.com> Reviewed-by: Andrew Jones Reviewed-by: Peter Xu Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/kvm_util.h | 2 +- tools/testing/selftests/kvm/lib/kvm_util.c | 2 +- tools/testing/selftests/kvm/lib/perf_test_util.c | 4 +++- .../selftests/kvm/memslot_modification_stress_test.c | 18 +++++++++++------- 4 files changed, 16 insertions(+), 10 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index a8f022794ce3..2e0d253dabd6 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -302,7 +302,7 @@ bool vm_is_unrestricted_guest(struct kvm_vm *vm); unsigned int vm_get_page_size(struct kvm_vm *vm); unsigned int vm_get_page_shift(struct kvm_vm *vm); -unsigned int vm_get_max_gfn(struct kvm_vm *vm); +uint64_t vm_get_max_gfn(struct kvm_vm *vm); int vm_get_fd(struct kvm_vm *vm); unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 1255744758e3..ea3f0db85b3e 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -2117,7 +2117,7 @@ unsigned int vm_get_page_shift(struct kvm_vm *vm) return vm->page_shift; } -unsigned int vm_get_max_gfn(struct kvm_vm *vm) +uint64_t vm_get_max_gfn(struct kvm_vm *vm) { return vm->max_gfn; } diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c index 81490b9b4e32..abf381800a59 100644 --- a/tools/testing/selftests/kvm/lib/perf_test_util.c +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c @@ -2,6 +2,7 @@ /* * Copyright (C) 2020, Google LLC. */ +#include #include "kvm_util.h" #include "perf_test_util.h" @@ -80,7 +81,8 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, */ TEST_ASSERT(guest_num_pages < vm_get_max_gfn(vm), "Requested more guest memory than address space allows.\n" - " guest pages: %lx max gfn: %x vcpus: %d wss: %lx]\n", + " guest pages: %" PRIx64 " max gfn: %" PRIx64 + " vcpus: %d wss: %" PRIx64 "]\n", guest_num_pages, vm_get_max_gfn(vm), vcpus, vcpu_memory_bytes); diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c index 6096bf0a5b34..98351ba0933c 100644 --- a/tools/testing/selftests/kvm/memslot_modification_stress_test.c +++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c @@ -71,14 +71,22 @@ struct memslot_antagonist_args { }; static void add_remove_memslot(struct kvm_vm *vm, useconds_t delay, - uint64_t nr_modifications, uint64_t gpa) + uint64_t nr_modifications) { + const uint64_t pages = 1; + uint64_t gpa; int i; + /* + * Add the dummy memslot just below the perf_test_util memslot, which is + * at the top of the guest physical address space. + */ + gpa = guest_test_phys_mem - pages * vm_get_page_size(vm); + for (i = 0; i < nr_modifications; i++) { usleep(delay); vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, gpa, - DUMMY_MEMSLOT_INDEX, 1, 0); + DUMMY_MEMSLOT_INDEX, pages, 0); vm_mem_region_delete(vm, DUMMY_MEMSLOT_INDEX); } @@ -120,11 +128,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) pr_info("Started all vCPUs\n"); add_remove_memslot(vm, p->memslot_modification_delay, - p->nr_memslot_modifications, - guest_test_phys_mem + - (guest_percpu_mem_size * nr_vcpus) + - perf_test_args.host_page_size + - perf_test_args.guest_page_size); + p->nr_memslot_modifications); run_vcpus = false; -- cgit v1.2.3 From 50bc913d526beb9937f1eb0159ec63c43234f961 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 19 May 2021 21:13:45 +0000 Subject: KVM: selftests: Ignore CPUID.0DH.1H in get_cpuid_test Similar to CPUID.0DH.0H this entry depends on the vCPU's XCR0 register and IA32_XSS MSR. Since this test does not control for either before assigning the vCPU's CPUID, these entries will not necessarily match the supported CPUID exposed by KVM. This fixes get_cpuid_test on Cascade Lake CPUs. Suggested-by: Jim Mattson Signed-off-by: David Matlack Message-Id: <20210519211345.3944063-1-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86_64/get_cpuid_test.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/x86_64/get_cpuid_test.c b/tools/testing/selftests/kvm/x86_64/get_cpuid_test.c index 9b78e8889638..8c77537af5a1 100644 --- a/tools/testing/selftests/kvm/x86_64/get_cpuid_test.c +++ b/tools/testing/selftests/kvm/x86_64/get_cpuid_test.c @@ -19,7 +19,12 @@ struct { u32 function; u32 index; } mangled_cpuids[] = { + /* + * These entries depend on the vCPU's XCR0 register and IA32_XSS MSR, + * which are not controlled for by this test. + */ {.function = 0xd, .index = 0}, + {.function = 0xd, .index = 1}, }; static void test_guest_cpuids(struct kvm_cpuid2 *guest_cpuid) -- cgit v1.2.3 From a10453c038a7e97169185405242d20d21de0bb91 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 14 May 2021 23:05:21 +0000 Subject: KVM: selftests: Fix hang in hardware_disable_test If /dev/kvm is not available then hardware_disable_test will hang indefinitely because the child process exits before posting to the semaphore for which the parent is waiting. Fix this by making the parent periodically check if the child has exited. We have to be careful to forward the child's exit status to preserve a KSFT_SKIP status. I considered just checking for /dev/kvm before creating the child process, but there are so many other reasons why the child could exit early that it seemed better to handle that as general case. Tested: $ ./hardware_disable_test /dev/kvm not available, skipping test $ echo $? 4 $ modprobe kvm_intel $ ./hardware_disable_test $ echo $? 0 Signed-off-by: David Matlack Message-Id: <20210514230521.2608768-1-dmatlack@google.com> Reviewed-by: Andrew Jones Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/hardware_disable_test.c | 32 +++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/hardware_disable_test.c b/tools/testing/selftests/kvm/hardware_disable_test.c index 5aadf84c91c0..4b8db3bce610 100644 --- a/tools/testing/selftests/kvm/hardware_disable_test.c +++ b/tools/testing/selftests/kvm/hardware_disable_test.c @@ -132,6 +132,36 @@ static void run_test(uint32_t run) TEST_ASSERT(false, "%s: [%d] child escaped the ninja\n", __func__, run); } +void wait_for_child_setup(pid_t pid) +{ + /* + * Wait for the child to post to the semaphore, but wake up periodically + * to check if the child exited prematurely. + */ + for (;;) { + const struct timespec wait_period = { .tv_sec = 1 }; + int status; + + if (!sem_timedwait(sem, &wait_period)) + return; + + /* Child is still running, keep waiting. */ + if (pid != waitpid(pid, &status, WNOHANG)) + continue; + + /* + * Child is no longer running, which is not expected. + * + * If it exited with a non-zero status, we explicitly forward + * the child's status in case it exited with KSFT_SKIP. + */ + if (WIFEXITED(status)) + exit(WEXITSTATUS(status)); + else + TEST_ASSERT(false, "Child exited unexpectedly"); + } +} + int main(int argc, char **argv) { uint32_t i; @@ -148,7 +178,7 @@ int main(int argc, char **argv) run_test(i); /* This function always exits */ pr_debug("%s: [%d] waiting semaphore\n", __func__, i); - sem_wait(sem); + wait_for_child_setup(pid); r = (rand() % DELAY_US_MAX) + 1; pr_debug("%s: [%d] waiting %dus\n", __func__, i, r); usleep(r); -- cgit v1.2.3 From c887d6a126dfc50b27872527615dd46cb3d96bc1 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Wed, 19 May 2021 13:03:30 -0700 Subject: KVM: selftests: trivial comment/logging fixes Some trivial fixes I found while touching related code in this series, factored out into a separate commit for easier reviewing: - s/gor/got/ and add a newline in demand_paging_test.c - s/backing_src/src_type/ in a comment to be consistent with the real function signature in kvm_util.c Signed-off-by: Axel Rasmussen Message-Id: <20210519200339.829146-2-axelrasmussen@google.com> Reviewed-by: Ben Gardon Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/demand_paging_test.c | 2 +- tools/testing/selftests/kvm/lib/kvm_util.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index 5f7a229c3af1..9398ba6ef023 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -169,7 +169,7 @@ static void *uffd_handler_thread_fn(void *arg) if (r == -1) { if (errno == EAGAIN) continue; - pr_info("Read of uffd gor errno %d", errno); + pr_info("Read of uffd got errno %d\n", errno); return NULL; } diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index ea3f0db85b3e..f4484e1edcfa 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -731,8 +731,8 @@ static void vm_userspace_mem_region_hva_insert(struct rb_root *hva_tree, * * Input Args: * vm - Virtual Machine - * backing_src - Storage source for this region. - * NULL to use anonymous memory. + * src_type - Storage source for this region. + * NULL to use anonymous memory. * guest_paddr - Starting guest physical address * slot - KVM region slot * npages - Number of physical pages -- cgit v1.2.3 From 2aab4b355cbbe1deacfd9349729c43509042b557 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Tue, 11 May 2021 20:21:20 +0000 Subject: KVM: selftests: Print a message if /dev/kvm is missing If a KVM selftest is run on a machine without /dev/kvm, it will exit silently. Make it easy to tell what's happening by printing an error message. Opportunistically consolidate all codepaths that open /dev/kvm into a single function so they all print the same message. This slightly changes the semantics of vm_is_unrestricted_guest() by changing a TEST_ASSERT() to exit(KSFT_SKIP). However vm_is_unrestricted_guest() is only called in one place (x86_64/mmio_warning_test.c) and that is to determine if the test should be skipped or not. Signed-off-by: David Matlack Message-Id: <20210511202120.1371800-1-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/kvm_util.h | 1 + tools/testing/selftests/kvm/lib/kvm_util.c | 46 +++++++++++++++------- tools/testing/selftests/kvm/lib/x86_64/processor.c | 16 ++------ .../selftests/kvm/x86_64/get_msr_index_features.c | 8 +--- 4 files changed, 39 insertions(+), 32 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 2e0d253dabd6..5d9b35d09251 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -77,6 +77,7 @@ struct vm_guest_mode_params { }; extern const struct vm_guest_mode_params vm_guest_mode_params[]; +int open_kvm_dev_path_or_exit(void); int kvm_check_cap(long cap); int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap); int vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id, diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index f4484e1edcfa..d00e49b73d68 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -31,6 +31,34 @@ static void *align(void *x, size_t size) return (void *) (((size_t) x + mask) & ~mask); } +/* + * Open KVM_DEV_PATH if available, otherwise exit the entire program. + * + * Input Args: + * flags - The flags to pass when opening KVM_DEV_PATH. + * + * Return: + * The opened file descriptor of /dev/kvm. + */ +static int _open_kvm_dev_path_or_exit(int flags) +{ + int fd; + + fd = open(KVM_DEV_PATH, flags); + if (fd < 0) { + print_skip("%s not available, is KVM loaded? (errno: %d)", + KVM_DEV_PATH, errno); + exit(KSFT_SKIP); + } + + return fd; +} + +int open_kvm_dev_path_or_exit(void) +{ + return _open_kvm_dev_path_or_exit(O_RDONLY); +} + /* * Capability * @@ -52,10 +80,7 @@ int kvm_check_cap(long cap) int ret; int kvm_fd; - kvm_fd = open(KVM_DEV_PATH, O_RDONLY); - if (kvm_fd < 0) - exit(KSFT_SKIP); - + kvm_fd = open_kvm_dev_path_or_exit(); ret = ioctl(kvm_fd, KVM_CHECK_EXTENSION, cap); TEST_ASSERT(ret != -1, "KVM_CHECK_EXTENSION IOCTL failed,\n" " rc: %i errno: %i", ret, errno); @@ -128,9 +153,7 @@ void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size) static void vm_open(struct kvm_vm *vm, int perm) { - vm->kvm_fd = open(KVM_DEV_PATH, perm); - if (vm->kvm_fd < 0) - exit(KSFT_SKIP); + vm->kvm_fd = _open_kvm_dev_path_or_exit(perm); if (!kvm_check_cap(KVM_CAP_IMMEDIATE_EXIT)) { print_skip("immediate_exit not available"); @@ -996,9 +1019,7 @@ static int vcpu_mmap_sz(void) { int dev_fd, ret; - dev_fd = open(KVM_DEV_PATH, O_RDONLY); - if (dev_fd < 0) - exit(KSFT_SKIP); + dev_fd = open_kvm_dev_path_or_exit(); ret = ioctl(dev_fd, KVM_GET_VCPU_MMAP_SIZE, NULL); TEST_ASSERT(ret >= sizeof(struct kvm_run), @@ -2091,10 +2112,7 @@ bool vm_is_unrestricted_guest(struct kvm_vm *vm) if (vm == NULL) { /* Ensure that the KVM vendor-specific module is loaded. */ - f = fopen(KVM_DEV_PATH, "r"); - TEST_ASSERT(f != NULL, "Error in opening KVM dev file: %d", - errno); - fclose(f); + close(open_kvm_dev_path_or_exit()); } f = fopen("/sys/module/kvm_intel/parameters/unrestricted_guest", "r"); diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index a8906e60a108..efe235044421 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -657,9 +657,7 @@ struct kvm_cpuid2 *kvm_get_supported_cpuid(void) return cpuid; cpuid = allocate_kvm_cpuid2(); - kvm_fd = open(KVM_DEV_PATH, O_RDONLY); - if (kvm_fd < 0) - exit(KSFT_SKIP); + kvm_fd = open_kvm_dev_path_or_exit(); ret = ioctl(kvm_fd, KVM_GET_SUPPORTED_CPUID, cpuid); TEST_ASSERT(ret == 0, "KVM_GET_SUPPORTED_CPUID failed %d %d\n", @@ -691,9 +689,7 @@ uint64_t kvm_get_feature_msr(uint64_t msr_index) buffer.header.nmsrs = 1; buffer.entry.index = msr_index; - kvm_fd = open(KVM_DEV_PATH, O_RDONLY); - if (kvm_fd < 0) - exit(KSFT_SKIP); + kvm_fd = open_kvm_dev_path_or_exit(); r = ioctl(kvm_fd, KVM_GET_MSRS, &buffer.header); TEST_ASSERT(r == 1, "KVM_GET_MSRS IOCTL failed,\n" @@ -986,9 +982,7 @@ struct kvm_msr_list *kvm_get_msr_index_list(void) struct kvm_msr_list *list; int nmsrs, r, kvm_fd; - kvm_fd = open(KVM_DEV_PATH, O_RDONLY); - if (kvm_fd < 0) - exit(KSFT_SKIP); + kvm_fd = open_kvm_dev_path_or_exit(); nmsrs = kvm_get_num_msrs_fd(kvm_fd); list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0])); @@ -1312,9 +1306,7 @@ struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void) return cpuid; cpuid = allocate_kvm_cpuid2(); - kvm_fd = open(KVM_DEV_PATH, O_RDONLY); - if (kvm_fd < 0) - exit(KSFT_SKIP); + kvm_fd = open_kvm_dev_path_or_exit(); ret = ioctl(kvm_fd, KVM_GET_SUPPORTED_HV_CPUID, cpuid); TEST_ASSERT(ret == 0, "KVM_GET_SUPPORTED_HV_CPUID failed %d %d\n", diff --git a/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c b/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c index cb953df4d7d0..8aed0db1331d 100644 --- a/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c +++ b/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c @@ -37,9 +37,7 @@ static void test_get_msr_index(void) int old_res, res, kvm_fd, r; struct kvm_msr_list *list; - kvm_fd = open(KVM_DEV_PATH, O_RDONLY); - if (kvm_fd < 0) - exit(KSFT_SKIP); + kvm_fd = open_kvm_dev_path_or_exit(); old_res = kvm_num_index_msrs(kvm_fd, 0); TEST_ASSERT(old_res != 0, "Expecting nmsrs to be > 0"); @@ -101,9 +99,7 @@ static void test_get_msr_feature(void) int res, old_res, i, kvm_fd; struct kvm_msr_list *feature_list; - kvm_fd = open(KVM_DEV_PATH, O_RDONLY); - if (kvm_fd < 0) - exit(KSFT_SKIP); + kvm_fd = open_kvm_dev_path_or_exit(); old_res = kvm_num_feature_msrs(kvm_fd, 0); TEST_ASSERT(old_res != 0, "Expecting nmsrs to be > 0"); -- cgit v1.2.3 From 25408e5a0246048e3e36d2cd513565ebcc481f51 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Wed, 19 May 2021 13:03:31 -0700 Subject: KVM: selftests: simplify setup_demand_paging error handling A small cleanup. Our caller writes: r = setup_demand_paging(...); if (r < 0) exit(-r); Since we're just going to exit anyway, instead of returning an error we can just re-use TEST_ASSERT. This makes the caller simpler, as well as the function itself - no need to write our branches, etc. Signed-off-by: Axel Rasmussen Message-Id: <20210519200339.829146-3-axelrasmussen@google.com> Reviewed-by: Ben Gardon Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/demand_paging_test.c | 50 +++++++++--------------- 1 file changed, 18 insertions(+), 32 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index 9398ba6ef023..8ce53488d6af 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -9,6 +9,7 @@ #define _GNU_SOURCE /* for pipe2 */ +#include #include #include #include @@ -198,42 +199,32 @@ static void *uffd_handler_thread_fn(void *arg) return NULL; } -static int setup_demand_paging(struct kvm_vm *vm, - pthread_t *uffd_handler_thread, int pipefd, - useconds_t uffd_delay, - struct uffd_handler_args *uffd_args, - void *hva, uint64_t len) +static void setup_demand_paging(struct kvm_vm *vm, + pthread_t *uffd_handler_thread, int pipefd, + useconds_t uffd_delay, + struct uffd_handler_args *uffd_args, + void *hva, uint64_t len) { int uffd; struct uffdio_api uffdio_api; struct uffdio_register uffdio_register; uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); - if (uffd == -1) { - pr_info("uffd creation failed\n"); - return -1; - } + TEST_ASSERT(uffd >= 0, "uffd creation failed, errno: %d", errno); uffdio_api.api = UFFD_API; uffdio_api.features = 0; - if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) { - pr_info("ioctl uffdio_api failed\n"); - return -1; - } + TEST_ASSERT(ioctl(uffd, UFFDIO_API, &uffdio_api) != -1, + "ioctl UFFDIO_API failed: %" PRIu64, + (uint64_t)uffdio_api.api); uffdio_register.range.start = (uint64_t)hva; uffdio_register.range.len = len; uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) { - pr_info("ioctl uffdio_register failed\n"); - return -1; - } - - if ((uffdio_register.ioctls & UFFD_API_RANGE_IOCTLS) != - UFFD_API_RANGE_IOCTLS) { - pr_info("unexpected userfaultfd ioctl set\n"); - return -1; - } + TEST_ASSERT(ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) != -1, + "ioctl UFFDIO_REGISTER failed"); + TEST_ASSERT((uffdio_register.ioctls & UFFD_API_RANGE_IOCTLS) == + UFFD_API_RANGE_IOCTLS, "unexpected userfaultfd ioctl set"); uffd_args->uffd = uffd; uffd_args->pipefd = pipefd; @@ -243,8 +234,6 @@ static int setup_demand_paging(struct kvm_vm *vm, PER_VCPU_DEBUG("Created uffd thread for HVA range [%p, %p)\n", hva, hva + len); - - return 0; } struct test_params { @@ -321,13 +310,10 @@ static void run_test(enum vm_guest_mode mode, void *arg) O_CLOEXEC | O_NONBLOCK); TEST_ASSERT(!r, "Failed to set up pipefd"); - r = setup_demand_paging(vm, - &uffd_handler_threads[vcpu_id], - pipefds[vcpu_id * 2], - p->uffd_delay, &uffd_args[vcpu_id], - vcpu_hva, vcpu_mem_size); - if (r < 0) - exit(-r); + setup_demand_paging(vm, &uffd_handler_threads[vcpu_id], + pipefds[vcpu_id * 2], p->uffd_delay, + &uffd_args[vcpu_id], vcpu_hva, + vcpu_mem_size); } } -- cgit v1.2.3 From 32ffa4f71e10009498ae6b54da65ab316db967bd Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Wed, 19 May 2021 13:03:33 -0700 Subject: KVM: selftests: compute correct demand paging size This is a preparatory commit needed before we can use different kinds of backing pages for guest memory. Previously, we used perf_test_args.host_page_size, which is the host's native page size (commonly 4K). For VM_MEM_SRC_ANONYMOUS this turns out to be okay, but in a follow-up commit we want to allow using different kinds of backing memory. Take VM_MEM_SRC_ANONYMOUS_HUGETLB for example. Without this change, if we used that backing page type, when we issued a UFFDIO_COPY ioctl we'd only do so with 4K, rather than the full 2M of a backing hugepage. In this case, UFFDIO_COPY returns -EINVAL (__mcopy_atomic_hugetlb checks the size). Signed-off-by: Axel Rasmussen Message-Id: <20210519200339.829146-5-axelrasmussen@google.com> Reviewed-by: Ben Gardon Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/demand_paging_test.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index 8ce53488d6af..e6582f504c0f 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -39,6 +39,7 @@ static int nr_vcpus = 1; static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE; +static size_t demand_paging_size; static char *guest_data_prototype; static void *vcpu_worker(void *data) @@ -84,7 +85,7 @@ static int handle_uffd_page_request(int uffd, uint64_t addr) copy.src = (uint64_t)guest_data_prototype; copy.dst = addr; - copy.len = perf_test_args.host_page_size; + copy.len = demand_paging_size; copy.mode = 0; clock_gettime(CLOCK_MONOTONIC, &start); @@ -101,7 +102,7 @@ static int handle_uffd_page_request(int uffd, uint64_t addr) PER_PAGE_DEBUG("UFFDIO_COPY %d \t%ld ns\n", tid, timespec_to_ns(ts_diff)); PER_PAGE_DEBUG("Paged in %ld bytes at 0x%lx from thread %d\n", - perf_test_args.host_page_size, addr, tid); + demand_paging_size, addr, tid); return 0; } @@ -260,10 +261,12 @@ static void run_test(enum vm_guest_mode mode, void *arg) perf_test_args.wr_fract = 1; - guest_data_prototype = malloc(perf_test_args.host_page_size); + demand_paging_size = get_backing_src_pagesz(VM_MEM_SRC_ANONYMOUS); + + guest_data_prototype = malloc(demand_paging_size); TEST_ASSERT(guest_data_prototype, "Failed to allocate buffer for guest data pattern"); - memset(guest_data_prototype, 0xAB, perf_test_args.host_page_size); + memset(guest_data_prototype, 0xAB, demand_paging_size); vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads)); TEST_ASSERT(vcpu_threads, "Memory allocation failed"); -- cgit v1.2.3 From 0368c2c1b422c94968b5286f289aed7fe6af93c2 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Wed, 19 May 2021 13:03:34 -0700 Subject: KVM: selftests: allow different backing source types Add an argument which lets us specify a different backing memory type for the test. The default is just to use anonymous, matching existing behavior. This is in preparation for testing UFFD minor faults. For that, we'll need to use a new backing memory type which is setup with MAP_SHARED. Signed-off-by: Axel Rasmussen Message-Id: <20210519200339.829146-6-axelrasmussen@google.com> Reviewed-by: Ben Gardon Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/demand_paging_test.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index e6582f504c0f..8c03484a5784 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -240,6 +240,7 @@ static void setup_demand_paging(struct kvm_vm *vm, struct test_params { bool use_uffd; useconds_t uffd_delay; + enum vm_mem_backing_src_type src_type; bool partition_vcpu_memory_access; }; @@ -257,11 +258,11 @@ static void run_test(enum vm_guest_mode mode, void *arg) int r; vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, - VM_MEM_SRC_ANONYMOUS); + p->src_type); perf_test_args.wr_fract = 1; - demand_paging_size = get_backing_src_pagesz(VM_MEM_SRC_ANONYMOUS); + demand_paging_size = get_backing_src_pagesz(p->src_type); guest_data_prototype = malloc(demand_paging_size); TEST_ASSERT(guest_data_prototype, @@ -377,7 +378,7 @@ static void help(char *name) { puts(""); printf("usage: %s [-h] [-m mode] [-u] [-d uffd_delay_usec]\n" - " [-b memory] [-v vcpus] [-o]\n", name); + " [-b memory] [-t type] [-v vcpus] [-o]\n", name); guest_modes_help(); printf(" -u: use User Fault FD to handle vCPU page\n" " faults.\n"); @@ -387,6 +388,8 @@ static void help(char *name) printf(" -b: specify the size of the memory region which should be\n" " demand paged by each vCPU. e.g. 10M or 3G.\n" " Default: 1G\n"); + printf(" -t: The type of backing memory to use. Default: anonymous\n"); + backing_src_help(); printf(" -v: specify the number of vCPUs to run.\n"); printf(" -o: Overlap guest memory accesses instead of partitioning\n" " them into a separate region of memory for each vCPU.\n"); @@ -398,13 +401,14 @@ int main(int argc, char *argv[]) { int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS); struct test_params p = { + .src_type = VM_MEM_SRC_ANONYMOUS, .partition_vcpu_memory_access = true, }; int opt; guest_modes_append_default(); - while ((opt = getopt(argc, argv, "hm:ud:b:v:o")) != -1) { + while ((opt = getopt(argc, argv, "hm:ud:b:t:v:o")) != -1) { switch (opt) { case 'm': guest_modes_cmdline(optarg); @@ -419,6 +423,9 @@ int main(int argc, char *argv[]) case 'b': guest_percpu_mem_size = parse_size(optarg); break; + case 't': + p.src_type = parse_backing_src_type(optarg); + break; case 'v': nr_vcpus = atoi(optarg); TEST_ASSERT(nr_vcpus > 0 && nr_vcpus <= max_vcpus, -- cgit v1.2.3 From b3784bc28ccc0d9b44d265a1d947c8766295ba00 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Wed, 19 May 2021 13:03:35 -0700 Subject: KVM: selftests: refactor vm_mem_backing_src_type flags Each struct vm_mem_backing_src_alias has a flags field, which denotes the flags used to mmap() an area of that type. Previously, this field never included MAP_PRIVATE | MAP_ANONYMOUS, because vm_userspace_mem_region_add assumed that *all* types would always use those flags, and so it hardcoded them. In a follow-up commit, we'll add a new type: shmem. Areas of this type must not have MAP_PRIVATE | MAP_ANONYMOUS, and instead they must have MAP_SHARED. So, refactor things. Make it so that the flags field of struct vm_mem_backing_src_alias really is a complete set of flags, and don't add in any extras in vm_userspace_mem_region_add. This will let us easily tack on shmem. Signed-off-by: Axel Rasmussen Message-Id: <20210519200339.829146-7-axelrasmussen@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/lib/kvm_util.c | 3 +-- tools/testing/selftests/kvm/lib/test_util.c | 35 ++++++++++++++++------------- 2 files changed, 20 insertions(+), 18 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index d00e49b73d68..491be22b410c 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -849,8 +849,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, region->mmap_start = mmap(NULL, region->mmap_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS - | vm_mem_backing_src_alias(src_type)->flag, + vm_mem_backing_src_alias(src_type)->flag, -1, 0); TEST_ASSERT(region->mmap_start != MAP_FAILED, "test_malloc failed, mmap_start: %p errno: %i", diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c index 63d2bc7d757b..06ddde068736 100644 --- a/tools/testing/selftests/kvm/lib/test_util.c +++ b/tools/testing/selftests/kvm/lib/test_util.c @@ -168,70 +168,73 @@ size_t get_def_hugetlb_pagesz(void) const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i) { + static const int anon_flags = MAP_PRIVATE | MAP_ANONYMOUS; + static const int anon_huge_flags = anon_flags | MAP_HUGETLB; + static const struct vm_mem_backing_src_alias aliases[] = { [VM_MEM_SRC_ANONYMOUS] = { .name = "anonymous", - .flag = 0, + .flag = anon_flags, }, [VM_MEM_SRC_ANONYMOUS_THP] = { .name = "anonymous_thp", - .flag = 0, + .flag = anon_flags, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB] = { .name = "anonymous_hugetlb", - .flag = MAP_HUGETLB, + .flag = anon_huge_flags, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_16KB] = { .name = "anonymous_hugetlb_16kb", - .flag = MAP_HUGETLB | MAP_HUGE_16KB, + .flag = anon_huge_flags | MAP_HUGE_16KB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_64KB] = { .name = "anonymous_hugetlb_64kb", - .flag = MAP_HUGETLB | MAP_HUGE_64KB, + .flag = anon_huge_flags | MAP_HUGE_64KB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_512KB] = { .name = "anonymous_hugetlb_512kb", - .flag = MAP_HUGETLB | MAP_HUGE_512KB, + .flag = anon_huge_flags | MAP_HUGE_512KB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_1MB] = { .name = "anonymous_hugetlb_1mb", - .flag = MAP_HUGETLB | MAP_HUGE_1MB, + .flag = anon_huge_flags | MAP_HUGE_1MB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_2MB] = { .name = "anonymous_hugetlb_2mb", - .flag = MAP_HUGETLB | MAP_HUGE_2MB, + .flag = anon_huge_flags | MAP_HUGE_2MB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_8MB] = { .name = "anonymous_hugetlb_8mb", - .flag = MAP_HUGETLB | MAP_HUGE_8MB, + .flag = anon_huge_flags | MAP_HUGE_8MB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_16MB] = { .name = "anonymous_hugetlb_16mb", - .flag = MAP_HUGETLB | MAP_HUGE_16MB, + .flag = anon_huge_flags | MAP_HUGE_16MB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_32MB] = { .name = "anonymous_hugetlb_32mb", - .flag = MAP_HUGETLB | MAP_HUGE_32MB, + .flag = anon_huge_flags | MAP_HUGE_32MB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_256MB] = { .name = "anonymous_hugetlb_256mb", - .flag = MAP_HUGETLB | MAP_HUGE_256MB, + .flag = anon_huge_flags | MAP_HUGE_256MB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_512MB] = { .name = "anonymous_hugetlb_512mb", - .flag = MAP_HUGETLB | MAP_HUGE_512MB, + .flag = anon_huge_flags | MAP_HUGE_512MB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_1GB] = { .name = "anonymous_hugetlb_1gb", - .flag = MAP_HUGETLB | MAP_HUGE_1GB, + .flag = anon_huge_flags | MAP_HUGE_1GB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_2GB] = { .name = "anonymous_hugetlb_2gb", - .flag = MAP_HUGETLB | MAP_HUGE_2GB, + .flag = anon_huge_flags | MAP_HUGE_2GB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_16GB] = { .name = "anonymous_hugetlb_16gb", - .flag = MAP_HUGETLB | MAP_HUGE_16GB, + .flag = anon_huge_flags | MAP_HUGE_16GB, }, }; _Static_assert(ARRAY_SIZE(aliases) == NUM_SRC_TYPES, -- cgit v1.2.3 From c9befd5958fdf8913db69049d47b6ac1d970af03 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Wed, 19 May 2021 13:03:36 -0700 Subject: KVM: selftests: add shmem backing source type This lets us run the demand paging test on top of a shmem-backed area. In follow-up commits, we'll 1) leverage this new capability to create an alias mapping, and then 2) use the alias mapping to exercise UFFD minor faults. Signed-off-by: Axel Rasmussen Message-Id: <20210519200339.829146-8-axelrasmussen@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/test_util.h | 1 + tools/testing/selftests/kvm/lib/kvm_util.c | 17 ++++++++++++++++- tools/testing/selftests/kvm/lib/test_util.c | 5 +++++ 3 files changed, 22 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index fade3130eb01..7377f00469ef 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -84,6 +84,7 @@ enum vm_mem_backing_src_type { VM_MEM_SRC_ANONYMOUS_HUGETLB_1GB, VM_MEM_SRC_ANONYMOUS_HUGETLB_2GB, VM_MEM_SRC_ANONYMOUS_HUGETLB_16GB, + VM_MEM_SRC_SHMEM, NUM_SRC_TYPES, }; diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 491be22b410c..bc50ca6390d3 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -847,10 +847,25 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, if (alignment > 1) region->mmap_size += alignment; + region->fd = -1; + if (src_type == VM_MEM_SRC_SHMEM) { + region->fd = memfd_create("kvm_selftest", MFD_CLOEXEC); + TEST_ASSERT(region->fd != -1, + "memfd_create failed, errno: %i", errno); + + ret = ftruncate(region->fd, region->mmap_size); + TEST_ASSERT(ret == 0, "ftruncate failed, errno: %i", errno); + + ret = fallocate(region->fd, + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, + region->mmap_size); + TEST_ASSERT(ret == 0, "fallocate failed, errno: %i", errno); + } + region->mmap_start = mmap(NULL, region->mmap_size, PROT_READ | PROT_WRITE, vm_mem_backing_src_alias(src_type)->flag, - -1, 0); + region->fd, 0); TEST_ASSERT(region->mmap_start != MAP_FAILED, "test_malloc failed, mmap_start: %p errno: %i", region->mmap_start, errno); diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c index 06ddde068736..c7a265da5090 100644 --- a/tools/testing/selftests/kvm/lib/test_util.c +++ b/tools/testing/selftests/kvm/lib/test_util.c @@ -236,6 +236,10 @@ const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i) .name = "anonymous_hugetlb_16gb", .flag = anon_huge_flags | MAP_HUGE_16GB, }, + [VM_MEM_SRC_SHMEM] = { + .name = "shmem", + .flag = MAP_SHARED, + }, }; _Static_assert(ARRAY_SIZE(aliases) == NUM_SRC_TYPES, "Missing new backing src types?"); @@ -253,6 +257,7 @@ size_t get_backing_src_pagesz(uint32_t i) switch (i) { case VM_MEM_SRC_ANONYMOUS: + case VM_MEM_SRC_SHMEM: return getpagesize(); case VM_MEM_SRC_ANONYMOUS_THP: return get_trans_hugepagesz(); -- cgit v1.2.3 From 94f3f2b31a8a9e8bd30bf6f4903ff84acc612e0e Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Wed, 19 May 2021 13:03:37 -0700 Subject: KVM: selftests: create alias mappings when using shared memory When a memory region is added with a src_type specifying that it should use some kind of shared memory, also create an alias mapping to the same underlying physical pages. And, add an API so tests can get access to these alias addresses. Basically, for a guest physical address, let us look up the analogous host *alias* address. In a future commit, we'll modify the demand paging test to take advantage of this to exercise UFFD minor faults. The idea is, we pre-fault the underlying pages *via the alias*. When the *guest* faults, it gets a "minor" fault (PTEs don't exist yet, but a page is already in the page cache). Then, the userfaultfd theads can handle the fault: they could potentially modify the underlying memory *via the alias* if they wanted to, and then they install the PTEs and let the guest carry on via a UFFDIO_CONTINUE ioctl. Reviewed-by: Ben Gardon Signed-off-by: Axel Rasmussen Message-Id: <20210519200339.829146-9-axelrasmussen@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/kvm_util.h | 1 + tools/testing/selftests/kvm/lib/kvm_util.c | 49 ++++++++++++++++++++++ .../testing/selftests/kvm/lib/kvm_util_internal.h | 2 + 3 files changed, 52 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 5d9b35d09251..fcd8e3855111 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -147,6 +147,7 @@ void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa); void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva); vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva); +void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa); /* * Address Guest Virtual to Guest Physical diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index bc50ca6390d3..f62780719176 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -903,6 +903,19 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, vm_userspace_mem_region_gpa_insert(&vm->regions.gpa_tree, region); vm_userspace_mem_region_hva_insert(&vm->regions.hva_tree, region); hash_add(vm->regions.slot_hash, ®ion->slot_node, slot); + + /* If shared memory, create an alias. */ + if (region->fd >= 0) { + region->mmap_alias = mmap(NULL, region->mmap_size, + PROT_READ | PROT_WRITE, + vm_mem_backing_src_alias(src_type)->flag, + region->fd, 0); + TEST_ASSERT(region->mmap_alias != MAP_FAILED, + "mmap of alias failed, errno: %i", errno); + + /* Align host alias address */ + region->host_alias = align(region->mmap_alias, alignment); + } } /* @@ -1333,6 +1346,42 @@ vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva) return -1; } +/* + * Address VM physical to Host Virtual *alias*. + * + * Input Args: + * vm - Virtual Machine + * gpa - VM physical address + * + * Output Args: None + * + * Return: + * Equivalent address within the host virtual *alias* area, or NULL + * (without failing the test) if the guest memory is not shared (so + * no alias exists). + * + * When vm_create() and related functions are called with a shared memory + * src_type, we also create a writable, shared alias mapping of the + * underlying guest memory. This allows the host to manipulate guest memory + * without mapping that memory in the guest's address space. And, for + * userfaultfd-based demand paging, we can do so without triggering userfaults. + */ +void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa) +{ + struct userspace_mem_region *region; + uintptr_t offset; + + region = userspace_mem_region_find(vm, gpa, gpa); + if (!region) + return NULL; + + if (!region->host_alias) + return NULL; + + offset = gpa - region->region.guest_phys_addr; + return (void *) ((uintptr_t) region->host_alias + offset); +} + /* * VM Create IRQ Chip * diff --git a/tools/testing/selftests/kvm/lib/kvm_util_internal.h b/tools/testing/selftests/kvm/lib/kvm_util_internal.h index b30e8c7b119b..a03febc24ba6 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util_internal.h +++ b/tools/testing/selftests/kvm/lib/kvm_util_internal.h @@ -19,7 +19,9 @@ struct userspace_mem_region { int fd; off_t offset; void *host_mem; + void *host_alias; void *mmap_start; + void *mmap_alias; size_t mmap_size; struct rb_node gpa_node; struct rb_node hva_node; -- cgit v1.2.3 From a4b9722a5996017264feb19ebe86efe4380f7afb Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Wed, 19 May 2021 13:03:38 -0700 Subject: KVM: selftests: allow using UFFD minor faults for demand paging UFFD handling of MINOR faults is a new feature whose use case is to speed up demand paging (compared to MISSING faults). So, it's interesting to let this selftest exercise this new mode. Modify the demand paging test to have the option of using UFFD minor faults, as opposed to missing faults. Now, when turning on userfaultfd with '-u', the desired mode has to be specified ("MISSING" or "MINOR"). If we're in minor mode, before registering, prefault via the *alias*. This way, the guest will trigger minor faults, instead of missing faults, and we can UFFDIO_CONTINUE to resolve them. Modify the page fault handler function to use the right ioctl depending on the mode we're running in. In MINOR mode, use UFFDIO_CONTINUE. Signed-off-by: Axel Rasmussen Message-Id: <20210519200339.829146-10-axelrasmussen@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/demand_paging_test.c | 112 ++++++++++++++++------- 1 file changed, 79 insertions(+), 33 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index 8c03484a5784..fcba527c29a6 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -73,33 +73,48 @@ static void *vcpu_worker(void *data) return NULL; } -static int handle_uffd_page_request(int uffd, uint64_t addr) +static int handle_uffd_page_request(int uffd_mode, int uffd, uint64_t addr) { - pid_t tid; + pid_t tid = syscall(__NR_gettid); struct timespec start; struct timespec ts_diff; - struct uffdio_copy copy; int r; - tid = syscall(__NR_gettid); + clock_gettime(CLOCK_MONOTONIC, &start); - copy.src = (uint64_t)guest_data_prototype; - copy.dst = addr; - copy.len = demand_paging_size; - copy.mode = 0; + if (uffd_mode == UFFDIO_REGISTER_MODE_MISSING) { + struct uffdio_copy copy; - clock_gettime(CLOCK_MONOTONIC, &start); + copy.src = (uint64_t)guest_data_prototype; + copy.dst = addr; + copy.len = demand_paging_size; + copy.mode = 0; + + r = ioctl(uffd, UFFDIO_COPY, ©); + if (r == -1) { + pr_info("Failed UFFDIO_COPY in 0x%lx from thread %d with errno: %d\n", + addr, tid, errno); + return r; + } + } else if (uffd_mode == UFFDIO_REGISTER_MODE_MINOR) { + struct uffdio_continue cont = {0}; + + cont.range.start = addr; + cont.range.len = demand_paging_size; - r = ioctl(uffd, UFFDIO_COPY, ©); - if (r == -1) { - pr_info("Failed Paged in 0x%lx from thread %d with errno: %d\n", - addr, tid, errno); - return r; + r = ioctl(uffd, UFFDIO_CONTINUE, &cont); + if (r == -1) { + pr_info("Failed UFFDIO_CONTINUE in 0x%lx from thread %d with errno: %d\n", + addr, tid, errno); + return r; + } + } else { + TEST_FAIL("Invalid uffd mode %d", uffd_mode); } ts_diff = timespec_elapsed(start); - PER_PAGE_DEBUG("UFFDIO_COPY %d \t%ld ns\n", tid, + PER_PAGE_DEBUG("UFFD page-in %d \t%ld ns\n", tid, timespec_to_ns(ts_diff)); PER_PAGE_DEBUG("Paged in %ld bytes at 0x%lx from thread %d\n", demand_paging_size, addr, tid); @@ -110,6 +125,7 @@ static int handle_uffd_page_request(int uffd, uint64_t addr) bool quit_uffd_thread; struct uffd_handler_args { + int uffd_mode; int uffd; int pipefd; useconds_t delay; @@ -186,7 +202,7 @@ static void *uffd_handler_thread_fn(void *arg) if (delay) usleep(delay); addr = msg.arg.pagefault.address; - r = handle_uffd_page_request(uffd, addr); + r = handle_uffd_page_request(uffd_args->uffd_mode, uffd, addr); if (r < 0) return NULL; pages++; @@ -202,13 +218,32 @@ static void *uffd_handler_thread_fn(void *arg) static void setup_demand_paging(struct kvm_vm *vm, pthread_t *uffd_handler_thread, int pipefd, - useconds_t uffd_delay, + int uffd_mode, useconds_t uffd_delay, struct uffd_handler_args *uffd_args, - void *hva, uint64_t len) + void *hva, void *alias, uint64_t len) { + bool is_minor = (uffd_mode == UFFDIO_REGISTER_MODE_MINOR); int uffd; struct uffdio_api uffdio_api; struct uffdio_register uffdio_register; + uint64_t expected_ioctls = ((uint64_t) 1) << _UFFDIO_COPY; + + PER_PAGE_DEBUG("Userfaultfd %s mode, faults resolved with %s\n", + is_minor ? "MINOR" : "MISSING", + is_minor ? "UFFDIO_CONINUE" : "UFFDIO_COPY"); + + /* In order to get minor faults, prefault via the alias. */ + if (is_minor) { + size_t p; + + expected_ioctls = ((uint64_t) 1) << _UFFDIO_CONTINUE; + + TEST_ASSERT(alias != NULL, "Alias required for minor faults"); + for (p = 0; p < (len / demand_paging_size); ++p) { + memcpy(alias + (p * demand_paging_size), + guest_data_prototype, demand_paging_size); + } + } uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); TEST_ASSERT(uffd >= 0, "uffd creation failed, errno: %d", errno); @@ -221,12 +256,13 @@ static void setup_demand_paging(struct kvm_vm *vm, uffdio_register.range.start = (uint64_t)hva; uffdio_register.range.len = len; - uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; + uffdio_register.mode = uffd_mode; TEST_ASSERT(ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) != -1, "ioctl UFFDIO_REGISTER failed"); - TEST_ASSERT((uffdio_register.ioctls & UFFD_API_RANGE_IOCTLS) == - UFFD_API_RANGE_IOCTLS, "unexpected userfaultfd ioctl set"); + TEST_ASSERT((uffdio_register.ioctls & expected_ioctls) == + expected_ioctls, "missing userfaultfd ioctls"); + uffd_args->uffd_mode = uffd_mode; uffd_args->uffd = uffd; uffd_args->pipefd = pipefd; uffd_args->delay = uffd_delay; @@ -238,7 +274,7 @@ static void setup_demand_paging(struct kvm_vm *vm, } struct test_params { - bool use_uffd; + int uffd_mode; useconds_t uffd_delay; enum vm_mem_backing_src_type src_type; bool partition_vcpu_memory_access; @@ -275,7 +311,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) perf_test_setup_vcpus(vm, nr_vcpus, guest_percpu_mem_size, p->partition_vcpu_memory_access); - if (p->use_uffd) { + if (p->uffd_mode) { uffd_handler_threads = malloc(nr_vcpus * sizeof(*uffd_handler_threads)); TEST_ASSERT(uffd_handler_threads, "Memory allocation failed"); @@ -289,6 +325,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { vm_paddr_t vcpu_gpa; void *vcpu_hva; + void *vcpu_alias; uint64_t vcpu_mem_size; @@ -303,8 +340,9 @@ static void run_test(enum vm_guest_mode mode, void *arg) PER_VCPU_DEBUG("Added VCPU %d with test mem gpa [%lx, %lx)\n", vcpu_id, vcpu_gpa, vcpu_gpa + vcpu_mem_size); - /* Cache the HVA pointer of the region */ + /* Cache the host addresses of the region */ vcpu_hva = addr_gpa2hva(vm, vcpu_gpa); + vcpu_alias = addr_gpa2alias(vm, vcpu_gpa); /* * Set up user fault fd to handle demand paging @@ -315,8 +353,9 @@ static void run_test(enum vm_guest_mode mode, void *arg) TEST_ASSERT(!r, "Failed to set up pipefd"); setup_demand_paging(vm, &uffd_handler_threads[vcpu_id], - pipefds[vcpu_id * 2], p->uffd_delay, - &uffd_args[vcpu_id], vcpu_hva, + pipefds[vcpu_id * 2], p->uffd_mode, + p->uffd_delay, &uffd_args[vcpu_id], + vcpu_hva, vcpu_alias, vcpu_mem_size); } } @@ -345,7 +384,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) pr_info("All vCPU threads joined\n"); - if (p->use_uffd) { + if (p->uffd_mode) { char c; /* Tell the user fault fd handler threads to quit */ @@ -367,7 +406,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) free(guest_data_prototype); free(vcpu_threads); - if (p->use_uffd) { + if (p->uffd_mode) { free(uffd_handler_threads); free(uffd_args); free(pipefds); @@ -377,11 +416,11 @@ static void run_test(enum vm_guest_mode mode, void *arg) static void help(char *name) { puts(""); - printf("usage: %s [-h] [-m mode] [-u] [-d uffd_delay_usec]\n" + printf("usage: %s [-h] [-m vm_mode] [-u uffd_mode] [-d uffd_delay_usec]\n" " [-b memory] [-t type] [-v vcpus] [-o]\n", name); guest_modes_help(); - printf(" -u: use User Fault FD to handle vCPU page\n" - " faults.\n"); + printf(" -u: use userfaultfd to handle vCPU page faults. Mode is a\n" + " UFFD registration mode: 'MISSING' or 'MINOR'.\n"); printf(" -d: add a delay in usec to the User Fault\n" " FD handler to simulate demand paging\n" " overheads. Ignored without -u.\n"); @@ -408,13 +447,17 @@ int main(int argc, char *argv[]) guest_modes_append_default(); - while ((opt = getopt(argc, argv, "hm:ud:b:t:v:o")) != -1) { + while ((opt = getopt(argc, argv, "hm:u:d:b:t:v:o")) != -1) { switch (opt) { case 'm': guest_modes_cmdline(optarg); break; case 'u': - p.use_uffd = true; + if (!strcmp("MISSING", optarg)) + p.uffd_mode = UFFDIO_REGISTER_MODE_MISSING; + else if (!strcmp("MINOR", optarg)) + p.uffd_mode = UFFDIO_REGISTER_MODE_MINOR; + TEST_ASSERT(p.uffd_mode, "UFFD mode must be 'MISSING' or 'MINOR'."); break; case 'd': p.uffd_delay = strtoul(optarg, NULL, 0); @@ -441,6 +484,9 @@ int main(int argc, char *argv[]) } } + TEST_ASSERT(p.uffd_mode != UFFDIO_REGISTER_MODE_MINOR || p.src_type == VM_MEM_SRC_SHMEM, + "userfaultfd MINOR mode requires shared memory; pick a different -t"); + for_each_guest_mode(run_test, &p); return 0; -- cgit v1.2.3 From 33090a884da5e9760f11441ac269f754375f80f5 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Wed, 19 May 2021 13:03:39 -0700 Subject: KVM: selftests: add shared hugetlbfs backing source type This lets us run the demand paging test on top of a shared hugetlbfs-backed area. The "shared" is key, as this allows us to exercise userfaultfd minor faults on hugetlbfs. Signed-off-by: Axel Rasmussen Message-Id: <20210519200339.829146-11-axelrasmussen@google.com> Reviewed-by: Ben Gardon Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/demand_paging_test.c | 6 ++++-- tools/testing/selftests/kvm/include/test_util.h | 11 +++++++++++ tools/testing/selftests/kvm/lib/kvm_util.c | 9 +++++++-- tools/testing/selftests/kvm/lib/test_util.c | 11 +++++++++++ 4 files changed, 33 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index fcba527c29a6..b74704305835 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -484,8 +484,10 @@ int main(int argc, char *argv[]) } } - TEST_ASSERT(p.uffd_mode != UFFDIO_REGISTER_MODE_MINOR || p.src_type == VM_MEM_SRC_SHMEM, - "userfaultfd MINOR mode requires shared memory; pick a different -t"); + if (p.uffd_mode == UFFDIO_REGISTER_MODE_MINOR && + !backing_src_is_shared(p.src_type)) { + TEST_FAIL("userfaultfd MINOR mode requires shared memory; pick a different -t"); + } for_each_guest_mode(run_test, &p); diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index 7377f00469ef..d79be15dd3d2 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -17,6 +17,7 @@ #include #include #include +#include #include "kselftest.h" static inline int _no_printf(const char *format, ...) { return 0; } @@ -85,6 +86,7 @@ enum vm_mem_backing_src_type { VM_MEM_SRC_ANONYMOUS_HUGETLB_2GB, VM_MEM_SRC_ANONYMOUS_HUGETLB_16GB, VM_MEM_SRC_SHMEM, + VM_MEM_SRC_SHARED_HUGETLB, NUM_SRC_TYPES, }; @@ -101,4 +103,13 @@ size_t get_backing_src_pagesz(uint32_t i); void backing_src_help(void); enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name); +/* + * Whether or not the given source type is shared memory (as opposed to + * anonymous). + */ +static inline bool backing_src_is_shared(enum vm_mem_backing_src_type t) +{ + return vm_mem_backing_src_alias(t)->flag & MAP_SHARED; +} + #endif /* SELFTEST_KVM_TEST_UTIL_H */ diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index f62780719176..28e528c19d28 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -848,8 +848,13 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, region->mmap_size += alignment; region->fd = -1; - if (src_type == VM_MEM_SRC_SHMEM) { - region->fd = memfd_create("kvm_selftest", MFD_CLOEXEC); + if (backing_src_is_shared(src_type)) { + int memfd_flags = MFD_CLOEXEC; + + if (src_type == VM_MEM_SRC_SHARED_HUGETLB) + memfd_flags |= MFD_HUGETLB; + + region->fd = memfd_create("kvm_selftest", memfd_flags); TEST_ASSERT(region->fd != -1, "memfd_create failed, errno: %i", errno); diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c index c7a265da5090..6ad6c8276b2e 100644 --- a/tools/testing/selftests/kvm/lib/test_util.c +++ b/tools/testing/selftests/kvm/lib/test_util.c @@ -240,6 +240,16 @@ const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i) .name = "shmem", .flag = MAP_SHARED, }, + [VM_MEM_SRC_SHARED_HUGETLB] = { + .name = "shared_hugetlb", + /* + * No MAP_HUGETLB, we use MFD_HUGETLB instead. Since + * we're using "file backed" memory, we need to specify + * this when the FD is created, not when the area is + * mapped. + */ + .flag = MAP_SHARED, + }, }; _Static_assert(ARRAY_SIZE(aliases) == NUM_SRC_TYPES, "Missing new backing src types?"); @@ -262,6 +272,7 @@ size_t get_backing_src_pagesz(uint32_t i) case VM_MEM_SRC_ANONYMOUS_THP: return get_trans_hugepagesz(); case VM_MEM_SRC_ANONYMOUS_HUGETLB: + case VM_MEM_SRC_SHARED_HUGETLB: return get_def_hugetlb_pagesz(); default: return MAP_HUGE_PAGE_SIZE(flag); -- cgit v1.2.3 From fb1070d18edb37daf3979662975bc54625a19953 Mon Sep 17 00:00:00 2001 From: Joe Richey Date: Fri, 21 May 2021 01:58:43 -0700 Subject: KVM: X86: Use _BITUL() macro in UAPI headers Replace BIT() in KVM's UPAI header with _BITUL(). BIT() is not defined in the UAPI headers and its usage may cause userspace build errors. Fixes: fb04a1eddb1a ("KVM: X86: Implement ring-based dirty memory tracking") Signed-off-by: Joe Richey Message-Id: <20210521085849.37676-3-joerichey94@gmail.com> Signed-off-by: Paolo Bonzini --- include/uapi/linux/kvm.h | 5 +++-- tools/include/uapi/linux/kvm.h | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 3fd9a7e9d90c..79d9c44d1ad7 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -8,6 +8,7 @@ * Note: you must update KVM_API_VERSION if you change this interface. */ +#include #include #include #include @@ -1879,8 +1880,8 @@ struct kvm_hyperv_eventfd { * conversion after harvesting an entry. Also, it must not skip any * dirty bits, so that dirty bits are always harvested in sequence. */ -#define KVM_DIRTY_GFN_F_DIRTY BIT(0) -#define KVM_DIRTY_GFN_F_RESET BIT(1) +#define KVM_DIRTY_GFN_F_DIRTY _BITUL(0) +#define KVM_DIRTY_GFN_F_RESET _BITUL(1) #define KVM_DIRTY_GFN_F_MASK 0x3 /* diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index f6afee209620..26e6d94d64ed 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -8,6 +8,7 @@ * Note: you must update KVM_API_VERSION if you change this interface. */ +#include #include #include #include @@ -1834,8 +1835,8 @@ struct kvm_hyperv_eventfd { * conversion after harvesting an entry. Also, it must not skip any * dirty bits, so that dirty bits are always harvested in sequence. */ -#define KVM_DIRTY_GFN_F_DIRTY BIT(0) -#define KVM_DIRTY_GFN_F_RESET BIT(1) +#define KVM_DIRTY_GFN_F_DIRTY _BITUL(0) +#define KVM_DIRTY_GFN_F_RESET _BITUL(1) #define KVM_DIRTY_GFN_F_MASK 0x3 /* -- cgit v1.2.3 From fb0f94794bb7558c078ce37b1a6e30d881fd7888 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 26 May 2021 14:36:14 -0400 Subject: selftests: kvm: do only 1 memslot_perf_test run by default The test takes a long time with the current implementation of memslots, so cut the run time a bit. Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/memslot_perf_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c index 4ae0e5ec0f74..11239652d805 100644 --- a/tools/testing/selftests/kvm/memslot_perf_test.c +++ b/tools/testing/selftests/kvm/memslot_perf_test.c @@ -992,7 +992,7 @@ int main(int argc, char *argv[]) .tlast = NTESTS - 1, .nslots = -1, .seconds = 5, - .runs = 20, + .runs = 1, }; struct test_result rbestslottime; int tctr; -- cgit v1.2.3 From c59870e2110e1229a6e4b2457aece6ffe8d68d99 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 19 May 2021 09:44:47 -0700 Subject: perf debug: Move debug initialization earlier This avoids segfaults during option handlers that use pr_err. For example, "perf --debug nopager list" segfaults before this change. Fixes: 8abceacff87d (perf debug: Add debug_set_file function) Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lore.kernel.org/lkml/20210519164447.2672030-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/perf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/perf.c b/tools/perf/perf.c index 20cb91ef06ff..2f6b67189b42 100644 --- a/tools/perf/perf.c +++ b/tools/perf/perf.c @@ -443,6 +443,8 @@ int main(int argc, const char **argv) const char *cmd; char sbuf[STRERR_BUFSIZE]; + perf_debug_setup(); + /* libsubcmd init */ exec_cmd_init("perf", PREFIX, PERF_EXEC_PATH, EXEC_PATH_ENVIRONMENT); pager_init(PERF_PAGER_ENVIRONMENT); @@ -531,8 +533,6 @@ int main(int argc, const char **argv) */ pthread__block_sigwinch(); - perf_debug_setup(); - while (1) { static int done_help; -- cgit v1.2.3 From c673b7f59e940061467200f1746820a178444bd0 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 27 May 2021 15:00:52 -0700 Subject: perf stat: Fix error check for bpf_program__attach It seems the bpf_program__attach() returns a negative error code instead of a NULL pointer in case of error. Fixes: 7fac83aaf2ee ("perf stat: Introduce 'bperf' to share hardware PMCs with BPF") Signed-off-by: Namhyung Kim Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ian Rogers Cc: Jiri Olsa Cc: Mark Rutland Cc: Peter Zijlstra Cc: Song Liu Link: http://lore.kernel.org/lkml/20210527220052.1657578-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/bpf_counter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c index ddb52f748c8e..974f10e356f0 100644 --- a/tools/perf/util/bpf_counter.c +++ b/tools/perf/util/bpf_counter.c @@ -451,10 +451,10 @@ static int bperf_reload_leader_program(struct evsel *evsel, int attr_map_fd, goto out; } - err = -1; link = bpf_program__attach(skel->progs.on_switch); - if (!link) { + if (IS_ERR(link)) { pr_err("Failed to attach leader program\n"); + err = PTR_ERR(link); goto out; } -- cgit v1.2.3 From 8fc4e4aa2bfca8d32e8bc2a01526ea2da450e6cb Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Tue, 25 May 2021 12:07:23 +0530 Subject: perf vendor events powerpc: Fix eventcode of power10 JSON events Fixed the eventcode values in the power10 JSON event files to prepend "0x" since these are hexadecimal values. The patch also changes the event description of the PM_EXEC_STALL_LOAD_FINISH and PM_EXEC_STALL_NTC_FLUSH event and move some events to correct files. Fixes: 32daa5d7899e ("perf vendor events: Initial JSON/events list for power10 platform") Signed-off-by: Kajol Jain Reviewed-by: Paul A. Clarke Tested-by: Nageswara R Sastry Cc: Athira Jajeev Cc: Jiri Olsa Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Ravi Bangoria Cc: linuxppc-dev@lists.ozlabs.org Link: http://lore.kernel.org/lkml/20210525063723.1191514-1-kjain@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- .../pmu-events/arch/powerpc/power10/cache.json | 30 +++-- .../arch/powerpc/power10/floating_point.json | 2 +- .../pmu-events/arch/powerpc/power10/frontend.json | 124 ++++++++++++------- .../pmu-events/arch/powerpc/power10/locks.json | 4 +- .../pmu-events/arch/powerpc/power10/marked.json | 61 +++++----- .../pmu-events/arch/powerpc/power10/memory.json | 79 ++++++------ .../pmu-events/arch/powerpc/power10/others.json | 133 +++++++++----------- .../pmu-events/arch/powerpc/power10/pipeline.json | 135 ++++++++++----------- .../perf/pmu-events/arch/powerpc/power10/pmc.json | 8 +- .../arch/powerpc/power10/translation.json | 22 ++-- 10 files changed, 299 insertions(+), 299 deletions(-) (limited to 'tools') diff --git a/tools/perf/pmu-events/arch/powerpc/power10/cache.json b/tools/perf/pmu-events/arch/powerpc/power10/cache.json index 616f29098c71..605be14f441c 100644 --- a/tools/perf/pmu-events/arch/powerpc/power10/cache.json +++ b/tools/perf/pmu-events/arch/powerpc/power10/cache.json @@ -1,46 +1,56 @@ [ { - "EventCode": "1003C", + "EventCode": "0x1003C", "EventName": "PM_EXEC_STALL_DMISS_L2L3", "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from either the local L2 or local L3." }, { - "EventCode": "34056", + "EventCode": "0x1E054", + "EventName": "PM_EXEC_STALL_DMISS_L21_L31", + "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from another core's L2 or L3 on the same chip." + }, + { + "EventCode": "0x34054", + "EventName": "PM_EXEC_STALL_DMISS_L2L3_NOCONFLICT", + "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from the local L2 or local L3, without a dispatch conflict." + }, + { + "EventCode": "0x34056", "EventName": "PM_EXEC_STALL_LOAD_FINISH", - "BriefDescription": "Cycles in which the oldest instruction in the pipeline was finishing a load after its data was reloaded from a data source beyond the local L1; cycles in which the LSU was processing an L1-hit; cycles in which the NTF instruction merged with another load in the LMQ." + "BriefDescription": "Cycles in which the oldest instruction in the pipeline was finishing a load after its data was reloaded from a data source beyond the local L1; cycles in which the LSU was processing an L1-hit; cycles in which the NTF instruction merged with another load in the LMQ; cycles in which the NTF instruction is waiting for a data reload for a load miss, but the data comes back with a non-NTF instruction." }, { - "EventCode": "3006C", + "EventCode": "0x3006C", "EventName": "PM_RUN_CYC_SMT2_MODE", "BriefDescription": "Cycles when this thread's run latch is set and the core is in SMT2 mode." }, { - "EventCode": "300F4", + "EventCode": "0x300F4", "EventName": "PM_RUN_INST_CMPL_CONC", "BriefDescription": "PowerPC instructions completed by this thread when all threads in the core had the run-latch set." }, { - "EventCode": "4C016", + "EventCode": "0x4C016", "EventName": "PM_EXEC_STALL_DMISS_L2L3_CONFLICT", "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from the local L2 or local L3, with a dispatch conflict." }, { - "EventCode": "4D014", + "EventCode": "0x4D014", "EventName": "PM_EXEC_STALL_LOAD", "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a load instruction executing in the Load Store Unit." }, { - "EventCode": "4D016", + "EventCode": "0x4D016", "EventName": "PM_EXEC_STALL_PTESYNC", "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a PTESYNC instruction executing in the Load Store Unit." }, { - "EventCode": "401EA", + "EventCode": "0x401EA", "EventName": "PM_THRESH_EXC_128", "BriefDescription": "Threshold counter exceeded a value of 128." }, { - "EventCode": "400F6", + "EventCode": "0x400F6", "EventName": "PM_BR_MPRED_CMPL", "BriefDescription": "A mispredicted branch completed. Includes direction and target." } diff --git a/tools/perf/pmu-events/arch/powerpc/power10/floating_point.json b/tools/perf/pmu-events/arch/powerpc/power10/floating_point.json index 703cd431ae5b..54acb55e2c8c 100644 --- a/tools/perf/pmu-events/arch/powerpc/power10/floating_point.json +++ b/tools/perf/pmu-events/arch/powerpc/power10/floating_point.json @@ -1,6 +1,6 @@ [ { - "EventCode": "4016E", + "EventCode": "0x4016E", "EventName": "PM_THRESH_NOT_MET", "BriefDescription": "Threshold counter did not meet threshold." } diff --git a/tools/perf/pmu-events/arch/powerpc/power10/frontend.json b/tools/perf/pmu-events/arch/powerpc/power10/frontend.json index eac8609dcc90..558f9530f54e 100644 --- a/tools/perf/pmu-events/arch/powerpc/power10/frontend.json +++ b/tools/perf/pmu-events/arch/powerpc/power10/frontend.json @@ -1,216 +1,246 @@ [ { - "EventCode": "10004", + "EventCode": "0x10004", "EventName": "PM_EXEC_STALL_TRANSLATION", "BriefDescription": "Cycles in which the oldest instruction in the pipeline suffered a TLB miss or ERAT miss and waited for it to resolve." }, { - "EventCode": "10010", + "EventCode": "0x10006", + "EventName": "PM_DISP_STALL_HELD_OTHER_CYC", + "BriefDescription": "Cycles in which the NTC instruction is held at dispatch for any other reason." + }, + { + "EventCode": "0x10010", "EventName": "PM_PMC4_OVERFLOW", "BriefDescription": "The event selected for PMC4 caused the event counter to overflow." }, { - "EventCode": "10020", + "EventCode": "0x10020", "EventName": "PM_PMC4_REWIND", "BriefDescription": "The speculative event selected for PMC4 rewinds and the counter for PMC4 is not charged." }, { - "EventCode": "10038", + "EventCode": "0x10038", "EventName": "PM_DISP_STALL_TRANSLATION", "BriefDescription": "Cycles when dispatch was stalled for this thread because the MMU was handling a translation miss." }, { - "EventCode": "1003A", + "EventCode": "0x1003A", "EventName": "PM_DISP_STALL_BR_MPRED_IC_L2", "BriefDescription": "Cycles when dispatch was stalled while the instruction was fetched from the local L2 after suffering a branch mispredict." }, { - "EventCode": "1E050", + "EventCode": "0x1D05E", + "EventName": "PM_DISP_STALL_HELD_HALT_CYC", + "BriefDescription": "Cycles in which the NTC instruction is held at dispatch because of power management." + }, + { + "EventCode": "0x1E050", "EventName": "PM_DISP_STALL_HELD_STF_MAPPER_CYC", "BriefDescription": "Cycles in which the NTC instruction is held at dispatch because the STF mapper/SRB was full. Includes GPR (count, link, tar), VSR, VMR, FPR." }, { - "EventCode": "1F054", + "EventCode": "0x1F054", "EventName": "PM_DTLB_HIT", "BriefDescription": "The PTE required by the instruction was resident in the TLB (data TLB access). When MMCR1[16]=0 this event counts only demand hits. When MMCR1[16]=1 this event includes demand and prefetch. Applies to both HPT and RPT." }, { - "EventCode": "101E8", + "EventCode": "0x10064", + "EventName": "PM_DISP_STALL_IC_L2", + "BriefDescription": "Cycles when dispatch was stalled while the instruction was fetched from the local L2." + }, + { + "EventCode": "0x101E8", "EventName": "PM_THRESH_EXC_256", "BriefDescription": "Threshold counter exceeded a count of 256." }, { - "EventCode": "101EC", + "EventCode": "0x101EC", "EventName": "PM_THRESH_MET", "BriefDescription": "Threshold exceeded." }, { - "EventCode": "100F2", + "EventCode": "0x100F2", "EventName": "PM_1PLUS_PPC_CMPL", "BriefDescription": "Cycles in which at least one instruction is completed by this thread." }, { - "EventCode": "100F6", + "EventCode": "0x100F6", "EventName": "PM_IERAT_MISS", "BriefDescription": "IERAT Reloaded to satisfy an IERAT miss. All page sizes are counted by this event." }, { - "EventCode": "100F8", + "EventCode": "0x100F8", "EventName": "PM_DISP_STALL_CYC", "BriefDescription": "Cycles the ICT has no itags assigned to this thread (no instructions were dispatched during these cycles)." }, { - "EventCode": "20114", + "EventCode": "0x20006", + "EventName": "PM_DISP_STALL_HELD_ISSQ_FULL_CYC", + "BriefDescription": "Cycles in which the NTC instruction is held at dispatch due to Issue queue full. Includes issue queue and branch queue." + }, + { + "EventCode": "0x20114", "EventName": "PM_MRK_L2_RC_DISP", "BriefDescription": "Marked instruction RC dispatched in L2." }, { - "EventCode": "2C010", + "EventCode": "0x2C010", "EventName": "PM_EXEC_STALL_LSU", "BriefDescription": "Cycles in which the oldest instruction in the pipeline was executing in the Load Store Unit. This does not include simple fixed point instructions." }, { - "EventCode": "2C016", + "EventCode": "0x2C016", "EventName": "PM_DISP_STALL_IERAT_ONLY_MISS", "BriefDescription": "Cycles when dispatch was stalled while waiting to resolve an instruction ERAT miss." }, { - "EventCode": "2C01E", + "EventCode": "0x2C01E", "EventName": "PM_DISP_STALL_BR_MPRED_IC_L3", "BriefDescription": "Cycles when dispatch was stalled while the instruction was fetched from the local L3 after suffering a branch mispredict." }, { - "EventCode": "2D01A", + "EventCode": "0x2D01A", "EventName": "PM_DISP_STALL_IC_MISS", "BriefDescription": "Cycles when dispatch was stalled for this thread due to an Icache Miss." }, { - "EventCode": "2D01C", - "EventName": "PM_CMPL_STALL_STCX", - "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a stcx waiting for resolution from the nest before completing." - }, - { - "EventCode": "2E018", + "EventCode": "0x2E018", "EventName": "PM_DISP_STALL_FETCH", "BriefDescription": "Cycles when dispatch was stalled for this thread because Fetch was being held." }, { - "EventCode": "2E01A", + "EventCode": "0x2E01A", "EventName": "PM_DISP_STALL_HELD_XVFC_MAPPER_CYC", "BriefDescription": "Cycles in which the NTC instruction is held at dispatch because the XVFC mapper/SRB was full." }, { - "EventCode": "2C142", + "EventCode": "0x2C142", "EventName": "PM_MRK_XFER_FROM_SRC_PMC2", "BriefDescription": "For a marked data transfer instruction, the processor's L1 data cache was reloaded from the source specified in MMCR3[15:27]. If MMCR1[16|17] is 0 (default), this count includes only lines that were reloaded to satisfy a demand miss. If MMCR1[16|17] is 1, this count includes both demand misses and prefetch reloads." }, { - "EventCode": "24050", + "EventCode": "0x24050", "EventName": "PM_IOPS_DISP", "BriefDescription": "Internal Operations dispatched. PM_IOPS_DISP / PM_INST_DISP will show the average number of internal operations per PowerPC instruction." }, { - "EventCode": "2405E", + "EventCode": "0x2405E", "EventName": "PM_ISSUE_CANCEL", "BriefDescription": "An instruction issued and the issue was later cancelled. Only one cancel per PowerPC instruction." }, { - "EventCode": "200FA", + "EventCode": "0x200FA", "EventName": "PM_BR_TAKEN_CMPL", "BriefDescription": "Branch Taken instruction completed." }, { - "EventCode": "30012", + "EventCode": "0x30004", + "EventName": "PM_DISP_STALL_FLUSH", + "BriefDescription": "Cycles when dispatch was stalled because of a flush that happened to an instruction(s) that was not yet NTC. PM_EXEC_STALL_NTC_FLUSH only includes instructions that were flushed after becoming NTC." + }, + { + "EventCode": "0x3000A", + "EventName": "PM_DISP_STALL_ITLB_MISS", + "BriefDescription": "Cycles when dispatch was stalled while waiting to resolve an instruction TLB miss." + }, + { + "EventCode": "0x30012", "EventName": "PM_FLUSH_COMPLETION", "BriefDescription": "The instruction that was next to complete (oldest in the pipeline) did not complete because it suffered a flush." }, { - "EventCode": "30014", + "EventCode": "0x30014", "EventName": "PM_EXEC_STALL_STORE", "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a store instruction executing in the Load Store Unit." }, { - "EventCode": "30018", + "EventCode": "0x30018", "EventName": "PM_DISP_STALL_HELD_SCOREBOARD_CYC", "BriefDescription": "Cycles in which the NTC instruction is held at dispatch while waiting on the Scoreboard. This event combines VSCR and FPSCR together." }, { - "EventCode": "30026", + "EventCode": "0x30026", "EventName": "PM_EXEC_STALL_STORE_MISS", "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a store whose cache line was not resident in the L1 and was waiting for allocation of the missing line into the L1." }, { - "EventCode": "3012A", + "EventCode": "0x3012A", "EventName": "PM_MRK_L2_RC_DONE", "BriefDescription": "L2 RC machine completed the transaction for the marked instruction." }, { - "EventCode": "3F046", + "EventCode": "0x3F046", "EventName": "PM_ITLB_HIT_1G", "BriefDescription": "Instruction TLB hit (IERAT reload) page size 1G, which implies Radix Page Table translation is in use. When MMCR1[17]=0 this event counts only for demand misses. When MMCR1[17]=1 this event includes demand misses and prefetches." }, { - "EventCode": "34058", + "EventCode": "0x34058", "EventName": "PM_DISP_STALL_BR_MPRED_ICMISS", "BriefDescription": "Cycles when dispatch was stalled after a mispredicted branch resulted in an instruction cache miss." }, { - "EventCode": "3D05C", + "EventCode": "0x3D05C", "EventName": "PM_DISP_STALL_HELD_RENAME_CYC", "BriefDescription": "Cycles in which the NTC instruction is held at dispatch because the mapper/SRB was full. Includes GPR (count, link, tar), VSR, VMR, FPR and XVFC." }, { - "EventCode": "3E052", + "EventCode": "0x3E052", "EventName": "PM_DISP_STALL_IC_L3", "BriefDescription": "Cycles when dispatch was stalled while the instruction was fetched from the local L3." }, { - "EventCode": "3E054", + "EventCode": "0x3E054", "EventName": "PM_LD_MISS_L1", "BriefDescription": "Load Missed L1, counted at execution time (can be greater than loads finished). LMQ merges are not included in this count. i.e. if a load instruction misses on an address that is already allocated on the LMQ, this event will not increment for that load). Note that this count is per slice, so if a load spans multiple slices this event will increment multiple times for a single load." }, { - "EventCode": "301EA", + "EventCode": "0x301EA", "EventName": "PM_THRESH_EXC_1024", "BriefDescription": "Threshold counter exceeded a value of 1024." }, { - "EventCode": "300FA", + "EventCode": "0x300FA", "EventName": "PM_INST_FROM_L3MISS", "BriefDescription": "The processor's instruction cache was reloaded from a source other than the local core's L1, L2, or L3 due to a demand miss." }, { - "EventCode": "40006", + "EventCode": "0x40006", "EventName": "PM_ISSUE_KILL", "BriefDescription": "Cycles in which an instruction or group of instructions were cancelled after being issued. This event increments once per occurrence, regardless of how many instructions are included in the issue group." }, { - "EventCode": "40116", + "EventCode": "0x40116", "EventName": "PM_MRK_LARX_FIN", "BriefDescription": "Marked load and reserve instruction (LARX) finished. LARX and STCX are instructions used to acquire a lock." }, { - "EventCode": "4C010", + "EventCode": "0x4C010", "EventName": "PM_DISP_STALL_BR_MPRED_IC_L3MISS", "BriefDescription": "Cycles when dispatch was stalled while the instruction was fetched from sources beyond the local L3 after suffering a mispredicted branch." }, { - "EventCode": "4D01E", + "EventCode": "0x4D01E", "EventName": "PM_DISP_STALL_BR_MPRED", "BriefDescription": "Cycles when dispatch was stalled for this thread due to a mispredicted branch." }, { - "EventCode": "4E010", + "EventCode": "0x4E010", "EventName": "PM_DISP_STALL_IC_L3MISS", "BriefDescription": "Cycles when dispatch was stalled while the instruction was fetched from any source beyond the local L3." }, { - "EventCode": "4E01A", + "EventCode": "0x4E01A", "EventName": "PM_DISP_STALL_HELD_CYC", "BriefDescription": "Cycles in which the NTC instruction is held at dispatch for any reason." }, { - "EventCode": "44056", + "EventCode": "0x4003C", + "EventName": "PM_DISP_STALL_HELD_SYNC_CYC", + "BriefDescription": "Cycles in which the NTC instruction is held at dispatch because of a synchronizing instruction that requires the ICT to be empty before dispatch." + }, + { + "EventCode": "0x44056", "EventName": "PM_VECTOR_ST_CMPL", "BriefDescription": "Vector store instructions completed." } diff --git a/tools/perf/pmu-events/arch/powerpc/power10/locks.json b/tools/perf/pmu-events/arch/powerpc/power10/locks.json index 016d8de0e14a..b5a0d6521963 100644 --- a/tools/perf/pmu-events/arch/powerpc/power10/locks.json +++ b/tools/perf/pmu-events/arch/powerpc/power10/locks.json @@ -1,11 +1,11 @@ [ { - "EventCode": "1E058", + "EventCode": "0x1E058", "EventName": "PM_STCX_FAIL_FIN", "BriefDescription": "Conditional store instruction (STCX) failed. LARX and STCX are instructions used to acquire a lock." }, { - "EventCode": "4E050", + "EventCode": "0x4E050", "EventName": "PM_STCX_PASS_FIN", "BriefDescription": "Conditional store instruction (STCX) passed. LARX and STCX are instructions used to acquire a lock." } diff --git a/tools/perf/pmu-events/arch/powerpc/power10/marked.json b/tools/perf/pmu-events/arch/powerpc/power10/marked.json index 93a5a5910648..58b5dfe3a273 100644 --- a/tools/perf/pmu-events/arch/powerpc/power10/marked.json +++ b/tools/perf/pmu-events/arch/powerpc/power10/marked.json @@ -1,146 +1,141 @@ [ { - "EventCode": "1002C", + "EventCode": "0x1002C", "EventName": "PM_LD_PREFETCH_CACHE_LINE_MISS", "BriefDescription": "The L1 cache was reloaded with a line that fulfills a prefetch request." }, { - "EventCode": "10132", + "EventCode": "0x10132", "EventName": "PM_MRK_INST_ISSUED", "BriefDescription": "Marked instruction issued. Note that stores always get issued twice, the address gets issued to the LSU and the data gets issued to the VSU. Also, issues can sometimes get killed/cancelled and cause multiple sequential issues for the same instruction." }, { - "EventCode": "101E0", + "EventCode": "0x101E0", "EventName": "PM_MRK_INST_DISP", "BriefDescription": "The thread has dispatched a randomly sampled marked instruction." }, { - "EventCode": "101E2", + "EventCode": "0x101E2", "EventName": "PM_MRK_BR_TAKEN_CMPL", "BriefDescription": "Marked Branch Taken instruction completed." }, { - "EventCode": "20112", + "EventCode": "0x20112", "EventName": "PM_MRK_NTF_FIN", "BriefDescription": "The marked instruction became the oldest in the pipeline before it finished. It excludes instructions that finish at dispatch." }, { - "EventCode": "2C01C", + "EventCode": "0x2C01C", "EventName": "PM_EXEC_STALL_DMISS_OFF_CHIP", "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from a remote chip." }, { - "EventCode": "20138", + "EventCode": "0x20138", "EventName": "PM_MRK_ST_NEST", "BriefDescription": "A store has been sampled/marked and is at the point of execution where it has completed in the core and can no longer be flushed. At this point the store is sent to the L2." }, { - "EventCode": "2013A", + "EventCode": "0x2013A", "EventName": "PM_MRK_BRU_FIN", "BriefDescription": "Marked Branch instruction finished." }, { - "EventCode": "2C144", + "EventCode": "0x2C144", "EventName": "PM_MRK_XFER_FROM_SRC_CYC_PMC2", "BriefDescription": "Cycles taken for a marked demand miss to reload a line from the source specified in MMCR3[15:27]." }, { - "EventCode": "24156", + "EventCode": "0x24156", "EventName": "PM_MRK_STCX_FIN", "BriefDescription": "Marked conditional store instruction (STCX) finished. LARX and STCX are instructions used to acquire a lock." }, { - "EventCode": "24158", + "EventCode": "0x24158", "EventName": "PM_MRK_INST", "BriefDescription": "An instruction was marked. Includes both Random Instruction Sampling (RIS) at decode time and Random Event Sampling (RES) at the time the configured event happens." }, { - "EventCode": "2415C", + "EventCode": "0x2415C", "EventName": "PM_MRK_BR_CMPL", "BriefDescription": "A marked branch completed. All branches are included." }, { - "EventCode": "200FD", + "EventCode": "0x200FD", "EventName": "PM_L1_ICACHE_MISS", "BriefDescription": "Demand iCache Miss." }, { - "EventCode": "30130", + "EventCode": "0x30130", "EventName": "PM_MRK_INST_FIN", "BriefDescription": "marked instruction finished. Excludes instructions that finish at dispatch. Note that stores always finish twice since the address gets issued to the LSU and the data gets issued to the VSU." }, { - "EventCode": "34146", + "EventCode": "0x34146", "EventName": "PM_MRK_LD_CMPL", "BriefDescription": "Marked loads completed." }, { - "EventCode": "3E158", + "EventCode": "0x3E158", "EventName": "PM_MRK_STCX_FAIL", "BriefDescription": "Marked conditional store instruction (STCX) failed. LARX and STCX are instructions used to acquire a lock." }, { - "EventCode": "3E15A", + "EventCode": "0x3E15A", "EventName": "PM_MRK_ST_FIN", "BriefDescription": "The marked instruction was a store of any kind." }, { - "EventCode": "30068", + "EventCode": "0x30068", "EventName": "PM_L1_ICACHE_RELOADED_PREF", "BriefDescription": "Counts all Icache prefetch reloads ( includes demand turned into prefetch)." }, { - "EventCode": "301E4", + "EventCode": "0x301E4", "EventName": "PM_MRK_BR_MPRED_CMPL", "BriefDescription": "Marked Branch Mispredicted. Includes direction and target." }, { - "EventCode": "300F6", + "EventCode": "0x300F6", "EventName": "PM_LD_DEMAND_MISS_L1", "BriefDescription": "The L1 cache was reloaded with a line that fulfills a demand miss request. Counted at reload time, before finish." }, { - "EventCode": "300FE", + "EventCode": "0x300FE", "EventName": "PM_DATA_FROM_L3MISS", "BriefDescription": "The processor's data cache was reloaded from a source other than the local core's L1, L2, or L3 due to a demand miss." }, { - "EventCode": "40012", + "EventCode": "0x40012", "EventName": "PM_L1_ICACHE_RELOADED_ALL", "BriefDescription": "Counts all Icache reloads includes demand, prefetch, prefetch turned into demand and demand turned into prefetch." }, { - "EventCode": "40134", + "EventCode": "0x40134", "EventName": "PM_MRK_INST_TIMEO", "BriefDescription": "Marked instruction finish timeout (instruction was lost)." }, { - "EventCode": "4003C", - "EventName": "PM_DISP_STALL_HELD_SYNC_CYC", - "BriefDescription": "Cycles in which the NTC instruction is held at dispatch because of a synchronizing instruction that requires the ICT to be empty before dispatch." - }, - { - "EventCode": "4505A", + "EventCode": "0x4505A", "EventName": "PM_SP_FLOP_CMPL", "BriefDescription": "Single Precision floating point instructions completed." }, { - "EventCode": "4D058", + "EventCode": "0x4D058", "EventName": "PM_VECTOR_FLOP_CMPL", "BriefDescription": "Vector floating point instructions completed." }, { - "EventCode": "4D05A", + "EventCode": "0x4D05A", "EventName": "PM_NON_MATH_FLOP_CMPL", "BriefDescription": "Non Math instructions completed." }, { - "EventCode": "401E0", + "EventCode": "0x401E0", "EventName": "PM_MRK_INST_CMPL", "BriefDescription": "marked instruction completed." }, { - "EventCode": "400FE", + "EventCode": "0x400FE", "EventName": "PM_DATA_FROM_MEMORY", "BriefDescription": "The processor's data cache was reloaded from local, remote, or distant memory due to a demand miss." } diff --git a/tools/perf/pmu-events/arch/powerpc/power10/memory.json b/tools/perf/pmu-events/arch/powerpc/power10/memory.json index b01141eeebee..843b51f531e9 100644 --- a/tools/perf/pmu-events/arch/powerpc/power10/memory.json +++ b/tools/perf/pmu-events/arch/powerpc/power10/memory.json @@ -1,191 +1,186 @@ [ { - "EventCode": "1000A", + "EventCode": "0x1000A", "EventName": "PM_PMC3_REWIND", "BriefDescription": "The speculative event selected for PMC3 rewinds and the counter for PMC3 is not charged." }, { - "EventCode": "1C040", + "EventCode": "0x1C040", "EventName": "PM_XFER_FROM_SRC_PMC1", "BriefDescription": "The processor's L1 data cache was reloaded from the source specified in MMCR3[0:12]. If MMCR1[16|17] is 0 (default), this count includes only lines that were reloaded to satisfy a demand miss. If MMCR1[16|17] is 1, this count includes both demand misses and prefetch reloads." }, { - "EventCode": "1C142", + "EventCode": "0x1C142", "EventName": "PM_MRK_XFER_FROM_SRC_PMC1", "BriefDescription": "For a marked data transfer instruction, the processor's L1 data cache was reloaded from the source specified in MMCR3[0:12]. If MMCR1[16|17] is 0 (default), this count includes only lines that were reloaded to satisfy a demand miss. If MMCR1[16|17] is 1, this count includes both demand misses and prefetch reloads." }, { - "EventCode": "1C144", + "EventCode": "0x1C144", "EventName": "PM_MRK_XFER_FROM_SRC_CYC_PMC1", "BriefDescription": "Cycles taken for a marked demand miss to reload a line from the source specified in MMCR3[0:12]." }, { - "EventCode": "1C056", + "EventCode": "0x1C056", "EventName": "PM_DERAT_MISS_4K", "BriefDescription": "Data ERAT Miss (Data TLB Access) page size 4K. When MMCR1[16]=0 this event counts only DERAT reloads for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches." }, { - "EventCode": "1C058", + "EventCode": "0x1C058", "EventName": "PM_DTLB_MISS_16G", "BriefDescription": "Data TLB reload (after a miss) page size 16G. When MMCR1[16]=0 this event counts only for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches." }, { - "EventCode": "1C05C", + "EventCode": "0x1C05C", "EventName": "PM_DTLB_MISS_2M", "BriefDescription": "Data TLB reload (after a miss) page size 2M. Implies radix translation was used. When MMCR1[16]=0 this event counts only for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches." }, { - "EventCode": "1E056", + "EventCode": "0x1E056", "EventName": "PM_EXEC_STALL_STORE_PIPE", "BriefDescription": "Cycles in which the oldest instruction in the pipeline was executing in the store unit. This does not include cycles spent handling store misses, PTESYNC instructions or TLBIE instructions." }, { - "EventCode": "1F150", + "EventCode": "0x1F150", "EventName": "PM_MRK_ST_L2_CYC", "BriefDescription": "Cycles from L2 RC dispatch to L2 RC completion." }, { - "EventCode": "10062", + "EventCode": "0x10062", "EventName": "PM_LD_L3MISS_PEND_CYC", "BriefDescription": "Cycles L3 miss was pending for this thread." }, { - "EventCode": "20010", + "EventCode": "0x20010", "EventName": "PM_PMC1_OVERFLOW", "BriefDescription": "The event selected for PMC1 caused the event counter to overflow." }, { - "EventCode": "2001A", + "EventCode": "0x2001A", "EventName": "PM_ITLB_HIT", "BriefDescription": "The PTE required to translate the instruction address was resident in the TLB (instruction TLB access/IERAT reload). Applies to both HPT and RPT. When MMCR1[17]=0 this event counts only for demand misses. When MMCR1[17]=1 this event includes demand misses and prefetches." }, { - "EventCode": "2003E", + "EventCode": "0x2003E", "EventName": "PM_PTESYNC_FIN", "BriefDescription": "Ptesync instruction finished in the store unit. Only one ptesync can finish at a time." }, { - "EventCode": "2C040", + "EventCode": "0x2C040", "EventName": "PM_XFER_FROM_SRC_PMC2", "BriefDescription": "The processor's L1 data cache was reloaded from the source specified in MMCR3[15:27]. If MMCR1[16|17] is 0 (default), this count includes only lines that were reloaded to satisfy a demand miss. If MMCR1[16|17] is 1, this count includes both demand misses and prefetch reloads." }, { - "EventCode": "2C054", + "EventCode": "0x2C054", "EventName": "PM_DERAT_MISS_64K", "BriefDescription": "Data ERAT Miss (Data TLB Access) page size 64K. When MMCR1[16]=0 this event counts only DERAT reloads for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches." }, { - "EventCode": "2C056", + "EventCode": "0x2C056", "EventName": "PM_DTLB_MISS_4K", "BriefDescription": "Data TLB reload (after a miss) page size 4K. When MMCR1[16]=0 this event counts only for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches." }, { - "EventCode": "2D154", + "EventCode": "0x2D154", "EventName": "PM_MRK_DERAT_MISS_64K", "BriefDescription": "Data ERAT Miss (Data TLB Access) page size 64K for a marked instruction. When MMCR1[16]=0 this event counts only DERAT reloads for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches." }, { - "EventCode": "200F6", + "EventCode": "0x200F6", "EventName": "PM_DERAT_MISS", "BriefDescription": "DERAT Reloaded to satisfy a DERAT miss. All page sizes are counted by this event. When MMCR1[16]=0 this event counts only DERAT reloads for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches." }, { - "EventCode": "3000A", - "EventName": "PM_DISP_STALL_ITLB_MISS", - "BriefDescription": "Cycles when dispatch was stalled while waiting to resolve an instruction TLB miss." - }, - { - "EventCode": "30016", + "EventCode": "0x30016", "EventName": "PM_EXEC_STALL_DERAT_DTLB_MISS", "BriefDescription": "Cycles in which the oldest instruction in the pipeline suffered a TLB miss and waited for it resolve." }, { - "EventCode": "3C040", + "EventCode": "0x3C040", "EventName": "PM_XFER_FROM_SRC_PMC3", "BriefDescription": "The processor's L1 data cache was reloaded from the source specified in MMCR3[30:42]. If MMCR1[16|17] is 0 (default), this count includes only lines that were reloaded to satisfy a demand miss. If MMCR1[16|17] is 1, this count includes both demand misses and prefetch reloads." }, { - "EventCode": "3C142", + "EventCode": "0x3C142", "EventName": "PM_MRK_XFER_FROM_SRC_PMC3", "BriefDescription": "For a marked data transfer instruction, the processor's L1 data cache was reloaded from the source specified in MMCR3[30:42]. If MMCR1[16|17] is 0 (default), this count includes only lines that were reloaded to satisfy a demand miss. If MMCR1[16|17] is 1, this count includes both demand misses and prefetch reloads." }, { - "EventCode": "3C144", + "EventCode": "0x3C144", "EventName": "PM_MRK_XFER_FROM_SRC_CYC_PMC3", "BriefDescription": "Cycles taken for a marked demand miss to reload a line from the source specified in MMCR3[30:42]." }, { - "EventCode": "3C054", + "EventCode": "0x3C054", "EventName": "PM_DERAT_MISS_16M", "BriefDescription": "Data ERAT Miss (Data TLB Access) page size 16M. When MMCR1[16]=0 this event counts only DERAT reloads for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches." }, { - "EventCode": "3C056", + "EventCode": "0x3C056", "EventName": "PM_DTLB_MISS_64K", "BriefDescription": "Data TLB reload (after a miss) page size 64K. When MMCR1[16]=0 this event counts only for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches." }, { - "EventCode": "3C058", + "EventCode": "0x3C058", "EventName": "PM_LARX_FIN", "BriefDescription": "Load and reserve instruction (LARX) finished. LARX and STCX are instructions used to acquire a lock." }, { - "EventCode": "301E2", + "EventCode": "0x301E2", "EventName": "PM_MRK_ST_CMPL", "BriefDescription": "Marked store completed and sent to nest. Note that this count excludes cache-inhibited stores." }, { - "EventCode": "300FC", + "EventCode": "0x300FC", "EventName": "PM_DTLB_MISS", "BriefDescription": "The DPTEG required for the load/store instruction in execution was missing from the TLB. It includes pages of all sizes for demand and prefetch activity." }, { - "EventCode": "4D02C", + "EventCode": "0x4D02C", "EventName": "PM_PMC1_REWIND", "BriefDescription": "The speculative event selected for PMC1 rewinds and the counter for PMC1 is not charged." }, { - "EventCode": "4003E", + "EventCode": "0x4003E", "EventName": "PM_LD_CMPL", "BriefDescription": "Loads completed." }, { - "EventCode": "4C040", + "EventCode": "0x4C040", "EventName": "PM_XFER_FROM_SRC_PMC4", "BriefDescription": "The processor's L1 data cache was reloaded from the source specified in MMCR3[45:57]. If MMCR1[16|17] is 0 (default), this count includes only lines that were reloaded to satisfy a demand miss. If MMCR1[16|17] is 1, this count includes both demand misses and prefetch reloads." }, { - "EventCode": "4C142", + "EventCode": "0x4C142", "EventName": "PM_MRK_XFER_FROM_SRC_PMC4", "BriefDescription": "For a marked data transfer instruction, the processor's L1 data cache was reloaded from the source specified in MMCR3[45:57]. If MMCR1[16|17] is 0 (default), this count includes only lines that were reloaded to satisfy a demand miss. If MMCR1[16|17] is 1, this count includes both demand misses and prefetch reloads." }, { - "EventCode": "4C144", + "EventCode": "0x4C144", "EventName": "PM_MRK_XFER_FROM_SRC_CYC_PMC4", "BriefDescription": "Cycles taken for a marked demand miss to reload a line from the source specified in MMCR3[45:57]." }, { - "EventCode": "4C056", + "EventCode": "0x4C056", "EventName": "PM_DTLB_MISS_16M", "BriefDescription": "Data TLB reload (after a miss) page size 16M. When MMCR1[16]=0 this event counts only for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches." }, { - "EventCode": "4C05A", + "EventCode": "0x4C05A", "EventName": "PM_DTLB_MISS_1G", "BriefDescription": "Data TLB reload (after a miss) page size 1G. Implies radix translation was used. When MMCR1[16]=0 this event counts only for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches." }, { - "EventCode": "4C15E", + "EventCode": "0x4C15E", "EventName": "PM_MRK_DTLB_MISS_64K", "BriefDescription": "Marked Data TLB reload (after a miss) page size 64K. When MMCR1[16]=0 this event counts only for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches." }, { - "EventCode": "4D056", + "EventCode": "0x4D056", "EventName": "PM_NON_FMA_FLOP_CMPL", "BriefDescription": "Non FMA instruction completed." }, { - "EventCode": "40164", + "EventCode": "0x40164", "EventName": "PM_MRK_DERAT_MISS_2M", "BriefDescription": "Data ERAT Miss (Data TLB Access) page size 2M for a marked instruction. When MMCR1[16]=0 this event counts only DERAT reloads for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches." } diff --git a/tools/perf/pmu-events/arch/powerpc/power10/others.json b/tools/perf/pmu-events/arch/powerpc/power10/others.json index a119e56cbf1c..7d0de1a2860b 100644 --- a/tools/perf/pmu-events/arch/powerpc/power10/others.json +++ b/tools/perf/pmu-events/arch/powerpc/power10/others.json @@ -1,296 +1,271 @@ [ { - "EventCode": "10016", + "EventCode": "0x10016", "EventName": "PM_VSU0_ISSUE", "BriefDescription": "VSU instructions issued to VSU pipe 0." }, { - "EventCode": "1001C", + "EventCode": "0x1001C", "EventName": "PM_ULTRAVISOR_INST_CMPL", "BriefDescription": "PowerPC instructions that completed while the thread was in ultravisor state." }, { - "EventCode": "100F0", + "EventCode": "0x100F0", "EventName": "PM_CYC", "BriefDescription": "Processor cycles." }, { - "EventCode": "10134", + "EventCode": "0x10134", "EventName": "PM_MRK_ST_DONE_L2", "BriefDescription": "Marked stores completed in L2 (RC machine done)." }, { - "EventCode": "1505E", + "EventCode": "0x1505E", "EventName": "PM_LD_HIT_L1", "BriefDescription": "Loads that finished without experiencing an L1 miss." }, { - "EventCode": "1D05E", - "EventName": "PM_DISP_STALL_HELD_HALT_CYC", - "BriefDescription": "Cycles in which the NTC instruction is held at dispatch because of power management." - }, - { - "EventCode": "1E054", - "EventName": "PM_EXEC_STALL_DMISS_L21_L31", - "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from another core's L2 or L3 on the same chip." - }, - { - "EventCode": "1E05A", - "EventName": "PM_CMPL_STALL_LWSYNC", - "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a lwsync waiting to complete." - }, - { - "EventCode": "1F056", + "EventCode": "0x1F056", "EventName": "PM_DISP_SS0_2_INSTR_CYC", "BriefDescription": "Cycles in which Superslice 0 dispatches either 1 or 2 instructions." }, { - "EventCode": "1F15C", + "EventCode": "0x1F15C", "EventName": "PM_MRK_STCX_L2_CYC", "BriefDescription": "Cycles spent in the nest portion of a marked Stcx instruction. It starts counting when the operation starts to drain to the L2 and it stops counting when the instruction retires from the Instruction Completion Table (ICT) in the Instruction Sequencing Unit (ISU)." }, { - "EventCode": "10066", + "EventCode": "0x10066", "EventName": "PM_ADJUNCT_CYC", "BriefDescription": "Cycles in which the thread is in Adjunct state. MSR[S HV PR] bits = 011." }, { - "EventCode": "101E4", + "EventCode": "0x101E4", "EventName": "PM_MRK_L1_ICACHE_MISS", "BriefDescription": "Marked Instruction suffered an icache Miss." }, { - "EventCode": "101EA", + "EventCode": "0x101EA", "EventName": "PM_MRK_L1_RELOAD_VALID", "BriefDescription": "Marked demand reload." }, { - "EventCode": "100F4", + "EventCode": "0x100F4", "EventName": "PM_FLOP_CMPL", "BriefDescription": "Floating Point Operations Completed. Includes any type. It counts once for each 1, 2, 4 or 8 flop instruction. Use PM_1|2|4|8_FLOP_CMPL events to count flops." }, { - "EventCode": "100FA", + "EventCode": "0x100FA", "EventName": "PM_RUN_LATCH_ANY_THREAD_CYC", "BriefDescription": "Cycles when at least one thread has the run latch set." }, { - "EventCode": "100FC", + "EventCode": "0x100FC", "EventName": "PM_LD_REF_L1", "BriefDescription": "All L1 D cache load references counted at finish, gated by reject. In P9 and earlier this event counted only cacheable loads but in P10 both cacheable and non-cacheable loads are included." }, { - "EventCode": "20006", - "EventName": "PM_DISP_STALL_HELD_ISSQ_FULL_CYC", - "BriefDescription": "Cycles in which the NTC instruction is held at dispatch due to Issue queue full. Includes issue queue and branch queue." - }, - { - "EventCode": "2000C", + "EventCode": "0x2000C", "EventName": "PM_RUN_LATCH_ALL_THREADS_CYC", "BriefDescription": "Cycles when the run latch is set for all threads." }, { - "EventCode": "2E010", + "EventCode": "0x2E010", "EventName": "PM_ADJUNCT_INST_CMPL", "BriefDescription": "PowerPC instructions that completed while the thread is in Adjunct state." }, { - "EventCode": "2E014", + "EventCode": "0x2E014", "EventName": "PM_STCX_FIN", "BriefDescription": "Conditional store instruction (STCX) finished. LARX and STCX are instructions used to acquire a lock." }, { - "EventCode": "20130", + "EventCode": "0x20130", "EventName": "PM_MRK_INST_DECODED", "BriefDescription": "An instruction was marked at decode time. Random Instruction Sampling (RIS) only." }, { - "EventCode": "20132", + "EventCode": "0x20132", "EventName": "PM_MRK_DFU_ISSUE", "BriefDescription": "The marked instruction was a decimal floating point operation issued to the VSU. Measured at issue time." }, { - "EventCode": "20134", + "EventCode": "0x20134", "EventName": "PM_MRK_FXU_ISSUE", "BriefDescription": "The marked instruction was a fixed point operation issued to the VSU. Measured at issue time." }, { - "EventCode": "2505C", + "EventCode": "0x2505C", "EventName": "PM_VSU_ISSUE", "BriefDescription": "At least one VSU instruction was issued to one of the VSU pipes. Up to 4 per cycle. Includes fixed point operations." }, { - "EventCode": "2F054", + "EventCode": "0x2F054", "EventName": "PM_DISP_SS1_2_INSTR_CYC", "BriefDescription": "Cycles in which Superslice 1 dispatches either 1 or 2 instructions." }, { - "EventCode": "2F056", + "EventCode": "0x2F056", "EventName": "PM_DISP_SS1_4_INSTR_CYC", "BriefDescription": "Cycles in which Superslice 1 dispatches either 3 or 4 instructions." }, { - "EventCode": "2006C", + "EventCode": "0x2006C", "EventName": "PM_RUN_CYC_SMT4_MODE", "BriefDescription": "Cycles when this thread's run latch is set and the core is in SMT4 mode." }, { - "EventCode": "201E0", + "EventCode": "0x201E0", "EventName": "PM_MRK_DATA_FROM_MEMORY", "BriefDescription": "The processor's data cache was reloaded from local, remote, or distant memory due to a demand miss for a marked load." }, { - "EventCode": "201E4", + "EventCode": "0x201E4", "EventName": "PM_MRK_DATA_FROM_L3MISS", "BriefDescription": "The processor's data cache was reloaded from a source other than the local core's L1, L2, or L3 due to a demand miss for a marked load." }, { - "EventCode": "201E8", + "EventCode": "0x201E8", "EventName": "PM_THRESH_EXC_512", "BriefDescription": "Threshold counter exceeded a value of 512." }, { - "EventCode": "200F2", + "EventCode": "0x200F2", "EventName": "PM_INST_DISP", "BriefDescription": "PowerPC instructions dispatched." }, { - "EventCode": "30132", + "EventCode": "0x30132", "EventName": "PM_MRK_VSU_FIN", "BriefDescription": "VSU marked instructions finished. Excludes simple FX instructions issued to the Store Unit." }, { - "EventCode": "30038", + "EventCode": "0x30038", "EventName": "PM_EXEC_STALL_DMISS_LMEM", "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from the local memory, local OpenCapp cache, or local OpenCapp memory." }, { - "EventCode": "3F04A", + "EventCode": "0x3F04A", "EventName": "PM_LSU_ST5_FIN", "BriefDescription": "LSU Finished an internal operation in ST2 port." }, { - "EventCode": "34054", - "EventName": "PM_EXEC_STALL_DMISS_L2L3_NOCONFLICT", - "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from the local L2 or local L3, without a dispatch conflict." - }, - { - "EventCode": "3405A", + "EventCode": "0x3405A", "EventName": "PM_PRIVILEGED_INST_CMPL", "BriefDescription": "PowerPC Instructions that completed while the thread is in Privileged state." }, { - "EventCode": "3F150", + "EventCode": "0x3F150", "EventName": "PM_MRK_ST_DRAIN_CYC", "BriefDescription": "cycles to drain st from core to L2." }, { - "EventCode": "3F054", + "EventCode": "0x3F054", "EventName": "PM_DISP_SS0_4_INSTR_CYC", "BriefDescription": "Cycles in which Superslice 0 dispatches either 3 or 4 instructions." }, { - "EventCode": "3F056", + "EventCode": "0x3F056", "EventName": "PM_DISP_SS0_8_INSTR_CYC", "BriefDescription": "Cycles in which Superslice 0 dispatches either 5, 6, 7 or 8 instructions." }, { - "EventCode": "30162", + "EventCode": "0x30162", "EventName": "PM_MRK_ISSUE_DEPENDENT_LOAD", "BriefDescription": "The marked instruction was dependent on a load. It is eligible for issue kill." }, { - "EventCode": "40114", + "EventCode": "0x40114", "EventName": "PM_MRK_START_PROBE_NOP_DISP", "BriefDescription": "Marked Start probe nop dispatched. Instruction AND R0,R0,R0." }, { - "EventCode": "4001C", + "EventCode": "0x4001C", "EventName": "PM_VSU_FIN", "BriefDescription": "VSU instructions finished." }, { - "EventCode": "4C01A", + "EventCode": "0x4C01A", "EventName": "PM_EXEC_STALL_DMISS_OFF_NODE", "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from a distant chip." }, { - "EventCode": "4D012", + "EventCode": "0x4D012", "EventName": "PM_PMC3_SAVED", "BriefDescription": "The conditions for the speculative event selected for PMC3 are met and PMC3 is charged." }, { - "EventCode": "4D022", + "EventCode": "0x4D022", "EventName": "PM_HYPERVISOR_INST_CMPL", "BriefDescription": "PowerPC instructions that completed while the thread is in hypervisor state." }, { - "EventCode": "4D026", + "EventCode": "0x4D026", "EventName": "PM_ULTRAVISOR_CYC", "BriefDescription": "Cycles when the thread is in Ultravisor state. MSR[S HV PR]=110." }, { - "EventCode": "4D028", + "EventCode": "0x4D028", "EventName": "PM_PRIVILEGED_CYC", "BriefDescription": "Cycles when the thread is in Privileged state. MSR[S HV PR]=x00." }, { - "EventCode": "40030", + "EventCode": "0x40030", "EventName": "PM_INST_FIN", "BriefDescription": "Instructions finished." }, { - "EventCode": "44146", + "EventCode": "0x44146", "EventName": "PM_MRK_STCX_CORE_CYC", "BriefDescription": "Cycles spent in the core portion of a marked Stcx instruction. It starts counting when the instruction is decoded and stops counting when it drains into the L2." }, { - "EventCode": "44054", + "EventCode": "0x44054", "EventName": "PM_VECTOR_LD_CMPL", "BriefDescription": "Vector load instructions completed." }, { - "EventCode": "45054", + "EventCode": "0x45054", "EventName": "PM_FMA_CMPL", "BriefDescription": "Two floating point instructions completed (FMA class of instructions: fmadd, fnmadd, fmsub, fnmsub). Scalar instructions only." }, { - "EventCode": "45056", + "EventCode": "0x45056", "EventName": "PM_SCALAR_FLOP_CMPL", "BriefDescription": "Scalar floating point instructions completed." }, { - "EventCode": "4505C", + "EventCode": "0x4505C", "EventName": "PM_MATH_FLOP_CMPL", "BriefDescription": "Math floating point instructions completed." }, { - "EventCode": "4D05E", + "EventCode": "0x4D05E", "EventName": "PM_BR_CMPL", "BriefDescription": "A branch completed. All branches are included." }, { - "EventCode": "4E15E", + "EventCode": "0x4E15E", "EventName": "PM_MRK_INST_FLUSHED", "BriefDescription": "The marked instruction was flushed." }, { - "EventCode": "401E6", + "EventCode": "0x401E6", "EventName": "PM_MRK_INST_FROM_L3MISS", "BriefDescription": "The processor's instruction cache was reloaded from a source other than the local core's L1, L2, or L3 due to a demand miss for a marked instruction." }, { - "EventCode": "401E8", + "EventCode": "0x401E8", "EventName": "PM_MRK_DATA_FROM_L2MISS", "BriefDescription": "The processor's data cache was reloaded from a source other than the local core's L1 or L2 due to a demand miss for a marked load." }, { - "EventCode": "400F0", + "EventCode": "0x400F0", "EventName": "PM_LD_DEMAND_MISS_L1_FIN", "BriefDescription": "Load Missed L1, counted at finish time." }, { - "EventCode": "400FA", + "EventCode": "0x400FA", "EventName": "PM_RUN_INST_CMPL", "BriefDescription": "Completed PowerPC instructions gated by the run latch." } diff --git a/tools/perf/pmu-events/arch/powerpc/power10/pipeline.json b/tools/perf/pmu-events/arch/powerpc/power10/pipeline.json index b61b5cc157ee..b8aded6045fa 100644 --- a/tools/perf/pmu-events/arch/powerpc/power10/pipeline.json +++ b/tools/perf/pmu-events/arch/powerpc/power10/pipeline.json @@ -1,296 +1,291 @@ [ { - "EventCode": "100FE", + "EventCode": "0x100FE", "EventName": "PM_INST_CMPL", "BriefDescription": "PowerPC instructions completed." }, { - "EventCode": "10006", - "EventName": "PM_DISP_STALL_HELD_OTHER_CYC", - "BriefDescription": "Cycles in which the NTC instruction is held at dispatch for any other reason." - }, - { - "EventCode": "1000C", + "EventCode": "0x1000C", "EventName": "PM_LSU_LD0_FIN", "BriefDescription": "LSU Finished an internal operation in LD0 port." }, { - "EventCode": "1000E", + "EventCode": "0x1000E", "EventName": "PM_MMA_ISSUED", "BriefDescription": "MMA instructions issued." }, { - "EventCode": "10012", + "EventCode": "0x10012", "EventName": "PM_LSU_ST0_FIN", "BriefDescription": "LSU Finished an internal operation in ST0 port." }, { - "EventCode": "10014", + "EventCode": "0x10014", "EventName": "PM_LSU_ST4_FIN", "BriefDescription": "LSU Finished an internal operation in ST4 port." }, { - "EventCode": "10018", + "EventCode": "0x10018", "EventName": "PM_IC_DEMAND_CYC", "BriefDescription": "Cycles in which an instruction reload is pending to satisfy a demand miss." }, { - "EventCode": "10022", + "EventCode": "0x10022", "EventName": "PM_PMC2_SAVED", "BriefDescription": "The conditions for the speculative event selected for PMC2 are met and PMC2 is charged." }, { - "EventCode": "10024", + "EventCode": "0x10024", "EventName": "PM_PMC5_OVERFLOW", "BriefDescription": "The event selected for PMC5 caused the event counter to overflow." }, { - "EventCode": "10058", + "EventCode": "0x10058", "EventName": "PM_EXEC_STALL_FIN_AT_DISP", "BriefDescription": "Cycles in which the oldest instruction in the pipeline finished at dispatch and did not require execution in the LSU, BRU or VSU." }, { - "EventCode": "1005A", + "EventCode": "0x1005A", "EventName": "PM_FLUSH_MPRED", "BriefDescription": "A flush occurred due to a mispredicted branch. Includes target and direction." }, { - "EventCode": "1C05A", + "EventCode": "0x1C05A", "EventName": "PM_DERAT_MISS_2M", "BriefDescription": "Data ERAT Miss (Data TLB Access) page size 2M. Implies radix translation. When MMCR1[16]=0 this event counts only DERAT reloads for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches." }, { - "EventCode": "10064", - "EventName": "PM_DISP_STALL_IC_L2", - "BriefDescription": "Cycles when dispatch was stalled while the instruction was fetched from the local L2." + "EventCode": "0x1E05A", + "EventName": "PM_CMPL_STALL_LWSYNC", + "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a lwsync waiting to complete." }, { - "EventCode": "10068", + "EventCode": "0x10068", "EventName": "PM_BR_FIN", "BriefDescription": "A branch instruction finished. Includes predicted/mispredicted/unconditional." }, { - "EventCode": "1006A", + "EventCode": "0x1006A", "EventName": "PM_FX_LSU_FIN", "BriefDescription": "Simple fixed point instruction issued to the store unit. Measured at finish time." }, { - "EventCode": "1006C", + "EventCode": "0x1006C", "EventName": "PM_RUN_CYC_ST_MODE", "BriefDescription": "Cycles when the run latch is set and the core is in ST mode." }, { - "EventCode": "20004", + "EventCode": "0x20004", "EventName": "PM_ISSUE_STALL", "BriefDescription": "Cycles in which the oldest instruction in the pipeline was dispatched but not issued yet." }, { - "EventCode": "2000A", + "EventCode": "0x2000A", "EventName": "PM_HYPERVISOR_CYC", "BriefDescription": "Cycles when the thread is in Hypervisor state. MSR[S HV PR]=010." }, { - "EventCode": "2000E", + "EventCode": "0x2000E", "EventName": "PM_LSU_LD1_FIN", "BriefDescription": "LSU Finished an internal operation in LD1 port." }, { - "EventCode": "2C014", + "EventCode": "0x2C014", "EventName": "PM_CMPL_STALL_SPECIAL", "BriefDescription": "Cycles in which the oldest instruction in the pipeline required special handling before completing." }, { - "EventCode": "2C018", + "EventCode": "0x2C018", "EventName": "PM_EXEC_STALL_DMISS_L3MISS", "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from a source beyond the local L2 or local L3." }, { - "EventCode": "2D010", + "EventCode": "0x2D010", "EventName": "PM_LSU_ST1_FIN", "BriefDescription": "LSU Finished an internal operation in ST1 port." }, { - "EventCode": "2D012", + "EventCode": "0x2D012", "EventName": "PM_VSU1_ISSUE", "BriefDescription": "VSU instructions issued to VSU pipe 1." }, { - "EventCode": "2D018", + "EventCode": "0x2D018", "EventName": "PM_EXEC_STALL_VSU", "BriefDescription": "Cycles in which the oldest instruction in the pipeline was executing in the VSU (includes FXU, VSU, CRU)." }, { - "EventCode": "2E01E", + "EventCode": "0x2D01C", + "EventName": "PM_CMPL_STALL_STCX", + "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a stcx waiting for resolution from the nest before completing." + }, + { + "EventCode": "0x2E01E", "EventName": "PM_EXEC_STALL_NTC_FLUSH", - "BriefDescription": "Cycles in which the oldest instruction in the pipeline was executing in any unit before it was flushed. Note that if the flush of the oldest instruction happens after finish, the cycles from dispatch to issue will be included in PM_DISP_STALL and the cycles from issue to finish will be included in PM_EXEC_STALL and its corresponding children." + "BriefDescription": "Cycles in which the oldest instruction in the pipeline was executing in any unit before it was flushed. Note that if the flush of the oldest instruction happens after finish, the cycles from dispatch to issue will be included in PM_DISP_STALL and the cycles from issue to finish will be included in PM_EXEC_STALL and its corresponding children. This event will also count cycles when the previous NTF instruction is still completing and the new NTF instruction is stalled at dispatch." }, { - "EventCode": "2013C", + "EventCode": "0x2013C", "EventName": "PM_MRK_FX_LSU_FIN", "BriefDescription": "The marked instruction was simple fixed point that was issued to the store unit. Measured at finish time." }, { - "EventCode": "2405A", + "EventCode": "0x2405A", "EventName": "PM_NTC_FIN", "BriefDescription": "Cycles in which the oldest instruction in the pipeline (NTC) finishes. Note that instructions can finish out of order, therefore not all the instructions that finish have a Next-to-complete status." }, { - "EventCode": "201E2", + "EventCode": "0x201E2", "EventName": "PM_MRK_LD_MISS_L1", "BriefDescription": "Marked DL1 Demand Miss counted at finish time." }, { - "EventCode": "200F4", + "EventCode": "0x200F4", "EventName": "PM_RUN_CYC", "BriefDescription": "Processor cycles gated by the run latch." }, { - "EventCode": "30004", - "EventName": "PM_DISP_STALL_FLUSH", - "BriefDescription": "Cycles when dispatch was stalled because of a flush that happened to an instruction(s) that was not yet NTC. PM_EXEC_STALL_NTC_FLUSH only includes instructions that were flushed after becoming NTC." - }, - { - "EventCode": "30008", + "EventCode": "0x30008", "EventName": "PM_EXEC_STALL", "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting to finish in one of the execution units (BRU, LSU, VSU). Only cycles between issue and finish are counted in this category." }, { - "EventCode": "3001A", + "EventCode": "0x3001A", "EventName": "PM_LSU_ST2_FIN", "BriefDescription": "LSU Finished an internal operation in ST2 port." }, { - "EventCode": "30020", + "EventCode": "0x30020", "EventName": "PM_PMC2_REWIND", "BriefDescription": "The speculative event selected for PMC2 rewinds and the counter for PMC2 is not charged." }, { - "EventCode": "30022", + "EventCode": "0x30022", "EventName": "PM_PMC4_SAVED", "BriefDescription": "The conditions for the speculative event selected for PMC4 are met and PMC4 is charged." }, { - "EventCode": "30024", + "EventCode": "0x30024", "EventName": "PM_PMC6_OVERFLOW", "BriefDescription": "The event selected for PMC6 caused the event counter to overflow." }, { - "EventCode": "30028", + "EventCode": "0x30028", "EventName": "PM_CMPL_STALL_MEM_ECC", "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for the non-speculative finish of either a stcx waiting for its result or a load waiting for non-critical sectors of data and ECC." }, { - "EventCode": "30036", + "EventCode": "0x30036", "EventName": "PM_EXEC_STALL_SIMPLE_FX", "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a simple fixed point instruction executing in the Load Store Unit." }, { - "EventCode": "3003A", + "EventCode": "0x3003A", "EventName": "PM_CMPL_STALL_EXCEPTION", "BriefDescription": "Cycles in which the oldest instruction in the pipeline was not allowed to complete because it was interrupted by ANY exception, which has to be serviced before the instruction can complete." }, { - "EventCode": "3F044", + "EventCode": "0x3F044", "EventName": "PM_VSU2_ISSUE", "BriefDescription": "VSU instructions issued to VSU pipe 2." }, { - "EventCode": "30058", + "EventCode": "0x30058", "EventName": "PM_TLBIE_FIN", "BriefDescription": "TLBIE instructions finished in the LSU. Two TLBIEs can finish each cycle. All will be counted." }, { - "EventCode": "3D058", + "EventCode": "0x3D058", "EventName": "PM_SCALAR_FSQRT_FDIV_ISSUE", "BriefDescription": "Scalar versions of four floating point operations: fdiv,fsqrt (xvdivdp, xvdivsp, xvsqrtdp, xvsqrtsp)." }, { - "EventCode": "30066", + "EventCode": "0x30066", "EventName": "PM_LSU_FIN", "BriefDescription": "LSU Finished an internal operation (up to 4 per cycle)." }, { - "EventCode": "40004", + "EventCode": "0x40004", "EventName": "PM_FXU_ISSUE", "BriefDescription": "A fixed point instruction was issued to the VSU." }, { - "EventCode": "40008", + "EventCode": "0x40008", "EventName": "PM_NTC_ALL_FIN", "BriefDescription": "Cycles in which both instructions in the ICT entry pair show as finished. These are the cycles between finish and completion for the oldest pair of instructions in the pipeline." }, { - "EventCode": "40010", + "EventCode": "0x40010", "EventName": "PM_PMC3_OVERFLOW", "BriefDescription": "The event selected for PMC3 caused the event counter to overflow." }, { - "EventCode": "4C012", + "EventCode": "0x4C012", "EventName": "PM_EXEC_STALL_DERAT_ONLY_MISS", "BriefDescription": "Cycles in which the oldest instruction in the pipeline suffered an ERAT miss and waited for it resolve." }, { - "EventCode": "4C018", + "EventCode": "0x4C018", "EventName": "PM_CMPL_STALL", "BriefDescription": "Cycles in which the oldest instruction in the pipeline cannot complete because the thread was blocked for any reason." }, { - "EventCode": "4C01E", + "EventCode": "0x4C01E", "EventName": "PM_LSU_ST3_FIN", "BriefDescription": "LSU Finished an internal operation in ST3 port." }, { - "EventCode": "4D018", + "EventCode": "0x4D018", "EventName": "PM_EXEC_STALL_BRU", "BriefDescription": "Cycles in which the oldest instruction in the pipeline was executing in the Branch unit." }, { - "EventCode": "4D01A", + "EventCode": "0x4D01A", "EventName": "PM_CMPL_STALL_HWSYNC", "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a hwsync waiting for response from L2 before completing." }, { - "EventCode": "4D01C", + "EventCode": "0x4D01C", "EventName": "PM_EXEC_STALL_TLBIEL", "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a TLBIEL instruction executing in the Load Store Unit. TLBIEL instructions have lower overhead than TLBIE instructions because they don't get set to the nest." }, { - "EventCode": "4E012", + "EventCode": "0x4E012", "EventName": "PM_EXEC_STALL_UNKNOWN", "BriefDescription": "Cycles in which the oldest instruction in the pipeline completed without an ntf_type pulse. The ntf_pulse was missed by the ISU because the NTF finishes and completions came too close together." }, { - "EventCode": "4D020", + "EventCode": "0x4D020", "EventName": "PM_VSU3_ISSUE", "BriefDescription": "VSU instruction was issued to VSU pipe 3." }, { - "EventCode": "40132", + "EventCode": "0x40132", "EventName": "PM_MRK_LSU_FIN", "BriefDescription": "LSU marked instruction finish." }, { - "EventCode": "45058", + "EventCode": "0x45058", "EventName": "PM_IC_MISS_CMPL", "BriefDescription": "Non-speculative icache miss, counted at completion." }, { - "EventCode": "4D050", + "EventCode": "0x4D050", "EventName": "PM_VSU_NON_FLOP_CMPL", "BriefDescription": "Non-floating point VSU instructions completed." }, { - "EventCode": "4D052", + "EventCode": "0x4D052", "EventName": "PM_2FLOP_CMPL", "BriefDescription": "Double Precision vector version of fmul, fsub, fcmp, fsel, fabs, fnabs, fres, fsqrte, fneg completed." }, { - "EventCode": "400F2", + "EventCode": "0x400F2", "EventName": "PM_1PLUS_PPC_DISP", "BriefDescription": "Cycles at least one Instr Dispatched." }, { - "EventCode": "400F8", + "EventCode": "0x400F8", "EventName": "PM_FLUSH", "BriefDescription": "Flush (any type)." } diff --git a/tools/perf/pmu-events/arch/powerpc/power10/pmc.json b/tools/perf/pmu-events/arch/powerpc/power10/pmc.json index ea122a91ceb0..b5d1bd39cfb2 100644 --- a/tools/perf/pmu-events/arch/powerpc/power10/pmc.json +++ b/tools/perf/pmu-events/arch/powerpc/power10/pmc.json @@ -1,21 +1,21 @@ [ { - "EventCode": "301E8", + "EventCode": "0x301E8", "EventName": "PM_THRESH_EXC_64", "BriefDescription": "Threshold counter exceeded a value of 64." }, { - "EventCode": "45050", + "EventCode": "0x45050", "EventName": "PM_1FLOP_CMPL", "BriefDescription": "One floating point instruction completed (fadd, fmul, fsub, fcmp, fsel, fabs, fnabs, fres, fsqrte, fneg)." }, { - "EventCode": "45052", + "EventCode": "0x45052", "EventName": "PM_4FLOP_CMPL", "BriefDescription": "Four floating point instructions completed (fadd, fmul, fsub, fcmp, fsel, fabs, fnabs, fres, fsqrte, fneg)." }, { - "EventCode": "4D054", + "EventCode": "0x4D054", "EventName": "PM_8FLOP_CMPL", "BriefDescription": "Four Double Precision vector instructions completed." } diff --git a/tools/perf/pmu-events/arch/powerpc/power10/translation.json b/tools/perf/pmu-events/arch/powerpc/power10/translation.json index 5a714e3dd71a..db3766dca07c 100644 --- a/tools/perf/pmu-events/arch/powerpc/power10/translation.json +++ b/tools/perf/pmu-events/arch/powerpc/power10/translation.json @@ -1,56 +1,56 @@ [ { - "EventCode": "1F15E", + "EventCode": "0x1F15E", "EventName": "PM_MRK_START_PROBE_NOP_CMPL", "BriefDescription": "Marked Start probe nop (AND R0,R0,R0) completed." }, { - "EventCode": "20016", + "EventCode": "0x20016", "EventName": "PM_ST_FIN", "BriefDescription": "Store finish count. Includes speculative activity." }, { - "EventCode": "20018", + "EventCode": "0x20018", "EventName": "PM_ST_FWD", "BriefDescription": "Store forwards that finished." }, { - "EventCode": "2011C", + "EventCode": "0x2011C", "EventName": "PM_MRK_NTF_CYC", "BriefDescription": "Cycles during which the marked instruction is the oldest in the pipeline (NTF or NTC)." }, { - "EventCode": "2E01C", + "EventCode": "0x2E01C", "EventName": "PM_EXEC_STALL_TLBIE", "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a TLBIE instruction executing in the Load Store Unit." }, { - "EventCode": "201E6", + "EventCode": "0x201E6", "EventName": "PM_THRESH_EXC_32", "BriefDescription": "Threshold counter exceeded a value of 32." }, { - "EventCode": "200F0", + "EventCode": "0x200F0", "EventName": "PM_ST_CMPL", "BriefDescription": "Stores completed from S2Q (2nd-level store queue). This event includes regular stores, stcx and cache inhibited stores. The following operations are excluded (pteupdate, snoop tlbie complete, store atomics, miso, load atomic payloads, tlbie, tlbsync, slbieg, isync, msgsnd, slbiag, cpabort, copy, tcheck, tend, stsync, dcbst, icbi, dcbf, hwsync, lwsync, ptesync, eieio, msgsync)." }, { - "EventCode": "200FE", + "EventCode": "0x200FE", "EventName": "PM_DATA_FROM_L2MISS", "BriefDescription": "The processor's data cache was reloaded from a source other than the local core's L1 or L2 due to a demand miss." }, { - "EventCode": "30010", + "EventCode": "0x30010", "EventName": "PM_PMC2_OVERFLOW", "BriefDescription": "The event selected for PMC2 caused the event counter to overflow." }, { - "EventCode": "4D010", + "EventCode": "0x4D010", "EventName": "PM_PMC1_SAVED", "BriefDescription": "The conditions for the speculative event selected for PMC1 are met and PMC1 is charged." }, { - "EventCode": "4D05C", + "EventCode": "0x4D05C", "EventName": "PM_DPP_FLOP_CMPL", "BriefDescription": "Double-Precision or Quad-Precision instructions completed." } -- cgit v1.2.3 From 69ca3d29a75554122b998e8dfa20117766f52f48 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 27 May 2021 16:31:40 -0700 Subject: mptcp: update selftest for fallback due to OoO The previous commit noted that we can have fallback scenario due to OoO (or packet drop). Update the self-tests accordingly Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_connect.sh | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index 3c4cb72ed8a4..9ca5f1ba461e 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -501,6 +501,7 @@ do_transfer() local stat_ackrx_now_l=$(get_mib_counter "${listener_ns}" "MPTcpExtMPCapableACKRX") local stat_cookietx_now=$(get_mib_counter "${listener_ns}" "TcpExtSyncookiesSent") local stat_cookierx_now=$(get_mib_counter "${listener_ns}" "TcpExtSyncookiesRecv") + local stat_ooo_now=$(get_mib_counter "${listener_ns}" "TcpExtTCPOFOQueue") expect_synrx=$((stat_synrx_last_l)) expect_ackrx=$((stat_ackrx_last_l)) @@ -518,10 +519,14 @@ do_transfer() "${stat_synrx_now_l}" "${expect_synrx}" 1>&2 retc=1 fi - if [ ${stat_ackrx_now_l} -lt ${expect_ackrx} ]; then - printf "[ FAIL ] lower MPC ACK rx (%d) than expected (%d)\n" \ - "${stat_ackrx_now_l}" "${expect_ackrx}" 1>&2 - rets=1 + if [ ${stat_ackrx_now_l} -lt ${expect_ackrx} -a ${stat_ooo_now} -eq 0 ]; then + if [ ${stat_ooo_now} -eq 0 ]; then + printf "[ FAIL ] lower MPC ACK rx (%d) than expected (%d)\n" \ + "${stat_ackrx_now_l}" "${expect_ackrx}" 1>&2 + rets=1 + else + printf "[ Note ] fallback due to TCP OoO" + fi fi if [ $retc -eq 0 ] && [ $rets -eq 0 ]; then -- cgit v1.2.3 From 000ac42953395a4f0a63d5db640c5e4c88a548c5 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 28 May 2021 15:10:58 -0400 Subject: selftests: kvm: fix overlapping addresses in memslot_perf_test vm_create allocates memory and maps it close to GPA. This memory is separate from what is allocated in subsequent calls to vm_userspace_mem_region_add, so it is incorrect to pass the test memory size to vm_create_default. Just pass a small fixed amount of memory which can be used later for page table, otherwise GPAs are already allocated at MEM_GPA and the test aborts. Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/memslot_perf_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c index 11239652d805..9307f25d8130 100644 --- a/tools/testing/selftests/kvm/memslot_perf_test.c +++ b/tools/testing/selftests/kvm/memslot_perf_test.c @@ -267,7 +267,7 @@ static bool prepare_vm(struct vm_data *data, int nslots, uint64_t *maxslots, data->hva_slots = malloc(sizeof(*data->hva_slots) * data->nslots); TEST_ASSERT(data->hva_slots, "malloc() fail"); - data->vm = vm_create_default(VCPU_ID, mempages, guest_code); + data->vm = vm_create_default(VCPU_ID, 1024, guest_code); pr_info_v("Adding slots 1..%i, each slot with %"PRIu64" pages + %"PRIu64" extra pages last\n", max_mem_slots - 1, data->pages_per_slot, rempages); -- cgit v1.2.3 From 6c1ced2f701618e912be6c549139d58c180419ea Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Tue, 1 Jun 2021 19:53:56 +0800 Subject: perf tools: Copy uapi/asm/perf_regs.h from the kernel for MIPS To allow the build to complete on older systems, where those files are either not uptodate, lacking some recent additions or not present at all. And check if the copy drifts from the kernel. This commit is similar with commit 12f020338a2c ("tools: Copy uapi/asm/perf_regs.h from the kernel") With this commit, we can avoid the following build error in any case: tools/perf/arch/mips/include/perf_regs.h:7:10: fatal error: asm/perf_regs.h: No such file or directory #include ^~~~~~~~~~~~~~~~~ compilation terminated. Signed-off-by: Tiezhu Yang Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Xuefeng Li Link: http://lore.kernel.org/lkml/1622548436-12472-1-git-send-email-yangtiezhu@loongson.cn Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/mips/include/uapi/asm/perf_regs.h | 40 ++++++++++++++++++++++++++++ tools/perf/Makefile.config | 1 - tools/perf/check-headers.sh | 1 + 3 files changed, 41 insertions(+), 1 deletion(-) create mode 100644 tools/arch/mips/include/uapi/asm/perf_regs.h (limited to 'tools') diff --git a/tools/arch/mips/include/uapi/asm/perf_regs.h b/tools/arch/mips/include/uapi/asm/perf_regs.h new file mode 100644 index 000000000000..d0f4ecd616cf --- /dev/null +++ b/tools/arch/mips/include/uapi/asm/perf_regs.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _ASM_MIPS_PERF_REGS_H +#define _ASM_MIPS_PERF_REGS_H + +enum perf_event_mips_regs { + PERF_REG_MIPS_PC, + PERF_REG_MIPS_R1, + PERF_REG_MIPS_R2, + PERF_REG_MIPS_R3, + PERF_REG_MIPS_R4, + PERF_REG_MIPS_R5, + PERF_REG_MIPS_R6, + PERF_REG_MIPS_R7, + PERF_REG_MIPS_R8, + PERF_REG_MIPS_R9, + PERF_REG_MIPS_R10, + PERF_REG_MIPS_R11, + PERF_REG_MIPS_R12, + PERF_REG_MIPS_R13, + PERF_REG_MIPS_R14, + PERF_REG_MIPS_R15, + PERF_REG_MIPS_R16, + PERF_REG_MIPS_R17, + PERF_REG_MIPS_R18, + PERF_REG_MIPS_R19, + PERF_REG_MIPS_R20, + PERF_REG_MIPS_R21, + PERF_REG_MIPS_R22, + PERF_REG_MIPS_R23, + PERF_REG_MIPS_R24, + PERF_REG_MIPS_R25, + PERF_REG_MIPS_R26, + PERF_REG_MIPS_R27, + PERF_REG_MIPS_R28, + PERF_REG_MIPS_R29, + PERF_REG_MIPS_R30, + PERF_REG_MIPS_R31, + PERF_REG_MIPS_MAX = PERF_REG_MIPS_R31 + 1, +}; +#endif /* _ASM_MIPS_PERF_REGS_H */ diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 406a9519145e..73df23dd664c 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -90,7 +90,6 @@ endif ifeq ($(ARCH),mips) NO_PERF_REGS := 0 CFLAGS += -I$(OUTPUT)arch/mips/include/generated - CFLAGS += -I../../arch/mips/include/uapi -I../../arch/mips/include/generated/uapi LIBUNWIND_LIBS = -lunwind -lunwind-mips endif diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index dd8ff287e930..c783558332b8 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -39,6 +39,7 @@ arch/x86/lib/x86-opcode-map.txt arch/x86/tools/gen-insn-attr-x86.awk arch/arm/include/uapi/asm/perf_regs.h arch/arm64/include/uapi/asm/perf_regs.h +arch/mips/include/uapi/asm/perf_regs.h arch/powerpc/include/uapi/asm/perf_regs.h arch/s390/include/uapi/asm/perf_regs.h arch/x86/include/uapi/asm/perf_regs.h -- cgit v1.2.3 From 3cb17cce1e76ccc5499915a4d7e095a1ad6bf7ff Mon Sep 17 00:00:00 2001 From: Li Huafei Date: Tue, 1 Jun 2021 17:27:50 +0800 Subject: perf probe: Fix NULL pointer dereference in convert_variable_location() If we just check whether the variable can be converted, 'tvar' should be a null pointer. However, the null pointer check is missing in the 'Constant value' execution path. The following cases can trigger this problem: $ cat test.c #include void main(void) { int a; const int b = 1; asm volatile("mov %1, %0" : "=r"(a): "i"(b)); printf("a: %d\n", a); } $ gcc test.c -o test -O -g $ sudo ./perf probe -x ./test -L "main" 0 void main(void) { 2 int a; const int b = 1; asm volatile("mov %1, %0" : "=r"(a): "i"(b)); 6 printf("a: %d\n", a); } $ sudo ./perf probe -x ./test -V "main:6" Segmentation fault The check on 'tvar' is added. If 'tavr' is a null pointer, we return 0 to indicate that the variable can be converted. Now, we can successfully show the variables that can be accessed. $ sudo ./perf probe -x ./test -V "main:6" Available variables at main:6 @ char* __fmt int a int b However, the variable 'b' cannot be tracked. $ sudo ./perf probe -x ./test -D "main:6 b" Failed to find the location of the 'b' variable at this address. Perhaps it has been optimized out. Use -V with the --range option to show 'b' location range. Error: Failed to add events. This is because __die_find_variable_cb() did not successfully match variable 'b', which has the DW_AT_const_value attribute instead of DW_AT_location. We added support for DW_AT_const_value in __die_find_variable_cb(). With this modification, we can successfully track the variable 'b'. $ sudo ./perf probe -x ./test -D "main:6 b" p:probe_test/main_L6 /home/lhf/test:0x1156 b=\1:s32 Fixes: 66f69b219716 ("perf probe: Support DW_AT_const_value constant value") Signed-off-by: Li Huafei Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Frank Ch. Eigler Cc: Jianlin Lv Cc: Jiri Olsa Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Yang Jihong Cc: Zhang Jinhao http://lore.kernel.org/lkml/20210601092750.169601-1-lihuafei1@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dwarf-aux.c | 8 ++++++-- tools/perf/util/probe-finder.c | 3 +++ 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c index b2f4920e19a6..7d2ba8419b0c 100644 --- a/tools/perf/util/dwarf-aux.c +++ b/tools/perf/util/dwarf-aux.c @@ -975,9 +975,13 @@ static int __die_find_variable_cb(Dwarf_Die *die_mem, void *data) if ((tag == DW_TAG_formal_parameter || tag == DW_TAG_variable) && die_compare_name(die_mem, fvp->name) && - /* Does the DIE have location information or external instance? */ + /* + * Does the DIE have location information or const value + * or external instance? + */ (dwarf_attr(die_mem, DW_AT_external, &attr) || - dwarf_attr(die_mem, DW_AT_location, &attr))) + dwarf_attr(die_mem, DW_AT_location, &attr) || + dwarf_attr(die_mem, DW_AT_const_value, &attr))) return DIE_FIND_CB_END; if (dwarf_haspc(die_mem, fvp->addr)) return DIE_FIND_CB_CONTINUE; diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c index 866f2d514d72..b029c29ce227 100644 --- a/tools/perf/util/probe-finder.c +++ b/tools/perf/util/probe-finder.c @@ -190,6 +190,9 @@ static int convert_variable_location(Dwarf_Die *vr_die, Dwarf_Addr addr, immediate_value_is_supported()) { Dwarf_Sword snum; + if (!tvar) + return 0; + dwarf_formsdata(&attr, &snum); ret = asprintf(&tvar->value, "\\%ld", (long)snum); -- cgit v1.2.3 From 4f2abe91922ba02bb419d91d92a518e4c805220b Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 27 May 2021 11:28:35 -0700 Subject: perf record: Move probing cgroup sampling support I found that checking cgroup sampling support using the missing features doesn't work on old kernels. Because it added both attr.cgroup bit and PERF_SAMPLE_CGROUP bit, it needs to check whichever comes first (usually the actual event, not dummy). But it only checks the attr.cgroup bit which is set only in the dummy event so cannot detect failtures due the sample bits. Also we don't ignore the missing feature and retry, it'd be better checking it with the API probing logic. Committer notes: Extracted the minimal part to check using the new cgroup API probe routine, the part that removes the cgroup member can be left for further discussion. Signed-off-by: Namhyung Kim Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ian Rogers Cc: Jiri Olsa Cc: Mark Rutland Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20210527182835.1634339-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 6 ++++++ tools/perf/util/perf_api_probe.c | 10 ++++++++++ tools/perf/util/perf_api_probe.h | 1 + 3 files changed, 17 insertions(+) (limited to 'tools') diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 3337b5f93336..84803abeb942 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -2714,6 +2714,12 @@ int cmd_record(int argc, const char **argv) rec->no_buildid = true; } + if (rec->opts.record_cgroup && !perf_can_record_cgroup()) { + pr_err("Kernel has no cgroup sampling support.\n"); + err = -EINVAL; + goto out_opts; + } + if (rec->opts.kcore) rec->data.is_dir = true; diff --git a/tools/perf/util/perf_api_probe.c b/tools/perf/util/perf_api_probe.c index 829af17a0867..020411682a3c 100644 --- a/tools/perf/util/perf_api_probe.c +++ b/tools/perf/util/perf_api_probe.c @@ -103,6 +103,11 @@ static void perf_probe_build_id(struct evsel *evsel) evsel->core.attr.build_id = 1; } +static void perf_probe_cgroup(struct evsel *evsel) +{ + evsel->core.attr.cgroup = 1; +} + bool perf_can_sample_identifier(void) { return perf_probe_api(perf_probe_sample_identifier); @@ -182,3 +187,8 @@ bool perf_can_record_build_id(void) { return perf_probe_api(perf_probe_build_id); } + +bool perf_can_record_cgroup(void) +{ + return perf_probe_api(perf_probe_cgroup); +} diff --git a/tools/perf/util/perf_api_probe.h b/tools/perf/util/perf_api_probe.h index f12ca55f509a..b104168efb15 100644 --- a/tools/perf/util/perf_api_probe.h +++ b/tools/perf/util/perf_api_probe.h @@ -12,5 +12,6 @@ bool perf_can_record_switch_events(void); bool perf_can_record_text_poke_events(void); bool perf_can_sample_identifier(void); bool perf_can_record_build_id(void); +bool perf_can_record_cgroup(void); #endif // __PERF_API_PROBE_H -- cgit v1.2.3 From d3fddc355a4a4415e8d43d1faae1be713d65cf5e Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 17 May 2021 16:12:54 +0800 Subject: perf stat: Fix error return code in bperf__load() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix to return a negative error code from the error handling case instead of 0, as done elsewhere in this function. Committer notes: Added the missing {} for the now multiline 'if' block, fixing this error: CC /tmp/build/perf/util/bpf_counter.o util/bpf_counter.c: In function ‘bperf__load’: util/bpf_counter.c:523:9: error: this ‘if’ clause does not guard... [-Werror=misleading-indentation] 523 | if (evsel->bperf_leader_link_fd < 0 && | ^~ util/bpf_counter.c:526:17: note: ...this statement, but the latter is misleadingly indented as if it were guarded by the ‘if’ 526 | goto out; | ^~~~ cc1: all warnings being treated as errors Fixes: 7fac83aaf2eecc9e ("perf stat: Introduce 'bperf' to share hardware PMCs with BPF") Reported-by: Hulk Robot Signed-off-by: Yu Kuai Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Daniel Borkmann Cc: Peter Zijlstra Cc: Song Liu Cc: Yu Kuai Cc: Zhang Yi Link: http://lore.kernel.org/lkml/20210517081254.1561564-1-yukuai3@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/bpf_counter.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c index 974f10e356f0..5ed674a2f55e 100644 --- a/tools/perf/util/bpf_counter.c +++ b/tools/perf/util/bpf_counter.c @@ -521,9 +521,10 @@ static int bperf__load(struct evsel *evsel, struct target *target) evsel->bperf_leader_link_fd = bpf_link_get_fd_by_id(entry.link_id); if (evsel->bperf_leader_link_fd < 0 && - bperf_reload_leader_program(evsel, attr_map_fd, &entry)) + bperf_reload_leader_program(evsel, attr_map_fd, &entry)) { + err = -1; goto out; - + } /* * The bpf_link holds reference to the leader program, and the * leader program holds reference to the maps. Therefore, if @@ -550,6 +551,7 @@ static int bperf__load(struct evsel *evsel, struct target *target) /* Step 2: load the follower skeleton */ evsel->follower_skel = bperf_follower_bpf__open(); if (!evsel->follower_skel) { + err = -1; pr_err("Failed to open follower skeleton\n"); goto out; } -- cgit v1.2.3 From f677ec94f6fb9d895f40403bd54236f7763c29db Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Fri, 28 May 2021 11:10:50 +0200 Subject: perf test: Test 17 fails with make LIBPFM4=1 on s390 z/VM This test case fails on s390 virtual machine z/VM which has no PMU support when the perf tool is built with LIBPFM4=1. Using make LIBPFM4=1 builds the perf tool with support for libpfm event notation. The command line flag --pfm-events is valid: # ./perf record --pfm-events cycles -- true [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.001 MB perf.data (2 samples) ] # However the command 'perf test -Fv 17' fails on s390 z/VM virtual machine with LIBPFM4=1: # perf test -Fv 17 17: Setup struct perf_event_attr : --- start --- ..... running './tests/attr/test-record-group2' unsupp './tests/attr/test-record-group2' running './tests/attr/test-record-pfm-period' expected exclude_hv=0, got 1 FAILED './tests/attr/test-record-pfm-period' - match failure ---- end ---- Setup struct perf_event_attr: FAILED! When --pfm-event system is not supported, the test returns unsupported and continues. Here is an example using a virtual machine on x86 and Fedora 34: [root@f33 perf]# perf test -Fv 17 17: Setup struct perf_event_attr : --- start --- ..... running './tests/attr/test-record-group2' unsupp './tests/attr/test-record-group2' running './tests/attr/test-record-pfm-period' unsupp './tests/attr/test-record-pfm-period' .... The issue is file ./tests/attr/test-record-pfm-period which requires perf event attribute member exclude_hv to be zero. This is not the case on s390 where the value of exclude_hv is one when executing on a z/VM virtual machine without PMU hardware support. Fix this by allowing value exlucde_hv to be zero or one. Output before: # /usr/bin/python ./tests/attr.py -d ./tests/attr/ -t \ test-record-pfm-period -p ./perf -vvv 2>&1| fgrep match matching [event:base-record] match: [event:base-record] matches [] FAILED './tests/attr//test-record-pfm-period' - match failure # Output after: # /usr/bin/python ./tests/attr.py -d ./tests/attr/ -t \ test-record-pfm-period -p ./perf -vvv 2>&1| fgrep match matching [event:base-record] match: [event:base-record] matches ['event-1-0-6', 'event-1-0-5'] matched Background: Using libpfm library ends up in this function call sequence pfm_get_perf_event_encoding() +-- pfm_get_os_event_encoding() +-- pfmlib_perf_event_encode() is called when no hardware specific PMU unit can be detected as in the s390 z/VM virtual machine case. This uses the "perf_events generic PMU" data structure which sets exclude_hv to 1 per default. Using this PMU that test case always fails. That is the reason why exclude_hv attribute setting varies. Version 2: As suggested by Ian Rogers make perf_event_attribute member exclude_hv more robust and accept value 0 or 1 to handle more test cases which might fail on s390 virtual machine z/VM. Suggested-by: Ian Rogers Signed-off-by: Thomas Richter Reviewed-by: Ian Rogers Cc: Heiko Carstens Cc: Ian Rogers Cc: Sumanth Korikkar Cc: Sven Schnelle Cc: Vasily Gorbik Link: http://lore.kernel.org/lkml/20210528091050.245838-1-tmricht@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/attr/base-record | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/tests/attr/base-record b/tools/perf/tests/attr/base-record index 4a7b8deef3fd..8c10955eff93 100644 --- a/tools/perf/tests/attr/base-record +++ b/tools/perf/tests/attr/base-record @@ -16,7 +16,7 @@ pinned=0 exclusive=0 exclude_user=0 exclude_kernel=0|1 -exclude_hv=0 +exclude_hv=0|1 exclude_idle=0 mmap=1 comm=1 -- cgit v1.2.3 From 2dc065eae56df804e4da5f8a9e4139033f7ea605 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 2 Jun 2021 14:22:40 -0700 Subject: perf evsel: Add missing cloning of evsel->use_config_name The evsel__clone() should copy all fields in the evsel which are set during the event parsing. But it missed the use_config_name field. Fixes: 12279429d862 ("perf stat: Uniquify hybrid event name") Signed-off-by: Namhyung Kim Acked-by: Ian Rogers Cc: Alexander Shishkin Cc: Andi Kleen Cc: Jin Yao Cc: Jiri Olsa Cc: Mark Rutland Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20210602212241.2175005-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.c | 1 + tools/perf/util/evsel.h | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 4a3cd1b5bb33..a8d8463f8ee5 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -428,6 +428,7 @@ struct evsel *evsel__clone(struct evsel *orig) evsel->auto_merge_stats = orig->auto_merge_stats; evsel->collect_stat = orig->collect_stat; evsel->weak_group = orig->weak_group; + evsel->use_config_name = orig->use_config_name; if (evsel__copy_config_terms(evsel, orig) < 0) goto out_err; diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 75cf5dbfe208..bdad52a06438 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -83,8 +83,10 @@ struct evsel { bool collect_stat; bool weak_group; bool bpf_counter; + bool use_config_name; int bpf_fd; struct bpf_object *bpf_obj; + struct list_head config_terms; }; /* @@ -116,10 +118,8 @@ struct evsel { bool merged_stat; bool reset_group; bool errored; - bool use_config_name; struct hashmap *per_pkg_mask; struct evsel *leader; - struct list_head config_terms; int err; int cpu_iter; struct { -- cgit v1.2.3 From 3cc84399e9b60463bc39cf352ffd8bccb92e02bd Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 2 Jun 2021 14:22:41 -0700 Subject: perf stat: Honor event config name on --no-merge If user gave an event name explicitly, it should be displayed in the output as is. But with --no-merge option it adds a pmu name at the end so might confuse users. Actually this is true for hybrid pmus, I think we should do the same for others. Signed-off-by: Namhyung Kim Acked-by: Ian Rogers Cc: Alexander Shishkin Cc: Andi Kleen Cc: Jin Yao Cc: Jiri Olsa Cc: Mark Rutland Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20210602212241.2175005-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/stat-display.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c index a76fff5e7d83..ca326f98c7a2 100644 --- a/tools/perf/util/stat-display.c +++ b/tools/perf/util/stat-display.c @@ -541,7 +541,7 @@ static void uniquify_event_name(struct evsel *counter) char *config; int ret = 0; - if (counter->uniquified_name || + if (counter->uniquified_name || counter->use_config_name || !counter->pmu_name || !strncmp(counter->name, counter->pmu_name, strlen(counter->pmu_name))) return; @@ -555,10 +555,8 @@ static void uniquify_event_name(struct evsel *counter) } } else { if (perf_pmu__has_hybrid()) { - if (!counter->use_config_name) { - ret = asprintf(&new_name, "%s/%s/", - counter->pmu_name, counter->name); - } + ret = asprintf(&new_name, "%s/%s/", + counter->pmu_name, counter->name); } else { ret = asprintf(&new_name, "%s [%s]", counter->name, counter->pmu_name); -- cgit v1.2.3 From 69c9ffed6cede9c11697861f654946e3ae95a930 Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Thu, 3 Jun 2021 00:08:33 +0200 Subject: perf symbol-elf: Fix memory leak by freeing sdt_note.args Reported by ASan. Signed-off-by: Riccardo Mancini Acked-by: Ian Rogers Cc: Alexander Shishkin Cc: Fabian Hemmer Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Remi Bernon Cc: Jiri Slaby Link: http://lore.kernel.org/lkml/20210602220833.285226-1-rickyman7@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/symbol-elf.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c index 4c56aa837434..a73345730ba9 100644 --- a/tools/perf/util/symbol-elf.c +++ b/tools/perf/util/symbol-elf.c @@ -2412,6 +2412,7 @@ int cleanup_sdt_note_list(struct list_head *sdt_notes) list_for_each_entry_safe(pos, tmp, sdt_notes, note_list) { list_del_init(&pos->note_list); + zfree(&pos->args); zfree(&pos->name); zfree(&pos->provider); free(pos); -- cgit v1.2.3 From 67069a1f0fe5f9eeca86d954fff2087f5542a008 Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Thu, 3 Jun 2021 00:40:23 +0200 Subject: perf env: Fix memory leak of bpf_prog_info_linear member ASan reported a memory leak caused by info_linear not being deallocated. The info_linear was allocated during in perf_event__synthesize_one_bpf_prog(). This patch adds the corresponding free() when bpf_prog_info_node is freed in perf_env__purge_bpf(). $ sudo ./perf record -- sleep 5 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.025 MB perf.data (8 samples) ] ================================================================= ==297735==ERROR: LeakSanitizer: detected memory leaks Direct leak of 7688 byte(s) in 19 object(s) allocated from: #0 0x4f420f in malloc (/home/user/linux/tools/perf/perf+0x4f420f) #1 0xc06a74 in bpf_program__get_prog_info_linear /home/user/linux/tools/lib/bpf/libbpf.c:11113:16 #2 0xb426fe in perf_event__synthesize_one_bpf_prog /home/user/linux/tools/perf/util/bpf-event.c:191:16 #3 0xb42008 in perf_event__synthesize_bpf_events /home/user/linux/tools/perf/util/bpf-event.c:410:9 #4 0x594596 in record__synthesize /home/user/linux/tools/perf/builtin-record.c:1490:8 #5 0x58c9ac in __cmd_record /home/user/linux/tools/perf/builtin-record.c:1798:8 #6 0x58990b in cmd_record /home/user/linux/tools/perf/builtin-record.c:2901:8 #7 0x7b2a20 in run_builtin /home/user/linux/tools/perf/perf.c:313:11 #8 0x7b12ff in handle_internal_command /home/user/linux/tools/perf/perf.c:365:8 #9 0x7b2583 in run_argv /home/user/linux/tools/perf/perf.c:409:2 #10 0x7b0d79 in main /home/user/linux/tools/perf/perf.c:539:3 #11 0x7fa357ef6b74 in __libc_start_main /usr/src/debug/glibc-2.33-8.fc34.x86_64/csu/../csu/libc-start.c:332:16 Signed-off-by: Riccardo Mancini Acked-by: Ian Rogers Cc: Alexander Shishkin Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Daniel Borkmann Cc: Jiri Olsa Cc: John Fastabend Cc: KP Singh Cc: Mark Rutland Cc: Martin KaFai Lau Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Song Liu Cc: Yonghong Song Link: http://lore.kernel.org/lkml/20210602224024.300485-1-rickyman7@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/env.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c index 9130f6fad8d5..bc5e4f294e9e 100644 --- a/tools/perf/util/env.c +++ b/tools/perf/util/env.c @@ -144,6 +144,7 @@ static void perf_env__purge_bpf(struct perf_env *env) node = rb_entry(next, struct bpf_prog_info_node, rb_node); next = rb_next(&node->rb_node); rb_erase(&node->rb_node, root); + free(node->info_linear); free(node); } -- cgit v1.2.3 From acf2492b51c9a3c4dfb947f4d3477a86d315150f Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Fri, 4 Jun 2021 17:17:30 +0200 Subject: wireguard: selftests: remove old conntrack kconfig value On recent kernels, this config symbol is no longer used. Reported-by: Rui Salvaterra Fixes: e7096c131e51 ("net: WireGuard secure network tunnel") Cc: stable@vger.kernel.org Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- tools/testing/selftests/wireguard/qemu/kernel.config | 1 - 1 file changed, 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/wireguard/qemu/kernel.config b/tools/testing/selftests/wireguard/qemu/kernel.config index 4eecb432a66c..74db83a0aedd 100644 --- a/tools/testing/selftests/wireguard/qemu/kernel.config +++ b/tools/testing/selftests/wireguard/qemu/kernel.config @@ -19,7 +19,6 @@ CONFIG_NETFILTER_XTABLES=y CONFIG_NETFILTER_XT_NAT=y CONFIG_NETFILTER_XT_MATCH_LENGTH=y CONFIG_NETFILTER_XT_MARK=y -CONFIG_NF_CONNTRACK_IPV4=y CONFIG_NF_NAT_IPV4=y CONFIG_IP_NF_IPTABLES=y CONFIG_IP_NF_FILTER=y -- cgit v1.2.3 From f8873d11d4121aad35024f9379e431e0c83abead Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Fri, 4 Jun 2021 17:17:31 +0200 Subject: wireguard: selftests: make sure rp_filter is disabled on vethc Some distros may enable strict rp_filter by default, which will prevent vethc from receiving the packets with an unrouteable reverse path address. Reported-by: Hangbin Liu Fixes: e7096c131e51 ("net: WireGuard secure network tunnel") Cc: stable@vger.kernel.org Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- tools/testing/selftests/wireguard/netns.sh | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/wireguard/netns.sh b/tools/testing/selftests/wireguard/netns.sh index 7ed7cd95e58f..ebc4ee0fe179 100755 --- a/tools/testing/selftests/wireguard/netns.sh +++ b/tools/testing/selftests/wireguard/netns.sh @@ -363,6 +363,7 @@ ip1 -6 rule add table main suppress_prefixlength 0 ip1 -4 route add default dev wg0 table 51820 ip1 -4 rule add not fwmark 51820 table 51820 ip1 -4 rule add table main suppress_prefixlength 0 +n1 bash -c 'printf 0 > /proc/sys/net/ipv4/conf/vethc/rp_filter' # Flood the pings instead of sending just one, to trigger routing table reference counting bugs. n1 ping -W 1 -c 100 -f 192.168.99.7 n1 ping -W 1 -c 100 -f abab::1111 -- cgit v1.2.3 From 263e88d678baa1a2e3f2d5afbdcd9fd3feb80a4d Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 4 Jun 2021 20:01:30 -0700 Subject: proc: add .gitignore for proc-subset-pid selftest This new selftest needs an entry in the .gitignore file otherwise git will try to track the binary. Link: https://lkml.kernel.org/r/20210601164305.11776-1-dmatlack@google.com Fixes: 268af17ada5855 ("selftests: proc: test subset=pid") Signed-off-by: David Matlack Acked-by: Christian Brauner Cc: Shuah Khan Cc: Alexey Dobriyan Cc: Alexey Gladkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/proc/.gitignore | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore index bed4b5318a86..8f3e72e626fa 100644 --- a/tools/testing/selftests/proc/.gitignore +++ b/tools/testing/selftests/proc/.gitignore @@ -10,6 +10,7 @@ /proc-self-map-files-002 /proc-self-syscall /proc-self-wchan +/proc-subset-pid /proc-uptime-001 /proc-uptime-002 /read -- cgit v1.2.3 From 11fc79fc9f2e395aa39fa5baccae62767c5d8280 Mon Sep 17 00:00:00 2001 From: Kev Jackson Date: Mon, 7 Jun 2021 14:08:35 +0100 Subject: libbpf: Fixes incorrect rx_ring_setup_done When calling xsk_socket__create_shared(), the logic at line 1097 marks a boolean flag true within the xsk_umem structure to track setup progress in order to support multiple calls to the function. However, instead of marking umem->tx_ring_setup_done, the code incorrectly sets umem->rx_ring_setup_done. This leads to improper behaviour when creating and destroying xsk and umem structures. Multiple calls to this function is documented as supported. Fixes: ca7a83e2487a ("libbpf: Only create rx and tx XDP rings when necessary") Signed-off-by: Kev Jackson Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/YL4aU4f3Aaik7CN0@linux-dev --- tools/lib/bpf/xsk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c index 6061431ee04c..e9b619aa0cdf 100644 --- a/tools/lib/bpf/xsk.c +++ b/tools/lib/bpf/xsk.c @@ -1094,7 +1094,7 @@ int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, goto out_put_ctx; } if (xsk->fd == umem->fd) - umem->rx_ring_setup_done = true; + umem->tx_ring_setup_done = true; } err = xsk_get_mmap_offsets(xsk->fd, &off); -- cgit v1.2.3 From 1bc603af73dd8fb2934306e861009c54f973dcc2 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Tue, 8 Jun 2021 14:39:54 +0200 Subject: KVM: selftests: introduce P47V64 for s390x s390x can have up to 47bits of physical guest and 64bits of virtual address bits. Add a new address mode to avoid errors of testcases going beyond 47bits. Signed-off-by: Christian Borntraeger Message-Id: <20210608123954.10991-1-borntraeger@de.ibm.com> Fixes: ef4c9f4f6546 ("KVM: selftests: Fix 32-bit truncation of vm_get_max_gfn()") Cc: stable@vger.kernel.org Reviewed-by: David Matlack Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/kvm_util.h | 3 ++- tools/testing/selftests/kvm/lib/kvm_util.c | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index fcd8e3855111..b602552b1ed0 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -43,6 +43,7 @@ enum vm_guest_mode { VM_MODE_P40V48_4K, VM_MODE_P40V48_64K, VM_MODE_PXXV48_4K, /* For 48bits VA but ANY bits PA */ + VM_MODE_P47V64_4K, NUM_VM_MODES, }; @@ -60,7 +61,7 @@ enum vm_guest_mode { #elif defined(__s390x__) -#define VM_MODE_DEFAULT VM_MODE_P52V48_4K +#define VM_MODE_DEFAULT VM_MODE_P47V64_4K #define MIN_PAGE_SHIFT 12U #define ptes_per_page(page_size) ((page_size) / 16) diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 28e528c19d28..b126fab6c4e1 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -175,6 +175,7 @@ const char *vm_guest_mode_string(uint32_t i) [VM_MODE_P40V48_4K] = "PA-bits:40, VA-bits:48, 4K pages", [VM_MODE_P40V48_64K] = "PA-bits:40, VA-bits:48, 64K pages", [VM_MODE_PXXV48_4K] = "PA-bits:ANY, VA-bits:48, 4K pages", + [VM_MODE_P47V64_4K] = "PA-bits:47, VA-bits:64, 4K pages", }; _Static_assert(sizeof(strings)/sizeof(char *) == NUM_VM_MODES, "Missing new mode strings?"); @@ -192,6 +193,7 @@ const struct vm_guest_mode_params vm_guest_mode_params[] = { { 40, 48, 0x1000, 12 }, { 40, 48, 0x10000, 16 }, { 0, 0, 0x1000, 12 }, + { 47, 64, 0x1000, 12 }, }; _Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct vm_guest_mode_params) == NUM_VM_MODES, "Missing new mode params?"); @@ -277,6 +279,9 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) TEST_FAIL("VM_MODE_PXXV48_4K not supported on non-x86 platforms"); #endif break; + case VM_MODE_P47V64_4K: + vm->pgtable_levels = 5; + break; default: TEST_FAIL("Unknown guest mode, mode: 0x%x", mode); } -- cgit v1.2.3 From f53b16ad64408b5376836708f8cf42dbf1cf6098 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 9 Jun 2021 07:38:16 +0800 Subject: selftests: kvm: Add support for customized slot0 memory size Until commit 39fe2fc96694 ("selftests: kvm: make allocation of extra memory take effect", 2021-05-27), parameter extra_mem_pages was used only to calculate the page table size for all the memory chunks, because real memory allocation happened with calls of vm_userspace_mem_region_add() after vm_create_default(). Commit 39fe2fc96694 however changed the meaning of extra_mem_pages to the size of memory slot 0. This makes the memory allocation more flexible, but makes it harder to account for the number of pages needed for the page tables. For example, memslot_perf_test has a small amount of memory in slot 0 but a lot in other slots, and adding that memory twice (both in slot 0 and with later calls to vm_userspace_mem_region_add()) causes an error that was fixed in commit 000ac4295339 ("selftests: kvm: fix overlapping addresses in memslot_perf_test", 2021-05-29) Since both uses are sensible, add a new parameter slot0_mem_pages to vm_create_with_vcpus() and some comments to clarify the meaning of slot0_mem_pages and extra_mem_pages. With this change, memslot_perf_test can go back to passing the number of memory pages as extra_mem_pages. Signed-off-by: Zhenzhong Duan Message-Id: <20210608233816.423958-4-zhenzhong.duan@intel.com> [Squashed in a single patch and rewrote the commit message. - Paolo] Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/kvm_util.h | 7 ++-- tools/testing/selftests/kvm/kvm_page_table_test.c | 2 +- tools/testing/selftests/kvm/lib/kvm_util.c | 47 ++++++++++++++++++----- tools/testing/selftests/kvm/lib/perf_test_util.c | 2 +- tools/testing/selftests/kvm/memslot_perf_test.c | 2 +- 5 files changed, 45 insertions(+), 15 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index b602552b1ed0..35739567189e 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -286,10 +286,11 @@ struct kvm_vm *vm_create_default_with_vcpus(uint32_t nr_vcpus, uint64_t extra_me uint32_t num_percpu_pages, void *guest_code, uint32_t vcpuids[]); -/* Like vm_create_default_with_vcpus, but accepts mode as a parameter */ +/* Like vm_create_default_with_vcpus, but accepts mode and slot0 memory as a parameter */ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, - uint64_t extra_mem_pages, uint32_t num_percpu_pages, - void *guest_code, uint32_t vcpuids[]); + uint64_t slot0_mem_pages, uint64_t extra_mem_pages, + uint32_t num_percpu_pages, void *guest_code, + uint32_t vcpuids[]); /* * Adds a vCPU with reasonable defaults (e.g. a stack) diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c index 1c4753fff19e..82171f17c1d7 100644 --- a/tools/testing/selftests/kvm/kvm_page_table_test.c +++ b/tools/testing/selftests/kvm/kvm_page_table_test.c @@ -268,7 +268,7 @@ static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg) /* Create a VM with enough guest pages */ guest_num_pages = test_mem_size / guest_page_size; - vm = vm_create_with_vcpus(mode, nr_vcpus, + vm = vm_create_with_vcpus(mode, nr_vcpus, DEFAULT_GUEST_PHY_PAGES, guest_num_pages, 0, guest_code, NULL); /* Align down GPA of the testing memslot */ diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index b126fab6c4e1..5c70596dd1b9 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -313,21 +313,50 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) return vm; } +/* + * VM Create with customized parameters + * + * Input Args: + * mode - VM Mode (e.g. VM_MODE_P52V48_4K) + * nr_vcpus - VCPU count + * slot0_mem_pages - Slot0 physical memory size + * extra_mem_pages - Non-slot0 physical memory total size + * num_percpu_pages - Per-cpu physical memory pages + * guest_code - Guest entry point + * vcpuids - VCPU IDs + * + * Output Args: None + * + * Return: + * Pointer to opaque structure that describes the created VM. + * + * Creates a VM with the mode specified by mode (e.g. VM_MODE_P52V48_4K), + * with customized slot0 memory size, at least 512 pages currently. + * extra_mem_pages is only used to calculate the maximum page table size, + * no real memory allocation for non-slot0 memory in this function. + */ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, - uint64_t extra_mem_pages, uint32_t num_percpu_pages, - void *guest_code, uint32_t vcpuids[]) + uint64_t slot0_mem_pages, uint64_t extra_mem_pages, + uint32_t num_percpu_pages, void *guest_code, + uint32_t vcpuids[]) { + uint64_t vcpu_pages, extra_pg_pages, pages; + struct kvm_vm *vm; + int i; + + /* Force slot0 memory size not small than DEFAULT_GUEST_PHY_PAGES */ + if (slot0_mem_pages < DEFAULT_GUEST_PHY_PAGES) + slot0_mem_pages = DEFAULT_GUEST_PHY_PAGES; + /* The maximum page table size for a memory region will be when the * smallest pages are used. Considering each page contains x page * table descriptors, the total extra size for page tables (for extra * N pages) will be: N/x+N/x^2+N/x^3+... which is definitely smaller * than N/x*2. */ - uint64_t vcpu_pages = (DEFAULT_STACK_PGS + num_percpu_pages) * nr_vcpus; - uint64_t extra_pg_pages = (extra_mem_pages + vcpu_pages) / PTES_PER_MIN_PAGE * 2; - uint64_t pages = DEFAULT_GUEST_PHY_PAGES + extra_mem_pages + vcpu_pages + extra_pg_pages; - struct kvm_vm *vm; - int i; + vcpu_pages = (DEFAULT_STACK_PGS + num_percpu_pages) * nr_vcpus; + extra_pg_pages = (slot0_mem_pages + extra_mem_pages + vcpu_pages) / PTES_PER_MIN_PAGE * 2; + pages = slot0_mem_pages + vcpu_pages + extra_pg_pages; TEST_ASSERT(nr_vcpus <= kvm_check_cap(KVM_CAP_MAX_VCPUS), "nr_vcpus = %d too large for host, max-vcpus = %d", @@ -359,8 +388,8 @@ struct kvm_vm *vm_create_default_with_vcpus(uint32_t nr_vcpus, uint64_t extra_me uint32_t num_percpu_pages, void *guest_code, uint32_t vcpuids[]) { - return vm_create_with_vcpus(VM_MODE_DEFAULT, nr_vcpus, extra_mem_pages, - num_percpu_pages, guest_code, vcpuids); + return vm_create_with_vcpus(VM_MODE_DEFAULT, nr_vcpus, DEFAULT_GUEST_PHY_PAGES, + extra_mem_pages, num_percpu_pages, guest_code, vcpuids); } struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages, diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c index abf381800a59..7397ca299835 100644 --- a/tools/testing/selftests/kvm/lib/perf_test_util.c +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c @@ -69,7 +69,7 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, TEST_ASSERT(vcpu_memory_bytes % perf_test_args.guest_page_size == 0, "Guest memory size is not guest page size aligned."); - vm = vm_create_with_vcpus(mode, vcpus, + vm = vm_create_with_vcpus(mode, vcpus, DEFAULT_GUEST_PHY_PAGES, (vcpus * vcpu_memory_bytes) / perf_test_args.guest_page_size, 0, guest_code, NULL); diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c index 9307f25d8130..11239652d805 100644 --- a/tools/testing/selftests/kvm/memslot_perf_test.c +++ b/tools/testing/selftests/kvm/memslot_perf_test.c @@ -267,7 +267,7 @@ static bool prepare_vm(struct vm_data *data, int nslots, uint64_t *maxslots, data->hva_slots = malloc(sizeof(*data->hva_slots) * data->nslots); TEST_ASSERT(data->hva_slots, "malloc() fail"); - data->vm = vm_create_default(VCPU_ID, 1024, guest_code); + data->vm = vm_create_default(VCPU_ID, mempages, guest_code); pr_info_v("Adding slots 1..%i, each slot with %"PRIu64" pages + %"PRIu64" extra pages last\n", max_mem_slots - 1, data->pages_per_slot, rempages); -- cgit v1.2.3 From e8ba0b2b64126381643bb50df3556b139a60545a Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Sat, 8 May 2021 11:42:16 +0800 Subject: tools/bootconfig: Fix error return code in apply_xbc() Fix to return a negative error code from the error handling case instead of 0, as done elsewhere in this function. Link: https://lkml.kernel.org/r/20210508034216.2277-1-thunder.leizhen@huawei.com Fixes: a995e6bc0524 ("tools/bootconfig: Fix to check the write failure correctly") Reported-by: Hulk Robot Acked-by: Masami Hiramatsu Signed-off-by: Zhen Lei Signed-off-by: Steven Rostedt (VMware) --- tools/bootconfig/main.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/bootconfig/main.c b/tools/bootconfig/main.c index 7362bef1a368..6cd6080cac04 100644 --- a/tools/bootconfig/main.c +++ b/tools/bootconfig/main.c @@ -399,6 +399,7 @@ static int apply_xbc(const char *path, const char *xbc_path) } /* TODO: Ensure the @path is initramfs/initrd image */ if (fstat(fd, &stat) < 0) { + ret = -errno; pr_err("Failed to get the size of %s\n", path); goto out; } -- cgit v1.2.3 From 824afd55e95c3cb12c55d297a0ae408be1779cc8 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 13 May 2021 12:06:33 +0900 Subject: tools/bootconfig: Fix a build error accroding to undefined fallthrough Since the "fallthrough" is defined only in the kernel, building lib/bootconfig.c as a part of user-space tools causes a build error. Add a dummy fallthrough to avoid the build error. Link: https://lkml.kernel.org/r/162087519356.442660.11385099982318160180.stgit@devnote2 Cc: Ingo Molnar Cc: stable@vger.kernel.org Fixes: 4c1ca831adb1 ("Revert "lib: Revert use of fallthrough pseudo-keyword in lib/"") Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- tools/bootconfig/include/linux/bootconfig.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'tools') diff --git a/tools/bootconfig/include/linux/bootconfig.h b/tools/bootconfig/include/linux/bootconfig.h index 078cbd2ba651..de7f30f99af3 100644 --- a/tools/bootconfig/include/linux/bootconfig.h +++ b/tools/bootconfig/include/linux/bootconfig.h @@ -4,4 +4,8 @@ #include "../../../../include/linux/bootconfig.h" +#ifndef fallthrough +# define fallthrough +#endif + #endif -- cgit v1.2.3 From 82944421243e5988898f54266687586ba07d889e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 8 Jun 2021 13:48:17 +0200 Subject: selftests: netfilter: add fib test case There is a bug report on netfilter.org bugzilla pointing to fib expression dropping ipv6 DAD packets. Add a test case that demonstrates this problem. Next patch excludes icmpv6 packets coming from any to linklocal. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- tools/testing/selftests/netfilter/Makefile | 2 +- tools/testing/selftests/netfilter/nft_fib.sh | 221 +++++++++++++++++++++++++++ 2 files changed, 222 insertions(+), 1 deletion(-) create mode 100755 tools/testing/selftests/netfilter/nft_fib.sh (limited to 'tools') diff --git a/tools/testing/selftests/netfilter/Makefile b/tools/testing/selftests/netfilter/Makefile index 3171069a6b46..cd6430b39982 100644 --- a/tools/testing/selftests/netfilter/Makefile +++ b/tools/testing/selftests/netfilter/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 # Makefile for netfilter selftests -TEST_PROGS := nft_trans_stress.sh nft_nat.sh bridge_brouter.sh \ +TEST_PROGS := nft_trans_stress.sh nft_fib.sh nft_nat.sh bridge_brouter.sh \ conntrack_icmp_related.sh nft_flowtable.sh ipvs.sh \ nft_concat_range.sh nft_conntrack_helper.sh \ nft_queue.sh nft_meta.sh nf_nat_edemux.sh \ diff --git a/tools/testing/selftests/netfilter/nft_fib.sh b/tools/testing/selftests/netfilter/nft_fib.sh new file mode 100755 index 000000000000..6caf6ac8c285 --- /dev/null +++ b/tools/testing/selftests/netfilter/nft_fib.sh @@ -0,0 +1,221 @@ +#!/bin/bash +# +# This tests the fib expression. +# +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 +ret=0 + +sfx=$(mktemp -u "XXXXXXXX") +ns1="ns1-$sfx" +ns2="ns2-$sfx" +nsrouter="nsrouter-$sfx" +timeout=4 + +log_netns=$(sysctl -n net.netfilter.nf_log_all_netns) + +cleanup() +{ + ip netns del ${ns1} + ip netns del ${ns2} + ip netns del ${nsrouter} + + [ $log_netns -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns=$log_netns +} + +nft --version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without nft tool" + exit $ksft_skip +fi + +ip -Version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +ip netns add ${nsrouter} +if [ $? -ne 0 ];then + echo "SKIP: Could not create net namespace" + exit $ksft_skip +fi + +trap cleanup EXIT + +dmesg | grep -q ' nft_rpfilter: ' +if [ $? -eq 0 ]; then + dmesg -c | grep ' nft_rpfilter: ' + echo "WARN: a previous test run has failed" 1>&2 +fi + +sysctl -q net.netfilter.nf_log_all_netns=1 +ip netns add ${ns1} +ip netns add ${ns2} + +load_ruleset() { + local netns=$1 + +ip netns exec ${netns} nft -f /dev/stdin <&2 + ip netns exec ${ns} nft list table inet filter + return 1 + fi + + if [ $want -gt 0 ]; then + echo "PASS: fib expression did drop packets for $address" + fi + + return 0 +} + +load_ruleset ${nsrouter} +load_ruleset ${ns1} +load_ruleset ${ns2} + +ip link add veth0 netns ${nsrouter} type veth peer name eth0 netns ${ns1} > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: No virtual ethernet pair device support in kernel" + exit $ksft_skip +fi +ip link add veth1 netns ${nsrouter} type veth peer name eth0 netns ${ns2} + +ip -net ${nsrouter} link set lo up +ip -net ${nsrouter} link set veth0 up +ip -net ${nsrouter} addr add 10.0.1.1/24 dev veth0 +ip -net ${nsrouter} addr add dead:1::1/64 dev veth0 + +ip -net ${nsrouter} link set veth1 up +ip -net ${nsrouter} addr add 10.0.2.1/24 dev veth1 +ip -net ${nsrouter} addr add dead:2::1/64 dev veth1 + +ip -net ${ns1} link set lo up +ip -net ${ns1} link set eth0 up + +ip -net ${ns2} link set lo up +ip -net ${ns2} link set eth0 up + +ip -net ${ns1} addr add 10.0.1.99/24 dev eth0 +ip -net ${ns1} addr add dead:1::99/64 dev eth0 +ip -net ${ns1} route add default via 10.0.1.1 +ip -net ${ns1} route add default via dead:1::1 + +ip -net ${ns2} addr add 10.0.2.99/24 dev eth0 +ip -net ${ns2} addr add dead:2::99/64 dev eth0 +ip -net ${ns2} route add default via 10.0.2.1 +ip -net ${ns2} route add default via dead:2::1 + +test_ping() { + local daddr4=$1 + local daddr6=$2 + + ip netns exec ${ns1} ping -c 1 -q $daddr4 > /dev/null + ret=$? + if [ $ret -ne 0 ];then + check_drops + echo "FAIL: ${ns1} cannot reach $daddr4, ret $ret" 1>&2 + return 1 + fi + + ip netns exec ${ns1} ping -c 3 -q $daddr6 > /dev/null + ret=$? + if [ $ret -ne 0 ];then + check_drops + echo "FAIL: ${ns1} cannot reach $daddr6, ret $ret" 1>&2 + return 1 + fi + + return 0 +} + +ip netns exec ${nsrouter} sysctl net.ipv6.conf.all.forwarding=1 > /dev/null +ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null +ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null + +sleep 3 + +test_ping 10.0.2.1 dead:2::1 || exit 1 +check_drops || exit 1 + +test_ping 10.0.2.99 dead:2::99 || exit 1 +check_drops || exit 1 + +echo "PASS: fib expression did not cause unwanted packet drops" + +ip netns exec ${nsrouter} nft flush table inet filter + +ip -net ${ns1} route del default +ip -net ${ns1} -6 route del default + +ip -net ${ns1} addr del 10.0.1.99/24 dev eth0 +ip -net ${ns1} addr del dead:1::99/64 dev eth0 + +ip -net ${ns1} addr add 10.0.2.99/24 dev eth0 +ip -net ${ns1} addr add dead:2::99/64 dev eth0 + +ip -net ${ns1} route add default via 10.0.2.1 +ip -net ${ns1} -6 route add default via dead:2::1 + +ip -net ${nsrouter} addr add dead:2::1/64 dev veth0 + +# switch to ruleset that doesn't log, this time +# its expected that this does drop the packets. +load_ruleset_count ${nsrouter} + +# ns1 has a default route, but nsrouter does not. +# must not check return value, ping to 1.1.1.1 will +# fail. +check_fib_counter 0 ${nsrouter} 1.1.1.1 || exit 1 +check_fib_counter 0 ${nsrouter} 1c3::c01d || exit 1 + +ip netns exec ${ns1} ping -c 1 -W 1 -q 1.1.1.1 > /dev/null +check_fib_counter 1 ${nsrouter} 1.1.1.1 || exit 1 + +sleep 2 +ip netns exec ${ns1} ping -c 3 -q 1c3::c01d > /dev/null +check_fib_counter 3 ${nsrouter} 1c3::c01d || exit 1 + +exit 0 -- cgit v1.2.3 From 584fd3b31889852d0d6f3dd1e3d8e9619b660d2c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 7 Jun 2021 11:45:58 +0200 Subject: objtool: Fix .symtab_shndx handling for elf_create_undef_symbol() When an ELF object uses extended symbol section indexes (IOW it has a .symtab_shndx section), these must be kept in sync with the regular symbol table (.symtab). So for every new symbol we emit, make sure to also emit a .symtab_shndx value to keep the arrays of equal size. Note: since we're writing an UNDEF symbol, most GElf_Sym fields will be 0 and we can repurpose one (st_size) to host the 0 for the xshndx value. Fixes: 2f2f7e47f052 ("objtool: Add elf_create_undef_symbol()") Reported-by: Nick Desaulniers Suggested-by: Fangrui Song Signed-off-by: Peter Zijlstra (Intel) Tested-by: Nick Desaulniers Link: https://lkml.kernel.org/r/YL3q1qFO9QIRL/BA@hirez.programming.kicks-ass.net --- tools/objtool/elf.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index 743c2e9d0f56..41bca1d13d8e 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -717,7 +717,7 @@ static int elf_add_string(struct elf *elf, struct section *strtab, char *str) struct symbol *elf_create_undef_symbol(struct elf *elf, const char *name) { - struct section *symtab; + struct section *symtab, *symtab_shndx; struct symbol *sym; Elf_Data *data; Elf_Scn *s; @@ -769,6 +769,29 @@ struct symbol *elf_create_undef_symbol(struct elf *elf, const char *name) symtab->len += data->d_size; symtab->changed = true; + symtab_shndx = find_section_by_name(elf, ".symtab_shndx"); + if (symtab_shndx) { + s = elf_getscn(elf->elf, symtab_shndx->idx); + if (!s) { + WARN_ELF("elf_getscn"); + return NULL; + } + + data = elf_newdata(s); + if (!data) { + WARN_ELF("elf_newdata"); + return NULL; + } + + data->d_buf = &sym->sym.st_size; /* conveniently 0 */ + data->d_size = sizeof(Elf32_Word); + data->d_align = 4; + data->d_type = ELF_T_WORD; + + symtab_shndx->len += 4; + symtab_shndx->changed = true; + } + sym->sec = find_section_by_index(elf, 0); elf_add_symbol(elf, sym); -- cgit v1.2.3 From 95bf69b400f41fbba7a2dc49b0152dd7bdc9a508 Mon Sep 17 00:00:00 2001 From: Yanan Wang Date: Thu, 10 Jun 2021 16:54:18 +0800 Subject: KVM: selftests: Fix compiling errors when initializing the static structure Errors like below were produced from test_util.c when compiling the KVM selftests on my local platform. lib/test_util.c: In function 'vm_mem_backing_src_alias': lib/test_util.c:177:12: error: initializer element is not constant .flag = anon_flags, ^~~~~~~~~~ lib/test_util.c:177:12: note: (near initialization for 'aliases[0].flag') The reason is that we are using non-const expressions to initialize the static structure, which will probably trigger a compiling error/warning on stricter GCC versions. Fix it by converting the two const variables "anon_flags" and "anon_huge_flags" into more stable macros. Fixes: b3784bc28ccc0 ("KVM: selftests: refactor vm_mem_backing_src_type flags") Reported-by: Zenghui Yu Signed-off-by: Yanan Wang Message-Id: <20210610085418.35544-1-wangyanan55@huawei.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/lib/test_util.c | 38 ++++++++++++++--------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c index 6ad6c8276b2e..af1031fed97f 100644 --- a/tools/testing/selftests/kvm/lib/test_util.c +++ b/tools/testing/selftests/kvm/lib/test_util.c @@ -166,75 +166,75 @@ size_t get_def_hugetlb_pagesz(void) return 0; } +#define ANON_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS) +#define ANON_HUGE_FLAGS (ANON_FLAGS | MAP_HUGETLB) + const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i) { - static const int anon_flags = MAP_PRIVATE | MAP_ANONYMOUS; - static const int anon_huge_flags = anon_flags | MAP_HUGETLB; - static const struct vm_mem_backing_src_alias aliases[] = { [VM_MEM_SRC_ANONYMOUS] = { .name = "anonymous", - .flag = anon_flags, + .flag = ANON_FLAGS, }, [VM_MEM_SRC_ANONYMOUS_THP] = { .name = "anonymous_thp", - .flag = anon_flags, + .flag = ANON_FLAGS, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB] = { .name = "anonymous_hugetlb", - .flag = anon_huge_flags, + .flag = ANON_HUGE_FLAGS, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_16KB] = { .name = "anonymous_hugetlb_16kb", - .flag = anon_huge_flags | MAP_HUGE_16KB, + .flag = ANON_HUGE_FLAGS | MAP_HUGE_16KB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_64KB] = { .name = "anonymous_hugetlb_64kb", - .flag = anon_huge_flags | MAP_HUGE_64KB, + .flag = ANON_HUGE_FLAGS | MAP_HUGE_64KB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_512KB] = { .name = "anonymous_hugetlb_512kb", - .flag = anon_huge_flags | MAP_HUGE_512KB, + .flag = ANON_HUGE_FLAGS | MAP_HUGE_512KB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_1MB] = { .name = "anonymous_hugetlb_1mb", - .flag = anon_huge_flags | MAP_HUGE_1MB, + .flag = ANON_HUGE_FLAGS | MAP_HUGE_1MB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_2MB] = { .name = "anonymous_hugetlb_2mb", - .flag = anon_huge_flags | MAP_HUGE_2MB, + .flag = ANON_HUGE_FLAGS | MAP_HUGE_2MB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_8MB] = { .name = "anonymous_hugetlb_8mb", - .flag = anon_huge_flags | MAP_HUGE_8MB, + .flag = ANON_HUGE_FLAGS | MAP_HUGE_8MB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_16MB] = { .name = "anonymous_hugetlb_16mb", - .flag = anon_huge_flags | MAP_HUGE_16MB, + .flag = ANON_HUGE_FLAGS | MAP_HUGE_16MB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_32MB] = { .name = "anonymous_hugetlb_32mb", - .flag = anon_huge_flags | MAP_HUGE_32MB, + .flag = ANON_HUGE_FLAGS | MAP_HUGE_32MB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_256MB] = { .name = "anonymous_hugetlb_256mb", - .flag = anon_huge_flags | MAP_HUGE_256MB, + .flag = ANON_HUGE_FLAGS | MAP_HUGE_256MB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_512MB] = { .name = "anonymous_hugetlb_512mb", - .flag = anon_huge_flags | MAP_HUGE_512MB, + .flag = ANON_HUGE_FLAGS | MAP_HUGE_512MB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_1GB] = { .name = "anonymous_hugetlb_1gb", - .flag = anon_huge_flags | MAP_HUGE_1GB, + .flag = ANON_HUGE_FLAGS | MAP_HUGE_1GB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_2GB] = { .name = "anonymous_hugetlb_2gb", - .flag = anon_huge_flags | MAP_HUGE_2GB, + .flag = ANON_HUGE_FLAGS | MAP_HUGE_2GB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_16GB] = { .name = "anonymous_hugetlb_16gb", - .flag = anon_huge_flags | MAP_HUGE_16GB, + .flag = ANON_HUGE_FLAGS | MAP_HUGE_16GB, }, [VM_MEM_SRC_SHMEM] = { .name = "shmem", -- cgit v1.2.3 From 2395da0e17935ce9158cdfae433962bdb6cbfa67 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 10 Jun 2021 15:59:43 -0700 Subject: selftests: mptcp: enable syncookie only in absence of reorders Syncookie validation may fail for OoO packets, causing spurious resets and self-tests failures, so let's force syncookie only for tests iteration with no OoO. Fixes: fed61c4b584c ("selftests: mptcp: make 2nd net namespace use tcp syn cookies unconditionally") Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/198 Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/mptcp_connect.sh | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index 9ca5f1ba461e..2b495dc8d78e 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -197,9 +197,6 @@ ip -net "$ns4" link set ns4eth3 up ip -net "$ns4" route add default via 10.0.3.2 ip -net "$ns4" route add default via dead:beef:3::2 -# use TCP syn cookies, even if no flooding was detected. -ip netns exec "$ns2" sysctl -q net.ipv4.tcp_syncookies=2 - set_ethtool_flags() { local ns="$1" local dev="$2" @@ -737,6 +734,14 @@ for sender in $ns1 $ns2 $ns3 $ns4;do exit $ret fi + # ns1<->ns2 is not subject to reordering/tc delays. Use it to test + # mptcp syncookie support. + if [ $sender = $ns1 ]; then + ip netns exec "$ns2" sysctl -q net.ipv4.tcp_syncookies=2 + else + ip netns exec "$ns2" sysctl -q net.ipv4.tcp_syncookies=1 + fi + run_tests "$ns2" $sender 10.0.1.2 run_tests "$ns2" $sender dead:beef:1::2 run_tests "$ns2" $sender 10.0.2.1 -- cgit v1.2.3 From 2d49b721dc18c113d5221f4cf5a6104eb66cb7f2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 Jun 2021 09:04:29 +0200 Subject: objtool: Only rewrite unconditional retpoline thunk calls It turns out that the compilers generate conditional branches to the retpoline thunks like: 5d5: 0f 85 00 00 00 00 jne 5db 5d7: R_X86_64_PLT32 __x86_indirect_thunk_r11-0x4 while the rewrite can only handle JMP/CALL to the thunks. The result is the alternative wrecking the code. Make sure to skip writing the alternatives for conditional branches. Fixes: 9bc0bb50727c ("objtool/x86: Rewrite retpoline thunk calls") Reported-by: Lukasz Majczak Reported-by: Nathan Chancellor Signed-off-by: Peter Zijlstra (Intel) Tested-by: Nathan Chancellor --- tools/objtool/arch/x86/decode.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'tools') diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index 24295d39713b..523aa4157f80 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -747,6 +747,10 @@ int arch_rewrite_retpolines(struct objtool_file *file) list_for_each_entry(insn, &file->retpoline_call_list, call_node) { + if (insn->type != INSN_JUMP_DYNAMIC && + insn->type != INSN_CALL_DYNAMIC) + continue; + if (!strcmp(insn->sec->name, ".text.__x86.indirect_thunk")) continue; -- cgit v1.2.3 From 197eecb6ecae0b04bd694432f640ff75597fed9c Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Sat, 5 Jun 2021 13:29:57 +0800 Subject: perf session: Correct buffer copying when peeking events When peeking an event, it has a short path and a long path. The short path uses the session pointer "one_mmap_addr" to directly fetch the event; and the long path needs to read out the event header and the following event data from file and fill into the buffer pointer passed through the argument "buf". The issue is in the long path that it copies the event header and event data into the same destination address which pointer "buf", this means the event header is overwritten. We are just lucky to run into the short path in most cases, so we don't hit the issue in the long path. This patch adds the offset "hdr_sz" to the pointer "buf" when copying the event data, so that it can reserve the event header which can be used properly by its caller. Fixes: 5a52f33adf02 ("perf session: Add perf_session__peek_event()") Signed-off-by: Leo Yan Acked-by: Adrian Hunter Acked-by: Jiri Olsa Cc: Alexander Shishkin Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20210605052957.1070720-1-leo.yan@linaro.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/session.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 106b3d60881a..e59242c361ce 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1723,6 +1723,7 @@ int perf_session__peek_event(struct perf_session *session, off_t file_offset, if (event->header.size < hdr_sz || event->header.size > buf_sz) return -1; + buf += hdr_sz; rest = event->header.size - hdr_sz; if (readn(fd, buf, rest) != (ssize_t)rest) -- cgit v1.2.3 From 36524112aba3246d1240c1791c72b26fa54008a3 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 8 Jun 2021 13:46:18 -0300 Subject: tools headers cpufeatures: Sync with the kernel sources To pick the changes in: fb35d30fe5b06cc2 ("x86/cpufeatures: Assign dedicated feature word for CPUID_0x8000001F[EAX]") e7b6385b01d8e9fb ("x86/cpufeatures: Add Intel SGX hardware bits") 1478b99a76534b6c ("x86/cpufeatures: Mark ENQCMD as disabled when configured out") That don't cause any change in the tools, just silences this perf build warning: Warning: Kernel ABI header at 'tools/arch/x86/include/asm/disabled-features.h' differs from latest version at 'arch/x86/include/asm/disabled-features.h' diff -u tools/arch/x86/include/asm/disabled-features.h arch/x86/include/asm/disabled-features.h Cc: Borislav Petkov Cc: Fenghua Yu Cc: Sean Christopherson Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/asm/disabled-features.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'tools') diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h index b7dd944dc867..8f28fafa98b3 100644 --- a/tools/arch/x86/include/asm/disabled-features.h +++ b/tools/arch/x86/include/asm/disabled-features.h @@ -56,11 +56,8 @@ # define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31)) #endif -#ifdef CONFIG_IOMMU_SUPPORT -# define DISABLE_ENQCMD 0 -#else -# define DISABLE_ENQCMD (1 << (X86_FEATURE_ENQCMD & 31)) -#endif +/* Force disable because it's broken beyond repair */ +#define DISABLE_ENQCMD (1 << (X86_FEATURE_ENQCMD & 31)) #ifdef CONFIG_X86_SGX # define DISABLE_SGX 0 -- cgit v1.2.3 From b87b04f5019e821c8c6c7761f258402e43500a1f Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sat, 12 Jun 2021 18:24:59 -0600 Subject: ipv4: Fix device used for dst_alloc with local routes Oliver reported a use case where deleting a VRF device can hang waiting for the refcnt to drop to 0. The root cause is that the dst is allocated against the VRF device but cached on the loopback device. The use case (added to the selftests) has an implicit VRF crossing due to the ordering of the FIB rules (lookup local is before the l3mdev rule, but the problem occurs even if the FIB rules are re-ordered with local after l3mdev because the VRF table does not have a default route to terminate the lookup). The end result is is that the FIB lookup returns the loopback device as the nexthop, but the ingress device is in a VRF. The mismatch causes the dst alloc against the VRF device but then cached on the loopback. The fix is to bring the trick used for IPv6 (see ip6_rt_get_dev_rcu): pick the dst alloc device based the fib lookup result but with checks that the result has a nexthop device (e.g., not an unreachable or prohibit entry). Fixes: f5a0aab84b74 ("net: ipv4: dst for local input routes should use l3mdev if relevant") Reported-by: Oliver Herms Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/route.c | 15 ++++++++++++++- tools/testing/selftests/net/fib_tests.sh | 25 +++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index f6787c55f6ab..6a36ac98476f 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2056,6 +2056,19 @@ martian_source: return err; } +/* get device for dst_alloc with local routes */ +static struct net_device *ip_rt_get_dev(struct net *net, + const struct fib_result *res) +{ + struct fib_nh_common *nhc = res->fi ? res->nhc : NULL; + struct net_device *dev = NULL; + + if (nhc) + dev = l3mdev_master_dev_rcu(nhc->nhc_dev); + + return dev ? : net->loopback_dev; +} + /* * NOTE. We drop all the packets that has local source * addresses, because every properly looped back packet @@ -2212,7 +2225,7 @@ local_input: } } - rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev, + rth = rt_dst_alloc(ip_rt_get_dev(net, res), flags | RTCF_LOCAL, res->type, IN_DEV_ORCONF(in_dev, NOPOLICY), false); if (!rth) diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh index 76d9487fb03c..5abe92d55b69 100755 --- a/tools/testing/selftests/net/fib_tests.sh +++ b/tools/testing/selftests/net/fib_tests.sh @@ -1384,12 +1384,37 @@ ipv4_rt_replace() ipv4_rt_replace_mpath } +# checks that cached input route on VRF port is deleted +# when VRF is deleted +ipv4_local_rt_cache() +{ + run_cmd "ip addr add 10.0.0.1/32 dev lo" + run_cmd "ip netns add test-ns" + run_cmd "ip link add veth-outside type veth peer name veth-inside" + run_cmd "ip link add vrf-100 type vrf table 1100" + run_cmd "ip link set veth-outside master vrf-100" + run_cmd "ip link set veth-inside netns test-ns" + run_cmd "ip link set veth-outside up" + run_cmd "ip link set vrf-100 up" + run_cmd "ip route add 10.1.1.1/32 dev veth-outside table 1100" + run_cmd "ip netns exec test-ns ip link set veth-inside up" + run_cmd "ip netns exec test-ns ip addr add 10.1.1.1/32 dev veth-inside" + run_cmd "ip netns exec test-ns ip route add 10.0.0.1/32 dev veth-inside" + run_cmd "ip netns exec test-ns ip route add default via 10.0.0.1" + run_cmd "ip netns exec test-ns ping 10.0.0.1 -c 1 -i 1" + run_cmd "ip link delete vrf-100" + + # if we do not hang test is a success + log_test $? 0 "Cached route removed from VRF port device" +} + ipv4_route_test() { route_setup ipv4_rt_add ipv4_rt_replace + ipv4_local_rt_cache route_cleanup } -- cgit v1.2.3 From 973377ffe8148180b2651825b92ae91988141b05 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 31 May 2021 12:34:24 +0000 Subject: bpf, selftests: Adjust few selftest outcomes wrt unreachable code In almost all cases from test_verifier that have been changed in here, we've had an unreachable path with a load from a register which has an invalid address on purpose. This was basically to make sure that we never walk this path and to have the verifier complain if it would otherwise. Change it to match on the right error for unprivileged given we now test these paths under speculative execution. There's one case where we match on exact # of insns_processed. Due to the extra path, this will of course mismatch on unprivileged. Thus, restrict the test->insn_processed check to privileged-only. In one other case, we result in a 'pointer comparison prohibited' error. This is similarly due to verifying an 'invalid' branch where we end up with a value pointer on one side of the comparison. Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Acked-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_verifier.c | 2 +- tools/testing/selftests/bpf/verifier/and.c | 2 ++ tools/testing/selftests/bpf/verifier/bounds.c | 14 ++++++++++++++ tools/testing/selftests/bpf/verifier/dead_code.c | 2 ++ tools/testing/selftests/bpf/verifier/jmp32.c | 22 ++++++++++++++++++++++ tools/testing/selftests/bpf/verifier/jset.c | 10 ++++++---- tools/testing/selftests/bpf/verifier/unpriv.c | 2 ++ .../selftests/bpf/verifier/value_ptr_arith.c | 7 ++++--- 8 files changed, 53 insertions(+), 8 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 1512092e1e68..3a9e332c5e36 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -1147,7 +1147,7 @@ static void do_test_single(struct bpf_test *test, bool unpriv, } } - if (test->insn_processed) { + if (!unpriv && test->insn_processed) { uint32_t insn_processed; char *proc; diff --git a/tools/testing/selftests/bpf/verifier/and.c b/tools/testing/selftests/bpf/verifier/and.c index ca8fdb1b3f01..7d7ebee5cc7a 100644 --- a/tools/testing/selftests/bpf/verifier/and.c +++ b/tools/testing/selftests/bpf/verifier/and.c @@ -61,6 +61,8 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R1 !read_ok", + .result_unpriv = REJECT, .result = ACCEPT, .retval = 0 }, diff --git a/tools/testing/selftests/bpf/verifier/bounds.c b/tools/testing/selftests/bpf/verifier/bounds.c index 8a1caf46ffbc..e061e8799ce2 100644 --- a/tools/testing/selftests/bpf/verifier/bounds.c +++ b/tools/testing/selftests/bpf/verifier/bounds.c @@ -508,6 +508,8 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, -1), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R0 invalid mem access 'inv'", + .result_unpriv = REJECT, .result = ACCEPT }, { @@ -528,6 +530,8 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, -1), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R0 invalid mem access 'inv'", + .result_unpriv = REJECT, .result = ACCEPT }, { @@ -569,6 +573,8 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R0 min value is outside of the allowed memory range", + .result_unpriv = REJECT, .fixup_map_hash_8b = { 3 }, .result = ACCEPT, }, @@ -589,6 +595,8 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R0 min value is outside of the allowed memory range", + .result_unpriv = REJECT, .fixup_map_hash_8b = { 3 }, .result = ACCEPT, }, @@ -609,6 +617,8 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R0 min value is outside of the allowed memory range", + .result_unpriv = REJECT, .fixup_map_hash_8b = { 3 }, .result = ACCEPT, }, @@ -674,6 +684,8 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R0 min value is outside of the allowed memory range", + .result_unpriv = REJECT, .fixup_map_hash_8b = { 3 }, .result = ACCEPT, }, @@ -695,6 +707,8 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R0 min value is outside of the allowed memory range", + .result_unpriv = REJECT, .fixup_map_hash_8b = { 3 }, .result = ACCEPT, }, diff --git a/tools/testing/selftests/bpf/verifier/dead_code.c b/tools/testing/selftests/bpf/verifier/dead_code.c index 17fe33a75034..2c8935b3e65d 100644 --- a/tools/testing/selftests/bpf/verifier/dead_code.c +++ b/tools/testing/selftests/bpf/verifier/dead_code.c @@ -8,6 +8,8 @@ BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 10, -4), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R9 !read_ok", + .result_unpriv = REJECT, .result = ACCEPT, .retval = 7, }, diff --git a/tools/testing/selftests/bpf/verifier/jmp32.c b/tools/testing/selftests/bpf/verifier/jmp32.c index bd5cae4a7f73..1c857b2fbdf0 100644 --- a/tools/testing/selftests/bpf/verifier/jmp32.c +++ b/tools/testing/selftests/bpf/verifier/jmp32.c @@ -87,6 +87,8 @@ BPF_LDX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R9 !read_ok", + .result_unpriv = REJECT, .result = ACCEPT, }, { @@ -150,6 +152,8 @@ BPF_LDX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R9 !read_ok", + .result_unpriv = REJECT, .result = ACCEPT, }, { @@ -213,6 +217,8 @@ BPF_LDX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R9 !read_ok", + .result_unpriv = REJECT, .result = ACCEPT, }, { @@ -280,6 +286,8 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R0 invalid mem access 'inv'", + .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, }, @@ -348,6 +356,8 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R0 invalid mem access 'inv'", + .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, }, @@ -416,6 +426,8 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R0 invalid mem access 'inv'", + .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, }, @@ -484,6 +496,8 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R0 invalid mem access 'inv'", + .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, }, @@ -552,6 +566,8 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R0 invalid mem access 'inv'", + .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, }, @@ -620,6 +636,8 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R0 invalid mem access 'inv'", + .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, }, @@ -688,6 +706,8 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R0 invalid mem access 'inv'", + .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, }, @@ -756,6 +776,8 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R0 invalid mem access 'inv'", + .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, }, diff --git a/tools/testing/selftests/bpf/verifier/jset.c b/tools/testing/selftests/bpf/verifier/jset.c index 8dcd4e0383d5..11fc68da735e 100644 --- a/tools/testing/selftests/bpf/verifier/jset.c +++ b/tools/testing/selftests/bpf/verifier/jset.c @@ -82,8 +82,8 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, - .retval_unpriv = 1, - .result_unpriv = ACCEPT, + .errstr_unpriv = "R9 !read_ok", + .result_unpriv = REJECT, .retval = 1, .result = ACCEPT, }, @@ -141,7 +141,8 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, - .result_unpriv = ACCEPT, + .errstr_unpriv = "R9 !read_ok", + .result_unpriv = REJECT, .result = ACCEPT, }, { @@ -162,6 +163,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, - .result_unpriv = ACCEPT, + .errstr_unpriv = "R9 !read_ok", + .result_unpriv = REJECT, .result = ACCEPT, }, diff --git a/tools/testing/selftests/bpf/verifier/unpriv.c b/tools/testing/selftests/bpf/verifier/unpriv.c index bd436df5cc32..111801aea5e3 100644 --- a/tools/testing/selftests/bpf/verifier/unpriv.c +++ b/tools/testing/selftests/bpf/verifier/unpriv.c @@ -420,6 +420,8 @@ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R7 invalid mem access 'inv'", + .result_unpriv = REJECT, .result = ACCEPT, .retval = 0, }, diff --git a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c index 7ae2859d495c..a3e593ddfafc 100644 --- a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c +++ b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c @@ -120,7 +120,7 @@ .fixup_map_array_48b = { 1 }, .result = ACCEPT, .result_unpriv = REJECT, - .errstr_unpriv = "R2 tried to add from different maps, paths or scalars", + .errstr_unpriv = "R2 pointer comparison prohibited", .retval = 0, }, { @@ -159,7 +159,8 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), // fake-dead code; targeted from branch A to - // prevent dead code sanitization + // prevent dead code sanitization, rejected + // via branch B however BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), @@ -167,7 +168,7 @@ .fixup_map_array_48b = { 1 }, .result = ACCEPT, .result_unpriv = REJECT, - .errstr_unpriv = "R2 tried to add from different maps, paths or scalars", + .errstr_unpriv = "R0 invalid mem access 'inv'", .retval = 0, }, { -- cgit v1.2.3 From 0fd158b89b50b3a31c97a639ff496e1c59686e97 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Wed, 16 Jun 2021 16:03:21 +0200 Subject: selftests: net: veth: make test compatible with dash veth.sh is a shell script that uses /bin/sh; some distro (Ubuntu for example) use dash as /bin/sh and in this case the test reports the following error: # ./veth.sh: 21: local: -r: bad variable name # ./veth.sh: 21: local: -r: bad variable name This happens because dash doesn't support the option "-r" with local. Moreover, in case of missing bpf object, the script is exiting -1, that is an illegal number for dash: exit: Illegal number: -1 Change the script to be compatible both with bash and dash and prevent the errors above. Signed-off-by: Andrea Righi Signed-off-by: David S. Miller --- tools/testing/selftests/net/veth.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/veth.sh b/tools/testing/selftests/net/veth.sh index 2fedc0781ce8..11d7cdb898c0 100755 --- a/tools/testing/selftests/net/veth.sh +++ b/tools/testing/selftests/net/veth.sh @@ -18,7 +18,8 @@ ret=0 cleanup() { local ns - local -r jobs="$(jobs -p)" + local jobs + readonly jobs="$(jobs -p)" [ -n "${jobs}" ] && kill -1 ${jobs} 2>/dev/null rm -f $STATS @@ -108,7 +109,7 @@ chk_gro() { if [ ! -f ../bpf/xdp_dummy.o ]; then echo "Missing xdp_dummy helper. Build bpf selftest first" - exit -1 + exit 1 fi create_ns -- cgit v1.2.3 From 1b29df0e2e802cb15a5196c936f494161ec97502 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Wed, 16 Jun 2021 16:57:27 +0200 Subject: selftests: net: use bash to run udpgro_fwd test case udpgro_fwd.sh contains many bash specific operators ("[[", "local -r"), but it's using /bin/sh; in some distro /bin/sh is mapped to /bin/dash, that doesn't support such operators. Force the test to use /bin/bash explicitly and prevent false positive test failures. Signed-off-by: Andrea Righi Signed-off-by: David S. Miller --- tools/testing/selftests/net/udpgro_fwd.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/net/udpgro_fwd.sh b/tools/testing/selftests/net/udpgro_fwd.sh index a8fa64136282..7f26591f236b 100755 --- a/tools/testing/selftests/net/udpgro_fwd.sh +++ b/tools/testing/selftests/net/udpgro_fwd.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # SPDX-License-Identifier: GPL-2.0 readonly BASE="ns-$(mktemp -u XXXXXX)" -- cgit v1.2.3 From d8ac05ea13d789d5491a5920d70a05659015441d Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Tue, 15 Jun 2021 16:04:43 +0100 Subject: KVM: selftests: Fix kvm_check_cap() assertion KVM_CHECK_EXTENSION ioctl can return any negative value on error, and not necessarily -1. Change the assertion to reflect that. Signed-off-by: Fuad Tabba Message-Id: <20210615150443.1183365-1-tabba@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/lib/kvm_util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 5c70596dd1b9..a2b732cf96ea 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -82,7 +82,7 @@ int kvm_check_cap(long cap) kvm_fd = open_kvm_dev_path_or_exit(); ret = ioctl(kvm_fd, KVM_CHECK_EXTENSION, cap); - TEST_ASSERT(ret != -1, "KVM_CHECK_EXTENSION IOCTL failed,\n" + TEST_ASSERT(ret >= 0, "KVM_CHECK_EXTENSION IOCTL failed,\n" " rc: %i errno: %i", ret, errno); close(kvm_fd); -- cgit v1.2.3 From 7e9838b7915e29ae0dfe4a3e5f007c9dc6ab9b45 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Fri, 18 Jun 2021 13:04:36 +0200 Subject: selftests/net: Add icmp.sh for testing ICMP dummy address responses MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds a new icmp.sh selftest for testing that the kernel will respond correctly with an ICMP unreachable message with the dummy (192.0.0.8) source address when there are no IPv4 addresses configured to use as source addresses. Signed-off-by: Toke Høiland-Jørgensen Reviewed-by: David Ahern Signed-off-by: David S. Miller --- tools/testing/selftests/net/icmp.sh | 74 +++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100755 tools/testing/selftests/net/icmp.sh (limited to 'tools') diff --git a/tools/testing/selftests/net/icmp.sh b/tools/testing/selftests/net/icmp.sh new file mode 100755 index 000000000000..e4b04cd1644a --- /dev/null +++ b/tools/testing/selftests/net/icmp.sh @@ -0,0 +1,74 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# Test for checking ICMP response with dummy address instead of 0.0.0.0. +# Sets up two namespaces like: +# +----------------------+ +--------------------+ +# | ns1 | v4-via-v6 routes: | ns2 | +# | | ' | | +# | +--------+ -> 172.16.1.0/24 -> +--------+ | +# | | veth0 +--------------------------+ veth0 | | +# | +--------+ <- 172.16.0.0/24 <- +--------+ | +# | 172.16.0.1 | | 2001:db8:1::2/64 | +# | 2001:db8:1::2/64 | | | +# +----------------------+ +--------------------+ +# +# And then tries to ping 172.16.1.1 from ns1. This results in a "net +# unreachable" message being sent from ns2, but there is no IPv4 address set in +# that address space, so the kernel should substitute the dummy address +# 192.0.0.8 defined in RFC7600. + +NS1=ns1 +NS2=ns2 +H1_IP=172.16.0.1/32 +H1_IP6=2001:db8:1::1 +RT1=172.16.1.0/24 +PINGADDR=172.16.1.1 +RT2=172.16.0.0/24 +H2_IP6=2001:db8:1::2 + +TMPFILE=$(mktemp) + +cleanup() +{ + rm -f "$TMPFILE" + ip netns del $NS1 + ip netns del $NS2 +} + +trap cleanup EXIT + +# Namespaces +ip netns add $NS1 +ip netns add $NS2 + +# Connectivity +ip -netns $NS1 link add veth0 type veth peer name veth0 netns $NS2 +ip -netns $NS1 link set dev veth0 up +ip -netns $NS2 link set dev veth0 up +ip -netns $NS1 addr add $H1_IP dev veth0 +ip -netns $NS1 addr add $H1_IP6/64 dev veth0 nodad +ip -netns $NS2 addr add $H2_IP6/64 dev veth0 nodad +ip -netns $NS1 route add $RT1 via inet6 $H2_IP6 +ip -netns $NS2 route add $RT2 via inet6 $H1_IP6 + +# Make sure ns2 will respond with ICMP unreachable +ip netns exec $NS2 sysctl -qw net.ipv4.icmp_ratelimit=0 net.ipv4.ip_forward=1 + +# Run the test - a ping runs in the background, and we capture ICMP responses +# with tcpdump; -c 1 means it should exit on the first ping, but add a timeout +# in case something goes wrong +ip netns exec $NS1 ping -w 3 -i 0.5 $PINGADDR >/dev/null & +ip netns exec $NS1 timeout 10 tcpdump -tpni veth0 -c 1 'icmp and icmp[icmptype] != icmp-echo' > $TMPFILE 2>/dev/null + +# Parse response and check for dummy address +# tcpdump output looks like: +# IP 192.0.0.8 > 172.16.0.1: ICMP net 172.16.1.1 unreachable, length 92 +RESP_IP=$(awk '{print $2}' < $TMPFILE) +if [[ "$RESP_IP" != "192.0.0.8" ]]; then + echo "FAIL - got ICMP response from $RESP_IP, should be 192.0.0.8" + exit 1 +else + echo "OK" + exit 0 +fi -- cgit v1.2.3 From fc96ec4d5d4155c61cbafd49fb2dd403c899a9f4 Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 10 Jun 2021 22:32:59 +0800 Subject: perf metricgroup: Fix find_evsel_group() event selector The following command segfaults on my x86 broadwell: $ ./perf stat -M frontend_bound,retiring,backend_bound,bad_speculation sleep 1 WARNING: grouped events cpus do not match, disabling group: anon group { raw 0x10e } anon group { raw 0x10e } perf: util/evsel.c:1596: get_group_fd: Assertion `!(!leader->core.fd)' failed. Aborted (core dumped) The issue shows itself as a use-after-free in evlist__check_cpu_maps(), whereby the leader of an event selector (evsel) has been deleted (yet we still attempt to verify for an evsel). Fundamentally the problem comes from metricgroup__setup_events() -> find_evsel_group(), and has developed from the previous fix attempt in commit 9c880c24cb0d ("perf metricgroup: Fix for metrics containing duration_time"). The problem now is that the logic in checking if an evsel is in the same group is subtly broken for the "cycles" event. For the "cycles" event, the pmu_name is NULL; however the logic in find_evsel_group() may set an event matched against "cycles" as used, when it should not be. This leads to a condition where an evsel is set, yet its leader is not. Fix the check for evsel pmu_name by not matching evsels when either has a NULL pmu_name. There is still a pre-existing metric issue whereby the ordering of the metrics may break the 'stat' function, as discussed at: https://lore.kernel.org/lkml/49c6fccb-b716-1bf0-18a6-cace1cdb66b9@huawei.com/ Fixes: 9c880c24cb0d ("perf metricgroup: Fix for metrics containing duration_time") Signed-off-by: John Garry Tested-by: Arnaldo Carvalho de Melo # On a Thinkpad T450S Cc: Alexander Shishkin Cc: Ian Rogers Cc: Jiri Olsa Cc: Kajol Jain Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/1623335580-187317-2-git-send-email-john.garry@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/metricgroup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c index 8336dd8e8098..c456fdeae06a 100644 --- a/tools/perf/util/metricgroup.c +++ b/tools/perf/util/metricgroup.c @@ -162,10 +162,10 @@ static bool contains_event(struct evsel **metric_events, int num_events, return false; } -static bool evsel_same_pmu(struct evsel *ev1, struct evsel *ev2) +static bool evsel_same_pmu_or_none(struct evsel *ev1, struct evsel *ev2) { if (!ev1->pmu_name || !ev2->pmu_name) - return false; + return true; return !strcmp(ev1->pmu_name, ev2->pmu_name); } @@ -288,7 +288,7 @@ static struct evsel *find_evsel_group(struct evlist *perf_evlist, */ if (!has_constraint && ev->leader != metric_events[i]->leader && - evsel_same_pmu(ev->leader, metric_events[i]->leader)) + evsel_same_pmu_or_none(ev->leader, metric_events[i]->leader)) break; if (!strcmp(metric_events[i]->name, ev->name)) { set_bit(ev->idx, evlist_used); -- cgit v1.2.3 From fe7a98b9d9b36e5c8a22d76b67d29721f153f66e Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 10 Jun 2021 22:33:00 +0800 Subject: perf metricgroup: Return error code from metricgroup__add_metric_sys_event_iter() The error code is not set at all in the sys event iter function. This may lead to an uninitialized value of "ret" in metricgroup__add_metric() when no CPU metric is added. Fix by properly setting the error code. It is not necessary to init "ret" to 0 in metricgroup__add_metric(), as if we have no CPU or sys event metric matching, then "has_match" should be 0 and "ret" is set to -EINVAL. However gcc cannot detect that it may not have been set after the map_for_each_metric() loop for CPU metrics, which is strange. Fixes: be335ec28efa8 ("perf metricgroup: Support adding metrics for system PMUs") Signed-off-by: John Garry Acked-by: Ian Rogers Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Kajol Jain Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/1623335580-187317-3-git-send-email-john.garry@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/metricgroup.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c index c456fdeae06a..d3cf2dee36c8 100644 --- a/tools/perf/util/metricgroup.c +++ b/tools/perf/util/metricgroup.c @@ -1073,16 +1073,18 @@ static int metricgroup__add_metric_sys_event_iter(struct pmu_event *pe, ret = add_metric(d->metric_list, pe, d->metric_no_group, &m, NULL, d->ids); if (ret) - return ret; + goto out; ret = resolve_metric(d->metric_no_group, d->metric_list, NULL, d->ids); if (ret) - return ret; + goto out; *(d->has_match) = true; - return *d->ret; +out: + *(d->ret) = ret; + return ret; } static int metricgroup__add_metric(const char *metric, bool metric_no_group, -- cgit v1.2.3 From c087e9480cf33672ef2c6cce4348d754988b8437 Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Sat, 12 Jun 2021 19:37:48 +0200 Subject: perf machine: Fix refcount usage when processing PERF_RECORD_KSYMBOL ASan reported a memory leak of BPF-related ksymbols map and dso. The leak is caused by refount never reaching 0, due to missing __put calls in the function machine__process_ksymbol_register. Once the dso is inserted in the map, dso__put() should be called (map__new2() increases the refcount to 2). The same thing applies for the map when it's inserted into maps (maps__insert() increases the refcount to 2). $ sudo ./perf record -- sleep 5 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.025 MB perf.data (8 samples) ] ================================================================= ==297735==ERROR: LeakSanitizer: detected memory leaks Direct leak of 6992 byte(s) in 19 object(s) allocated from: #0 0x4f43c7 in calloc (/home/user/linux/tools/perf/perf+0x4f43c7) #1 0x8e4e53 in map__new2 /home/user/linux/tools/perf/util/map.c:216:20 #2 0x8cf68c in machine__process_ksymbol_register /home/user/linux/tools/perf/util/machine.c:778:10 [...] Indirect leak of 8702 byte(s) in 19 object(s) allocated from: #0 0x4f43c7 in calloc (/home/user/linux/tools/perf/perf+0x4f43c7) #1 0x8728d7 in dso__new_id /home/user/linux/tools/perf/util/dso.c:1256:20 #2 0x872015 in dso__new /home/user/linux/tools/perf/util/dso.c:1295:9 #3 0x8cf623 in machine__process_ksymbol_register /home/user/linux/tools/perf/util/machine.c:774:21 [...] Indirect leak of 1520 byte(s) in 19 object(s) allocated from: #0 0x4f43c7 in calloc (/home/user/linux/tools/perf/perf+0x4f43c7) #1 0x87b3da in symbol__new /home/user/linux/tools/perf/util/symbol.c:269:23 #2 0x888954 in map__process_kallsym_symbol /home/user/linux/tools/perf/util/symbol.c:710:8 [...] Indirect leak of 1406 byte(s) in 19 object(s) allocated from: #0 0x4f43c7 in calloc (/home/user/linux/tools/perf/perf+0x4f43c7) #1 0x87b3da in symbol__new /home/user/linux/tools/perf/util/symbol.c:269:23 #2 0x8cfbd8 in machine__process_ksymbol_register /home/user/linux/tools/perf/util/machine.c:803:8 [...] Signed-off-by: Riccardo Mancini Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ian Rogers Cc: Jiapeng Chong Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Tommi Rantala Link: http://lore.kernel.org/lkml/20210612173751.188582-1-rickyman7@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/machine.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 3ff4936a15a4..da19be7da284 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -776,10 +776,10 @@ static int machine__process_ksymbol_register(struct machine *machine, if (dso) { dso->kernel = DSO_SPACE__KERNEL; map = map__new2(0, dso); + dso__put(dso); } if (!dso || !map) { - dso__put(dso); return -ENOMEM; } @@ -792,6 +792,7 @@ static int machine__process_ksymbol_register(struct machine *machine, map->start = event->ksymbol.addr; map->end = map->start + event->ksymbol.len; maps__insert(&machine->kmaps, map); + map__put(map); dso__set_loaded(dso); if (is_bpf_image(event->ksymbol.name)) { -- cgit v1.2.3 From 482698c2f848f9dee1a5bd949793c2fe6a71adc5 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 17 Jun 2021 11:42:13 -0700 Subject: perf test: Fix non-bash issue with stat bpf counters $(( .. )) is a bash feature but the test's interpreter is !/bin/sh, switch the code to use expr. Signed-off-by: Ian Rogers Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Song Liu Cc: bpf@vger.kernel.org Link: http://lore.kernel.org/lkml/20210617184216.2075588-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/stat_bpf_counters.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/tests/shell/stat_bpf_counters.sh b/tools/perf/tests/shell/stat_bpf_counters.sh index 22eb31e48ca7..2f9948b3d943 100755 --- a/tools/perf/tests/shell/stat_bpf_counters.sh +++ b/tools/perf/tests/shell/stat_bpf_counters.sh @@ -11,9 +11,9 @@ compare_number() second_num=$2 # upper bound is first_num * 110% - upper=$(( $first_num + $first_num / 10 )) + upper=$(expr $first_num + $first_num / 10 ) # lower bound is first_num * 90% - lower=$(( $first_num - $first_num / 10 )) + lower=$(expr $first_num - $first_num / 10 ) if [ $second_num -gt $upper ] || [ $second_num -lt $lower ]; then echo "The difference between $first_num and $second_num are greater than 10%." -- cgit v1.2.3 From ef83f9efe8461b8fd71eb60b53dbb6a5dd7b39e9 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 19 Jun 2021 10:09:08 -0300 Subject: perf beauty: Update copy of linux/socket.h with the kernel sources To pick the changes in: ea6932d70e223e02 ("net: make get_net_ns return error if NET_NS is disabled") That don't result in any changes in the tables generated from that header. This silences this perf build warning: Warning: Kernel ABI header at 'tools/perf/trace/beauty/include/linux/socket.h' differs from latest version at 'include/linux/socket.h' diff -u tools/perf/trace/beauty/include/linux/socket.h include/linux/socket.h Cc: Changbin Du Cc: David S. Miller Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/trace/beauty/include/linux/socket.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'tools') diff --git a/tools/perf/trace/beauty/include/linux/socket.h b/tools/perf/trace/beauty/include/linux/socket.h index b8fc5c53ba6f..0d8e3dcb7f88 100644 --- a/tools/perf/trace/beauty/include/linux/socket.h +++ b/tools/perf/trace/beauty/include/linux/socket.h @@ -438,6 +438,4 @@ extern int __sys_socketpair(int family, int type, int protocol, int __user *usockvec); extern int __sys_shutdown_sock(struct socket *sock, int how); extern int __sys_shutdown(int fd, int how); - -extern struct ns_common *get_net_ns(struct ns_common *ns); #endif /* _LINUX_SOCKET_H */ -- cgit v1.2.3 From 17d27fc314cba0205eec8966735a7a241cc8a5e0 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 19 Jun 2021 10:11:46 -0300 Subject: tools headers UAPI: Sync asm-generic/unistd.h with the kernel original To pick the changes in: 8b1462b67f23da54 ("quota: finish disable quotactl_path syscall") Those headers are used in some arches to generate the syscall table used in 'perf trace' to translate syscall numbers into strings. This addresses this perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/asm-generic/unistd.h' differs from latest version at 'include/uapi/asm-generic/unistd.h' diff -u tools/include/uapi/asm-generic/unistd.h include/uapi/asm-generic/unistd.h Cc: Jan Kara Cc: Marcin Juszkiewicz Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/asm-generic/unistd.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h index 6de5a7fc066b..d2a942086fcb 100644 --- a/tools/include/uapi/asm-generic/unistd.h +++ b/tools/include/uapi/asm-generic/unistd.h @@ -863,8 +863,7 @@ __SYSCALL(__NR_process_madvise, sys_process_madvise) __SC_COMP(__NR_epoll_pwait2, sys_epoll_pwait2, compat_sys_epoll_pwait2) #define __NR_mount_setattr 442 __SYSCALL(__NR_mount_setattr, sys_mount_setattr) -#define __NR_quotactl_path 443 -__SYSCALL(__NR_quotactl_path, sys_quotactl_path) +/* 443 is reserved for quotactl_path */ #define __NR_landlock_create_ruleset 444 __SYSCALL(__NR_landlock_create_ruleset, sys_landlock_create_ruleset) -- cgit v1.2.3 From 1792a59eab9593de2eae36c40c5a22d70f52c026 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 19 Jun 2021 10:15:22 -0300 Subject: tools headers UAPI: Sync linux/in.h copy with the kernel sources MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To pick the changes in: 321827477360934d ("icmp: don't send out ICMP messages with a source address of 0.0.0.0") That don't result in any change in tooling, as INADDR_ are not used to generate id->string tables used by 'perf trace'. This addresses this build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/in.h' differs from latest version at 'include/uapi/linux/in.h' diff -u tools/include/uapi/linux/in.h include/uapi/linux/in.h Cc: David S. Miller Cc: Toke Høiland-Jørgensen Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/in.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'tools') diff --git a/tools/include/uapi/linux/in.h b/tools/include/uapi/linux/in.h index 7d6687618d80..d1b327036ae4 100644 --- a/tools/include/uapi/linux/in.h +++ b/tools/include/uapi/linux/in.h @@ -289,6 +289,9 @@ struct sockaddr_in { /* Address indicating an error return. */ #define INADDR_NONE ((unsigned long int) 0xffffffff) +/* Dummy address for src of ICMP replies if no real address is set (RFC7600). */ +#define INADDR_DUMMY ((unsigned long int) 0xc0000008) + /* Network number for local host loopback. */ #define IN_LOOPBACKNET 127 -- cgit v1.2.3 From 309505dd56854c1f9744c9a2b8aa40d897002bca Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Thu, 24 Jun 2021 15:09:31 +0800 Subject: KVM: selftests: Fix mapping length truncation in m{,un}map() max_mem_slots is now declared as uint32_t. The result of (0x200000 * 32767) is unexpectedly truncated to be 0xffe00000, whilst we actually need to allocate about, 63GB. Cast max_mem_slots to size_t in both mmap() and munmap() to fix the length truncation. We'll otherwise see the failure on arm64 thanks to the access_ok() checking in __kvm_set_memory_region(), as the unmapped VA happen to go beyond the task's allowed address space. # ./set_memory_region_test Allowed number of memory slots: 32767 Adding slots 0..32766, each memory region with 2048K size ==== Test Assertion Failure ==== set_memory_region_test.c:391: ret == 0 pid=94861 tid=94861 errno=22 - Invalid argument 1 0x00000000004015a7: test_add_max_memory_regions at set_memory_region_test.c:389 2 (inlined by) main at set_memory_region_test.c:426 3 0x0000ffffb8e67bdf: ?? ??:0 4 0x00000000004016db: _start at :? KVM_SET_USER_MEMORY_REGION IOCTL failed, rc: -1 errno: 22 slot: 2615 Fixes: 3bf0fcd75434 ("KVM: selftests: Speed up set_memory_region_test") Signed-off-by: Zenghui Yu Message-Id: <20210624070931.565-1-yuzenghui@huawei.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/set_memory_region_test.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index 978f5b5f4dc0..d8812f27648c 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -376,7 +376,7 @@ static void test_add_max_memory_regions(void) pr_info("Adding slots 0..%i, each memory region with %dK size\n", (max_mem_slots - 1), MEM_REGION_SIZE >> 10); - mem = mmap(NULL, MEM_REGION_SIZE * max_mem_slots + alignment, + mem = mmap(NULL, (size_t)max_mem_slots * MEM_REGION_SIZE + alignment, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); TEST_ASSERT(mem != MAP_FAILED, "Failed to mmap() host"); mem_aligned = (void *)(((size_t) mem + alignment - 1) & ~(alignment - 1)); @@ -401,7 +401,7 @@ static void test_add_max_memory_regions(void) TEST_ASSERT(ret == -1 && errno == EINVAL, "Adding one more memory slot should fail with EINVAL"); - munmap(mem, MEM_REGION_SIZE * max_mem_slots + alignment); + munmap(mem, (size_t)max_mem_slots * MEM_REGION_SIZE + alignment); munmap(mem_extra, MEM_REGION_SIZE); kvm_vm_free(vm); } -- cgit v1.2.3